From 7c1705712dddc0d80ad33fdacc6e37e9272d83aa Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 1 Mar 2025 18:17:11 +0530
Subject: [PATCH 01/78] fix: https://github.com/unclecode/crawl4ai/issues/756

---
 crawl4ai/content_scraping_strategy.py | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 46761013..719cab8e 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -471,6 +471,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 return False
 
             keep_element = False
+            # Special case for table elements - always preserve structure
+            if element.name in ["tr", "td", "th"]:
+                keep_element = True
 
             exclude_domains = kwargs.get("exclude_domains", [])
             # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
@@ -1130,6 +1133,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             "source",
             "track",
             "wbr",
+            "tr",
+            "td",
+            "th",
         }
 
         for el in reversed(list(root.iterdescendants())):

From 5edfea279d6add5a2a2914f862a5d6af67e7b6b5 Mon Sep 17 00:00:00 2001
From: jawshoeadan <62785552+jawshoeadan@users.noreply.github.com>
Date: Sun, 2 Mar 2025 16:58:00 +0100
Subject: [PATCH 02/78] Fix LiteLLM branding and link

---
 docs/md_v2/extraction/llm-strategies.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md
index dc2dba1a..d1f68239 100644
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -2,7 +2,7 @@
 
 In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
 
-1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).  
+1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).  
 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
 
@@ -18,9 +18,9 @@ In some cases, you need to extract **complex or unstructured** information from
 
 ---
 
-## 2. Provider-Agnostic via LightLLM
+## 2. Provider-Agnostic via LiteLLM
 
-Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
 
 - **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
 - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
@@ -288,7 +288,7 @@ if __name__ == "__main__":
 
 ## 11. Conclusion
 
-**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
 
 - Put your LLM strategy **in `CrawlerRunConfig`**.  
 - Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
@@ -319,4 +319,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS
 
 ---
 
-That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
\ No newline at end of file
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!

From 1e819cdb2663d93d3d204760c107182a58d9c77c Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Mon, 3 Mar 2025 11:53:15 +0530
Subject: [PATCH 03/78] fixes: https://github.com/unclecode/crawl4ai/issues/774

---
 docs/md_v2/api/parameters.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index ed3828c8..b8a1a213 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -230,6 +230,7 @@ async def main():
 
 if __name__ == "__main__":
     asyncio.run(main())
+```
 
 ## 2.4 Compliance & Ethics
 

From 504207faa61c8b52f8e9e781529248a898288310 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Mon, 3 Mar 2025 19:24:44 +0530
Subject: [PATCH 04/78] docs: update text in llm-strategies.md to reflect new
 changes in LlmConfig

---
 docs/md_v2/extraction/llm-strategies.md | 32 ++++++++++++++-----------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md
index 4effb74b..d40be2db 100644
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -20,11 +20,17 @@ In some cases, you need to extract **complex or unstructured** information from
 
 ## 2. Provider-Agnostic via LiteLLM
 
+You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+
+```python
+llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
 Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
 
 - **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
 - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
-- **`api_base`** (optional): If your provider has a custom endpoint.  
+- **`base_url`** (optional): If your provider has a custom endpoint.  
 
 This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
 
@@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
 
 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
 
-1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
-2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
-3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
-4. **`extraction_type`** (str): `"schema"` or `"block"`.  
-5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
-6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
-7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
-8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
-9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
+2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+3. **`extraction_type`** (str): `"schema"` or `"block"`.  
+4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
    - `"markdown"`: The raw markdown (default).  
    - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
    - `"html"`: The cleaned or raw HTML.  
-10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
-11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
 
 **Example**:
 
@@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel):
 async def main():
     # LLM extraction strategy
     llm_strat = LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=os.getenv('OPENAI_API_KEY'),
+        llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
         schema=KnowledgeGraph.schema_json(),
         extraction_type="schema",
         instruction="Extract entities and relationships from the content. Return valid JSON.",

From fc425023f57c92295357b60f95c759b4443ddc64 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 5 Mar 2025 12:51:07 +0800
Subject: [PATCH 05/78] Update config.yml

---
 deploy/docker/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index fc118bf4..413f2c6b 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -38,7 +38,7 @@ rate_limiting:
 
 # Security Configuration
 security:
-  enabled: true 
+  enabled: false 
   jwt_enabled: true 
   https_redirect: false
   trusted_hosts: ["*"]
@@ -68,4 +68,4 @@ observability:
     enabled: True
     endpoint: "/metrics"
   health_check:
-    endpoint: "/health"
\ No newline at end of file
+    endpoint: "/health"

From 14fe5ef873d2a8427d634534eb58a6e06ae4152e Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 5 Mar 2025 14:16:24 +0800
Subject: [PATCH 06/78] Update config.yml

---
 deploy/docker/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index 413f2c6b..8f819827 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -39,7 +39,7 @@ rate_limiting:
 # Security Configuration
 security:
   enabled: false 
-  jwt_enabled: true 
+  jwt_enabled: false 
   https_redirect: false
   trusted_hosts: ["*"]
   headers:

From 341b7a5f2a4ff900242b7847389d7f6caf28fe2e Mon Sep 17 00:00:00 2001
From: dvschuyl <vanschuylenbergh.dries@gmail.com>
Date: Tue, 11 Mar 2025 11:05:14 +0100
Subject: [PATCH 07/78] =?UTF-8?q?=F0=9F=90=9B=20Truncate=20width=20to=20in?=
 =?UTF-8?q?teger=20string=20in=20parse=5Fsrcset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crawl4ai/content_scraping_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 46761013..a7c51dd0 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -48,7 +48,7 @@ def parse_srcset(s: str) -> List[Dict]:
         if len(parts) >= 1:
             url = parts[0]
             width = (
-                parts[1].rstrip("w")
+                parts[1].rstrip("w").split('.')[0]
                 if len(parts) > 1 and parts[1].endswith("w")
                 else None
             )

From a3954dd4c69a73ec1561e0dd695a72cfcd13abf7 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 14 Mar 2025 09:39:10 +0530
Subject: [PATCH 08/78] refactor: Move the checking of protocol and prepending
 protocol inside api handlers

---
 deploy/docker/api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index cc103905..c5700a9e 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -48,6 +48,8 @@ async def handle_llm_qa(
 ) -> str:
     """Process QA using LLM with crawled content as context."""
     try:
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
         # Extract base URL by finding last '?q=' occurrence
         last_q_index = url.rfind('?q=')
         if last_q_index != -1:
@@ -61,7 +63,7 @@ async def handle_llm_qa(
                     status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                     detail=result.error_message
                 )
-            content = result.markdown.fit_markdown
+            content = result.markdown.fit_markdown or result.markdown.raw_markdown
 
         # Create prompt and get LLM response
         prompt = f"""Use the following content as context to answer the question.
@@ -377,6 +379,7 @@ async def handle_crawl_request(
 ) -> dict:
     """Handle non-streaming crawl requests."""
     try:
+        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
         browser_config = BrowserConfig.load(browser_config)
         crawler_config = CrawlerRunConfig.load(crawler_config)
 

From c190ba816d88753bb0bc927a8225898b7c3e9de6 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 14 Mar 2025 09:40:50 +0530
Subject: [PATCH 09/78] refactor: Instead of custom validation of question,
 rely on the built in FastAPI validator, so generated API docs also reflects
 this expectation correctly

---
 deploy/docker/server.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index edb55130..40df17d5 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -101,13 +101,9 @@ async def get_markdown(
 async def llm_endpoint(
     request: Request,
     url: str = Path(...),
-    q: Optional[str] = Query(None),
+    q: str = Query(...),
     token_data: Optional[Dict] = Depends(token_dependency)
 ):
-    if not q:
-        raise HTTPException(status_code=400, detail="Query parameter 'q' is required")
-    if not url.startswith(('http://', 'https://')):
-        url = 'https://' + url
     try:
         answer = await handle_llm_qa(url, q, config)
         return JSONResponse({"answer": answer})
@@ -136,7 +132,6 @@ async def crawl(
 ):
     if not crawl_request.urls:
         raise HTTPException(status_code=400, detail="At least one URL required")
-    
     results = await handle_crawl_request(
         urls=crawl_request.urls,
         browser_config=crawl_request.browser_config,

From 79328e42925c9ce8c030a1cadfe68c88cbe02c36 Mon Sep 17 00:00:00 2001
From: Aravind <aravind.karanam@gmail.com>
Date: Mon, 17 Mar 2025 18:17:57 +0530
Subject: [PATCH 10/78] Create main.yml (#846)

* Create main.yml

GH actions to post notifications in discord for new issues, PRs and discussions

* Add comments on bugs to the trigger
---
 .github/workflows/main.yml | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/workflows/main.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 00000000..2d51a74b
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,35 @@
+name: Discord GitHub Notifications
+
+on:
+  issues:
+    types: [opened]
+  issue_comment:
+    types: [created]
+  pull_request:
+    types: [opened]
+  discussion:
+    types: [created]
+
+jobs:
+  notify-discord:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set webhook based on event type
+        id: set-webhook
+        run: |
+          if [ "${{ github.event_name }}" == "discussion" ]; then
+            echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
+          else
+            echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Discord Notification
+        uses: Ilshidur/action-discord@master
+        env:
+          DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }}
+        with:
+          args: |
+            ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || 
+            github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
+            github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) || 
+            format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}

From 9109ecd8fc50ce9c9b87bd8e58aa863648556f82 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 18 Mar 2025 15:26:20 +0530
Subject: [PATCH 11/78] chore: Raise an exception with clear messaging when
 body tag is missing in the fetched html. The message should warn users to add
 appropriate wait_for condition to wait until body tag is loaded into DOM.
 fixes: https://github.com/unclecode/crawl4ai/issues/804

---
 crawl4ai/content_scraping_strategy.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index ef622abe..215e7cda 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -862,6 +862,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         parser_type = kwargs.get("parser", "lxml")
         soup = BeautifulSoup(html, parser_type)
         body = soup.body
+        if body is None:
+            raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
         base_domain = get_base_domain(url)
 
         try:

From 529a79725e267e0abd119482bc498d74a414176d Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 18 Mar 2025 16:14:00 +0530
Subject: [PATCH 12/78] docs: remove hallucinations from docs for
 CrawlerRunConfig + Add chunking strategy docs in the table

---
 docs/md_v2/api/parameters.md              |  3 ++-
 docs/md_v2/core/browser-crawler-config.md | 26 -----------------------
 2 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index b3e4349b..7e615a8c 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -69,7 +69,8 @@ We group them by category.
 | **Parameter**                | **Type / Default**                   | **What It Does**                                                                                |
 |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
 | **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
-| **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
+| **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). 
+| **`chunking_strategy`**    | `ChunkingStrategy` (default: RegexChunking) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
 | **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.).                 |
 | **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector. Affects the entire extraction process. |
 | **`target_elements`**        | `List[str]` (None)                   | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 0d97e0fc..a080fca3 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -136,11 +136,6 @@ class CrawlerRunConfig:
         wait_for=None,
         screenshot=False,
         pdf=False,
-        enable_rate_limiting=False,
-        rate_limit_config=None,
-        memory_threshold_percent=70.0,
-        check_interval=1.0,
-        max_session_permit=20,
         display_mode=None,
         verbose=True,
         stream=False,  # Enable streaming for arun_many()
@@ -183,25 +178,7 @@ class CrawlerRunConfig:
    - Logs additional runtime details.  
    - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
 
-9. **`enable_rate_limiting`**:  
-   - If `True`, enables rate limiting for batch processing.  
-   - Requires `rate_limit_config` to be set.
 
-10. **`memory_threshold_percent`**:  
-    - The memory threshold (as a percentage) to monitor.  
-    - If exceeded, the crawler will pause or slow down.
-
-11. **`check_interval`**:  
-    - The interval (in seconds) to check system resources.  
-    - Affects how often memory and CPU usage are monitored.
-
-12. **`max_session_permit`**:  
-    - The maximum number of concurrent crawl sessions.  
-    - Helps prevent overwhelming the system.
-
-13. **`display_mode`**:  
-    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
-    - Affects how much information is printed during the crawl.
 
 ### Helper Methods
 
@@ -236,9 +213,6 @@ The `clone()` method:
 ---
 
 
-
-
-
 ## 3. LLMConfig Essentials
 
 ### Key fields to note

From 4359b1200377d86af3cd10fa98f91cf599b16d6a Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 18 Mar 2025 17:20:24 +0530
Subject: [PATCH 13/78] docs + fix: Update example for full page screenshot &
 PDF export. Fix the bug Error:
 crawl4ai.async_webcrawler.AsyncWebCrawler.aprocess_html() got multiple values
 for keyword argument - for screenshot param.
 https://github.com/unclecode/crawl4ai/issues/822#issuecomment-2732602118

---
 crawl4ai/async_webcrawler.py                     | 10 +++-------
 .../full_page_screenshot_and_pdf_export.md       | 16 +++++++++-------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 430e26a0..3aa7701a 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -398,7 +398,7 @@ class AsyncWebCrawler:
                         html=html,
                         extracted_content=extracted_content,
                         config=config,  # Pass the config object instead of individual parameters
-                        screenshot=screenshot_data,
+                        screenshot_data=screenshot_data,
                         pdf_data=pdf_data,
                         verbose=config.verbose,
                         is_raw_html=True if url.startswith("raw:") else False,
@@ -482,7 +482,7 @@ class AsyncWebCrawler:
         html: str,
         extracted_content: str,
         config: CrawlerRunConfig,
-        screenshot: str,
+        screenshot_data: str,
         pdf_data: str,
         verbose: bool,
         **kwargs,
@@ -495,7 +495,7 @@ class AsyncWebCrawler:
             html: Raw HTML content
             extracted_content: Previously extracted content (if any)
             config: Configuration object controlling processing behavior
-            screenshot: Screenshot data (if any)
+            screenshot_data: Screenshot data (if any)
             pdf_data: PDF data (if any)
             verbose: Whether to enable verbose logging
             **kwargs: Additional parameters for backwards compatibility
@@ -620,10 +620,6 @@ class AsyncWebCrawler:
                 params={"url": _url, "timing": time.perf_counter() - t1},
             )
 
-        # Handle screenshot and PDF data
-        screenshot_data = None if not screenshot else screenshot
-        pdf_data = None if not pdf_data else pdf_data
-
         # Apply HTML formatting if requested
         if config.prettiify:
             cleaned_html = fast_format_html(cleaned_html)
diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md
index 8522675c..bf11f8db 100644
--- a/docs/examples/full_page_screenshot_and_pdf_export.md
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page
 
 **Simple Example:**
 ```python
-import os, sys
+import os
+import sys
 import asyncio
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 
 # Adjust paths as needed
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -26,9 +27,11 @@ async def main():
         # Request both PDF and screenshot
         result = await crawler.arun(
             url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                pdf=True,
+                screenshot=True
+            )
         )
         
         if result.success:
@@ -40,9 +43,8 @@ async def main():
             
             # Save PDF
             if result.pdf:
-                pdf_bytes = b64decode(result.pdf)
                 with open(os.path.join(__location__, "page.pdf"), "wb") as f:
-                    f.write(pdf_bytes)
+                    f.write(result.pdf)
 
 if __name__ == "__main__":
     asyncio.run(main())

From eedda1ae5ca0fa38ee72fa424a7255bab698efc3 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Thu, 20 Mar 2025 18:56:19 +0530
Subject: [PATCH 14/78] fix: Truncate long urls in middle than end since users
 are confused that same url is being scraped several times. Also remove labels
 on status and timer to be replaced with symbols to save space and display
 more URL

---
 crawl4ai/async_logger.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 6f89c217..c733c31a 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -37,11 +37,11 @@ class AsyncLoggerBase(ABC):
         pass
 
     @abstractmethod
-    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
         pass
 
     @abstractmethod
-    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
         pass
 
 class AsyncLogger(AsyncLoggerBase):
@@ -110,6 +110,14 @@ class AsyncLogger(AsyncLoggerBase):
     def _get_icon(self, tag: str) -> str:
         """Get the icon for a tag, defaulting to info icon if not found."""
         return self.icons.get(tag, self.icons["INFO"])
+    
+    def _shorten(self, text, length, placeholder="..."):
+        """Truncate text in the middle if longer than length, or pad if shorter."""
+        if len(text) <= length:
+            return text.ljust(length)  # Pad with spaces to reach desired length
+        half = (length - len(placeholder)) // 2
+        shortened = text[:half] + placeholder + text[-half:]
+        return shortened.ljust(length)  # Also pad shortened text to consistent length
 
     def _write_to_file(self, message: str):
         """Write a message to the log file if configured."""
@@ -210,7 +218,7 @@ class AsyncLogger(AsyncLoggerBase):
         success: bool,
         timing: float,
         tag: str = "FETCH",
-        url_length: int = 50,
+        url_length: int = 100,
     ):
         """
         Convenience method for logging URL fetch status.
@@ -224,12 +232,11 @@ class AsyncLogger(AsyncLoggerBase):
         """
         self._log(
             level=LogLevel.SUCCESS if success else LogLevel.ERROR,
-            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
+            message="{url} | {status} | ⏱: {timing:.2f}s",
             tag=tag,
             params={
-                "url": url,
-                "url_length": url_length,
-                "status": success,
+                "url": self._shorten(url, url_length),
+                "status": "✓" if success else "✗",
                 "timing": timing,
             },
             colors={
@@ -252,9 +259,9 @@ class AsyncLogger(AsyncLoggerBase):
         """
         self._log(
             level=LogLevel.ERROR,
-            message="{url:.{url_length}}... | Error: {error}",
+            message="{url} | Error: {error}",
             tag=tag,
-            params={"url": url, "url_length": url_length, "error": error},
+            params={"url": self.shorten(url,url_length), "error": error},
         )
 
 class AsyncFileLogger(AsyncLoggerBase):
@@ -298,13 +305,13 @@ class AsyncFileLogger(AsyncLoggerBase):
         """Log an error message to file."""
         self._write_to_file("ERROR", message, tag)
 
-    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
         """Log URL fetch status to file."""
         status = "SUCCESS" if success else "FAILED"
         message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
         self._write_to_file("URL_STATUS", message, tag)
 
-    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
         """Log error status to file."""
         message = f"{url[:url_length]}... | Error: {error}"
         self._write_to_file("ERROR", message, tag)

From ac2f9ae533b7560f057d8558ff84c8fca4f647ee Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Thu, 20 Mar 2025 18:59:15 +0530
Subject: [PATCH 15/78] fix: streamline url status logging via single
 entrypoint i.e. logger.url_status

---
 crawl4ai/async_webcrawler.py | 158 ++++++++++++++++++++---------------
 deps.txt                     | 115 +++++++++++++++++++++++++
 2 files changed, 205 insertions(+), 68 deletions(-)
 create mode 100644 deps.txt

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index a6374e89..98111e4b 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -10,12 +10,17 @@ import asyncio
 
 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    DispatchResult,
+    ScrapingResult,
+)
 from .async_database import async_db_manager
 from .chunking_strategy import *  # noqa: F403
 from .chunking_strategy import IdentityChunking
 from .content_filter_strategy import *  # noqa: F403
-from .extraction_strategy import * # noqa: F403
+from .extraction_strategy import *  # noqa: F403
 from .extraction_strategy import NoExtractionStrategy
 from .async_crawler_strategy import (
     AsyncCrawlerStrategy,
@@ -30,7 +35,7 @@ from .markdown_generation_strategy import (
 from .deep_crawling import DeepCrawlDecorator
 from .async_logger import AsyncLogger, AsyncLoggerBase
 from .async_configs import BrowserConfig, CrawlerRunConfig
-from .async_dispatcher import * # noqa: F403
+from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
 
 from .utils import (
@@ -44,9 +49,10 @@ from .utils import (
 
 from typing import Union, AsyncGenerator
 
-CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
 # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
 
+
 class CrawlResultContainer(Generic[CrawlResultT]):
     def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
         # Normalize to a list
@@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]):
         # Delegate attribute access to the first element.
         if self._results:
             return getattr(self._results[0], attr)
-        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+        raise AttributeError(
+            f"{self.__class__.__name__} object has no attribute '{attr}'"
+        )
 
     def __repr__(self):
         return f"{self.__class__.__name__}({self._results!r})"
 
+
 # Redefine the union type. Now synchronous calls always return a container,
 # while stream mode is handled with an AsyncGenerator.
 RunManyReturn = Union[
-    CrawlResultContainer[CrawlResultT],
-    AsyncGenerator[CrawlResultT, None]
+    CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
 ]
 
 
-
 class AsyncWebCrawler:
     """
     Asynchronous web crawler with flexible caching capabilities.
@@ -193,7 +200,7 @@ class AsyncWebCrawler:
 
         # Decorate arun method with deep crawling capabilities
         self._deep_handler = DeepCrawlDecorator(self)
-        self.arun = self._deep_handler(self.arun)  
+        self.arun = self._deep_handler(self.arun)
 
     async def start(self):
         """
@@ -210,26 +217,39 @@ class AsyncWebCrawler:
             AsyncWebCrawler: The initialized crawler instance
         """
         # Check for builtin browser if requested
-        if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
+        if (
+            self.browser_config.browser_mode == "builtin"
+            and not self.browser_config.cdp_url
+        ):
             # Import here to avoid circular imports
             from .browser_profiler import BrowserProfiler
+
             profiler = BrowserProfiler(logger=self.logger)
-            
+
             # Get builtin browser info or launch if needed
             browser_info = profiler.get_builtin_browser_info()
             if not browser_info:
-                self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
+                self.logger.info(
+                    "Builtin browser not found, launching new instance...",
+                    tag="BROWSER",
+                )
                 cdp_url = await profiler.launch_builtin_browser()
                 if not cdp_url:
-                    self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
+                    self.logger.warning(
+                        "Failed to launch builtin browser, falling back to dedicated browser",
+                        tag="BROWSER",
+                    )
                 else:
                     self.browser_config.cdp_url = cdp_url
                     self.browser_config.use_managed_browser = True
             else:
-                self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
-                self.browser_config.cdp_url = browser_info.get('cdp_url')
+                self.logger.info(
+                    f"Using existing builtin browser at {browser_info.get('cdp_url')}",
+                    tag="BROWSER",
+                )
+                self.browser_config.cdp_url = browser_info.get("cdp_url")
                 self.browser_config.use_managed_browser = True
-                
+
         await self.crawler_strategy.__aenter__()
         await self.awarmup()
         return self
@@ -305,7 +325,7 @@ class AsyncWebCrawler:
         # Auto-start if not ready
         if not self.ready:
             await self.start()
-            
+
         config = config or CrawlerRunConfig()
         if not isinstance(url, str) or not url:
             raise ValueError("Invalid URL, make sure the URL is a non-empty string")
@@ -319,9 +339,7 @@ class AsyncWebCrawler:
                     config.cache_mode = CacheMode.ENABLED
 
                 # Create cache context
-                cache_context = CacheContext(
-                    url, config.cache_mode, False
-                )
+                cache_context = CacheContext(url, config.cache_mode, False)
 
                 # Initialize processing variables
                 async_response: AsyncCrawlResponse = None
@@ -351,7 +369,7 @@ class AsyncWebCrawler:
                     # if config.screenshot and not screenshot or config.pdf and not pdf:
                     if config.screenshot and not screenshot_data:
                         cached_result = None
-                    
+
                     if config.pdf and not pdf_data:
                         cached_result = None
 
@@ -383,14 +401,18 @@ class AsyncWebCrawler:
 
                     # Check robots.txt if enabled
                     if config and config.check_robots_txt:
-                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
                             return CrawlResult(
                                 url=url,
                                 html="",
                                 success=False,
                                 status_code=403,
                                 error_message="Access denied by robots.txt",
-                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
                             )
 
                     ##############################
@@ -417,7 +439,7 @@ class AsyncWebCrawler:
                     ###############################################################
                     # Process the HTML content, Call CrawlerStrategy.process_html #
                     ###############################################################
-                    crawl_result : CrawlResult = await self.aprocess_html(
+                    crawl_result: CrawlResult = await self.aprocess_html(
                         url=url,
                         html=html,
                         extracted_content=extracted_content,
@@ -441,18 +463,11 @@ class AsyncWebCrawler:
                     crawl_result.success = bool(html)
                     crawl_result.session_id = getattr(config, "session_id", None)
 
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=crawl_result.success,
+                        timing=time.perf_counter() - start_time,
                         tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": crawl_result.success,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={
-                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                            "timing": Fore.YELLOW,
-                        },
                     )
 
                     # Update cache if appropriate
@@ -462,17 +477,12 @@ class AsyncWebCrawler:
                     return CrawlResultContainer(crawl_result)
 
                 else:
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
-                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": True,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=True,
+                        timing=time.perf_counter() - start_time,
+                        tag="COMPLETE"
                     )
-
                     cached_result.success = bool(html)
                     cached_result.session_id = getattr(config, "session_id", None)
                     cached_result.redirected_url = cached_result.redirected_url or url
@@ -494,7 +504,7 @@ class AsyncWebCrawler:
                     tag="ERROR",
                 )
 
-                return  CrawlResultContainer(
+                return CrawlResultContainer(
                     CrawlResult(
                         url=url, html="", success=False, error_message=error_message
                     )
@@ -539,15 +549,14 @@ class AsyncWebCrawler:
 
             # Process HTML content
             params = config.__dict__.copy()
-            params.pop("url", None)            
+            params.pop("url", None)
             # add keys from kwargs to params that doesn't exist in params
             params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
 
-            
             ################################
             # Scraping Strategy Execution  #
             ################################
-            result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
+            result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
 
             if result is None:
                 raise ValueError(
@@ -593,11 +602,17 @@ class AsyncWebCrawler:
         )
 
         # Log processing completion
-        self.logger.info(
-            message="{url:.50}... | Time: {timing}s",
+        self.logger.url_status(
+            url=_url,
+            success=True,
+            timing=int((time.perf_counter() - t1) * 1000) / 1000,
             tag="SCRAPE",
-            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
         )
+        # self.logger.info(
+        #     message="{url:.50}... | Time: {timing}s",
+        #     tag="SCRAPE",
+        #     params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
+        # )
 
         ################################
         # Structured Content Extraction           #
@@ -667,7 +682,7 @@ class AsyncWebCrawler:
     async def arun_many(
         self,
         urls: List[str],
-        config: Optional[CrawlerRunConfig] = None, 
+        config: Optional[CrawlerRunConfig] = None,
         dispatcher: Optional[BaseDispatcher] = None,
         # Legacy parameters maintained for backwards compatibility
         # word_count_threshold=MIN_WORD_THRESHOLD,
@@ -681,8 +696,8 @@ class AsyncWebCrawler:
         # pdf: bool = False,
         # user_agent: str = None,
         # verbose=True,
-        **kwargs
-        ) -> RunManyReturn:
+        **kwargs,
+    ) -> RunManyReturn:
         """
         Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
 
@@ -738,28 +753,35 @@ class AsyncWebCrawler:
 
         def transform_result(task_result):
             return (
-                    setattr(task_result.result, 'dispatch_result', 
-                        DispatchResult(
-                            task_id=task_result.task_id,
-                            memory_usage=task_result.memory_usage,
-                            peak_memory=task_result.peak_memory,
-                            start_time=task_result.start_time,
-                            end_time=task_result.end_time,
-                            error_message=task_result.error_message,
-                        )
-                    ) or task_result.result
+                setattr(
+                    task_result.result,
+                    "dispatch_result",
+                    DispatchResult(
+                        task_id=task_result.task_id,
+                        memory_usage=task_result.memory_usage,
+                        peak_memory=task_result.peak_memory,
+                        start_time=task_result.start_time,
+                        end_time=task_result.end_time,
+                        error_message=task_result.error_message,
+                    ),
                 )
+                or task_result.result
+            )
 
         stream = config.stream
-        
+
         if stream:
+
             async def result_transformer():
-                async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
                     yield transform_result(task_result)
+
             return result_transformer()
         else:
             _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
-            return [transform_result(res) for res in _results]    
+            return [transform_result(res) for res in _results]
 
     async def aclear_cache(self):
         """Clear the cache database."""
diff --git a/deps.txt b/deps.txt
new file mode 100644
index 00000000..1d085f0f
--- /dev/null
+++ b/deps.txt
@@ -0,0 +1,115 @@
+aiofiles==24.1.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiolimiter==1.2.1
+aiosignal==1.3.2
+aiosqlite==0.20.0
+annotated-types==0.7.0
+anyio==4.8.0
+attrs==24.3.0
+beautifulsoup4==4.12.3
+certifi==2024.12.14
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
+cryptography==44.0.0
+cssselect==1.2.0
+Cython==3.0.12
+Deprecated==1.2.18
+distro==1.9.0
+dnspython==2.7.0
+email_validator==2.2.0
+fake-http-header==0.3.5
+fake-useragent==2.0.3
+fastapi==0.115.11
+faust-cchardet==2.1.19
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.12.0
+ghp-import==2.1.0
+greenlet==3.1.1
+gunicorn==23.0.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.27.2
+huggingface-hub==0.27.1
+humanize==4.12.1
+idna==3.10
+importlib_metadata==8.5.0
+iniconfig==2.0.0
+Jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jwt==1.3.1
+limits==4.2
+litellm==1.59.0
+lxml==5.3.0
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mergedeep==1.3.4
+mkdocs==1.6.1
+mkdocs-get-deps==0.2.0
+mkdocs-terminal==4.7.0
+mockito==1.5.3
+multidict==6.1.0
+nltk==3.9.1
+numpy==2.2.2
+openai==1.59.9
+packaging==24.2
+pathspec==0.12.1
+pdf2image==1.17.0
+pillow==10.4.0
+platformdirs==4.3.6
+playwright==1.49.1
+pluggy==1.5.0
+prometheus-fastapi-instrumentator==7.0.2
+prometheus_client==0.21.1
+propcache==0.2.1
+psutil==6.1.1
+pycparser==2.22
+pydantic==2.10.5
+pydantic_core==2.27.2
+pyee==12.0.0
+Pygments==2.19.1
+pymdown-extensions==10.14.3
+pyOpenSSL==25.0.0
+pytest==8.3.4
+pytest-mockito==0.0.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+PyYAML==6.0.2
+pyyaml_env_tag==0.1
+rank-bm25==0.2.2
+redis==5.2.1
+referencing==0.36.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+six==1.17.0
+slowapi==0.1.9
+sniffio==1.3.1
+snowballstemmer==2.2.0
+soupsieve==2.6
+starlette==0.46.1
+tenacity==9.0.0
+tf-playwright-stealth==1.1.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+tqdm==4.67.1
+typing_extensions==4.12.2
+urllib3==2.3.0
+uvicorn==0.34.0
+validators==0.34.0
+watchdog==6.0.0
+wrapt==1.17.2
+xxhash==3.5.0
+yarl==1.18.3
+zipp==3.21.0

From e0c2a7c2848102bc2001392f0ef4a33d679507f1 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 21 Mar 2025 11:06:46 +0530
Subject: [PATCH 16/78] chore: remove mistakenly commited deps.txt file

---
 deps.txt | 115 -------------------------------------------------------
 1 file changed, 115 deletions(-)
 delete mode 100644 deps.txt

diff --git a/deps.txt b/deps.txt
deleted file mode 100644
index 1d085f0f..00000000
--- a/deps.txt
+++ /dev/null
@@ -1,115 +0,0 @@
-aiofiles==24.1.0
-aiohappyeyeballs==2.4.4
-aiohttp==3.11.11
-aiolimiter==1.2.1
-aiosignal==1.3.2
-aiosqlite==0.20.0
-annotated-types==0.7.0
-anyio==4.8.0
-attrs==24.3.0
-beautifulsoup4==4.12.3
-certifi==2024.12.14
-cffi==1.17.1
-chardet==5.2.0
-charset-normalizer==3.4.1
-click==8.1.8
-colorama==0.4.6
--e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI
-cryptography==44.0.0
-cssselect==1.2.0
-Cython==3.0.12
-Deprecated==1.2.18
-distro==1.9.0
-dnspython==2.7.0
-email_validator==2.2.0
-fake-http-header==0.3.5
-fake-useragent==2.0.3
-fastapi==0.115.11
-faust-cchardet==2.1.19
-filelock==3.16.1
-frozenlist==1.5.0
-fsspec==2024.12.0
-ghp-import==2.1.0
-greenlet==3.1.1
-gunicorn==23.0.0
-h11==0.14.0
-httpcore==1.0.7
-httpx==0.27.2
-huggingface-hub==0.27.1
-humanize==4.12.1
-idna==3.10
-importlib_metadata==8.5.0
-iniconfig==2.0.0
-Jinja2==3.1.5
-jiter==0.8.2
-joblib==1.4.2
-jsonschema==4.23.0
-jsonschema-specifications==2024.10.1
-jwt==1.3.1
-limits==4.2
-litellm==1.59.0
-lxml==5.3.0
-Markdown==3.7
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-mdurl==0.1.2
-mergedeep==1.3.4
-mkdocs==1.6.1
-mkdocs-get-deps==0.2.0
-mkdocs-terminal==4.7.0
-mockito==1.5.3
-multidict==6.1.0
-nltk==3.9.1
-numpy==2.2.2
-openai==1.59.9
-packaging==24.2
-pathspec==0.12.1
-pdf2image==1.17.0
-pillow==10.4.0
-platformdirs==4.3.6
-playwright==1.49.1
-pluggy==1.5.0
-prometheus-fastapi-instrumentator==7.0.2
-prometheus_client==0.21.1
-propcache==0.2.1
-psutil==6.1.1
-pycparser==2.22
-pydantic==2.10.5
-pydantic_core==2.27.2
-pyee==12.0.0
-Pygments==2.19.1
-pymdown-extensions==10.14.3
-pyOpenSSL==25.0.0
-pytest==8.3.4
-pytest-mockito==0.0.4
-python-dateutil==2.9.0.post0
-python-dotenv==1.0.1
-PyYAML==6.0.2
-pyyaml_env_tag==0.1
-rank-bm25==0.2.2
-redis==5.2.1
-referencing==0.36.1
-regex==2024.11.6
-requests==2.32.3
-rich==13.9.4
-rpds-py==0.22.3
-six==1.17.0
-slowapi==0.1.9
-sniffio==1.3.1
-snowballstemmer==2.2.0
-soupsieve==2.6
-starlette==0.46.1
-tenacity==9.0.0
-tf-playwright-stealth==1.1.0
-tiktoken==0.8.0
-tokenizers==0.21.0
-tqdm==4.67.1
-typing_extensions==4.12.2
-urllib3==2.3.0
-uvicorn==0.34.0
-validators==0.34.0
-watchdog==6.0.0
-wrapt==1.17.2
-xxhash==3.5.0
-yarl==1.18.3
-zipp==3.21.0

From 8b761f232be85acc5d480bcc999b59348a22fcbc Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 21 Mar 2025 13:40:23 +0530
Subject: [PATCH 17/78] fix: improve logged url readability by decoding encoded
 urls

---
 crawl4ai/async_logger.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index c733c31a..7a7b08ac 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -4,6 +4,7 @@ from typing import Optional, Dict, Any
 from colorama import Fore, Style, init
 import os
 from datetime import datetime
+from urllib.parse import unquote
 
 
 class LogLevel(Enum):
@@ -230,12 +231,14 @@ class AsyncLogger(AsyncLoggerBase):
             tag: Tag for the message
             url_length: Maximum length for URL in log
         """
+        decoded_url = unquote(url)
+        readable_url = self._shorten(decoded_url, url_length)
         self._log(
             level=LogLevel.SUCCESS if success else LogLevel.ERROR,
             message="{url} | {status} | ⏱: {timing:.2f}s",
             tag=tag,
             params={
-                "url": self._shorten(url, url_length),
+                "url": readable_url,
                 "status": "✓" if success else "✗",
                 "timing": timing,
             },
@@ -257,11 +260,13 @@ class AsyncLogger(AsyncLoggerBase):
             tag: Tag for the message
             url_length: Maximum length for URL in log
         """
+        decoded_url = unquote(url)
+        readable_url = self._shorten(decoded_url, url_length)
         self._log(
             level=LogLevel.ERROR,
             message="{url} | Error: {error}",
             tag=tag,
-            params={"url": self.shorten(url,url_length), "error": error},
+            params={"url": readable_url, "error": error},
         )
 
 class AsyncFileLogger(AsyncLoggerBase):

From 6740e87b4d24e5e5904a8100419f3b1e0eed501a Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 21 Mar 2025 13:41:31 +0530
Subject: [PATCH 18/78] fix: remove trailing slash when the path is empty. This
 is causing dupicate crawls

---
 crawl4ai/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index acaf7933..5b8af794 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2002,7 +2002,7 @@ def normalize_url_for_deep_crawl(href, base_url):
     normalized = urlunparse((
         parsed.scheme,
         netloc,
-        parsed.path.rstrip('/') or '/',  # Normalize trailing slash
+        parsed.path.rstrip('/'),  # Normalize trailing slash
         parsed.params,
         query,
         fragment
@@ -2030,7 +2030,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
     normalized = urlunparse((
         parsed.scheme,
         parsed.netloc.lower(),
-        parsed.path,
+        parsed.path.rstrip('/'),
         parsed.params,
         parsed.query,
         ''  # Remove fragment

From f89113377aa2e7ac40023976e63cb2d1d9a93255 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 21 Mar 2025 13:44:57 +0530
Subject: [PATCH 19/78] fix: Move adding of visited urls to the 'visited' set,
 when queueing the URLs instead of after dequeuing, this is to prevent
 duplicate crawls. https://github.com/unclecode/crawl4ai/issues/843

---
 crawl4ai/deep_crawling/bfs_strategy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
index 54b72ea3..48c116dd 100644
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                 self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
                 self.stats.urls_skipped += 1
                 continue
-            
+
+            visited.add(base_url)
             valid_links.append((base_url, score))
         
         # If we have more valid links than capacity, sort by score and take the top ones
@@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
         while current_level and not self._cancel_event.is_set():
             next_level: List[Tuple[str, Optional[str]]] = []
             urls = [url for url, _ in current_level]
-            visited.update(urls)
 
             # Clone the config to disable deep crawling recursion and enforce batch mode.
             batch_config = config.clone(deep_crawl_strategy=None, stream=False)

From 471d110c5e496a1334422ee177e95cf1675ad37b Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 21 Mar 2025 16:48:07 +0530
Subject: [PATCH 20/78] fix: url normalisation ref:
 https://github.com/unclecode/crawl4ai/issues/841

---
 crawl4ai/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 5b8af794..fe725317 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1958,6 +1958,10 @@ def normalize_url(href, base_url):
     if not parsed_base.scheme or not parsed_base.netloc:
         raise ValueError(f"Invalid base URL format: {base_url}")
 
+    # Ensure base_url ends with a trailing slash if it's a directory path
+    if not base_url.endswith('/'):
+        base_url = base_url + '/'
+
     # Use urljoin to handle all cases
     normalized = urljoin(base_url, href.strip())
     return normalized

From e01d1e73e167bb89d6656f0bdda359555a1c0be0 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 21 Mar 2025 17:34:13 +0530
Subject: [PATCH 21/78] fix: link normalisation in BestFirstStrategy

---
 crawl4ai/deep_crawling/bff_strategy.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py
index 4811ba14..65d4e819 100644
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -11,6 +11,7 @@ from .scorers import URLScorer
 from . import DeepCrawlStrategy
 
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+from ..utils import normalize_url_for_deep_crawl
 
 from math import inf as infinity
 
@@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
         valid_links = []
         for link in links:
             url = link.get("href")
-            if url in visited:
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
                 continue
             if not await self.can_process_url(url, new_depth):
                 self.stats.urls_skipped += 1
                 continue
                 
-            valid_links.append(url)
+            valid_links.append(base_url)
             
         # If we have more valid links than capacity, limit them
         if len(valid_links) > remaining_capacity:

From 2f0e2177512369f89ed7579e8e261c3a7133deda Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 25 Mar 2025 13:44:41 +0530
Subject: [PATCH 22/78] Chore: Add brotli as dependancy to fix:
 https://github.com/unclecode/crawl4ai/issues/867

---
 pyproject.toml   | 1 +
 requirements.txt | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ad07548d..247974c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
     "pyperclip>=1.8.2",
     "faust-cchardet>=2.1.19",
     "aiohttp>=3.11.11",
+    "brotli>=1.1.0",
     "humanize>=4.10.0",
 ]
 classifiers = [
diff --git a/requirements.txt b/requirements.txt
index c1f36c56..5fe0cc4c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ psutil>=6.1.1
 nltk>=3.9.1
 rich>=13.9.4
 cssselect>=1.2.0
-faust-cchardet>=2.1.19
\ No newline at end of file
+faust-cchardet>=2.1.19
+brotli>=1.1.0
\ No newline at end of file

From e3111d0a328ae2a0c78464de83cfc986f807c28b Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 25 Mar 2025 13:46:55 +0530
Subject: [PATCH 23/78] fix: prevent session closing after each request to
 maintain connection pool. Fixes:
 https://github.com/unclecode/crawl4ai/issues/867

---
 crawl4ai/async_crawler_strategy.py | 133 ++++++++++++++---------------
 1 file changed, 63 insertions(+), 70 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 37aa0962..2330b3f3 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1702,15 +1702,6 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
     async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
         await self.close()
 
-    @contextlib.asynccontextmanager
-    async def _session_context(self):
-        try:
-            if not self._session:
-                await self.start()
-            yield self._session
-        finally:
-            await self.close()
-
     def set_hook(self, hook_type: str, hook_func: Callable) -> None:
         if hook_type in self.hooks:
             self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
@@ -1787,75 +1778,77 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
         url: str, 
         config: CrawlerRunConfig
     ) -> AsyncCrawlResponse:
-        async with self._session_context() as session:
-            timeout = ClientTimeout(
-                total=config.page_timeout or self.DEFAULT_TIMEOUT,
-                connect=10,
-                sock_read=30
-            )
-            
-            headers = dict(self._BASE_HEADERS)
-            if self.browser_config.headers:
-                headers.update(self.browser_config.headers)
+        if not self._session or self._session.closed:
+            await self.start()
+        
+        timeout = ClientTimeout(
+            total=config.page_timeout or self.DEFAULT_TIMEOUT,
+            connect=10,
+            sock_read=30
+        )
+        
+        headers = dict(self._BASE_HEADERS)
+        if self.browser_config.headers:
+            headers.update(self.browser_config.headers)
 
-            request_kwargs = {
-                'timeout': timeout,
-                'allow_redirects': self.browser_config.follow_redirects,
-                'ssl': self.browser_config.verify_ssl,
-                'headers': headers
-            }
+        request_kwargs = {
+            'timeout': timeout,
+            'allow_redirects': self.browser_config.follow_redirects,
+            'ssl': self.browser_config.verify_ssl,
+            'headers': headers
+        }
 
-            if self.browser_config.method == "POST":
-                if self.browser_config.data:
-                    request_kwargs['data'] = self.browser_config.data
-                if self.browser_config.json:
-                    request_kwargs['json'] = self.browser_config.json
+        if self.browser_config.method == "POST":
+            if self.browser_config.data:
+                request_kwargs['data'] = self.browser_config.data
+            if self.browser_config.json:
+                request_kwargs['json'] = self.browser_config.json
 
-            await self.hooks['before_request'](url, request_kwargs)
+        await self.hooks['before_request'](url, request_kwargs)
 
-            try:
-                async with session.request(self.browser_config.method, url, **request_kwargs) as response:
-                    content = memoryview(await response.read())
-                    
-                    if not (200 <= response.status < 300):
-                        raise HTTPStatusError(
-                            response.status,
-                            f"Unexpected status code for {url}"
-                        )
-                    
-                    encoding = response.charset
-                    if not encoding:
-                        encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
-                    
-                    result = AsyncCrawlResponse(
-                        html=content.tobytes().decode(encoding, errors='replace'),
-                        response_headers=dict(response.headers),
-                        status_code=response.status,
-                        redirected_url=str(response.url)
+        try:
+            async with self._session.request(self.browser_config.method, url, **request_kwargs) as response:
+                content = memoryview(await response.read())
+                
+                if not (200 <= response.status < 300):
+                    raise HTTPStatusError(
+                        response.status,
+                        f"Unexpected status code for {url}"
                     )
-                    
-                    await self.hooks['after_request'](result)
-                    return result
+                
+                encoding = response.charset
+                if not encoding:
+                    encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
+                
+                result = AsyncCrawlResponse(
+                    html=content.tobytes().decode(encoding, errors='replace'),
+                    response_headers=dict(response.headers),
+                    status_code=response.status,
+                    redirected_url=str(response.url)
+                )
+                
+                await self.hooks['after_request'](result)
+                return result
 
-            except aiohttp.ServerTimeoutError as e:
-                await self.hooks['on_error'](e)
-                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
-                
-            except aiohttp.ClientConnectorError as e:
-                await self.hooks['on_error'](e)
-                raise ConnectionError(f"Connection failed: {str(e)}")
-                
-            except aiohttp.ClientError as e:
-                await self.hooks['on_error'](e)
-                raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
+        except aiohttp.ServerTimeoutError as e:
+            await self.hooks['on_error'](e)
+            raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
             
-            except asyncio.exceptions.TimeoutError as e:
-                await self.hooks['on_error'](e)
-                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+        except aiohttp.ClientConnectorError as e:
+            await self.hooks['on_error'](e)
+            raise ConnectionError(f"Connection failed: {str(e)}")
             
-            except Exception as e:
-                await self.hooks['on_error'](e)
-                raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
+        except aiohttp.ClientError as e:
+            await self.hooks['on_error'](e)
+            raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
+        
+        except asyncio.exceptions.TimeoutError as e:
+            await self.hooks['on_error'](e)
+            raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+        
+        except Exception as e:
+            await self.hooks['on_error'](e)
+            raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
 
     async def crawl(
         self, 

From 585e5e5973a264ac22343f9a4fdef54048b3b31f Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 25 Mar 2025 15:17:59 +0530
Subject: [PATCH 24/78] fix: https://github.com/unclecode/crawl4ai/issues/733

---
 crawl4ai/async_webcrawler.py          | 3 ++-
 crawl4ai/content_scraping_strategy.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 98111e4b..91b98d7f 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -448,6 +448,7 @@ class AsyncWebCrawler:
                         pdf_data=pdf_data,
                         verbose=config.verbose,
                         is_raw_html=True if url.startswith("raw:") else False,
+                        redirected_url=async_response.redirected_url, 
                         **kwargs,
                     )
 
@@ -596,7 +597,7 @@ class AsyncWebCrawler:
         markdown_result: MarkdownGenerationResult = (
             markdown_generator.generate_markdown(
                 cleaned_html=cleaned_html,
-                base_url=url,
+                base_url=params.get("redirected_url", url),
                 # html2text_options=kwargs.get('html2text', {})
             )
         )
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 215e7cda..0848d655 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -128,7 +128,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             ScrapingResult: A structured result containing the scraped content.
         """
-        raw_result = self._scrap(url, html, is_async=False, **kwargs)
+        actual_url = kwargs.get("redirected_url", url)
+        raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
         if raw_result is None:
             return ScrapingResult(
                 cleaned_html="",

From 57e0423b3a6ddb9147fce898a2e5c0afaaead90d Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 28 Mar 2025 12:56:37 +0530
Subject: [PATCH 25/78] fix:target_element should not affect link extraction.
 -> https://github.com/unclecode/crawl4ai/issues/902

---
 crawl4ai/content_scraping_strategy.py | 59 ++++++++-------------------
 1 file changed, 16 insertions(+), 43 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 0848d655..11835d62 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -897,29 +897,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 for element in body.select(excluded_selector):
                     element.extract()
 
-        # if False and css_selector:
-        #     selected_elements = body.select(css_selector)
-        #     if not selected_elements:
-        #         return {
-        #             "markdown": "",
-        #             "cleaned_html": "",
-        #             "success": True,
-        #             "media": {"images": [], "videos": [], "audios": []},
-        #             "links": {"internal": [], "external": []},
-        #             "metadata": {},
-        #             "message": f"No elements found for CSS selector: {css_selector}",
-        #         }
-        #         # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
-        #     body = soup.new_tag("div")
-        #     for el in selected_elements:
-        #         body.append(el)
-
         content_element = None
         if target_elements:
             try:
                 for_content_targeted_element = []
                 for target_element in target_elements:
-                    for_content_targeted_element.extend(body.select(target_element))
+                    # Creating a fresh parse of HTML for each selector to prevent element extraction
+                    # from modifying the original DOM tree; this keeps the original body 
+                    # intact for link processing. This is better performant than deepcopy.
+                    fresh_body = BeautifulSoup(html, "html.parser")
+                    for_content_targeted_element.extend(fresh_body.select(target_element))
                 content_element = soup.new_tag("div")
                 for el in for_content_targeted_element:
                     content_element.append(el)
@@ -927,7 +914,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                 return None
         else:
-            content_element = body        
+            content_element = body      
 
         kwargs["exclude_social_media_domains"] = set(
             kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -1536,34 +1523,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                 self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
                 meta = {}
 
-            # Handle CSS selector targeting
-            # if css_selector:
-            #     try:
-            #         selected_elements = body.cssselect(css_selector)
-            #         if not selected_elements:
-            #             return {
-            #                 "markdown": "",
-            #                 "cleaned_html": "",
-            #                 "success": True,
-            #                 "media": {"images": [], "videos": [], "audios": []},
-            #                 "links": {"internal": [], "external": []},
-            #                 "metadata": meta,
-            #                 "message": f"No elements found for CSS selector: {css_selector}",
-            #             }
-            #         body = lhtml.Element("div")
-            #         body.extend(selected_elements)
-            #     except Exception as e:
-            #         self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
-            #         return None
-
             content_element = None
             if target_elements:
                 try:
-                    for_content_targeted_element = []
-                    for target_element in target_elements:
-                        for_content_targeted_element.extend(body.cssselect(target_element))
                     content_element = lhtml.Element("div")
-                    content_element.extend(for_content_targeted_element)
+                    for target_element in target_elements:
+                        # Creating a fresh parse of HTML for each selector to prevent element extraction
+                        # from modifying the original DOM tree; this keeps the original body 
+                        # intact for link processing. This is better performant than deepcopy.
+                        fresh_body = lhtml.document_fromstring(html)
+                        for_content_targeted_element = []
+                        for target_element in target_elements:
+                            for_content_targeted_element.extend(fresh_body.cssselect(target_element))
+                        content_element = lhtml.Element("div")
+                        content_element.extend(for_content_targeted_element)
                 except Exception as e:
                     self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                     return None

From d8cbeff38643a119cc1534aa6176a5b45effc685 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Fri, 28 Mar 2025 19:31:05 +0530
Subject: [PATCH 26/78] fix: https://github.com/unclecode/crawl4ai/issues/842

---
 crawl4ai/async_crawler_strategy.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 2330b3f3..ddd6348e 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         Close the browser and clean up resources.
         """
         await self.browser_manager.close()
+        # Explicitly reset the static Playwright instance
+        BrowserManager._playwright_instance = None
 
     async def kill_session(self, session_id: str):
         """

From 1119f2f5b50a3e8ae77c0baf93490329ed678ef9 Mon Sep 17 00:00:00 2001
From: "maggie.wang" <maggie@edkey.com>
Date: Mon, 31 Mar 2025 14:05:54 +0800
Subject: [PATCH 27/78] fix: https://github.com/unclecode/crawl4ai/issues/911

---
 crawl4ai/async_crawler_strategy.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index ddd6348e..7eef0196 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -820,7 +820,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     
                     for selector in selectors:
                         try:
-                            content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
+                            content = await page.evaluate(
+                                f"""Array.from(document.querySelectorAll("{selector}"))
+                                    .map(el => el.outerHTML)
+                                    .join('')"""
+                            )
                             html_parts.append(content)
                         except Error as e:
                             print(f"Warning: Could not get content for selector '{selector}': {str(e)}")

From ef1f0c410246c77ed6e68cb17574cde8a8aaab94 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Mon, 31 Mar 2025 12:43:32 +0530
Subject: [PATCH 28/78] fix:https://github.com/unclecode/crawl4ai/issues/701

---
 crawl4ai/js_snippet/remove_overlay_elements.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js
index 0400d89c..9d93b4ac 100644
--- a/crawl4ai/js_snippet/remove_overlay_elements.js
+++ b/crawl4ai/js_snippet/remove_overlay_elements.js
@@ -115,5 +115,6 @@ async () => {
     document.body.style.overflow = "auto";
 
     // Wait a bit for any animations to complete
-    await new Promise((resolve) => setTimeout(resolve, 100));
+    document.body.scrollIntoView(false);
+    await new Promise((resolve) => setTimeout(resolve, 250));
 };

From 757e3177ed6cfed0cbd9b9f01c0c330ba5d6f18f Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Mon, 31 Mar 2025 17:10:04 +0530
Subject: [PATCH 29/78] fix: https://github.com/unclecode/crawl4ai/issues/839

---
 crawl4ai/async_crawler_strategy.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 7eef0196..f18a3c1d 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -532,14 +532,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 if console_log_type == "error":
                     self.logger.error(
                         message=f"Console error: {msg}",  # Use f-string for variable interpolation
-                        tag="CONSOLE",
-                        params={"msg": msg.text},
+                        tag="CONSOLE"
                     )
                 elif console_log_type == "debug":
                     self.logger.debug(
                         message=f"Console: {msg}",  # Use f-string for variable interpolation
-                        tag="CONSOLE",
-                        params={"msg": msg.text},
+                        tag="CONSOLE"
                     )
 
             page.on("console", log_consol)

From 73fda8a6ec8ef35cdb63e1bae74411976d4e63b9 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Thu, 3 Apr 2025 13:47:13 +0530
Subject: [PATCH 30/78] fix: address the PR review:
 https://github.com/unclecode/crawl4ai/pull/899#discussion_r2024639193

---
 crawl4ai/content_scraping_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 91b1c674..eaed0816 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -905,7 +905,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                     # Creating a fresh parse of HTML for each selector to prevent element extraction
                     # from modifying the original DOM tree; this keeps the original body 
                     # intact for link processing. This is better performant than deepcopy.
-                    fresh_body = BeautifulSoup(html, "html.parser")
+                    fresh_body = BeautifulSoup(html, "lxml")
                     for_content_targeted_element.extend(fresh_body.select(target_element))
                 content_element = soup.new_tag("div")
                 for el in for_content_targeted_element:

From 4133e5460d734262f621bfa1edc9c4f168579fd9 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Thu, 3 Apr 2025 17:42:24 +0530
Subject: [PATCH 31/78] typo-fix:
 https://github.com/unclecode/crawl4ai/pull/918

---
 crawl4ai/content_scraping_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index eaed0816..0a157a08 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -1605,7 +1605,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             # Remove empty elements
             self.remove_empty_elements_fast(body, 1)
 
-            # Remvoe unneeded attributes
+            # Remove unneeded attributes
             self.remove_unwanted_attributes_fast(
                 body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
             )

From 7155778eac65d9e9d7b09a4e6a4d6526ece2f476 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Thu, 3 Apr 2025 17:42:51 +0530
Subject: [PATCH 32/78] chore: move from faust-cchardet to chardet

---
 crawl4ai/async_crawler_strategy.py | 4 ++--
 pyproject.toml                     | 2 +-
 requirements.txt                   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index f18a3c1d..301d925f 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -24,7 +24,7 @@ from .browser_manager import BrowserManager
 
 import aiofiles
 import aiohttp
-import cchardet
+import chardet
 from aiohttp.client import ClientTimeout
 from urllib.parse import urlparse
 from types import MappingProxyType
@@ -1822,7 +1822,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
                 
                 encoding = response.charset
                 if not encoding:
-                    encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
+                    encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
                 
                 result = AsyncCrawlResponse(
                     html=content.tobytes().decode(encoding, errors='replace'),
diff --git a/pyproject.toml b/pyproject.toml
index 247974c5..032e5cd6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ dependencies = [
     "fake-useragent>=2.0.3",
     "click>=8.1.7",
     "pyperclip>=1.8.2",
-    "faust-cchardet>=2.1.19",
+    "chardet>=5.2.0",
     "aiohttp>=3.11.11",
     "brotli>=1.1.0",
     "humanize>=4.10.0",
diff --git a/requirements.txt b/requirements.txt
index 5fe0cc4c..0bb596d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,5 +21,5 @@ psutil>=6.1.1
 nltk>=3.9.1
 rich>=13.9.4
 cssselect>=1.2.0
-faust-cchardet>=2.1.19
+chardet>=5.2.0
 brotli>=1.1.0
\ No newline at end of file

From 935d9d39f85f4a398db61221473a37486f564c0d Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sat, 5 Apr 2025 21:37:25 +0800
Subject: [PATCH 33/78] Add quickstart example set

---
 docs/examples/quickstart.py          | 562 ++++++++++++++++++++
 docs/examples/quickstart_examples.py | 404 +++++++++++++++
 docs/examples/quickstart_v0.ipynb    | 735 ---------------------------
 3 files changed, 966 insertions(+), 735 deletions(-)
 create mode 100644 docs/examples/quickstart.py
 create mode 100644 docs/examples/quickstart_examples.py
 delete mode 100644 docs/examples/quickstart_v0.ipynb

diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
new file mode 100644
index 00000000..3adbfc0d
--- /dev/null
+++ b/docs/examples/quickstart.py
@@ -0,0 +1,562 @@
+import os, sys
+
+from crawl4ai.types import LLMConfig
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links["internal"][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        for img in result.media["images"][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
+
+        # Perform the crawl operation
+        result = await crawler.arun(url="https://crawl4ai.com")
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
+    )
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+
+        if result.success and result.screenshot:
+            import base64
+
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, "wb") as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+        delay_before_return_html=1
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config,
+        )
+        print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+
+    # Firefox
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+        print(result.markdown)
+
+
+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+    # Basic examples
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+
+    # Advanced examples
+    await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+
+    # Browser comparisons
+    await crawl_custom_browser_type()
+
+    # Screenshot example
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/quickstart_examples.py b/docs/examples/quickstart_examples.py
new file mode 100644
index 00000000..f9829f2d
--- /dev/null
+++ b/docs/examples/quickstart_examples.py
@@ -0,0 +1,404 @@
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai.configs import ProxyConfig
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+    """Basic web crawling with markdown generation"""
+    print("\n=== 1. Basic Web Crawling ===")
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com/",
+        )
+
+        for i, result in enumerate(results):
+            print(f"Result {i + 1}:")
+            print(f"Success: {result.success}")
+            if result.success:
+                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+            else:
+                print("Failed to crawl the URL")
+
+
+async def demo_parallel_crawl():
+    """Crawl multiple URLs in parallel"""
+    print("\n=== 2. Parallel Crawling ===")
+
+    urls = [
+        "https://news.ycombinator.com/",
+        "https://example.com/",
+        "https://httpbin.org/html",
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun_many(
+            urls=urls,
+        )
+
+        print(f"Crawled {len(results)} URLs in parallel:")
+        for i, result in enumerate(results):
+            print(
+                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+            )
+
+
+async def demo_fit_markdown():
+    """Generate focused markdown with LLM content filter"""
+    print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://en.wikipedia.org/wiki/Python_(programming_language)",
+            config=CrawlerRunConfig(
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter()
+                )
+            ),
+        )
+
+        # Print stats and save the fit markdown
+        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+        print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+
+async def demo_llm_structured_extraction_no_schema():
+    # Create a simple LLM extraction strategy (no schema required)
+    extraction_strategy = LLMExtractionStrategy(
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        instruction="This is news.ycombinator.com, extract all news for each. title, source url, number of comments.",
+        extract_type="schema",
+        schema="{title: string, url: string, comments: int}",
+        extra_args={
+            "temperature": 0.0,
+            "max_tokens": 4096,
+        },
+        verbose=True,
+    )
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://news.ycombinator.com/", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+
+async def demo_css_structured_extraction_no_schema():
+    """Extract structured data using CSS selectors"""
+    print("\n=== 5. CSS-Based Structured Extraction ===")
+    # Sample HTML for schema generation (one-time cost)
+    sample_html = """
+<div class="body-post clear">
+    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
+        <div class="clear home-post-box cf">
+            <div class="home-img clear">
+                <div class="img-ratio">
+                    <img alt="..." src="...">
+                </div>
+            </div>
+            <div class="clear home-right">
+                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
+                <div class="item-label">
+                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
+                    <span class="h-tags">Malware / Supply Chain Attack</span>
+                </div>
+                <div class="home-desc"> Cybersecurity researchers have uncovered malicious libraries in the Python Package Index (PyPI) repository that are designed to steal sensitive information.  Two of the packages, bitcoinlibdbfix and bitcoinlib-dev, masquerade as fixes for recent issues  detected in a legitimate Python module called bitcoinlib, according to ReversingLabs . A third package discovered  by Socket, disgrasya, contained a fully automated carding script targeting WooCommerce stores.  The packages attracted hundreds of downloads before being taken down, according to statistics from pepy.tech -   bitcoinlibdbfix  - 1,101 downloads  bitcoinlib-dev  - 735 downloads  disgrasya  - 37,217 downloads   "The malicious libraries both attempt a similar attack, overwriting the legitimate 'clw cli' command with malicious code that attempts to exfiltrate sensitive database files," ReversingLabs said.   In an interesting twist, the authors of the counterfeit libraries are said to have joined a GitHub issue...</div>
+            </div>
+        </div>
+    </a>
+</div>
+    """
+
+    # Generate schema using LLM (one-time setup)
+    schema = JsonCssExtractionStrategy.generate_schema(
+        html=sample_html,
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        query="From https://thehackernews.com/, I have shares a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+    )
+
+    print(f"Generated schema: {json.dumps(schema, indent=2)}")
+    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+    # with open("schema.json", "w") as f:
+    #     json.dump(schema, f, indent=2)
+
+    # Create no-LLM extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema)
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    # Use the fast CSS extraction (no LLM calls during extraction)
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://thehackernews.com", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+
+async def demo_deep_crawl():
+    """Deep crawling with BFS strategy"""
+    print("\n=== 6. Deep Crawling ===")
+
+    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=1, max_pages=5, filter_chain=filter_chain
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+        )
+
+        print(f"Deep crawl returned {len(results)} pages:")
+        for i, result in enumerate(results):
+            depth = result.metadata.get("depth", "unknown")
+            print(f"  {i + 1}. {result.url} (Depth: {depth})")
+
+
+async def demo_js_interaction():
+    """Execute JavaScript to load more content"""
+    print("\n=== 7. JavaScript Interaction ===")
+
+    # A simple page that needs JS to reveal content
+    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+        # Initial load
+
+        news_schema = {
+            "name": "news",
+            "baseSelector": "tr.athing",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "span.titleline",
+                    "type": "text",
+                }
+            ],
+        }
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=CrawlerRunConfig(
+                session_id="hn_session",  # Keep session
+                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+            ),
+        )
+
+        news = []
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+        print(f"Initial items: {len(news)}")
+
+        # Click "More" link
+        more_config = CrawlerRunConfig(
+            js_code="document.querySelector('a.morelink').click();",
+            js_only=True,  # Continue in same page
+            session_id="hn_session",  # Keep session
+            extraction_strategy=JsonCssExtractionStrategy(
+                schema=news_schema,
+            ),
+        )
+
+        result: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com", config=more_config
+        )
+
+        # Extract new items
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+        print(f"Total items: {len(news)}")
+
+
+
+async def demo_media_and_links():
+    """Extract media and links from a page"""
+    print("\n=== 8. Media and Links Extraction ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+        for i, result in enumerate(result):
+            # Extract and save all images
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+
+            # Extract and save all links (internal and external)
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Found {len(internal_links)} internal links")
+            print(f"Found {len(external_links)} external links")
+
+            # Save everything to files
+            with open("images.json", "w") as f:
+                json.dump(images, f, indent=2)
+
+            with open("links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
+
+
+async def demo_screenshot_and_pdf():
+    """Capture screenshot and PDF of a page"""
+    print("\n=== 9. Screenshot and PDF Capture ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun(
+            # url="https://example.com",
+            url="https://en.wikipedia.org/wiki/Giant_anteater",
+            config=CrawlerRunConfig(screenshot=True, pdf=True),
+        )
+
+        for i, result in enumerate(result):
+            if result.screenshot:
+                # Save screenshot
+                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+                with open(screenshot_path, "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"Screenshot saved to {screenshot_path}")
+
+            if result.pdf:
+                # Save PDF
+                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+                with open(pdf_path, "wb") as f:
+                    f.write(result.pdf)
+                print(f"PDF saved to {pdf_path}")
+
+
+async def demo_proxy_rotation():
+    """Proxy rotation for multiple requests"""
+    print("\n=== 10. Proxy Rotation ===")
+
+    # Example proxies (replace with real ones)
+    proxies = [
+        ProxyConfig(server="http://proxy1.example.com:8080"),
+        ProxyConfig(server="http://proxy2.example.com:8080"),
+    ]
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    print(f"Using {len(proxies)} proxies in rotation")
+    print(
+        "Note: This example uses placeholder proxies - replace with real ones to test"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            proxy_rotation_strategy=proxy_strategy, cache_mode=CacheMode.BYPASS
+        )
+
+        # In a real scenario, these would be run and the proxies would rotate
+        print("In a real scenario, requests would rotate through the available proxies")
+
+
+async def demo_raw_html_and_file():
+    """Process raw HTML and local files"""
+    print("\n=== 11. Raw HTML and Local Files ===")
+
+    raw_html = """
+    <html><body>
+        <h1>Sample Article</h1>
+        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
+    </body></html>
+    """
+
+    # Save to file
+    file_path = Path("docs/examples/tmp/sample.html").absolute()
+    with open(file_path, "w") as f:
+        f.write(raw_html)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl raw HTML
+        raw_result = await crawler.arun(
+            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Raw HTML processing:")
+        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+        # Crawl local file
+        file_result = await crawler.arun(
+            url=f"file://{file_path}",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("\nLocal file processing:")
+        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+    # Clean up
+    os.remove(file_path)
+    print(f"Processed both raw HTML and local file ({file_path})")
+
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Comprehensive Crawl4AI Demo ===")
+    print("Note: Some examples require API keys or other configurations")
+
+    # Run all demos
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
+    await demo_deep_crawl()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
+    # await demo_proxy_rotation()
+    await demo_raw_html_and_file()
+
+    # Clean up any temp files that may have been created
+    print("\n=== Demo Complete ===")
+    print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/quickstart_v0.ipynb b/docs/examples/quickstart_v0.ipynb
deleted file mode 100644
index 0282aa12..00000000
--- a/docs/examples/quickstart_v0.ipynb
+++ /dev/null
@@ -1,735 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6yLvrXn7yZQI"
-      },
-      "source": [
-        "# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
-        "\n",
-        "Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
-        "\n",
-        "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
-        "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
-        "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
-        "\n",
-        "Let's explore the powerful features of Crawl4AI!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KIn_9nxFyZQK"
-      },
-      "source": [
-        "## Installation\n",
-        "\n",
-        "First, let's install Crawl4AI from GitHub:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mSnaxLf3zMog"
-      },
-      "outputs": [],
-      "source": [
-        "!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xlXqaRtayZQK"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install crawl4ai\n",
-        "!pip install nest-asyncio\n",
-        "!playwright install"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qKCE7TI7yZQL"
-      },
-      "source": [
-        "Now, let's import the necessary libraries:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "I67tr7aAyZQL"
-      },
-      "outputs": [],
-      "source": [
-        "import asyncio\n",
-        "import nest_asyncio\n",
-        "from crawl4ai import AsyncWebCrawler\n",
-        "from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
-        "import json\n",
-        "import time\n",
-        "from pydantic import BaseModel, Field\n",
-        "\n",
-        "nest_asyncio.apply()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "h7yR_Rt_yZQM"
-      },
-      "source": [
-        "## Basic Usage\n",
-        "\n",
-        "Let's start with a simple crawl example:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yBh6hf4WyZQM",
-        "outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
-            "18102\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def simple_crawl():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
-        "        print(len(result.markdown))\n",
-        "await simple_crawl()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9rtkgHI28uI4"
-      },
-      "source": [
-        "💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MzZ0zlJ9yZQM"
-      },
-      "source": [
-        "## Advanced Features\n",
-        "\n",
-        "### Executing JavaScript and Using CSS Selectors"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gHStF86xyZQM",
-        "outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
-            "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
-            "41135\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def js_and_css():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            js_code=js_code,\n",
-        "            # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
-        "            bypass_cache=True\n",
-        "        )\n",
-        "        print(len(result.markdown))\n",
-        "\n",
-        "await js_and_css()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cqE_W4coyZQM"
-      },
-      "source": [
-        "### Using a Proxy\n",
-        "\n",
-        "Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "QjAyiAGqyZQM"
-      },
-      "outputs": [],
-      "source": [
-        "async def use_proxy():\n",
-        "    async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            bypass_cache=True\n",
-        "        )\n",
-        "        print(result.markdown[:500])  # Print first 500 characters\n",
-        "\n",
-        "# Uncomment the following line to run the proxy example\n",
-        "# await use_proxy()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XTZ88lbayZQN"
-      },
-      "source": [
-        "### Extracting Structured Data with OpenAI\n",
-        "\n",
-        "Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fIOlDayYyZQN",
-        "outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
-            "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
-            "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
-            "[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
-            "[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
-            "[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
-            "[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
-            "[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
-            "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
-            "5029\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
-        "\n",
-        "class OpenAIModelFee(BaseModel):\n",
-        "    model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
-        "    input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
-        "    output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
-        "\n",
-        "async def extract_openai_fees():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url='https://openai.com/api/pricing/',\n",
-        "            word_count_threshold=1,\n",
-        "            extraction_strategy=LLMExtractionStrategy(\n",
-        "                provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
-        "                schema=OpenAIModelFee.schema(),\n",
-        "                extraction_type=\"schema\",\n",
-        "                instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
-        "                Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
-        "                {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
-        "            ),\n",
-        "            bypass_cache=True,\n",
-        "        )\n",
-        "        print(len(result.extracted_content))\n",
-        "\n",
-        "# Uncomment the following line to run the OpenAI extraction example\n",
-        "await extract_openai_fees()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BypA5YxEyZQN"
-      },
-      "source": [
-        "### Advanced Multi-Page Crawling with JavaScript Execution"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tfkcVQ0b7mw-"
-      },
-      "source": [
-        "## Advanced Multi-Page Crawling with JavaScript Execution\n",
-        "\n",
-        "This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
-        "\n",
-        "To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "qUBKGpn3yZQN",
-        "outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
-            "Page 1: Found 35 commits\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
-            "Page 2: Found 35 commits\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
-            "Page 3: Found 35 commits\n",
-            "Successfully crawled 105 commits across 3 pages\n"
-          ]
-        }
-      ],
-      "source": [
-        "import re\n",
-        "from bs4 import BeautifulSoup\n",
-        "\n",
-        "async def crawl_typescript_commits():\n",
-        "    first_commit = \"\"\n",
-        "    async def on_execution_started(page):\n",
-        "        nonlocal first_commit\n",
-        "        try:\n",
-        "            while True:\n",
-        "                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
-        "                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
-        "                commit = await commit.evaluate('(element) => element.textContent')\n",
-        "                commit = re.sub(r'\\s+', '', commit)\n",
-        "                if commit and commit != first_commit:\n",
-        "                    first_commit = commit\n",
-        "                    break\n",
-        "                await asyncio.sleep(0.5)\n",
-        "        except Exception as e:\n",
-        "            print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
-        "\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
-        "\n",
-        "        url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
-        "        session_id = \"typescript_commits_session\"\n",
-        "        all_commits = []\n",
-        "\n",
-        "        js_next_page = \"\"\"\n",
-        "        const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
-        "        if (button) button.click();\n",
-        "        \"\"\"\n",
-        "\n",
-        "        for page in range(3):  # Crawl 3 pages\n",
-        "            result = await crawler.arun(\n",
-        "                url=url,\n",
-        "                session_id=session_id,\n",
-        "                css_selector=\"li.Box-sc-g0xbh4-0\",\n",
-        "                js=js_next_page if page > 0 else None,\n",
-        "                bypass_cache=True,\n",
-        "                js_only=page > 0\n",
-        "            )\n",
-        "\n",
-        "            assert result.success, f\"Failed to crawl page {page + 1}\"\n",
-        "\n",
-        "            soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
-        "            commits = soup.select(\"li\")\n",
-        "            all_commits.extend(commits)\n",
-        "\n",
-        "            print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
-        "\n",
-        "        await crawler.crawler_strategy.kill_session(session_id)\n",
-        "        print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
-        "\n",
-        "await crawl_typescript_commits()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EJRnYsp6yZQN"
-      },
-      "source": [
-        "### Using JsonCssExtractionStrategy for Fast Structured Output"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1ZMqIzB_8SYp"
-      },
-      "source": [
-        "The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
-        "\n",
-        "1. You define a schema that describes the pattern of data you're interested in extracting.\n",
-        "2. The schema includes a base selector that identifies repeating elements on the page.\n",
-        "3. Within the schema, you define fields, each with its own selector and type.\n",
-        "4. These field selectors are applied within the context of each base selector element.\n",
-        "5. The strategy supports nested structures, lists within lists, and various data types.\n",
-        "6. You can even include computed fields for more complex data manipulation.\n",
-        "\n",
-        "This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
-        "\n",
-        "For more details and advanced usage, check out the full documentation on the Crawl4AI website."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "trCMR2T9yZQN",
-        "outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
-            "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
-            "Successfully extracted 11 news teasers\n",
-            "{\n",
-            "  \"category\": \"Business News\",\n",
-            "  \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
-            "  \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
-            "  \"time\": \"13h ago\",\n",
-            "  \"image\": {\n",
-            "    \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
-            "    \"alt\": \"Mike Tirico.\"\n",
-            "  },\n",
-            "  \"link\": \"https://www.nbcnews.com/business\"\n",
-            "}\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def extract_news_teasers():\n",
-        "    schema = {\n",
-        "        \"name\": \"News Teaser Extractor\",\n",
-        "        \"baseSelector\": \".wide-tease-item__wrapper\",\n",
-        "        \"fields\": [\n",
-        "            {\n",
-        "                \"name\": \"category\",\n",
-        "                \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"headline\",\n",
-        "                \"selector\": \".wide-tease-item__headline\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"summary\",\n",
-        "                \"selector\": \".wide-tease-item__description\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"time\",\n",
-        "                \"selector\": \"[data-testid='wide-tease-date']\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"image\",\n",
-        "                \"type\": \"nested\",\n",
-        "                \"selector\": \"picture.teasePicture img\",\n",
-        "                \"fields\": [\n",
-        "                    {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
-        "                    {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
-        "                ],\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"link\",\n",
-        "                \"selector\": \"a[href]\",\n",
-        "                \"type\": \"attribute\",\n",
-        "                \"attribute\": \"href\",\n",
-        "            },\n",
-        "        ],\n",
-        "    }\n",
-        "\n",
-        "    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
-        "\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            extraction_strategy=extraction_strategy,\n",
-        "            bypass_cache=True,\n",
-        "        )\n",
-        "\n",
-        "        assert result.success, \"Failed to crawl the page\"\n",
-        "\n",
-        "        news_teasers = json.loads(result.extracted_content)\n",
-        "        print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
-        "        print(json.dumps(news_teasers[0], indent=2))\n",
-        "\n",
-        "await extract_news_teasers()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FnyVhJaByZQN"
-      },
-      "source": [
-        "## Speed Comparison\n",
-        "\n",
-        "Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "agDD186f3wig"
-      },
-      "source": [
-        "💡 **Note on Speed Comparison:**\n",
-        "\n",
-        "The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
-        "\n",
-        "For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
-        "\n",
-        "If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "F7KwHv8G1LbY"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install firecrawl"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "91813zILyZQN",
-        "outputId": "663223db-ab89-4976-b233-05ceca62b19b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Firecrawl (simulated):\n",
-            "Time taken: 4.38 seconds\n",
-            "Content length: 41967 characters\n",
-            "Images found: 49\n",
-            "\n",
-            "Crawl4AI (simple crawl):\n",
-            "Time taken: 4.22 seconds\n",
-            "Content length: 18221 characters\n",
-            "Images found: 49\n",
-            "\n",
-            "Crawl4AI (with JavaScript execution):\n",
-            "Time taken: 9.13 seconds\n",
-            "Content length: 34243 characters\n",
-            "Images found: 89\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
-        "import time\n",
-        "from firecrawl import FirecrawlApp\n",
-        "\n",
-        "async def speed_comparison():\n",
-        "    # Simulated Firecrawl performance\n",
-        "    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
-        "    start = time.time()\n",
-        "    scrape_status = app.scrape_url(\n",
-        "    'https://www.nbcnews.com/business',\n",
-        "    params={'formats': ['markdown', 'html']}\n",
-        "    )\n",
-        "    end = time.time()\n",
-        "    print(\"Firecrawl (simulated):\")\n",
-        "    print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "    print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
-        "    print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
-        "    print()\n",
-        "\n",
-        "    async with AsyncWebCrawler() as crawler:\n",
-        "        # Crawl4AI simple crawl\n",
-        "        start = time.time()\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            word_count_threshold=0,\n",
-        "            bypass_cache=True,\n",
-        "            verbose=False\n",
-        "        )\n",
-        "        end = time.time()\n",
-        "        print(\"Crawl4AI (simple crawl):\")\n",
-        "        print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "        print(f\"Content length: {len(result.markdown)} characters\")\n",
-        "        print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
-        "        print()\n",
-        "\n",
-        "        # Crawl4AI with JavaScript execution\n",
-        "        start = time.time()\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
-        "            word_count_threshold=0,\n",
-        "            bypass_cache=True,\n",
-        "            verbose=False\n",
-        "        )\n",
-        "        end = time.time()\n",
-        "        print(\"Crawl4AI (with JavaScript execution):\")\n",
-        "        print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "        print(f\"Content length: {len(result.markdown)} characters\")\n",
-        "        print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
-        "\n",
-        "await speed_comparison()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OBFFYVJIyZQN"
-      },
-      "source": [
-        "If you run on a local machine with a proper internet speed:\n",
-        "- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
-        "- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
-        "\n",
-        "Please note that actual performance may vary depending on network conditions and the specific content being crawled."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "A6_1RK1_yZQO"
-      },
-      "source": [
-        "## Conclusion\n",
-        "\n",
-        "In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
-        "\n",
-        "1. Basic crawling\n",
-        "2. JavaScript execution and CSS selector usage\n",
-        "3. Proxy support\n",
-        "4. Structured data extraction with OpenAI\n",
-        "5. Advanced multi-page crawling with JavaScript execution\n",
-        "6. Fast structured output using JsonCssExtractionStrategy\n",
-        "7. Speed comparison with other services\n",
-        "\n",
-        "Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
-        "\n",
-        "For more information and advanced usage, please visit the [Crawl4AI documentation](https://docs.crawl4ai.com/).\n",
-        "\n",
-        "Happy crawling!"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "venv",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.13"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}

From ca9351252a20797acd3d6a7e8adfedbd4317a100 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sat, 5 Apr 2025 22:55:56 +0800
Subject: [PATCH 34/78] refactor(docs): update import paths and clean up
 example code in quickstart_examples.py

---
 docs/examples/quickstart_examples.py | 100 ++++++++++++++-------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/docs/examples/quickstart_examples.py b/docs/examples/quickstart_examples.py
index f9829f2d..e94b8486 100644
--- a/docs/examples/quickstart_examples.py
+++ b/docs/examples/quickstart_examples.py
@@ -5,7 +5,7 @@ import base64
 from pathlib import Path
 from typing import List
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
-from crawl4ai.configs import ProxyConfig
+from crawl4ai.proxy_strategy import ProxyConfig
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 from crawl4ai import LLMConfig
@@ -19,10 +19,9 @@ __cur_dir__ = Path(__file__).parent
 async def demo_basic_crawl():
     """Basic web crawling with markdown generation"""
     print("\n=== 1. Basic Web Crawling ===")
-
     async with AsyncWebCrawler() as crawler:
         results: List[CrawlResult] = await crawler.arun(
-            url="https://news.ycombinator.com/",
+            url="https://news.ycombinator.com/"
         )
 
         for i, result in enumerate(results):
@@ -34,7 +33,6 @@ async def demo_basic_crawl():
             else:
                 print("Failed to crawl the URL")
 
-
 async def demo_parallel_crawl():
     """Crawl multiple URLs in parallel"""
     print("\n=== 2. Parallel Crawling ===")
@@ -56,14 +54,13 @@ async def demo_parallel_crawl():
                 f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
             )
 
-
 async def demo_fit_markdown():
     """Generate focused markdown with LLM content filter"""
     print("\n=== 3. Fit Markdown with LLM Content Filter ===")
 
     async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            "https://en.wikipedia.org/wiki/Python_(programming_language)",
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
             config=CrawlerRunConfig(
                 markdown_generator=DefaultMarkdownGenerator(
                     content_filter=PruningContentFilter()
@@ -75,7 +72,6 @@ async def demo_fit_markdown():
         print(f"Raw: {len(result.markdown.raw_markdown)} chars")
         print(f"Fit: {len(result.markdown.fit_markdown)} chars")
 
-
 async def demo_llm_structured_extraction_no_schema():
     # Create a simple LLM extraction strategy (no schema required)
     extraction_strategy = LLMExtractionStrategy(
@@ -83,7 +79,7 @@ async def demo_llm_structured_extraction_no_schema():
             provider="groq/qwen-2.5-32b",
             api_token="env:GROQ_API_KEY",
         ),
-        instruction="This is news.ycombinator.com, extract all news for each. title, source url, number of comments.",
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
         extract_type="schema",
         schema="{title: string, url: string, comments: int}",
         extra_args={
@@ -109,7 +105,6 @@ async def demo_llm_structured_extraction_no_schema():
             else:
                 print("Failed to extract structured data")
 
-
 async def demo_css_structured_extraction_no_schema():
     """Extract structured data using CSS selectors"""
     print("\n=== 5. CSS-Based Structured Extraction ===")
@@ -129,27 +124,33 @@ async def demo_css_structured_extraction_no_schema():
                     <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
                     <span class="h-tags">Malware / Supply Chain Attack</span>
                 </div>
-                <div class="home-desc"> Cybersecurity researchers have uncovered malicious libraries in the Python Package Index (PyPI) repository that are designed to steal sensitive information.  Two of the packages, bitcoinlibdbfix and bitcoinlib-dev, masquerade as fixes for recent issues  detected in a legitimate Python module called bitcoinlib, according to ReversingLabs . A third package discovered  by Socket, disgrasya, contained a fully automated carding script targeting WooCommerce stores.  The packages attracted hundreds of downloads before being taken down, according to statistics from pepy.tech -   bitcoinlibdbfix  - 1,101 downloads  bitcoinlib-dev  - 735 downloads  disgrasya  - 37,217 downloads   "The malicious libraries both attempt a similar attack, overwriting the legitimate 'clw cli' command with malicious code that attempts to exfiltrate sensitive database files," ReversingLabs said.   In an interesting twist, the authors of the counterfeit libraries are said to have joined a GitHub issue...</div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
             </div>
         </div>
     </a>
 </div>
     """
 
-    # Generate schema using LLM (one-time setup)
-    schema = JsonCssExtractionStrategy.generate_schema(
-        html=sample_html,
-        llm_config=LLMConfig(
-            provider="groq/qwen-2.5-32b",
-            api_token="env:GROQ_API_KEY",
-        ),
-        query="From https://thehackernews.com/, I have shares a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
-    )
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )
 
     print(f"Generated schema: {json.dumps(schema, indent=2)}")
     # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
-    # with open("schema.json", "w") as f:
-    #     json.dump(schema, f, indent=2)
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)
 
     # Create no-LLM extraction strategy with the generated schema
     extraction_strategy = JsonCssExtractionStrategy(schema)
@@ -170,7 +171,6 @@ async def demo_css_structured_extraction_no_schema():
             else:
                 print("Failed to extract structured data")
 
-
 async def demo_deep_crawl():
     """Deep crawling with BFS strategy"""
     print("\n=== 6. Deep Crawling ===")
@@ -192,7 +192,6 @@ async def demo_deep_crawl():
             depth = result.metadata.get("depth", "unknown")
             print(f"  {i + 1}. {result.url} (Depth: {depth})")
 
-
 async def demo_js_interaction():
     """Execute JavaScript to load more content"""
     print("\n=== 7. JavaScript Interaction ===")
@@ -255,8 +254,6 @@ async def demo_js_interaction():
                 print("Failed to extract structured data")
         print(f"Total items: {len(news)}")
 
-
-
 async def demo_media_and_links():
     """Extract media and links from a page"""
     print("\n=== 8. Media and Links Extraction ===")
@@ -275,17 +272,24 @@ async def demo_media_and_links():
             print(f"Found {len(internal_links)} internal links")
             print(f"Found {len(external_links)} external links")
 
-            # Save everything to files
-            with open("images.json", "w") as f:
-                json.dump(images, f, indent=2)
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")
 
-            with open("links.json", "w") as f:
-                json.dump(
-                    {"internal": internal_links, "external": external_links},
-                    f,
-                    indent=2,
-                )
+            # # Save everything to files
+            # with open("images.json", "w") as f:
+            #     json.dump(images, f, indent=2)
 
+            # with open("links.json", "w") as f:
+            #     json.dump(
+            #         {"internal": internal_links, "external": external_links},
+            #         f,
+            #         indent=2,
+            #     )
 
 async def demo_screenshot_and_pdf():
     """Capture screenshot and PDF of a page"""
@@ -299,6 +303,7 @@ async def demo_screenshot_and_pdf():
         )
 
         for i, result in enumerate(result):
+            # if result.screenshot_data:
             if result.screenshot:
                 # Save screenshot
                 screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
@@ -306,6 +311,7 @@ async def demo_screenshot_and_pdf():
                     f.write(base64.b64decode(result.screenshot))
                 print(f"Screenshot saved to {screenshot_path}")
 
+            # if result.pdf_data:
             if result.pdf:
                 # Save PDF
                 pdf_path = f"{__cur_dir__}/tmp/example.pdf"
@@ -313,7 +319,6 @@ async def demo_screenshot_and_pdf():
                     f.write(result.pdf)
                 print(f"PDF saved to {pdf_path}")
 
-
 async def demo_proxy_rotation():
     """Proxy rotation for multiple requests"""
     print("\n=== 10. Proxy Rotation ===")
@@ -339,7 +344,6 @@ async def demo_proxy_rotation():
         # In a real scenario, these would be run and the proxies would rotate
         print("In a real scenario, requests would rotate through the available proxies")
 
-
 async def demo_raw_html_and_file():
     """Process raw HTML and local files"""
     print("\n=== 11. Raw HTML and Local Files ===")
@@ -376,29 +380,27 @@ async def demo_raw_html_and_file():
     os.remove(file_path)
     print(f"Processed both raw HTML and local file ({file_path})")
 
-
 async def main():
     """Run all demo functions sequentially"""
     print("=== Comprehensive Crawl4AI Demo ===")
     print("Note: Some examples require API keys or other configurations")
 
     # Run all demos
-    await demo_basic_crawl()
-    await demo_parallel_crawl()
-    await demo_fit_markdown()
-    await demo_llm_structured_extraction_no_schema()
-    await demo_css_structured_extraction_no_schema()
+    # await demo_basic_crawl()
+    # await demo_parallel_crawl()
+    # await demo_fit_markdown()
+    # await demo_llm_structured_extraction_no_schema()
+    # await demo_css_structured_extraction_no_schema()
     await demo_deep_crawl()
-    await demo_js_interaction()
-    await demo_media_and_links()
-    await demo_screenshot_and_pdf()
-    # await demo_proxy_rotation()
-    await demo_raw_html_and_file()
+    # await demo_js_interaction()
+    # await demo_media_and_links()
+    # await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    # await demo_raw_html_and_file()
 
     # Clean up any temp files that may have been created
     print("\n=== Demo Complete ===")
     print("Check for any generated files (screenshots, PDFs) in the current directory")
 
-
 if __name__ == "__main__":
     asyncio.run(main())

From 49d904ca0aa34fedaa3c9527bcc568046c53b10c Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sat, 5 Apr 2025 22:57:45 +0800
Subject: [PATCH 35/78] refactor(docs): enhance quickstart_examples.py with
 improved configuration and file handling

---
 docs/examples/quickstart_examples.py | 48 ++++++++++++++++------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/docs/examples/quickstart_examples.py b/docs/examples/quickstart_examples.py
index e94b8486..76224746 100644
--- a/docs/examples/quickstart_examples.py
+++ b/docs/examples/quickstart_examples.py
@@ -4,12 +4,13 @@ import json
 import base64
 from pathlib import Path
 from typing import List
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai.proxy_strategy import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 from crawl4ai import LLMConfig
-from crawl4ai import PruningContentFilter
+from crawl4ai import PruningContentFilter, BM25ContentFilter
 from crawl4ai import DefaultMarkdownGenerator
 from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
 from crawl4ai import BrowserConfig
@@ -19,7 +20,12 @@ __cur_dir__ = Path(__file__).parent
 async def demo_basic_crawl():
     """Basic web crawling with markdown generation"""
     print("\n=== 1. Basic Web Crawling ===")
-    async with AsyncWebCrawler() as crawler:
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
         results: List[CrawlResult] = await crawler.arun(
             url="https://news.ycombinator.com/"
         )
@@ -281,15 +287,15 @@ async def demo_media_and_links():
                 print(f"External link: {link['href']}")
 
             # # Save everything to files
-            # with open("images.json", "w") as f:
-            #     json.dump(images, f, indent=2)
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)
 
-            # with open("links.json", "w") as f:
-            #     json.dump(
-            #         {"internal": internal_links, "external": external_links},
-            #         f,
-            #         indent=2,
-            #     )
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
 
 async def demo_screenshot_and_pdf():
     """Capture screenshot and PDF of a page"""
@@ -338,7 +344,7 @@ async def demo_proxy_rotation():
 
     async with AsyncWebCrawler() as crawler:
         config = CrawlerRunConfig(
-            proxy_rotation_strategy=proxy_strategy, cache_mode=CacheMode.BYPASS
+            proxy_rotation_strategy=proxy_strategy
         )
 
         # In a real scenario, these would be run and the proxies would rotate
@@ -386,17 +392,17 @@ async def main():
     print("Note: Some examples require API keys or other configurations")
 
     # Run all demos
-    # await demo_basic_crawl()
-    # await demo_parallel_crawl()
-    # await demo_fit_markdown()
-    # await demo_llm_structured_extraction_no_schema()
-    # await demo_css_structured_extraction_no_schema()
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
     await demo_deep_crawl()
-    # await demo_js_interaction()
-    # await demo_media_and_links()
-    # await demo_screenshot_and_pdf()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
     # # await demo_proxy_rotation()
-    # await demo_raw_html_and_file()
+    await demo_raw_html_and_file()
 
     # Clean up any temp files that may have been created
     print("\n=== Demo Complete ===")

From b1693b1c215bc7c7bbf9379e9e311a6f843d9dc3 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sat, 5 Apr 2025 23:10:25 +0800
Subject: [PATCH 36/78] Remove old quickstart  files

---
 ...ickstart_async.config.py => quickstart.py} |   0
 docs/examples/quickstart_async.py             | 675 ----------------
 docs/examples/quickstart_sync.py              | 405 ----------
 docs/examples/quickstart_v0.ipynb             | 735 ------------------
 4 files changed, 1815 deletions(-)
 rename docs/examples/{quickstart_async.config.py => quickstart.py} (100%)
 delete mode 100644 docs/examples/quickstart_async.py
 delete mode 100644 docs/examples/quickstart_sync.py
 delete mode 100644 docs/examples/quickstart_v0.ipynb

diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart.py
similarity index 100%
rename from docs/examples/quickstart_async.config.py
rename to docs/examples/quickstart.py
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
deleted file mode 100644
index aeb0d20a..00000000
--- a/docs/examples/quickstart_async.py
+++ /dev/null
@@ -1,675 +0,0 @@
-import os, sys
-
-from crawl4ai import LLMConfig
-
-# append parent directory to system path
-sys.path.append(
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-)
-os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
-
-import asyncio
-# import nest_asyncio
-# nest_asyncio.apply()
-
-import time
-import json
-import os
-import re
-from typing import Dict, List
-from bs4 import BeautifulSoup
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
-    JsonCssExtractionStrategy,
-    LLMExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-print("Crawl4AI: Advanced Web Crawling and Data Extraction")
-print("GitHub Repository: https://github.com/unclecode/crawl4ai")
-print("Twitter: @unclecode")
-print("Website: https://crawl4ai.com")
-
-
-async def simple_crawl():
-    print("\n--- Basic Usage ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_running_js_code():
-    print("\n--- Executing JavaScript and Using CSS Selectors ---")
-    # New code to handle the wait_for parameter
-    wait_for = """() => {
-        return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
-    }"""
-
-    # wait_for can be also just a css selector
-    # wait_for = "article.tease-card:nth-child(10)"
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        js_code = [
-            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-        ]
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=js_code,
-            # wait_for=wait_for,
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_css_selector():
-    print("\n--- Using CSS Selectors ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            css_selector=".wide-tease-item__description",
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def use_proxy():
-    print("\n--- Using a Proxy ---")
-    print(
-        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
-    )
-    # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(
-        verbose=True, proxy="http://your-proxy-url:port"
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        if result.success:
-            print(result.markdown[:500])  # Print first 500 characters
-
-
-async def capture_and_save_screenshot(url: str, output_path: str):
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
-        )
-
-        if result.success and result.screenshot:
-            import base64
-
-            # Decode the base64 screenshot data
-            screenshot_data = base64.b64decode(result.screenshot)
-
-            # Save the screenshot as a JPEG file
-            with open(output_path, "wb") as f:
-                f.write(screenshot_data)
-
-            print(f"Screenshot saved successfully to {output_path}")
-        else:
-            print("Failed to capture screenshot")
-
-
-class OpenAIModelFee(BaseModel):
-    model_name: str = Field(..., description="Name of the OpenAI model.")
-    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
-        ..., description="Fee for output token for the OpenAI model."
-    )
-
-
-async def extract_structured_data_using_llm(
-    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
-):
-    print(f"\n--- Extracting Structured Data with {provider} ---")
-
-    if api_token is None and provider != "ollama":
-        print(f"API token is required for {provider}. Skipping this example.")
-        return
-
-    # extra_args = {}
-    extra_args = {
-        "temperature": 0,
-        "top_p": 0.9,
-        "max_tokens": 2000,
-        # any other supported parameters for litellm
-    }
-    if extra_headers:
-        extra_args["extra_headers"] = extra_headers
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://openai.com/api/pricing/",
-            word_count_threshold=1,
-            extraction_strategy=LLMExtractionStrategy(
-                llm_config=LLMConfig(provider=provider,api_token=api_token),
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
-                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
-                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args,
-            ),
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.extracted_content)
-
-
-async def extract_structured_data_using_css_extractor():
-    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
-    schema = {
-        "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
-        "fields": [
-            {
-                "name": "section_title",
-                "selector": "h3.heading-50",
-                "type": "text",
-            },
-            {
-                "name": "section_description",
-                "selector": ".charge-content",
-                "type": "text",
-            },
-            {
-                "name": "course_name",
-                "selector": ".text-block-93",
-                "type": "text",
-            },
-            {
-                "name": "course_description",
-                "selector": ".course-content-text",
-                "type": "text",
-            },
-            {
-                "name": "course_icon",
-                "selector": ".image-92",
-                "type": "attribute",
-                "attribute": "src",
-            },
-        ],
-    }
-
-    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
-        # Create the JavaScript that handles clicking multiple times
-        js_click_tabs = """
-        (async () => {
-            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
-            
-            for(let tab of tabs) {
-                // scroll to the tab
-                tab.scrollIntoView();
-                tab.click();
-                // Wait for content to load and animations to complete
-                await new Promise(r => setTimeout(r, 500));
-            }
-        })();
-        """
-
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
-            js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        companies = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(companies)} companies")
-        print(json.dumps(companies[0], indent=2))
-
-
-# Advanced Session-Based Crawling with Dynamic Content 🔄
-async def crawl_dynamic_content_pages_method_1():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-    first_commit = ""
-
-    async def on_execution_started(page):
-        nonlocal first_commit
-        try:
-            while True:
-                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await commit.evaluate("(element) => element.textContent")
-                commit = re.sub(r"\s+", "", commit)
-                if commit and commit != first_commit:
-                    first_commit = commit
-                    break
-                await asyncio.sleep(0.5)
-        except Exception as e:
-            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        (() => {
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-        })();
-        """
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                js=js_next_page if page > 0 else None,
-                cache_mode=CacheMode.BYPASS,
-                js_only=page > 0,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            soup = BeautifulSoup(result.cleaned_html, "html.parser")
-            commits = soup.select("li")
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_2():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-        last_commit = ""
-
-        js_next_page_and_wait = """
-        (async () => {
-            const getCurrentCommit = () => {
-                const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-                return commits.length > 0 ? commits[0].textContent.trim() : null;
-            };
-
-            const initialCommit = getCurrentCommit();
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-
-            // Poll for changes
-            while (true) {
-                await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
-                const newCommit = getCurrentCommit();
-                if (newCommit && newCommit !== initialCommit) {
-                    break;
-                }
-            }
-        })();
-        """
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page_and_wait if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_3():
-    print(
-        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
-    )
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-        if (commits.length > 0) {
-            window.firstCommit = commits[0].textContent.trim();
-        }
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
-        if (button) button.click();
-        """
-
-        wait_for = """() => {
-            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-            if (commits.length === 0) return false;
-            const firstCommit = commits[0].textContent.trim();
-            return firstCommit !== window.firstCommit;
-        }"""
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page if page > 0 else None,
-                wait_for=wait_for if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_custom_browser_type():
-    # Use Firefox
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="firefox", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use WebKit
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="webkit", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use Chromium (default)
-    start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-
-async def crawl_with_user_simultion():
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        url = "YOUR-URL-HERE"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
-            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
-            # override_navigator = True # Overrides the navigator object to make it look like a real user
-        )
-
-        print(result.markdown)
-
-
-async def speed_comparison():
-    # print("\n--- Speed Comparison ---")
-    # print("Firecrawl (simulated):")
-    # print("Time taken: 7.02 seconds")
-    # print("Content length: 42074 characters")
-    # print("Images found: 49")
-    # print()
-    # Simulated Firecrawl performance
-    from firecrawl import FirecrawlApp
-
-    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
-    start = time.time()
-    scrape_status = app.scrape_url(
-        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
-    )
-    end = time.time()
-    print("Firecrawl:")
-    print(f"Time taken: {end - start:.2f} seconds")
-    print(f"Content length: {len(scrape_status['markdown'])} characters")
-    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()
-
-    async with AsyncWebCrawler() as crawler:
-        # Crawl4AI simple crawl
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (simple crawl):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with advanced content filtering
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (Markdown Plus):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with JavaScript execution
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=[
-                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-            ],
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (with JavaScript execution):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-
-    print("\nNote on Speed Comparison:")
-    print("The speed test conducted here may not reflect optimal conditions.")
-    print("When we call Firecrawl's API, we're seeing its best performance,")
-    print("while Crawl4AI's performance is limited by the local network speed.")
-    print("For a more accurate comparison, it's recommended to run these tests")
-    print("on servers with a stable and fast internet connection.")
-    print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
-    print("If you run these tests in an environment with better network conditions,")
-    print("you may observe an even more significant speed advantage for Crawl4AI.")
-
-
-async def generate_knowledge_graph():
-    class Entity(BaseModel):
-        name: str
-        description: str
-
-    class Relationship(BaseModel):
-        entity1: Entity
-        entity2: Entity
-        description: str
-        relation_type: str
-
-    class KnowledgeGraph(BaseModel):
-        entities: List[Entity]
-        relationships: List[Relationship]
-
-    extraction_strategy = LLMExtractionStrategy(
-        llm_config=LLMConfig(provider="openai/gpt-4o-mini",  api_token=os.getenv("OPENAI_API_KEY")),  # In case of Ollama just pass "no-token"
-        schema=KnowledgeGraph.model_json_schema(),
-        extraction_type="schema",
-        instruction="""Extract entities and relationships from the given text.""",
-    )
-    async with AsyncWebCrawler() as crawler:
-        url = "https://paulgraham.com/love.html"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            extraction_strategy=extraction_strategy,
-            # magic=True
-        )
-        # print(result.extracted_content)
-        with open(os.path.join(__location__, "kb.json"), "w") as f:
-            f.write(result.extracted_content)
-
-
-async def fit_markdown_remove_overlay():
-    async with AsyncWebCrawler(
-        headless=True,  # Set to False to see what is happening
-        verbose=True,
-        user_agent_mode="random",
-        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                ),
-                options={"ignore_links": True},
-            ),
-            # markdown_generator=DefaultMarkdownGenerator(
-            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
-            #     options={
-            #         "ignore_links": True
-            #     }
-            # ),
-        )
-
-        if result.success:
-            print(len(result.markdown.raw_markdown))
-            print(len(result.markdown.markdown_with_citations))
-            print(len(result.markdown.fit_markdown))
-
-            # Save clean html
-            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
-                f.write(result.cleaned_html)
-
-            with open(
-                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown.raw_markdown)
-
-            with open(
-                os.path.join(__location__, "output/output_markdown_with_citations.md"),
-                "w",
-            ) as f:
-                f.write(result.markdown.markdown_with_citations)
-
-            with open(
-                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown.fit_markdown)
-
-    print("Done")
-
-
-async def main():
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
-    # # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
-
-    # LLM extraction examples
-    # await extract_structured_data_using_llm()
-    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")
-
-    # You always can pass custom headers to the extraction strategy
-    # custom_headers = {
-    #     "Authorization": "Bearer your-custom-token",
-    #     "X-Custom-Header": "Some-Value"
-    # }
-    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
-    await crawl_dynamic_content_pages_method_3()
-
-    # await crawl_custom_browser_type()
-
-    # await speed_comparison()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py
deleted file mode 100644
index 78f3e56c..00000000
--- a/docs/examples/quickstart_sync.py
+++ /dev/null
@@ -1,405 +0,0 @@
-import os
-import time
-from crawl4ai import LLMConfig
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
-from rich import print
-from rich.console import Console
-from functools import lru_cache
-
-console = Console()
-
-
-@lru_cache()
-def create_crawler():
-    crawler = WebCrawler(verbose=True)
-    crawler.warmup()
-    return crawler
-
-
-def print_result(result):
-    # Print each key in one line and just the first 10 characters of each one's value and three dots
-    console.print("\t[bold]Result:[/bold]")
-    for key, value in result.model_dump().items():
-        if isinstance(value, str) and value:
-            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
-    if result.extracted_content:
-        items = json.loads(result.extracted_content)
-        print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
-
-
-def cprint(message, press_any_key=False):
-    console.print(message)
-    if press_any_key:
-        console.print("Press any key to continue...", style="")
-        input()
-
-
-def basic_usage(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def basic_usage_some_params(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
-    )
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def screenshot_usage(crawler):
-    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
-    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
-    # Save the screenshot to a file
-    with open("screenshot.png", "wb") as f:
-        f.write(base64.b64decode(result.screenshot))
-    cprint("Screenshot saved to 'screenshot.png'!")
-    print_result(result)
-
-
-def understanding_parameters(crawler):
-    cprint(
-        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
-    )
-    cprint(
-        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
-    )
-
-    # First crawl (reads from cache)
-    cprint("1️⃣ First crawl (caches the result):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business")
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
-    )
-    print_result(result)
-
-    # Force to crawl again
-    cprint("2️⃣ Second crawl (Force to crawl again):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_chunking_strategy(crawler):
-    # Adding a chunking strategy: RegexChunking
-    cprint(
-        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"]),
-    )
-    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
-    print_result(result)
-
-    # Adding another chunking strategy: NlpSentenceChunking
-    cprint(
-        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
-    )
-    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
-    print_result(result)
-
-
-def add_extraction_strategy(crawler):
-    # Adding an extraction strategy: CosineStrategy
-    cprint(
-        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
-        True,
-    )
-    cprint(
-        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            word_count_threshold=10,
-            max_dist=0.2,
-            linkage_method="ward",
-            top_k=3,
-            sim_threshold=0.3,
-            verbose=True,
-        ),
-    )
-    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
-    print_result(result)
-
-    # Using semantic_filter with CosineStrategy
-    cprint(
-        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            semantic_filter="inflation rent prices",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_llm_extraction_strategy(crawler):
-    # Adding an LLM extraction strategy without instructions
-    cprint(
-        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config =  LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    # Adding an LLM extraction strategy with instructions
-    cprint(
-        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-            instruction="I am interested in only financial news",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-            instruction="Extract only content related to technology",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def targeted_extraction(crawler):
-    # Using a CSS selector to extract only H2 tags
-    cprint(
-        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
-        True,
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
-    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
-    print_result(result)
-
-
-def interactive_extraction(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def multiple_scrip(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = [
-        """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    ] * 2
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def using_crawler_hooks(crawler):
-    # Example usage of the hooks for authentication and setting a cookie
-    def on_driver_created(driver):
-        print("[HOOK] on_driver_created")
-        # Example customization: maximize the window
-        driver.maximize_window()
-
-        # Example customization: logging in to a hypothetical website
-        driver.get("https://example.com/login")
-
-        from selenium.webdriver.support.ui import WebDriverWait
-        from selenium.webdriver.common.by import By
-        from selenium.webdriver.support import expected_conditions as EC
-
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.NAME, "username"))
-        )
-        driver.find_element(By.NAME, "username").send_keys("testuser")
-        driver.find_element(By.NAME, "password").send_keys("password123")
-        driver.find_element(By.NAME, "login").click()
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.ID, "welcome"))
-        )
-        # Add a custom cookie
-        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
-        return driver
-
-    def before_get_url(driver):
-        print("[HOOK] before_get_url")
-        # Example customization: add a custom header
-        # Enable Network domain for sending headers
-        driver.execute_cdp_cmd("Network.enable", {})
-        # Add a custom header
-        driver.execute_cdp_cmd(
-            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
-        )
-        return driver
-
-    def after_get_url(driver):
-        print("[HOOK] after_get_url")
-        # Example customization: log the URL
-        print(driver.current_url)
-        return driver
-
-    def before_return_html(driver, html):
-        print("[HOOK] before_return_html")
-        # Example customization: log the HTML
-        print(len(html))
-        return driver
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
-        True,
-    )
-
-    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-    crawler_strategy.set_hook("on_driver_created", on_driver_created)
-    crawler_strategy.set_hook("before_get_url", before_get_url)
-    crawler_strategy.set_hook("after_get_url", after_get_url)
-    crawler_strategy.set_hook("before_return_html", before_return_html)
-
-    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-    crawler.warmup()
-    result = crawler.run(url="https://example.com")
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result=result)
-
-
-def using_crawler_hooks_dleay_example(crawler):
-    def delay(driver):
-        print("Delaying for 5 seconds...")
-        time.sleep(5)
-        print("Resuming...")
-
-    def create_crawler():
-        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-        crawler_strategy.set_hook("after_get_url", delay)
-        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-        crawler.warmup()
-        return crawler
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
-    )
-    crawler = create_crawler()
-    result = crawler.run(url="https://google.com", bypass_cache=True)
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result)
-
-
-def main():
-    cprint(
-        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
-    )
-    cprint(
-        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
-    )
-    cprint(
-        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
-    )
-
-    crawler = create_crawler()
-
-    crawler.always_by_pass_cache = True
-    basic_usage(crawler)
-    # basic_usage_some_params(crawler)
-    understanding_parameters(crawler)
-
-    crawler.always_by_pass_cache = True
-    screenshot_usage(crawler)
-    add_chunking_strategy(crawler)
-    add_extraction_strategy(crawler)
-    add_llm_extraction_strategy(crawler)
-    targeted_extraction(crawler)
-    interactive_extraction(crawler)
-    multiple_scrip(crawler)
-
-    cprint(
-        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/docs/examples/quickstart_v0.ipynb b/docs/examples/quickstart_v0.ipynb
deleted file mode 100644
index 0282aa12..00000000
--- a/docs/examples/quickstart_v0.ipynb
+++ /dev/null
@@ -1,735 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6yLvrXn7yZQI"
-      },
-      "source": [
-        "# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
-        "\n",
-        "Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
-        "\n",
-        "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
-        "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
-        "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
-        "\n",
-        "Let's explore the powerful features of Crawl4AI!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KIn_9nxFyZQK"
-      },
-      "source": [
-        "## Installation\n",
-        "\n",
-        "First, let's install Crawl4AI from GitHub:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mSnaxLf3zMog"
-      },
-      "outputs": [],
-      "source": [
-        "!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xlXqaRtayZQK"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install crawl4ai\n",
-        "!pip install nest-asyncio\n",
-        "!playwright install"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qKCE7TI7yZQL"
-      },
-      "source": [
-        "Now, let's import the necessary libraries:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "I67tr7aAyZQL"
-      },
-      "outputs": [],
-      "source": [
-        "import asyncio\n",
-        "import nest_asyncio\n",
-        "from crawl4ai import AsyncWebCrawler\n",
-        "from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
-        "import json\n",
-        "import time\n",
-        "from pydantic import BaseModel, Field\n",
-        "\n",
-        "nest_asyncio.apply()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "h7yR_Rt_yZQM"
-      },
-      "source": [
-        "## Basic Usage\n",
-        "\n",
-        "Let's start with a simple crawl example:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yBh6hf4WyZQM",
-        "outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
-            "18102\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def simple_crawl():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
-        "        print(len(result.markdown))\n",
-        "await simple_crawl()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9rtkgHI28uI4"
-      },
-      "source": [
-        "💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MzZ0zlJ9yZQM"
-      },
-      "source": [
-        "## Advanced Features\n",
-        "\n",
-        "### Executing JavaScript and Using CSS Selectors"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gHStF86xyZQM",
-        "outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
-            "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
-            "41135\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def js_and_css():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            js_code=js_code,\n",
-        "            # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
-        "            bypass_cache=True\n",
-        "        )\n",
-        "        print(len(result.markdown))\n",
-        "\n",
-        "await js_and_css()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cqE_W4coyZQM"
-      },
-      "source": [
-        "### Using a Proxy\n",
-        "\n",
-        "Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "QjAyiAGqyZQM"
-      },
-      "outputs": [],
-      "source": [
-        "async def use_proxy():\n",
-        "    async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            bypass_cache=True\n",
-        "        )\n",
-        "        print(result.markdown[:500])  # Print first 500 characters\n",
-        "\n",
-        "# Uncomment the following line to run the proxy example\n",
-        "# await use_proxy()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XTZ88lbayZQN"
-      },
-      "source": [
-        "### Extracting Structured Data with OpenAI\n",
-        "\n",
-        "Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fIOlDayYyZQN",
-        "outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
-            "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
-            "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
-            "[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
-            "[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
-            "[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
-            "[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
-            "[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
-            "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
-            "5029\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
-        "\n",
-        "class OpenAIModelFee(BaseModel):\n",
-        "    model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
-        "    input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
-        "    output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
-        "\n",
-        "async def extract_openai_fees():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url='https://openai.com/api/pricing/',\n",
-        "            word_count_threshold=1,\n",
-        "            extraction_strategy=LLMExtractionStrategy(\n",
-        "                provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
-        "                schema=OpenAIModelFee.schema(),\n",
-        "                extraction_type=\"schema\",\n",
-        "                instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
-        "                Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
-        "                {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
-        "            ),\n",
-        "            bypass_cache=True,\n",
-        "        )\n",
-        "        print(len(result.extracted_content))\n",
-        "\n",
-        "# Uncomment the following line to run the OpenAI extraction example\n",
-        "await extract_openai_fees()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BypA5YxEyZQN"
-      },
-      "source": [
-        "### Advanced Multi-Page Crawling with JavaScript Execution"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tfkcVQ0b7mw-"
-      },
-      "source": [
-        "## Advanced Multi-Page Crawling with JavaScript Execution\n",
-        "\n",
-        "This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
-        "\n",
-        "To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "qUBKGpn3yZQN",
-        "outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
-            "Page 1: Found 35 commits\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
-            "Page 2: Found 35 commits\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
-            "Page 3: Found 35 commits\n",
-            "Successfully crawled 105 commits across 3 pages\n"
-          ]
-        }
-      ],
-      "source": [
-        "import re\n",
-        "from bs4 import BeautifulSoup\n",
-        "\n",
-        "async def crawl_typescript_commits():\n",
-        "    first_commit = \"\"\n",
-        "    async def on_execution_started(page):\n",
-        "        nonlocal first_commit\n",
-        "        try:\n",
-        "            while True:\n",
-        "                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
-        "                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
-        "                commit = await commit.evaluate('(element) => element.textContent')\n",
-        "                commit = re.sub(r'\\s+', '', commit)\n",
-        "                if commit and commit != first_commit:\n",
-        "                    first_commit = commit\n",
-        "                    break\n",
-        "                await asyncio.sleep(0.5)\n",
-        "        except Exception as e:\n",
-        "            print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
-        "\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
-        "\n",
-        "        url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
-        "        session_id = \"typescript_commits_session\"\n",
-        "        all_commits = []\n",
-        "\n",
-        "        js_next_page = \"\"\"\n",
-        "        const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
-        "        if (button) button.click();\n",
-        "        \"\"\"\n",
-        "\n",
-        "        for page in range(3):  # Crawl 3 pages\n",
-        "            result = await crawler.arun(\n",
-        "                url=url,\n",
-        "                session_id=session_id,\n",
-        "                css_selector=\"li.Box-sc-g0xbh4-0\",\n",
-        "                js=js_next_page if page > 0 else None,\n",
-        "                bypass_cache=True,\n",
-        "                js_only=page > 0\n",
-        "            )\n",
-        "\n",
-        "            assert result.success, f\"Failed to crawl page {page + 1}\"\n",
-        "\n",
-        "            soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
-        "            commits = soup.select(\"li\")\n",
-        "            all_commits.extend(commits)\n",
-        "\n",
-        "            print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
-        "\n",
-        "        await crawler.crawler_strategy.kill_session(session_id)\n",
-        "        print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
-        "\n",
-        "await crawl_typescript_commits()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EJRnYsp6yZQN"
-      },
-      "source": [
-        "### Using JsonCssExtractionStrategy for Fast Structured Output"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1ZMqIzB_8SYp"
-      },
-      "source": [
-        "The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
-        "\n",
-        "1. You define a schema that describes the pattern of data you're interested in extracting.\n",
-        "2. The schema includes a base selector that identifies repeating elements on the page.\n",
-        "3. Within the schema, you define fields, each with its own selector and type.\n",
-        "4. These field selectors are applied within the context of each base selector element.\n",
-        "5. The strategy supports nested structures, lists within lists, and various data types.\n",
-        "6. You can even include computed fields for more complex data manipulation.\n",
-        "\n",
-        "This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
-        "\n",
-        "For more details and advanced usage, check out the full documentation on the Crawl4AI website."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "trCMR2T9yZQN",
-        "outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
-            "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
-            "Successfully extracted 11 news teasers\n",
-            "{\n",
-            "  \"category\": \"Business News\",\n",
-            "  \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
-            "  \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
-            "  \"time\": \"13h ago\",\n",
-            "  \"image\": {\n",
-            "    \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
-            "    \"alt\": \"Mike Tirico.\"\n",
-            "  },\n",
-            "  \"link\": \"https://www.nbcnews.com/business\"\n",
-            "}\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def extract_news_teasers():\n",
-        "    schema = {\n",
-        "        \"name\": \"News Teaser Extractor\",\n",
-        "        \"baseSelector\": \".wide-tease-item__wrapper\",\n",
-        "        \"fields\": [\n",
-        "            {\n",
-        "                \"name\": \"category\",\n",
-        "                \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"headline\",\n",
-        "                \"selector\": \".wide-tease-item__headline\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"summary\",\n",
-        "                \"selector\": \".wide-tease-item__description\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"time\",\n",
-        "                \"selector\": \"[data-testid='wide-tease-date']\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"image\",\n",
-        "                \"type\": \"nested\",\n",
-        "                \"selector\": \"picture.teasePicture img\",\n",
-        "                \"fields\": [\n",
-        "                    {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
-        "                    {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
-        "                ],\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"link\",\n",
-        "                \"selector\": \"a[href]\",\n",
-        "                \"type\": \"attribute\",\n",
-        "                \"attribute\": \"href\",\n",
-        "            },\n",
-        "        ],\n",
-        "    }\n",
-        "\n",
-        "    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
-        "\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            extraction_strategy=extraction_strategy,\n",
-        "            bypass_cache=True,\n",
-        "        )\n",
-        "\n",
-        "        assert result.success, \"Failed to crawl the page\"\n",
-        "\n",
-        "        news_teasers = json.loads(result.extracted_content)\n",
-        "        print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
-        "        print(json.dumps(news_teasers[0], indent=2))\n",
-        "\n",
-        "await extract_news_teasers()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FnyVhJaByZQN"
-      },
-      "source": [
-        "## Speed Comparison\n",
-        "\n",
-        "Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "agDD186f3wig"
-      },
-      "source": [
-        "💡 **Note on Speed Comparison:**\n",
-        "\n",
-        "The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
-        "\n",
-        "For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
-        "\n",
-        "If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "F7KwHv8G1LbY"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install firecrawl"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "91813zILyZQN",
-        "outputId": "663223db-ab89-4976-b233-05ceca62b19b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Firecrawl (simulated):\n",
-            "Time taken: 4.38 seconds\n",
-            "Content length: 41967 characters\n",
-            "Images found: 49\n",
-            "\n",
-            "Crawl4AI (simple crawl):\n",
-            "Time taken: 4.22 seconds\n",
-            "Content length: 18221 characters\n",
-            "Images found: 49\n",
-            "\n",
-            "Crawl4AI (with JavaScript execution):\n",
-            "Time taken: 9.13 seconds\n",
-            "Content length: 34243 characters\n",
-            "Images found: 89\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
-        "import time\n",
-        "from firecrawl import FirecrawlApp\n",
-        "\n",
-        "async def speed_comparison():\n",
-        "    # Simulated Firecrawl performance\n",
-        "    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
-        "    start = time.time()\n",
-        "    scrape_status = app.scrape_url(\n",
-        "    'https://www.nbcnews.com/business',\n",
-        "    params={'formats': ['markdown', 'html']}\n",
-        "    )\n",
-        "    end = time.time()\n",
-        "    print(\"Firecrawl (simulated):\")\n",
-        "    print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "    print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
-        "    print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
-        "    print()\n",
-        "\n",
-        "    async with AsyncWebCrawler() as crawler:\n",
-        "        # Crawl4AI simple crawl\n",
-        "        start = time.time()\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            word_count_threshold=0,\n",
-        "            bypass_cache=True,\n",
-        "            verbose=False\n",
-        "        )\n",
-        "        end = time.time()\n",
-        "        print(\"Crawl4AI (simple crawl):\")\n",
-        "        print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "        print(f\"Content length: {len(result.markdown)} characters\")\n",
-        "        print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
-        "        print()\n",
-        "\n",
-        "        # Crawl4AI with JavaScript execution\n",
-        "        start = time.time()\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
-        "            word_count_threshold=0,\n",
-        "            bypass_cache=True,\n",
-        "            verbose=False\n",
-        "        )\n",
-        "        end = time.time()\n",
-        "        print(\"Crawl4AI (with JavaScript execution):\")\n",
-        "        print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "        print(f\"Content length: {len(result.markdown)} characters\")\n",
-        "        print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
-        "\n",
-        "await speed_comparison()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OBFFYVJIyZQN"
-      },
-      "source": [
-        "If you run on a local machine with a proper internet speed:\n",
-        "- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
-        "- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
-        "\n",
-        "Please note that actual performance may vary depending on network conditions and the specific content being crawled."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "A6_1RK1_yZQO"
-      },
-      "source": [
-        "## Conclusion\n",
-        "\n",
-        "In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
-        "\n",
-        "1. Basic crawling\n",
-        "2. JavaScript execution and CSS selector usage\n",
-        "3. Proxy support\n",
-        "4. Structured data extraction with OpenAI\n",
-        "5. Advanced multi-page crawling with JavaScript execution\n",
-        "6. Fast structured output using JsonCssExtractionStrategy\n",
-        "7. Speed comparison with other services\n",
-        "\n",
-        "Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
-        "\n",
-        "For more information and advanced usage, please visit the [Crawl4AI documentation](https://docs.crawl4ai.com/).\n",
-        "\n",
-        "Happy crawling!"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "venv",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.13"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}

From e1d9e2489cd736d3af9992209268c0f601222c1a Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sat, 5 Apr 2025 23:12:06 +0800
Subject: [PATCH 37/78] refactor(docs): update import statement in
 quickstart.py for improved clarity

---
 docs/examples/quickstart.py                   |   2 +-
 docs/examples/quickstart_async.py             | 675 ------------------
 ...amples.py => quickstart_examples_set_1.py} |   0
 ...config.py => quickstart_examples_set_2.py} |   2 +-
 docs/examples/quickstart_sync.py              | 405 -----------
 5 files changed, 2 insertions(+), 1082 deletions(-)
 delete mode 100644 docs/examples/quickstart_async.py
 rename docs/examples/{quickstart_examples.py => quickstart_examples_set_1.py} (100%)
 rename docs/examples/{quickstart_async.config.py => quickstart_examples_set_2.py} (99%)
 delete mode 100644 docs/examples/quickstart_sync.py

diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index 3adbfc0d..5efb785d 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -1,6 +1,6 @@
 import os, sys
 
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 
 sys.path.append(
     os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
deleted file mode 100644
index aeb0d20a..00000000
--- a/docs/examples/quickstart_async.py
+++ /dev/null
@@ -1,675 +0,0 @@
-import os, sys
-
-from crawl4ai import LLMConfig
-
-# append parent directory to system path
-sys.path.append(
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-)
-os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
-
-import asyncio
-# import nest_asyncio
-# nest_asyncio.apply()
-
-import time
-import json
-import os
-import re
-from typing import Dict, List
-from bs4 import BeautifulSoup
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
-    JsonCssExtractionStrategy,
-    LLMExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-print("Crawl4AI: Advanced Web Crawling and Data Extraction")
-print("GitHub Repository: https://github.com/unclecode/crawl4ai")
-print("Twitter: @unclecode")
-print("Website: https://crawl4ai.com")
-
-
-async def simple_crawl():
-    print("\n--- Basic Usage ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_running_js_code():
-    print("\n--- Executing JavaScript and Using CSS Selectors ---")
-    # New code to handle the wait_for parameter
-    wait_for = """() => {
-        return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
-    }"""
-
-    # wait_for can be also just a css selector
-    # wait_for = "article.tease-card:nth-child(10)"
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        js_code = [
-            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-        ]
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=js_code,
-            # wait_for=wait_for,
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_css_selector():
-    print("\n--- Using CSS Selectors ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            css_selector=".wide-tease-item__description",
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def use_proxy():
-    print("\n--- Using a Proxy ---")
-    print(
-        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
-    )
-    # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(
-        verbose=True, proxy="http://your-proxy-url:port"
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        if result.success:
-            print(result.markdown[:500])  # Print first 500 characters
-
-
-async def capture_and_save_screenshot(url: str, output_path: str):
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
-        )
-
-        if result.success and result.screenshot:
-            import base64
-
-            # Decode the base64 screenshot data
-            screenshot_data = base64.b64decode(result.screenshot)
-
-            # Save the screenshot as a JPEG file
-            with open(output_path, "wb") as f:
-                f.write(screenshot_data)
-
-            print(f"Screenshot saved successfully to {output_path}")
-        else:
-            print("Failed to capture screenshot")
-
-
-class OpenAIModelFee(BaseModel):
-    model_name: str = Field(..., description="Name of the OpenAI model.")
-    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
-        ..., description="Fee for output token for the OpenAI model."
-    )
-
-
-async def extract_structured_data_using_llm(
-    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
-):
-    print(f"\n--- Extracting Structured Data with {provider} ---")
-
-    if api_token is None and provider != "ollama":
-        print(f"API token is required for {provider}. Skipping this example.")
-        return
-
-    # extra_args = {}
-    extra_args = {
-        "temperature": 0,
-        "top_p": 0.9,
-        "max_tokens": 2000,
-        # any other supported parameters for litellm
-    }
-    if extra_headers:
-        extra_args["extra_headers"] = extra_headers
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://openai.com/api/pricing/",
-            word_count_threshold=1,
-            extraction_strategy=LLMExtractionStrategy(
-                llm_config=LLMConfig(provider=provider,api_token=api_token),
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
-                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
-                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args,
-            ),
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.extracted_content)
-
-
-async def extract_structured_data_using_css_extractor():
-    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
-    schema = {
-        "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
-        "fields": [
-            {
-                "name": "section_title",
-                "selector": "h3.heading-50",
-                "type": "text",
-            },
-            {
-                "name": "section_description",
-                "selector": ".charge-content",
-                "type": "text",
-            },
-            {
-                "name": "course_name",
-                "selector": ".text-block-93",
-                "type": "text",
-            },
-            {
-                "name": "course_description",
-                "selector": ".course-content-text",
-                "type": "text",
-            },
-            {
-                "name": "course_icon",
-                "selector": ".image-92",
-                "type": "attribute",
-                "attribute": "src",
-            },
-        ],
-    }
-
-    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
-        # Create the JavaScript that handles clicking multiple times
-        js_click_tabs = """
-        (async () => {
-            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
-            
-            for(let tab of tabs) {
-                // scroll to the tab
-                tab.scrollIntoView();
-                tab.click();
-                // Wait for content to load and animations to complete
-                await new Promise(r => setTimeout(r, 500));
-            }
-        })();
-        """
-
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
-            js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        companies = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(companies)} companies")
-        print(json.dumps(companies[0], indent=2))
-
-
-# Advanced Session-Based Crawling with Dynamic Content 🔄
-async def crawl_dynamic_content_pages_method_1():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-    first_commit = ""
-
-    async def on_execution_started(page):
-        nonlocal first_commit
-        try:
-            while True:
-                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await commit.evaluate("(element) => element.textContent")
-                commit = re.sub(r"\s+", "", commit)
-                if commit and commit != first_commit:
-                    first_commit = commit
-                    break
-                await asyncio.sleep(0.5)
-        except Exception as e:
-            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        (() => {
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-        })();
-        """
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                js=js_next_page if page > 0 else None,
-                cache_mode=CacheMode.BYPASS,
-                js_only=page > 0,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            soup = BeautifulSoup(result.cleaned_html, "html.parser")
-            commits = soup.select("li")
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_2():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-        last_commit = ""
-
-        js_next_page_and_wait = """
-        (async () => {
-            const getCurrentCommit = () => {
-                const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-                return commits.length > 0 ? commits[0].textContent.trim() : null;
-            };
-
-            const initialCommit = getCurrentCommit();
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-
-            // Poll for changes
-            while (true) {
-                await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
-                const newCommit = getCurrentCommit();
-                if (newCommit && newCommit !== initialCommit) {
-                    break;
-                }
-            }
-        })();
-        """
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page_and_wait if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_3():
-    print(
-        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
-    )
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-        if (commits.length > 0) {
-            window.firstCommit = commits[0].textContent.trim();
-        }
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
-        if (button) button.click();
-        """
-
-        wait_for = """() => {
-            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-            if (commits.length === 0) return false;
-            const firstCommit = commits[0].textContent.trim();
-            return firstCommit !== window.firstCommit;
-        }"""
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page if page > 0 else None,
-                wait_for=wait_for if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_custom_browser_type():
-    # Use Firefox
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="firefox", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use WebKit
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="webkit", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use Chromium (default)
-    start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-
-async def crawl_with_user_simultion():
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        url = "YOUR-URL-HERE"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
-            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
-            # override_navigator = True # Overrides the navigator object to make it look like a real user
-        )
-
-        print(result.markdown)
-
-
-async def speed_comparison():
-    # print("\n--- Speed Comparison ---")
-    # print("Firecrawl (simulated):")
-    # print("Time taken: 7.02 seconds")
-    # print("Content length: 42074 characters")
-    # print("Images found: 49")
-    # print()
-    # Simulated Firecrawl performance
-    from firecrawl import FirecrawlApp
-
-    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
-    start = time.time()
-    scrape_status = app.scrape_url(
-        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
-    )
-    end = time.time()
-    print("Firecrawl:")
-    print(f"Time taken: {end - start:.2f} seconds")
-    print(f"Content length: {len(scrape_status['markdown'])} characters")
-    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()
-
-    async with AsyncWebCrawler() as crawler:
-        # Crawl4AI simple crawl
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (simple crawl):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with advanced content filtering
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (Markdown Plus):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with JavaScript execution
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=[
-                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-            ],
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (with JavaScript execution):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-
-    print("\nNote on Speed Comparison:")
-    print("The speed test conducted here may not reflect optimal conditions.")
-    print("When we call Firecrawl's API, we're seeing its best performance,")
-    print("while Crawl4AI's performance is limited by the local network speed.")
-    print("For a more accurate comparison, it's recommended to run these tests")
-    print("on servers with a stable and fast internet connection.")
-    print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
-    print("If you run these tests in an environment with better network conditions,")
-    print("you may observe an even more significant speed advantage for Crawl4AI.")
-
-
-async def generate_knowledge_graph():
-    class Entity(BaseModel):
-        name: str
-        description: str
-
-    class Relationship(BaseModel):
-        entity1: Entity
-        entity2: Entity
-        description: str
-        relation_type: str
-
-    class KnowledgeGraph(BaseModel):
-        entities: List[Entity]
-        relationships: List[Relationship]
-
-    extraction_strategy = LLMExtractionStrategy(
-        llm_config=LLMConfig(provider="openai/gpt-4o-mini",  api_token=os.getenv("OPENAI_API_KEY")),  # In case of Ollama just pass "no-token"
-        schema=KnowledgeGraph.model_json_schema(),
-        extraction_type="schema",
-        instruction="""Extract entities and relationships from the given text.""",
-    )
-    async with AsyncWebCrawler() as crawler:
-        url = "https://paulgraham.com/love.html"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            extraction_strategy=extraction_strategy,
-            # magic=True
-        )
-        # print(result.extracted_content)
-        with open(os.path.join(__location__, "kb.json"), "w") as f:
-            f.write(result.extracted_content)
-
-
-async def fit_markdown_remove_overlay():
-    async with AsyncWebCrawler(
-        headless=True,  # Set to False to see what is happening
-        verbose=True,
-        user_agent_mode="random",
-        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                ),
-                options={"ignore_links": True},
-            ),
-            # markdown_generator=DefaultMarkdownGenerator(
-            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
-            #     options={
-            #         "ignore_links": True
-            #     }
-            # ),
-        )
-
-        if result.success:
-            print(len(result.markdown.raw_markdown))
-            print(len(result.markdown.markdown_with_citations))
-            print(len(result.markdown.fit_markdown))
-
-            # Save clean html
-            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
-                f.write(result.cleaned_html)
-
-            with open(
-                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown.raw_markdown)
-
-            with open(
-                os.path.join(__location__, "output/output_markdown_with_citations.md"),
-                "w",
-            ) as f:
-                f.write(result.markdown.markdown_with_citations)
-
-            with open(
-                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown.fit_markdown)
-
-    print("Done")
-
-
-async def main():
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
-    # # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
-
-    # LLM extraction examples
-    # await extract_structured_data_using_llm()
-    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")
-
-    # You always can pass custom headers to the extraction strategy
-    # custom_headers = {
-    #     "Authorization": "Bearer your-custom-token",
-    #     "X-Custom-Header": "Some-Value"
-    # }
-    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
-    await crawl_dynamic_content_pages_method_3()
-
-    # await crawl_custom_browser_type()
-
-    # await speed_comparison()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/docs/examples/quickstart_examples.py b/docs/examples/quickstart_examples_set_1.py
similarity index 100%
rename from docs/examples/quickstart_examples.py
rename to docs/examples/quickstart_examples_set_1.py
diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_examples_set_2.py
similarity index 99%
rename from docs/examples/quickstart_async.config.py
rename to docs/examples/quickstart_examples_set_2.py
index 5efb785d..3adbfc0d 100644
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_examples_set_2.py
@@ -1,6 +1,6 @@
 import os, sys
 
-from crawl4ai import LLMConfig
+from crawl4ai.types import LLMConfig
 
 sys.path.append(
     os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py
deleted file mode 100644
index 78f3e56c..00000000
--- a/docs/examples/quickstart_sync.py
+++ /dev/null
@@ -1,405 +0,0 @@
-import os
-import time
-from crawl4ai import LLMConfig
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
-from rich import print
-from rich.console import Console
-from functools import lru_cache
-
-console = Console()
-
-
-@lru_cache()
-def create_crawler():
-    crawler = WebCrawler(verbose=True)
-    crawler.warmup()
-    return crawler
-
-
-def print_result(result):
-    # Print each key in one line and just the first 10 characters of each one's value and three dots
-    console.print("\t[bold]Result:[/bold]")
-    for key, value in result.model_dump().items():
-        if isinstance(value, str) and value:
-            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
-    if result.extracted_content:
-        items = json.loads(result.extracted_content)
-        print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
-
-
-def cprint(message, press_any_key=False):
-    console.print(message)
-    if press_any_key:
-        console.print("Press any key to continue...", style="")
-        input()
-
-
-def basic_usage(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def basic_usage_some_params(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
-    )
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def screenshot_usage(crawler):
-    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
-    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
-    # Save the screenshot to a file
-    with open("screenshot.png", "wb") as f:
-        f.write(base64.b64decode(result.screenshot))
-    cprint("Screenshot saved to 'screenshot.png'!")
-    print_result(result)
-
-
-def understanding_parameters(crawler):
-    cprint(
-        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
-    )
-    cprint(
-        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
-    )
-
-    # First crawl (reads from cache)
-    cprint("1️⃣ First crawl (caches the result):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business")
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
-    )
-    print_result(result)
-
-    # Force to crawl again
-    cprint("2️⃣ Second crawl (Force to crawl again):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_chunking_strategy(crawler):
-    # Adding a chunking strategy: RegexChunking
-    cprint(
-        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"]),
-    )
-    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
-    print_result(result)
-
-    # Adding another chunking strategy: NlpSentenceChunking
-    cprint(
-        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
-    )
-    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
-    print_result(result)
-
-
-def add_extraction_strategy(crawler):
-    # Adding an extraction strategy: CosineStrategy
-    cprint(
-        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
-        True,
-    )
-    cprint(
-        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            word_count_threshold=10,
-            max_dist=0.2,
-            linkage_method="ward",
-            top_k=3,
-            sim_threshold=0.3,
-            verbose=True,
-        ),
-    )
-    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
-    print_result(result)
-
-    # Using semantic_filter with CosineStrategy
-    cprint(
-        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            semantic_filter="inflation rent prices",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_llm_extraction_strategy(crawler):
-    # Adding an LLM extraction strategy without instructions
-    cprint(
-        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config =  LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    # Adding an LLM extraction strategy with instructions
-    cprint(
-        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-            instruction="I am interested in only financial news",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-            instruction="Extract only content related to technology",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def targeted_extraction(crawler):
-    # Using a CSS selector to extract only H2 tags
-    cprint(
-        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
-        True,
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
-    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
-    print_result(result)
-
-
-def interactive_extraction(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def multiple_scrip(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = [
-        """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    ] * 2
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def using_crawler_hooks(crawler):
-    # Example usage of the hooks for authentication and setting a cookie
-    def on_driver_created(driver):
-        print("[HOOK] on_driver_created")
-        # Example customization: maximize the window
-        driver.maximize_window()
-
-        # Example customization: logging in to a hypothetical website
-        driver.get("https://example.com/login")
-
-        from selenium.webdriver.support.ui import WebDriverWait
-        from selenium.webdriver.common.by import By
-        from selenium.webdriver.support import expected_conditions as EC
-
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.NAME, "username"))
-        )
-        driver.find_element(By.NAME, "username").send_keys("testuser")
-        driver.find_element(By.NAME, "password").send_keys("password123")
-        driver.find_element(By.NAME, "login").click()
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.ID, "welcome"))
-        )
-        # Add a custom cookie
-        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
-        return driver
-
-    def before_get_url(driver):
-        print("[HOOK] before_get_url")
-        # Example customization: add a custom header
-        # Enable Network domain for sending headers
-        driver.execute_cdp_cmd("Network.enable", {})
-        # Add a custom header
-        driver.execute_cdp_cmd(
-            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
-        )
-        return driver
-
-    def after_get_url(driver):
-        print("[HOOK] after_get_url")
-        # Example customization: log the URL
-        print(driver.current_url)
-        return driver
-
-    def before_return_html(driver, html):
-        print("[HOOK] before_return_html")
-        # Example customization: log the HTML
-        print(len(html))
-        return driver
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
-        True,
-    )
-
-    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-    crawler_strategy.set_hook("on_driver_created", on_driver_created)
-    crawler_strategy.set_hook("before_get_url", before_get_url)
-    crawler_strategy.set_hook("after_get_url", after_get_url)
-    crawler_strategy.set_hook("before_return_html", before_return_html)
-
-    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-    crawler.warmup()
-    result = crawler.run(url="https://example.com")
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result=result)
-
-
-def using_crawler_hooks_dleay_example(crawler):
-    def delay(driver):
-        print("Delaying for 5 seconds...")
-        time.sleep(5)
-        print("Resuming...")
-
-    def create_crawler():
-        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-        crawler_strategy.set_hook("after_get_url", delay)
-        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-        crawler.warmup()
-        return crawler
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
-    )
-    crawler = create_crawler()
-    result = crawler.run(url="https://google.com", bypass_cache=True)
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result)
-
-
-def main():
-    cprint(
-        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
-    )
-    cprint(
-        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
-    )
-    cprint(
-        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
-    )
-
-    crawler = create_crawler()
-
-    crawler.always_by_pass_cache = True
-    basic_usage(crawler)
-    # basic_usage_some_params(crawler)
-    understanding_parameters(crawler)
-
-    crawler.always_by_pass_cache = True
-    screenshot_usage(crawler)
-    add_chunking_strategy(crawler)
-    add_extraction_strategy(crawler)
-    add_llm_extraction_strategy(crawler)
-    targeted_extraction(crawler)
-    interactive_extraction(crawler)
-    multiple_scrip(crawler)
-
-    cprint(
-        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
-    )
-
-
-if __name__ == "__main__":
-    main()

From 591f55edc7aa1bc07c2ac4e2a619870ac1752ee2 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 6 Apr 2025 18:22:05 +0800
Subject: [PATCH 38/78] refactor(browser): rename methods and update type hints
 in BrowserHub for clarity

---
 crawl4ai/browser/browser_hub.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/crawl4ai/browser/browser_hub.py b/crawl4ai/browser/browser_hub.py
index 33144319..47b742b5 100644
--- a/crawl4ai/browser/browser_hub.py
+++ b/crawl4ai/browser/browser_hub.py
@@ -2,9 +2,9 @@
 import hashlib
 import json
 import asyncio
-from typing import Dict, Optional
+from typing import Dict, Optional, List, Tuple
 from .manager import BrowserManager, UnavailableBehavior
-from ..async_configs import BrowserConfig
+from ..async_configs import BrowserConfig, CrawlerRunConfig
 from ..async_logger import AsyncLogger
 
 class BrowserHub:
@@ -19,7 +19,7 @@ class BrowserHub:
     _lock = asyncio.Lock()
     
     @classmethod
-    async def get_or_create_hub(
+    async def get_browser_manager(
         cls, 
         config: Optional[BrowserConfig] = None,
         hub_id: Optional[str] = None,
@@ -28,10 +28,10 @@ class BrowserHub:
         max_browsers_per_config: int = 10,
         max_pages_per_browser: int = 5,
         initial_pool_size: int = 1,
-        page_configs: Optional[list] = None
+        page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None
     ) -> BrowserManager:
         """
-        Get an existing Browser-Hub or create a new one based on parameters.
+        Get an existing BrowserManager or create a new one based on parameters.
         
         Args:
             config: Browser configuration for new hub
@@ -61,7 +61,7 @@ class BrowserHub:
                 config_hash = cls._hash_config(config)
                 instance_key = hub_id or f"config:{config_hash}"
                 if instance_key not in cls._instances:
-                    cls._instances[instance_key] = await cls._create_browser_hub(
+                    cls._instances[instance_key] = await cls._create_browser_manager(
                         config, 
                         logger,
                         max_browsers_per_config,
@@ -83,21 +83,22 @@ class BrowserHub:
             return cls._instances[instance_key]
     
     @classmethod
-    async def _create_browser_hub(
+    async def _create_browser_manager(
         cls, 
         config: BrowserConfig,
         logger: Optional[AsyncLogger],
         max_browsers_per_config: int,
         max_pages_per_browser: int,
         initial_pool_size: int,
-        page_configs: Optional[list]
+        page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None
     ) -> BrowserManager:
         """Create a new browser hub with the specified configuration."""
         manager = BrowserManager(
             browser_config=config,
             logger=logger,
             unavailable_behavior=UnavailableBehavior.ON_DEMAND,
-            max_browsers_per_config=max_browsers_per_config
+            max_browsers_per_config=max_browsers_per_config,
+            max_pages_per_browser=max_pages_per_browser,
         )
         
         # Initialize the pool
@@ -119,7 +120,7 @@ class BrowserHub:
     ) -> BrowserManager:
         """Create a default browser hub with standard settings."""
         config = BrowserConfig(headless=True)
-        return await cls._create_browser_hub(
+        return await cls._create_browser_manager(
             config, 
             logger, 
             max_browsers_per_config, 

From 5b66208a7ebcb04c62c3822591f497f1a6ba9f79 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 6 Apr 2025 18:33:09 +0800
Subject: [PATCH 39/78] Refactor next branch

---
 crawl4ai/browser/__init__.py                  |  22 -
 crawl4ai/browser/browser_hub.py               | 184 ----
 .../browser/docker/alpine/connect.Dockerfile  |  34 -
 .../browser/docker/alpine/launch.Dockerfile   |  27 -
 .../browser/docker/debian/connect.Dockerfile  |  23 -
 crawl4ai/browser/docker_registry.py           | 264 ------
 crawl4ai/browser/docker_utils.py              | 661 --------------
 crawl4ai/browser/manager copy.py              | 177 ----
 crawl4ai/browser/manager.py                   | 853 ------------------
 crawl4ai/browser/models.py                    | 143 ---
 crawl4ai/browser/profiles.py                  | 457 ----------
 crawl4ai/browser/strategies/__init__.py       |  13 -
 crawl4ai/browser/strategies/base.py           | 601 ------------
 crawl4ai/browser/strategies/builtin.py        | 468 ----------
 crawl4ai/browser/strategies/cdp.py            | 281 ------
 .../browser/strategies/docker_strategy.py     | 430 ---------
 crawl4ai/browser/strategies/playwright.py     | 134 ---
 crawl4ai/browser/utils.py                     | 465 ----------
 18 files changed, 5237 deletions(-)
 delete mode 100644 crawl4ai/browser/__init__.py
 delete mode 100644 crawl4ai/browser/browser_hub.py
 delete mode 100644 crawl4ai/browser/docker/alpine/connect.Dockerfile
 delete mode 100644 crawl4ai/browser/docker/alpine/launch.Dockerfile
 delete mode 100644 crawl4ai/browser/docker/debian/connect.Dockerfile
 delete mode 100644 crawl4ai/browser/docker_registry.py
 delete mode 100644 crawl4ai/browser/docker_utils.py
 delete mode 100644 crawl4ai/browser/manager copy.py
 delete mode 100644 crawl4ai/browser/manager.py
 delete mode 100644 crawl4ai/browser/models.py
 delete mode 100644 crawl4ai/browser/profiles.py
 delete mode 100644 crawl4ai/browser/strategies/__init__.py
 delete mode 100644 crawl4ai/browser/strategies/base.py
 delete mode 100644 crawl4ai/browser/strategies/builtin.py
 delete mode 100644 crawl4ai/browser/strategies/cdp.py
 delete mode 100644 crawl4ai/browser/strategies/docker_strategy.py
 delete mode 100644 crawl4ai/browser/strategies/playwright.py
 delete mode 100644 crawl4ai/browser/utils.py

diff --git a/crawl4ai/browser/__init__.py b/crawl4ai/browser/__init__.py
deleted file mode 100644
index af4d74c7..00000000
--- a/crawl4ai/browser/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""Browser management module for Crawl4AI.
-
-This module provides browser management capabilities using different strategies
-for browser creation and interaction.
-"""
-
-from .manager import BrowserManager
-from .profiles import BrowserProfileManager
-from .models import DockerConfig
-from .docker_registry import DockerRegistry
-from .docker_utils import DockerUtils
-from .strategies import (
-    BaseBrowserStrategy,
-    PlaywrightBrowserStrategy,
-    CDPBrowserStrategy,
-    BuiltinBrowserStrategy,
-    DockerBrowserStrategy
-)
-
-__all__ = ['BrowserManager', 'BrowserProfileManager', 'DockerConfig', 'DockerRegistry', 'DockerUtils', 'BaseBrowserStrategy',
-           'PlaywrightBrowserStrategy', 'CDPBrowserStrategy', 'BuiltinBrowserStrategy',
-           'DockerBrowserStrategy']
\ No newline at end of file
diff --git a/crawl4ai/browser/browser_hub.py b/crawl4ai/browser/browser_hub.py
deleted file mode 100644
index 47b742b5..00000000
--- a/crawl4ai/browser/browser_hub.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# browser_hub_manager.py
-import hashlib
-import json
-import asyncio
-from typing import Dict, Optional, List, Tuple
-from .manager import BrowserManager, UnavailableBehavior
-from ..async_configs import BrowserConfig, CrawlerRunConfig
-from ..async_logger import AsyncLogger
-
-class BrowserHub:
-    """
-    Manages Browser-Hub instances for sharing across multiple pipelines.
-    
-    This class provides centralized management for browser resources, allowing
-    multiple pipelines to share browser instances efficiently, connect to
-    existing browser hubs, or create new ones with custom configurations.
-    """
-    _instances: Dict[str, BrowserManager] = {}
-    _lock = asyncio.Lock()
-    
-    @classmethod
-    async def get_browser_manager(
-        cls, 
-        config: Optional[BrowserConfig] = None,
-        hub_id: Optional[str] = None,
-        connection_info: Optional[str] = None,
-        logger: Optional[AsyncLogger] = None,
-        max_browsers_per_config: int = 10,
-        max_pages_per_browser: int = 5,
-        initial_pool_size: int = 1,
-        page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None
-    ) -> BrowserManager:
-        """
-        Get an existing BrowserManager or create a new one based on parameters.
-        
-        Args:
-            config: Browser configuration for new hub
-            hub_id: Identifier for the hub instance
-            connection_info: Connection string for existing hub
-            logger: Logger for recording events and errors
-            max_browsers_per_config: Maximum browsers per configuration
-            max_pages_per_browser: Maximum pages per browser
-            initial_pool_size: Initial number of browsers to create
-            page_configs: Optional configurations for pre-warming pages
-            
-        Returns:
-            BrowserManager: The requested browser manager instance
-        """
-        async with cls._lock:
-            # Scenario 3: Use existing hub via connection info
-            if connection_info:
-                instance_key = f"connection:{connection_info}"
-                if instance_key not in cls._instances:
-                    cls._instances[instance_key] = await cls._connect_to_browser_hub(
-                        connection_info, logger
-                    )
-                return cls._instances[instance_key]
-                
-            # Scenario 2: Custom configured hub
-            if config:
-                config_hash = cls._hash_config(config)
-                instance_key = hub_id or f"config:{config_hash}"
-                if instance_key not in cls._instances:
-                    cls._instances[instance_key] = await cls._create_browser_manager(
-                        config, 
-                        logger,
-                        max_browsers_per_config,
-                        max_pages_per_browser,
-                        initial_pool_size,
-                        page_configs
-                    )
-                return cls._instances[instance_key]
-            
-            # Scenario 1: Default hub
-            instance_key = "default"
-            if instance_key not in cls._instances:
-                cls._instances[instance_key] = await cls._create_default_browser_hub(
-                    logger,
-                    max_browsers_per_config,
-                    max_pages_per_browser,
-                    initial_pool_size
-                )
-            return cls._instances[instance_key]
-    
-    @classmethod
-    async def _create_browser_manager(
-        cls, 
-        config: BrowserConfig,
-        logger: Optional[AsyncLogger],
-        max_browsers_per_config: int,
-        max_pages_per_browser: int,
-        initial_pool_size: int,
-        page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None
-    ) -> BrowserManager:
-        """Create a new browser hub with the specified configuration."""
-        manager = BrowserManager(
-            browser_config=config,
-            logger=logger,
-            unavailable_behavior=UnavailableBehavior.ON_DEMAND,
-            max_browsers_per_config=max_browsers_per_config,
-            max_pages_per_browser=max_pages_per_browser,
-        )
-        
-        # Initialize the pool
-        await manager.initialize_pool(
-            browser_configs=[config] if config else None,
-            browsers_per_config=initial_pool_size,
-            page_configs=page_configs
-        )
-        
-        return manager
-    
-    @classmethod
-    async def _create_default_browser_hub(
-        cls,
-        logger: Optional[AsyncLogger],
-        max_browsers_per_config: int,
-        max_pages_per_browser: int,
-        initial_pool_size: int
-    ) -> BrowserManager:
-        """Create a default browser hub with standard settings."""
-        config = BrowserConfig(headless=True)
-        return await cls._create_browser_manager(
-            config, 
-            logger, 
-            max_browsers_per_config, 
-            max_pages_per_browser, 
-            initial_pool_size,
-            None
-        )
-    
-    @classmethod
-    async def _connect_to_browser_hub(
-        cls, 
-        connection_info: str,
-        logger: Optional[AsyncLogger]
-    ) -> BrowserManager:
-        """
-        Connect to an existing browser hub.
-        
-        Note: This is a placeholder for future remote connection functionality.
-        Currently creates a local instance.
-        """
-        if logger:
-            logger.info(
-                message="Remote browser hub connections not yet implemented. Creating local instance.",
-                tag="BROWSER_HUB"
-            )
-        # For now, create a default local instance
-        return await cls._create_default_browser_hub(
-            logger, 
-            max_browsers_per_config=10, 
-            max_pages_per_browser=5, 
-            initial_pool_size=1
-        )
-    
-    @classmethod
-    def _hash_config(cls, config: BrowserConfig) -> str:
-        """Create a hash of the browser configuration for identification."""
-        # Convert config to dictionary, excluding any callable objects
-        config_dict = config.__dict__.copy()
-        for key in list(config_dict.keys()):
-            if callable(config_dict[key]):
-                del config_dict[key]
-        
-        # Convert to canonical JSON string
-        config_json = json.dumps(config_dict, sort_keys=True, default=str)
-        
-        # Hash the JSON
-        config_hash = hashlib.sha256(config_json.encode()).hexdigest()
-        return config_hash
-    
-    @classmethod
-    async def shutdown_all(cls):
-        """Close all browser hub instances and clear the registry."""
-        async with cls._lock:
-            shutdown_tasks = []
-            for hub in cls._instances.values():
-                shutdown_tasks.append(hub.close())
-            
-            if shutdown_tasks:
-                await asyncio.gather(*shutdown_tasks)
-            
-            cls._instances.clear()
\ No newline at end of file
diff --git a/crawl4ai/browser/docker/alpine/connect.Dockerfile b/crawl4ai/browser/docker/alpine/connect.Dockerfile
deleted file mode 100644
index 96f77cef..00000000
--- a/crawl4ai/browser/docker/alpine/connect.Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# ---------- Dockerfile ----------
-    FROM alpine:latest
-
-    # Combine everything in one RUN to keep layers minimal.
-    RUN apk update && apk upgrade && \
-        apk add --no-cache \
-            chromium \
-            nss \
-            freetype \
-            harfbuzz \
-            ca-certificates \
-            ttf-freefont \
-            socat \
-            curl && \
-        addgroup -S chromium && adduser -S chromium -G chromium && \
-        mkdir -p /data && chown chromium:chromium /data && \
-        rm -rf /var/cache/apk/*
-    
-    # Copy start script, then chown/chmod in one step
-    COPY start.sh /home/chromium/start.sh
-    RUN chown chromium:chromium /home/chromium/start.sh && \
-        chmod +x /home/chromium/start.sh
-    
-    USER chromium
-    WORKDIR /home/chromium
-    
-    # Expose port used by socat (mapping 9222→9223 or whichever you prefer)
-    EXPOSE 9223
-    
-    # Simple healthcheck: is the remote debug endpoint responding?
-    HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -f http://localhost:9222/json/version || exit 1
-    
-    CMD ["./start.sh"]
-    
\ No newline at end of file
diff --git a/crawl4ai/browser/docker/alpine/launch.Dockerfile b/crawl4ai/browser/docker/alpine/launch.Dockerfile
deleted file mode 100644
index 17e3c660..00000000
--- a/crawl4ai/browser/docker/alpine/launch.Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-# ---------- Dockerfile (Idle Version) ----------
-    FROM alpine:latest
-
-    # Install only Chromium and its dependencies in a single layer
-    RUN apk update && apk upgrade && \
-        apk add --no-cache \
-            chromium \
-            nss \
-            freetype \
-            harfbuzz \
-            ca-certificates \
-            ttf-freefont \
-            socat \
-            curl && \
-        addgroup -S chromium && adduser -S chromium -G chromium && \
-        mkdir -p /data && chown chromium:chromium /data && \
-        rm -rf /var/cache/apk/*
-    
-    ENV PATH="/usr/bin:/bin:/usr/sbin:/sbin"
-
-    # Switch to a non-root user for security
-    USER chromium
-    WORKDIR /home/chromium
-    
-    # Idle: container does nothing except stay alive
-    CMD ["tail", "-f", "/dev/null"]
-    
\ No newline at end of file
diff --git a/crawl4ai/browser/docker/debian/connect.Dockerfile b/crawl4ai/browser/docker/debian/connect.Dockerfile
deleted file mode 100644
index ee0f25b4..00000000
--- a/crawl4ai/browser/docker/debian/connect.Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-# Use Debian 12 (Bookworm) slim for a small, stable base image
-FROM debian:bookworm-slim
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Install Chromium, socat, and basic fonts
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    chromium \
-    wget \
-    curl \
-    socat \
-    fonts-freefont-ttf \
-    fonts-noto-color-emoji && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Copy start.sh and make it executable
-COPY start.sh /start.sh
-RUN chmod +x /start.sh
-
-# Expose socat port (use host mapping, e.g. -p 9225:9223)
-EXPOSE 9223
-
-ENTRYPOINT ["/start.sh"]
diff --git a/crawl4ai/browser/docker_registry.py b/crawl4ai/browser/docker_registry.py
deleted file mode 100644
index 03594e2e..00000000
--- a/crawl4ai/browser/docker_registry.py
+++ /dev/null
@@ -1,264 +0,0 @@
-"""Docker registry module for Crawl4AI.
-
-This module provides a registry system for tracking and reusing Docker containers
-across browser sessions, improving performance and resource utilization.
-"""
-
-import os
-import json
-import time
-from typing import Dict, Optional
-
-from ..utils import get_home_folder
-
-
-class DockerRegistry:
-    """Manages a registry of Docker containers used for browser automation.
-    
-    This registry tracks containers by configuration hash, allowing reuse of appropriately
-    configured containers instead of creating new ones for each session.
-    
-    Attributes:
-        registry_file (str): Path to the registry file
-        containers (dict): Dictionary of container information
-        port_map (dict): Map of host ports to container IDs
-        last_port (int): Last port assigned
-    """
-    
-    def __init__(self, registry_file: Optional[str] = None):
-        """Initialize the registry with an optional path to the registry file.
-        
-        Args:
-            registry_file: Path to the registry file. If None, uses default path.
-        """
-        # Use the same file path as BuiltinBrowserStrategy by default
-        self.registry_file = registry_file or os.path.join(get_home_folder(), "builtin-browser", "browser_config.json")
-        self.containers = {}  # Still maintain this for backward compatibility
-        self.port_map = {}    # Will be populated from the shared file
-        self.last_port = 9222
-        self.load()
-    
-    def load(self):
-        """Load container registry from file."""
-        if os.path.exists(self.registry_file):
-            try:
-                with open(self.registry_file, 'r') as f:
-                    registry_data = json.load(f)
-                    
-                    # Initialize port_map if not present
-                    if "port_map" not in registry_data:
-                        registry_data["port_map"] = {}
-                    
-                    self.port_map = registry_data.get("port_map", {})
-                    
-                    # Extract container information from port_map entries of type "docker"
-                    self.containers = {}
-                    for port_str, browser_info in self.port_map.items():
-                        if browser_info.get("browser_type") == "docker" and "container_id" in browser_info:
-                            container_id = browser_info["container_id"]
-                            self.containers[container_id] = {
-                                "host_port": int(port_str),
-                                "config_hash": browser_info.get("config_hash", ""),
-                                "created_at": browser_info.get("created_at", time.time())
-                            }
-                    
-                    # Get last port if available
-                    if "last_port" in registry_data:
-                        self.last_port = registry_data["last_port"]
-                    else:
-                        # Find highest port in port_map
-                        ports = [int(p) for p in self.port_map.keys() if p.isdigit()]
-                        self.last_port = max(ports + [9222])
-                    
-            except Exception as e:
-                # Reset to defaults on error
-                print(f"Error loading registry: {e}")
-                self.containers = {}
-                self.port_map = {}
-                self.last_port = 9222
-        else:
-            # Initialize with defaults if file doesn't exist
-            self.containers = {}
-            self.port_map = {}
-            self.last_port = 9222
-    
-    def save(self):
-        """Save container registry to file."""
-        # First load the current file to avoid overwriting other browser types
-        current_data = {"port_map": {}, "last_port": self.last_port}
-        if os.path.exists(self.registry_file):
-            try:
-                with open(self.registry_file, 'r') as f:
-                    current_data = json.load(f)
-            except Exception:
-                pass
-        
-        # Create a new port_map dictionary
-        updated_port_map = {}
-        
-        # First, copy all non-docker entries from the existing port_map
-        for port_str, browser_info in current_data.get("port_map", {}).items():
-            if browser_info.get("browser_type") != "docker":
-                updated_port_map[port_str] = browser_info
-        
-        # Then add all current docker container entries
-        for container_id, container_info in self.containers.items():
-            port_str = str(container_info["host_port"])
-            updated_port_map[port_str] = {
-                "browser_type": "docker",
-                "container_id": container_id,
-                "cdp_url": f"http://localhost:{port_str}",
-                "config_hash": container_info["config_hash"],
-                "created_at": container_info["created_at"]
-            }
-        
-        # Replace the port_map with our updated version
-        current_data["port_map"] = updated_port_map
-        
-        # Update last_port
-        current_data["last_port"] = self.last_port
-        
-        # Ensure directory exists
-        os.makedirs(os.path.dirname(self.registry_file), exist_ok=True)
-        
-        # Save the updated data
-        with open(self.registry_file, 'w') as f:
-            json.dump(current_data, f, indent=2)
-    
-    def register_container(self, container_id: str, host_port: int, config_hash: str, cdp_json_config: Optional[str] = None):
-        """Register a container with its configuration hash and port mapping.
-        
-        Args:
-            container_id: Docker container ID
-            host_port: Host port mapped to container
-            config_hash: Hash of configuration used to create container
-            cdp_json_config: CDP JSON configuration if available
-        """
-        self.containers[container_id] = {
-            "host_port": host_port,
-            "config_hash": config_hash,
-            "created_at": time.time()
-        }
-        
-        # Update port_map to maintain compatibility with BuiltinBrowserStrategy
-        port_str = str(host_port)
-        self.port_map[port_str] = {
-            "browser_type": "docker",
-            "container_id": container_id,
-            "cdp_url": f"http://localhost:{port_str}",
-            "config_hash": config_hash,
-            "created_at": time.time()
-        }
-        
-        if cdp_json_config:
-            self.port_map[port_str]["cdp_json_config"] = cdp_json_config
-        
-        self.save()
-    
-    def unregister_container(self, container_id: str):
-        """Unregister a container.
-        
-        Args:
-            container_id: Docker container ID to unregister
-        """
-        if container_id in self.containers:
-            host_port = self.containers[container_id]["host_port"]
-            port_str = str(host_port)
-            
-            # Remove from port_map
-            if port_str in self.port_map:
-                del self.port_map[port_str]
-                
-            # Remove from containers
-            del self.containers[container_id]
-            
-            self.save()
-    
-    async def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]:
-        """Find a container that matches the given configuration hash.
-        
-        Args:
-            config_hash: Hash of configuration to match
-            docker_utils: DockerUtils instance to check running containers
-            
-        Returns:
-            Container ID if found, None otherwise
-        """
-        # Search through port_map for entries with matching config_hash
-        for port_str, browser_info in self.port_map.items():
-            if (browser_info.get("browser_type") == "docker" and 
-                browser_info.get("config_hash") == config_hash and 
-                "container_id" in browser_info):
-                
-                container_id = browser_info["container_id"]
-                if await docker_utils.is_container_running(container_id):
-                    return container_id
-        
-        return None
-    
-    def get_container_host_port(self, container_id: str) -> Optional[int]:
-        """Get the host port mapped to the container.
-        
-        Args:
-            container_id: Docker container ID
-            
-        Returns:
-            Host port if container is registered, None otherwise
-        """
-        if container_id in self.containers:
-            return self.containers[container_id]["host_port"]
-        return None
-    
-    def get_next_available_port(self, docker_utils) -> int:
-        """Get the next available host port for Docker mapping.
-        
-        Args:
-            docker_utils: DockerUtils instance to check port availability
-            
-        Returns:
-            Available port number
-        """
-        # Start from last port + 1
-        port = self.last_port + 1
-        
-        # Check if port is in use (either in our registry or system-wide)
-        while str(port) in self.port_map or docker_utils.is_port_in_use(port):
-            port += 1
-        
-        # Update last port
-        self.last_port = port
-        self.save()
-        
-        return port
-    
-    def get_container_config_hash(self, container_id: str) -> Optional[str]:
-        """Get the configuration hash for a container.
-        
-        Args:
-            container_id: Docker container ID
-            
-        Returns:
-            Configuration hash if container is registered, None otherwise
-        """
-        if container_id in self.containers:
-            return self.containers[container_id]["config_hash"]
-        return None
-    
-    def cleanup_stale_containers(self, docker_utils):
-        """Clean up containers that are no longer running.
-        
-        Args:
-            docker_utils: DockerUtils instance to check container status
-        """
-        to_remove = []
-        
-        # Find containers that are no longer running
-        for port_str, browser_info in self.port_map.items():
-            if browser_info.get("browser_type") == "docker" and "container_id" in browser_info:
-                container_id = browser_info["container_id"]
-                if not docker_utils.is_container_running(container_id):
-                    to_remove.append(container_id)
-        
-        # Remove stale containers
-        for container_id in to_remove:
-            self.unregister_container(container_id)
\ No newline at end of file
diff --git a/crawl4ai/browser/docker_utils.py b/crawl4ai/browser/docker_utils.py
deleted file mode 100644
index f93a51b9..00000000
--- a/crawl4ai/browser/docker_utils.py
+++ /dev/null
@@ -1,661 +0,0 @@
-import os
-import json
-import asyncio
-import hashlib
-import tempfile
-import shutil
-import socket
-import subprocess
-from typing import Dict, List, Optional, Tuple, Union
-
-
-class DockerUtils:
-    """Utility class for Docker operations in browser automation.
-
-    This class provides methods for managing Docker images, containers,
-    and related operations needed for browser automation. It handles
-    image building, container lifecycle, port management, and registry operations.
-
-    Attributes:
-        DOCKER_FOLDER (str): Path to folder containing Docker files
-        DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode
-        DOCKER_LAUNCH_FILE (str): Path to Dockerfile for launch mode
-        DOCKER_START_SCRIPT (str): Path to startup script for connect mode
-        DEFAULT_CONNECT_IMAGE (str): Default image name for connect mode
-        DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode
-        logger: Optional logger instance
-    """
-
-    # File paths for Docker resources
-    DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker")
-    DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile")
-    DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile")
-    DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh")
-
-    # Default image names
-    DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest"
-    DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest"
-
-    def __init__(self, logger=None):
-        """Initialize Docker utilities.
-
-        Args:
-            logger: Optional logger for recording operations
-        """
-        self.logger = logger
-
-    # Image Management Methods
-
-    async def check_image_exists(self, image_name: str) -> bool:
-        """Check if a Docker image exists.
-
-        Args:
-            image_name: Name of the Docker image to check
-
-        Returns:
-            bool: True if the image exists, False otherwise
-        """
-        cmd = ["docker", "image", "inspect", image_name]
-
-        try:
-            process = await asyncio.create_subprocess_exec(
-                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
-            )
-            _, _ = await process.communicate()
-            return process.returncode == 0
-        except Exception as e:
-            if self.logger:
-                self.logger.debug(
-                    f"Error checking if image exists: {str(e)}", tag="DOCKER"
-                )
-            return False
-
-    async def build_docker_image(
-        self,
-        image_name: str,
-        dockerfile_path: str,
-        files_to_copy: Dict[str, str] = None,
-    ) -> bool:
-        """Build a Docker image from a Dockerfile.
-
-        Args:
-            image_name: Name to give the built image
-            dockerfile_path: Path to the Dockerfile
-            files_to_copy: Dict of {dest_name: source_path} for files to copy to build context
-
-        Returns:
-            bool: True if image was built successfully, False otherwise
-        """
-        # Create a temporary build context
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Copy the Dockerfile
-            shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile"))
-
-            # Copy any additional files needed
-            if files_to_copy:
-                for dest_name, source_path in files_to_copy.items():
-                    shutil.copy(source_path, os.path.join(temp_dir, dest_name))
-
-            # Build the image
-            cmd = ["docker", "build", "-t", image_name, temp_dir]
-
-            if self.logger:
-                self.logger.debug(
-                    f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER"
-                )
-
-            process = await asyncio.create_subprocess_exec(
-                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
-            )
-            stdout, stderr = await process.communicate()
-
-            if process.returncode != 0:
-                if self.logger:
-                    self.logger.error(
-                        message="Failed to build Docker image: {error}",
-                        tag="DOCKER",
-                        params={"error": stderr.decode()},
-                    )
-                return False
-
-            if self.logger:
-                self.logger.success(
-                    f"Successfully built Docker image: {image_name}", tag="DOCKER"
-                )
-            return True
-
-    async def ensure_docker_image_exists(
-        self, image_name: str, mode: str = "connect"
-    ) -> str:
-        """Ensure the required Docker image exists, creating it if necessary.
-
-        Args:
-            image_name: Name of the Docker image
-            mode: Either "connect" or "launch" to determine which image to build
-
-        Returns:
-            str: Name of the available Docker image
-
-        Raises:
-            Exception: If image doesn't exist and can't be built
-        """
-        # If image name is not specified, use default based on mode
-        if not image_name:
-            image_name = (
-                self.DEFAULT_CONNECT_IMAGE
-                if mode == "connect"
-                else self.DEFAULT_LAUNCH_IMAGE
-            )
-
-        # Check if the image already exists
-        if await self.check_image_exists(image_name):
-            if self.logger:
-                self.logger.debug(
-                    f"Docker image {image_name} already exists", tag="DOCKER"
-                )
-            return image_name
-
-        # If we're using a custom image that doesn't exist, warn and fail
-        if (
-            image_name != self.DEFAULT_CONNECT_IMAGE
-            and image_name != self.DEFAULT_LAUNCH_IMAGE
-        ):
-            if self.logger:
-                self.logger.warning(
-                    f"Custom Docker image {image_name} not found and cannot be automatically created",
-                    tag="DOCKER",
-                )
-            raise Exception(f"Docker image {image_name} not found")
-
-        # Build the appropriate default image
-        if self.logger:
-            self.logger.info(
-                f"Docker image {image_name} not found, creating it now...", tag="DOCKER"
-            )
-
-        if mode == "connect":
-            success = await self.build_docker_image(
-                image_name,
-                self.DOCKER_CONNECT_FILE,
-                {"start.sh": self.DOCKER_START_SCRIPT},
-            )
-        else:
-            success = await self.build_docker_image(image_name, self.DOCKER_LAUNCH_FILE)
-
-        if not success:
-            raise Exception(f"Failed to create Docker image {image_name}")
-
-        return image_name
-
-    # Container Management Methods
-
-    async def create_container(
-        self,
-        image_name: str,
-        host_port: int,
-        container_name: Optional[str] = None,
-        volumes: List[str] = None,
-        network: Optional[str] = None,
-        env_vars: Dict[str, str] = None,
-        cpu_limit: float = 1.0,
-        memory_limit: str = "1.5g",
-        extra_args: List[str] = None,
-    ) -> Optional[str]:
-        """Create a new Docker container.
-
-        Args:
-            image_name: Docker image to use
-            host_port: Port on host to map to container port 9223
-            container_name: Optional name for the container
-            volumes: List of volume mappings (e.g., ["host_path:container_path"])
-            network: Optional Docker network to use
-            env_vars: Dictionary of environment variables
-            cpu_limit: CPU limit for the container
-            memory_limit: Memory limit for the container
-            extra_args: Additional docker run arguments
-
-        Returns:
-            str: Container ID if successful, None otherwise
-        """
-        # Prepare container command
-        cmd = [
-            "docker",
-            "run",
-            "--detach",
-        ]
-
-        # Add container name if specified
-        if container_name:
-            cmd.extend(["--name", container_name])
-
-        # Add port mapping
-        cmd.extend(["-p", f"{host_port}:9223"])
-
-        # Add volumes
-        if volumes:
-            for volume in volumes:
-                cmd.extend(["-v", volume])
-
-        # Add network if specified
-        if network:
-            cmd.extend(["--network", network])
-
-        # Add environment variables
-        if env_vars:
-            for key, value in env_vars.items():
-                cmd.extend(["-e", f"{key}={value}"])
-
-        # Add CPU and memory limits
-        if cpu_limit:
-            cmd.extend(["--cpus", str(cpu_limit)])
-        if memory_limit:
-            cmd.extend(["--memory", memory_limit])
-            cmd.extend(["--memory-swap", memory_limit])
-        if self.logger:
-            self.logger.debug(
-                f"Setting CPU limit: {cpu_limit}, Memory limit: {memory_limit}",
-                tag="DOCKER",
-            )
-
-        # Add extra args
-        if extra_args:
-            cmd.extend(extra_args)
-
-        # Add image
-        cmd.append(image_name)
-
-        if self.logger:
-            self.logger.debug(
-                f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER"
-            )
-
-        # Run docker command
-        try:
-            process = await asyncio.create_subprocess_exec(
-                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
-            )
-            stdout, stderr = await process.communicate()
-
-            if process.returncode != 0:
-                if self.logger:
-                    self.logger.error(
-                        message="Failed to create Docker container: {error}",
-                        tag="DOCKER",
-                        params={"error": stderr.decode()},
-                    )
-                return None
-
-            # Get container ID
-            container_id = stdout.decode().strip()
-
-            if self.logger:
-                self.logger.success(
-                    f"Created Docker container: {container_id[:12]}", tag="DOCKER"
-                )
-
-            return container_id
-
-        except Exception as e:
-            if self.logger:
-                self.logger.error(
-                    message="Error creating Docker container: {error}",
-                    tag="DOCKER",
-                    params={"error": str(e)},
-                )
-            return None
-
-    async def is_container_running(self, container_id: str) -> bool:
-        """Check if a container is running.
-
-        Args:
-            container_id: ID of the container to check
-
-        Returns:
-            bool: True if the container is running, False otherwise
-        """
-        cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id]
-
-        try:
-            process = await asyncio.create_subprocess_exec(
-                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
-            )
-            stdout, _ = await process.communicate()
-
-            return process.returncode == 0 and stdout.decode().strip() == "true"
-        except Exception as e:
-            if self.logger:
-                self.logger.debug(
-                    f"Error checking if container is running: {str(e)}", tag="DOCKER"
-                )
-            return False
-
-    async def wait_for_container_ready(
-        self, container_id: str, timeout: int = 30
-    ) -> bool:
-        """Wait for the container to be in running state.
-
-        Args:
-            container_id: ID of the container to wait for
-            timeout: Maximum time to wait in seconds
-
-        Returns:
-            bool: True if container is ready, False if timeout occurred
-        """
-        for _ in range(timeout):
-            if await self.is_container_running(container_id):
-                return True
-            await asyncio.sleep(1)
-
-        if self.logger:
-            self.logger.warning(
-                f"Container {container_id[:12]} not ready after {timeout}s timeout",
-                tag="DOCKER",
-            )
-        return False
-
-    async def stop_container(self, container_id: str) -> bool:
-        """Stop a Docker container.
-
-        Args:
-            container_id: ID of the container to stop
-
-        Returns:
-            bool: True if stopped successfully, False otherwise
-        """
-        cmd = ["docker", "stop", container_id]
-
-        try:
-            process = await asyncio.create_subprocess_exec(*cmd)
-            await process.communicate()
-
-            if self.logger:
-                self.logger.debug(
-                    f"Stopped container: {container_id[:12]}", tag="DOCKER"
-                )
-
-            return process.returncode == 0
-        except Exception as e:
-            if self.logger:
-                self.logger.warning(
-                    message="Failed to stop container: {error}",
-                    tag="DOCKER",
-                    params={"error": str(e)},
-                )
-            return False
-
-    async def remove_container(self, container_id: str, force: bool = True) -> bool:
-        """Remove a Docker container.
-
-        Args:
-            container_id: ID of the container to remove
-            force: Whether to force removal
-
-        Returns:
-            bool: True if removed successfully, False otherwise
-        """
-        cmd = ["docker", "rm"]
-        if force:
-            cmd.append("-f")
-        cmd.append(container_id)
-
-        try:
-            process = await asyncio.create_subprocess_exec(*cmd)
-            await process.communicate()
-
-            if self.logger:
-                self.logger.debug(
-                    f"Removed container: {container_id[:12]}", tag="DOCKER"
-                )
-
-            return process.returncode == 0
-        except Exception as e:
-            if self.logger:
-                self.logger.warning(
-                    message="Failed to remove container: {error}",
-                    tag="DOCKER",
-                    params={"error": str(e)},
-                )
-            return False
-
-    # Container Command Execution Methods
-
-    async def exec_in_container(
-        self, container_id: str, command: List[str], detach: bool = False
-    ) -> Tuple[int, str, str]:
-        """Execute a command in a running container.
-
-        Args:
-            container_id: ID of the container
-            command: Command to execute as a list of strings
-            detach: Whether to run the command in detached mode
-
-        Returns:
-            Tuple of (return_code, stdout, stderr)
-        """
-        cmd = ["docker", "exec"]
-        if detach:
-            cmd.append("-d")
-        cmd.append(container_id)
-        cmd.extend(command)
-
-        try:
-            process = await asyncio.create_subprocess_exec(
-                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
-            )
-            stdout, stderr = await process.communicate()
-
-            return process.returncode, stdout.decode(), stderr.decode()
-        except Exception as e:
-            if self.logger:
-                self.logger.error(
-                    message="Error executing command in container: {error}",
-                    tag="DOCKER",
-                    params={"error": str(e)},
-                )
-            return -1, "", str(e)
-
-    async def start_socat_in_container(self, container_id: str) -> bool:
-        """Start socat in the container to map port 9222 to 9223.
-
-        Args:
-            container_id: ID of the container
-
-        Returns:
-            bool: True if socat started successfully, False otherwise
-        """
-        # Command to run socat as a background process
-        cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"]
-
-        returncode, _, stderr = await self.exec_in_container(
-            container_id, cmd, detach=True
-        )
-
-        if returncode != 0:
-            if self.logger:
-                self.logger.error(
-                    message="Failed to start socat in container: {error}",
-                    tag="DOCKER",
-                    params={"error": stderr},
-                )
-            return False
-
-        if self.logger:
-            self.logger.debug(
-                f"Started socat in container: {container_id[:12]}", tag="DOCKER"
-            )
-
-        # Wait a moment for socat to start
-        await asyncio.sleep(1)
-        return True
-
-    async def launch_chrome_in_container(
-        self, container_id: str, browser_args: List[str]
-    ) -> bool:
-        """Launch Chrome inside the container with specified arguments.
-
-        Args:
-            container_id: ID of the container
-            browser_args: Chrome command line arguments
-
-        Returns:
-            bool: True if Chrome started successfully, False otherwise
-        """
-        # Build Chrome command
-        chrome_cmd = ["chromium"]
-        chrome_cmd.extend(browser_args)
-
-        returncode, _, stderr = await self.exec_in_container(
-            container_id, chrome_cmd, detach=True
-        )
-
-        if returncode != 0:
-            if self.logger:
-                self.logger.error(
-                    message="Failed to launch Chrome in container: {error}",
-                    tag="DOCKER",
-                    params={"error": stderr},
-                )
-            return False
-
-        if self.logger:
-            self.logger.debug(
-                f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER"
-            )
-
-        return True
-
-    async def get_process_id_in_container(
-        self, container_id: str, process_name: str
-    ) -> Optional[int]:
-        """Get the process ID for a process in the container.
-
-        Args:
-            container_id: ID of the container
-            process_name: Name pattern to search for
-
-        Returns:
-            int: Process ID if found, None otherwise
-        """
-        cmd = ["pgrep", "-f", process_name]
-
-        returncode, stdout, _ = await self.exec_in_container(container_id, cmd)
-
-        if returncode == 0 and stdout.strip():
-            pid = int(stdout.strip().split("\n")[0])
-            return pid
-
-        return None
-
-    async def stop_process_in_container(self, container_id: str, pid: int) -> bool:
-        """Stop a process in the container by PID.
-
-        Args:
-            container_id: ID of the container
-            pid: Process ID to stop
-
-        Returns:
-            bool: True if process was stopped, False otherwise
-        """
-        cmd = ["kill", "-TERM", str(pid)]
-
-        returncode, _, stderr = await self.exec_in_container(container_id, cmd)
-
-        if returncode != 0:
-            if self.logger:
-                self.logger.warning(
-                    message="Failed to stop process in container: {error}",
-                    tag="DOCKER",
-                    params={"error": stderr},
-                )
-            return False
-
-        if self.logger:
-            self.logger.debug(
-                f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER"
-            )
-
-        return True
-
-    # Network and Port Methods
-
-    async def wait_for_cdp_ready(self, host_port: int, timeout: int = 10) -> dict:
-        """Wait for the CDP endpoint to be ready.
-
-        Args:
-            host_port: Port to check for CDP endpoint
-            timeout: Maximum time to wait in seconds
-
-        Returns:
-            dict: CDP JSON config if ready, None if timeout occurred
-        """
-        import aiohttp
-
-        url = f"http://localhost:{host_port}/json/version"
-
-        for _ in range(timeout):
-            try:
-                async with aiohttp.ClientSession() as session:
-                    async with session.get(url, timeout=1) as response:
-                        if response.status == 200:
-                            if self.logger:
-                                self.logger.debug(
-                                    f"CDP endpoint ready on port {host_port}",
-                                    tag="DOCKER",
-                                )
-                            cdp_json_config = await response.json()
-                            if self.logger:
-                                self.logger.debug(
-                                    f"CDP JSON config: {cdp_json_config}", tag="DOCKER"
-                                )
-                            return cdp_json_config
-            except Exception:
-                pass
-            await asyncio.sleep(1)
-
-        if self.logger:
-            self.logger.warning(
-                f"CDP endpoint not ready on port {host_port} after {timeout}s timeout",
-                tag="DOCKER",
-            )
-        return None
-
-    def is_port_in_use(self, port: int) -> bool:
-        """Check if a port is already in use on the host.
-
-        Args:
-            port: Port number to check
-
-        Returns:
-            bool: True if port is in use, False otherwise
-        """
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            return s.connect_ex(("localhost", port)) == 0
-
-    def get_next_available_port(self, start_port: int = 9223) -> int:
-        """Get the next available port starting from a given port.
-
-        Args:
-            start_port: Port number to start checking from
-
-        Returns:
-            int: First available port number
-        """
-        port = start_port
-        while self.is_port_in_use(port):
-            port += 1
-        return port
-
-    # Configuration Hash Methods
-
-    def generate_config_hash(self, config_dict: Dict) -> str:
-        """Generate a hash of the configuration for container matching.
-
-        Args:
-            config_dict: Dictionary of configuration parameters
-
-        Returns:
-            str: Hash string uniquely identifying this configuration
-        """
-        # Convert to canonical JSON string and hash
-        config_json = json.dumps(config_dict, sort_keys=True)
-        return hashlib.sha256(config_json.encode()).hexdigest()
diff --git a/crawl4ai/browser/manager copy.py b/crawl4ai/browser/manager copy.py
deleted file mode 100644
index 97aaf587..00000000
--- a/crawl4ai/browser/manager copy.py	
+++ /dev/null
@@ -1,177 +0,0 @@
-"""Browser manager module for Crawl4AI.
-
-This module provides a central browser management class that uses the
-strategy pattern internally while maintaining the existing API.
-It also implements a page pooling mechanism for improved performance.
-"""
-
-from typing import Optional, Tuple, List
-
-from playwright.async_api import Page, BrowserContext
-
-from ..async_logger import AsyncLogger
-from ..async_configs import BrowserConfig, CrawlerRunConfig
-
-from .strategies import (
-    BaseBrowserStrategy,
-    PlaywrightBrowserStrategy,
-    CDPBrowserStrategy,
-    BuiltinBrowserStrategy,
-    DockerBrowserStrategy
-)
-
-class BrowserManager:
-    """Main interface for browser management in Crawl4AI.
-    
-    This class maintains backward compatibility with the existing implementation
-    while using the strategy pattern internally for different browser types.
-    
-    Attributes:
-        config (BrowserConfig): Configuration object containing all browser settings
-        logger: Logger instance for recording events and errors
-        browser: The browser instance
-        default_context: The default browser context
-        managed_browser: The managed browser instance
-        playwright: The Playwright instance
-        sessions: Dictionary to store session information
-        session_ttl: Session timeout in seconds
-    """
-    
-    def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
-        """Initialize the BrowserManager with a browser configuration.
-        
-        Args:
-            browser_config: Configuration object containing all browser settings
-            logger: Logger instance for recording events and errors
-        """
-        self.config = browser_config or BrowserConfig()
-        self.logger = logger
-        
-        # Create strategy based on configuration
-        self.strategy = self._create_strategy()
-        
-        # Initialize state variables for compatibility with existing code
-        self.browser = None
-        self.default_context = None
-        self.managed_browser = None
-        self.playwright = None
-        
-        # For session management (from existing implementation)
-        self.sessions = {}
-        self.session_ttl = 1800  # 30 minutes
-    
-    def _create_strategy(self) -> BaseBrowserStrategy:
-        """Create appropriate browser strategy based on configuration.
-        
-        Returns:
-            BaseBrowserStrategy: The selected browser strategy
-        """
-        if self.config.browser_mode == "builtin":
-            return BuiltinBrowserStrategy(self.config, self.logger)
-        elif self.config.browser_mode == "docker":
-            if DockerBrowserStrategy is None:
-                if self.logger:
-                    self.logger.error(
-                        "Docker browser strategy requested but not available. "
-                        "Falling back to PlaywrightBrowserStrategy.",
-                        tag="BROWSER"
-                    )
-                return PlaywrightBrowserStrategy(self.config, self.logger)
-            return DockerBrowserStrategy(self.config, self.logger)
-        elif self.config.browser_mode == "cdp" or self.config.cdp_url or self.config.use_managed_browser:
-            return CDPBrowserStrategy(self.config, self.logger)
-        else:
-            return PlaywrightBrowserStrategy(self.config, self.logger)
-    
-    async def start(self):
-        """Start the browser instance and set up the default context.
-        
-        Returns:
-            self: For method chaining
-        """
-        # Start the strategy
-        await self.strategy.start()
-        
-        # Update legacy references
-        self.browser = self.strategy.browser
-        self.default_context = self.strategy.default_context
-        
-        # Set browser process reference (for CDP strategy)
-        if hasattr(self.strategy, 'browser_process'):
-            self.managed_browser = self.strategy
-        
-        # Set Playwright reference
-        self.playwright = self.strategy.playwright
-        
-        # Sync sessions if needed
-        if hasattr(self.strategy, 'sessions'):
-            self.sessions = self.strategy.sessions
-            self.session_ttl = self.strategy.session_ttl
-        
-        return self
-    
-    async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        """Get a page for the given configuration.
-        
-        Args:
-            crawlerRunConfig: Configuration object for the crawler run
-            
-        Returns:
-            Tuple of (Page, BrowserContext)
-        """
-        # Delegate to strategy
-        page, context = await self.strategy.get_page(crawlerRunConfig)
-        
-        # Sync sessions if needed
-        if hasattr(self.strategy, 'sessions'):
-            self.sessions = self.strategy.sessions
-        
-        return page, context
-        
-    async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
-        """Get multiple pages with the same configuration.
-        
-        This method efficiently creates multiple browser pages using the same configuration,
-        which is useful for parallel crawling of multiple URLs.
-        
-        Args:
-            crawlerRunConfig: Configuration for the pages
-            count: Number of pages to create
-            
-        Returns:
-            List of (Page, Context) tuples
-        """
-        # Delegate to strategy
-        pages = await self.strategy.get_pages(crawlerRunConfig, count)
-        
-        # Sync sessions if needed
-        if hasattr(self.strategy, 'sessions'):
-            self.sessions = self.strategy.sessions
-            
-        return pages
-    
-    # Just for legacy compatibility
-    async def kill_session(self, session_id: str):
-        """Kill a browser session and clean up resources.
-        
-        Args:
-            session_id: The session ID to kill
-        """
-        # Handle kill_session via our strategy if it supports it
-        await self.strategy.kill_session(session_id)
-
-        # sync sessions if needed
-        if hasattr(self.strategy, 'sessions'):
-            self.sessions = self.strategy.sessions
-    
-    async def close(self):
-        """Close the browser and clean up resources."""
-        # Delegate to strategy
-        await self.strategy.close()
-        
-        # Reset legacy references
-        self.browser = None
-        self.default_context = None
-        self.managed_browser = None
-        self.playwright = None
-        self.sessions = {}
diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py
deleted file mode 100644
index 429d2516..00000000
--- a/crawl4ai/browser/manager.py
+++ /dev/null
@@ -1,853 +0,0 @@
-"""Browser manager module for Crawl4AI.
-
-This module provides a central browser management class that uses the
-strategy pattern internally while maintaining the existing API.
-It also implements browser pooling for improved performance.
-"""
-
-import asyncio
-import hashlib
-import json
-import math
-from enum import Enum
-from typing import Dict, List, Optional, Tuple, Any
-
-from playwright.async_api import Page, BrowserContext
-
-from ..async_logger import AsyncLogger
-from ..async_configs import BrowserConfig, CrawlerRunConfig
-
-from .strategies import (
-    BaseBrowserStrategy,
-    PlaywrightBrowserStrategy,
-    CDPBrowserStrategy,
-    BuiltinBrowserStrategy,
-    DockerBrowserStrategy
-)
-
-class UnavailableBehavior(Enum):
-    """Behavior when no browser is available."""
-    ON_DEMAND = "on_demand"  # Create new browser on demand
-    PENDING = "pending"      # Wait until a browser is available
-    EXCEPTION = "exception"  # Raise an exception
-
-
-class BrowserManager:
-    """Main interface for browser management and pooling in Crawl4AI.
-    
-    This class maintains backward compatibility with the existing implementation
-    while using the strategy pattern internally for different browser types.
-    It also implements browser pooling for improved performance.
-    
-    Attributes:
-        config (BrowserConfig): Default configuration object for browsers
-        logger (AsyncLogger): Logger instance for recording events and errors
-        browser_pool (Dict): Dictionary to store browser instances by configuration
-        browser_in_use (Dict): Dictionary to track which browsers are in use
-        request_queues (Dict): Queues for pending requests by configuration
-        unavailable_behavior (UnavailableBehavior): Behavior when no browser is available
-    """
-    
-    def __init__(
-        self, 
-        browser_config: Optional[BrowserConfig] = None, 
-        logger: Optional[AsyncLogger] = None,
-        unavailable_behavior: UnavailableBehavior = UnavailableBehavior.EXCEPTION,
-        max_browsers_per_config: int = 10,
-        max_pages_per_browser: int = 5
-        ):
-        """Initialize the BrowserManager with a browser configuration.
-        
-        Args:
-            browser_config: Configuration object containing all browser settings
-            logger: Logger instance for recording events and errors
-            unavailable_behavior: Behavior when no browser is available
-            max_browsers_per_config: Maximum number of browsers per configuration
-            max_pages_per_browser: Maximum number of pages per browser
-        """
-        self.config = browser_config or BrowserConfig()
-        self.logger = logger
-        self.unavailable_behavior = unavailable_behavior
-        self.max_browsers_per_config = max_browsers_per_config
-        self.max_pages_per_browser = max_pages_per_browser
-        
-        # Browser pool management
-        self.browser_pool = {}            # config_hash -> list of browser strategies
-        self.browser_in_use = {}          # strategy instance -> Boolean
-        self.request_queues = {}          # config_hash -> asyncio.Queue()
-        self._browser_locks = {}          # config_hash -> asyncio.Lock()
-        self._browser_pool_lock = asyncio.Lock()  # Global lock for pool modifications
-        
-        # Page pool management
-        self.page_pool = {}  # (browser_config_hash, crawler_config_hash) -> list of (page, context, strategy)
-        self._page_pool_lock = asyncio.Lock()
-            
-        self.browser_page_counts = {}  # strategy instance -> current page count
-        self._page_count_lock = asyncio.Lock()  # Lock for thread-safe access to page counts
-
-        # For session management (from existing implementation)
-        self.sessions = {}
-        self.session_ttl = 1800  # 30 minutes
-
-        # For legacy compatibility
-        self.browser = None
-        self.default_context = None
-        self.managed_browser = None
-        self.playwright = None
-        self.strategy = None
-    
-    def _create_browser_config_hash(self, browser_config: BrowserConfig) -> str:
-        """Create a hash of the browser configuration for browser pooling.
-        
-        Args:
-            browser_config: Browser configuration
-            
-        Returns:
-            str: Hash of the browser configuration
-        """
-        # Convert config to dictionary, excluding any callable objects
-        config_dict = browser_config.__dict__.copy()
-        for key in list(config_dict.keys()):
-            if callable(config_dict[key]):
-                del config_dict[key]
-        
-        # Convert to canonical JSON string
-        config_json = json.dumps(config_dict, sort_keys=True, default=str)
-        
-        # Hash the JSON
-        config_hash = hashlib.sha256(config_json.encode()).hexdigest()
-        return config_hash
-    
-    def _create_strategy(self, browser_config: BrowserConfig) -> BaseBrowserStrategy:
-        """Create appropriate browser strategy based on configuration.
-        
-        Args:
-            browser_config: Browser configuration
-            
-        Returns:
-            BaseBrowserStrategy: The selected browser strategy
-        """
-        if browser_config.browser_mode == "builtin":
-            return BuiltinBrowserStrategy(browser_config, self.logger)
-        elif browser_config.browser_mode == "docker":
-            if DockerBrowserStrategy is None:
-                if self.logger:
-                    self.logger.error(
-                        "Docker browser strategy requested but not available. "
-                        "Falling back to PlaywrightBrowserStrategy.",
-                        tag="BROWSER"
-                    )
-                return PlaywrightBrowserStrategy(browser_config, self.logger)
-            return DockerBrowserStrategy(browser_config, self.logger)
-        elif browser_config.browser_mode == "cdp" or browser_config.cdp_url or browser_config.use_managed_browser:
-            return CDPBrowserStrategy(browser_config, self.logger)
-        else:
-            return PlaywrightBrowserStrategy(browser_config, self.logger)
-    
-    async def initialize_pool(
-        self, 
-        browser_configs: List[BrowserConfig] = None,
-        browsers_per_config: int = 1,
-        page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None
-    ):
-        """Initialize the browser pool with multiple browser configurations.
-        
-        Args:
-            browser_configs: List of browser configurations to initialize
-            browsers_per_config: Number of browser instances per configuration
-            page_configs: Optional list of (browser_config, crawler_run_config, count) tuples
-                for pre-warming pages
-                
-        Returns:
-            self: For method chaining
-        """
-        if not browser_configs:
-            browser_configs = [self.config]
-        
-        # Calculate how many browsers we'll need based on page_configs
-        browsers_needed = {}
-        if page_configs:
-            for browser_config, _, page_count in page_configs:
-                config_hash = self._create_browser_config_hash(browser_config)
-                # Calculate browsers based on max_pages_per_browser
-                browsers_needed_for_config = math.ceil(page_count / self.max_pages_per_browser)
-                browsers_needed[config_hash] = max(
-                    browsers_needed.get(config_hash, 0),
-                    browsers_needed_for_config
-                )
-        
-        # Adjust browsers_per_config if needed to ensure enough capacity
-        config_browsers_needed = {}
-        for browser_config in browser_configs:
-            config_hash = self._create_browser_config_hash(browser_config)
-            
-            # Estimate browsers needed based on page requirements
-            browsers_for_config = browsers_per_config
-            if config_hash in browsers_needed:
-                browsers_for_config = max(browsers_for_config, browsers_needed[config_hash])
-            
-            config_browsers_needed[config_hash] = browsers_for_config
-            
-            # Update max_browsers_per_config if needed
-            if browsers_for_config > self.max_browsers_per_config:
-                self.max_browsers_per_config = browsers_for_config
-                if self.logger:
-                    self.logger.info(
-                        f"Increased max_browsers_per_config to {browsers_for_config} to accommodate page requirements",
-                        tag="POOL"
-                    )
-        
-        # Initialize locks and queues for each config
-        async with self._browser_pool_lock:
-            for browser_config in browser_configs:
-                config_hash = self._create_browser_config_hash(browser_config)
-                
-                # Initialize lock for this config if needed
-                if config_hash not in self._browser_locks:
-                    self._browser_locks[config_hash] = asyncio.Lock()
-                
-                # Initialize queue for this config if needed
-                if config_hash not in self.request_queues:
-                    self.request_queues[config_hash] = asyncio.Queue()
-                
-                # Initialize pool for this config if needed
-                if config_hash not in self.browser_pool:
-                    self.browser_pool[config_hash] = []
-        
-        # Create browser instances for each configuration in parallel
-        browser_tasks = []
-        
-        for browser_config in browser_configs:
-            config_hash = self._create_browser_config_hash(browser_config)
-            browsers_to_create = config_browsers_needed.get(
-                config_hash, 
-                browsers_per_config
-            ) - len(self.browser_pool.get(config_hash, []))
-            
-            if browsers_to_create <= 0:
-                continue
-                
-            for _ in range(browsers_to_create):
-                # Create a task for each browser initialization
-                task = self._create_and_add_browser(browser_config, config_hash)
-                browser_tasks.append(task)
-        
-        # Wait for all browser initializations to complete
-        if browser_tasks:
-            if self.logger:
-                self.logger.info(f"Initializing {len(browser_tasks)} browsers in parallel...", tag="POOL")
-            await asyncio.gather(*browser_tasks)
-        
-        # Pre-warm pages if requested
-        if page_configs:
-            page_tasks = []
-            for browser_config, crawler_run_config, count in page_configs:
-                task = self._prewarm_pages(browser_config, crawler_run_config, count)
-                page_tasks.append(task)
-            
-            if page_tasks:
-                if self.logger:
-                    self.logger.info(f"Pre-warming pages with {len(page_tasks)} configurations...", tag="POOL")
-                await asyncio.gather(*page_tasks)
-        
-        # Update legacy references
-        if self.browser_pool and next(iter(self.browser_pool.values()), []):
-            strategy = next(iter(self.browser_pool.values()))[0]
-            self.strategy = strategy
-            self.browser = strategy.browser
-            self.default_context = strategy.default_context
-            self.playwright = strategy.playwright
-        
-        return self
-
-    async def _create_and_add_browser(self, browser_config: BrowserConfig, config_hash: str):
-        """Create and add a browser to the pool.
-        
-        Args:
-            browser_config: Browser configuration
-            config_hash: Hash of the configuration
-        """
-        try:
-            strategy = self._create_strategy(browser_config)
-            await strategy.start()
-            
-            async with self._browser_pool_lock:
-                if config_hash not in self.browser_pool:
-                    self.browser_pool[config_hash] = []
-                self.browser_pool[config_hash].append(strategy)
-                self.browser_in_use[strategy] = False
-            
-            if self.logger:
-                self.logger.debug(
-                    f"Added browser to pool: {browser_config.browser_type} "
-                    f"({browser_config.browser_mode})", 
-                    tag="POOL"
-                )
-        except Exception as e:
-            if self.logger:
-                self.logger.error(
-                    f"Failed to create browser: {str(e)}", 
-                    tag="POOL"
-                )
-            raise
-
-    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
-        """Create a signature hash from crawler configuration.
-        
-        Args:
-            crawlerRunConfig: Crawler run configuration
-            
-        Returns:
-            str: Hash of the crawler configuration
-        """
-        config_dict = crawlerRunConfig.__dict__.copy()
-        # Exclude items that do not affect page creation
-        ephemeral_keys = [
-            "session_id",
-            "js_code",
-            "scraping_strategy",
-            "extraction_strategy",
-            "chunking_strategy",
-            "cache_mode",
-            "content_filter",
-            "semaphore_count",
-            "url"
-        ]
-        for key in ephemeral_keys:
-            if key in config_dict:
-                del config_dict[key]
-                
-        # Convert to canonical JSON string
-        config_json = json.dumps(config_dict, sort_keys=True, default=str)
-
-        # Hash the JSON
-        config_hash = hashlib.sha256(config_json.encode("utf-8")).hexdigest()
-        return config_hash            
-
-    async def _prewarm_pages(
-        self, 
-        browser_config: BrowserConfig, 
-        crawler_run_config: CrawlerRunConfig, 
-        count: int
-    ):
-        """Pre-warm pages for a specific configuration.
-        
-        Args:
-            browser_config: Browser configuration
-            crawler_run_config: Crawler run configuration
-            count: Number of pages to pre-warm
-        """
-        try:
-            # Create individual page tasks and run them in parallel
-            browser_config_hash = self._create_browser_config_hash(browser_config)
-            crawler_config_hash = self._make_config_signature(crawler_run_config)            
-            async def get_single_page():
-                strategy = await self.get_available_browser(browser_config)
-                try:
-                    page, context = await strategy.get_page(crawler_run_config)
-                    # Store config hashes on the page object for later retrieval
-                    setattr(page, "_browser_config_hash", browser_config_hash)
-                    setattr(page, "_crawler_config_hash", crawler_config_hash)                    
-                    return page, context, strategy
-                except Exception as e:
-                    # Release the browser back to the pool
-                    await self.release_browser(strategy, browser_config)
-                    raise e
-                
-            # Create tasks for parallel execution
-            page_tasks = [get_single_page() for _ in range(count)]
-            
-            # Execute all page creation tasks in parallel
-            pages_contexts_strategies = await asyncio.gather(*page_tasks)
-            
-            # Add pages to the page pool
-            browser_config_hash = self._create_browser_config_hash(browser_config)
-            crawler_config_hash = self._make_config_signature(crawler_run_config)
-            pool_key = (browser_config_hash, crawler_config_hash)
-            
-            async with self._page_pool_lock:
-                if pool_key not in self.page_pool:
-                    self.page_pool[pool_key] = []
-                
-                # Add all pages to the pool
-                self.page_pool[pool_key].extend(pages_contexts_strategies)
-                
-            if self.logger:
-                self.logger.debug(
-                    f"Pre-warmed {count} pages in parallel with config {crawler_run_config}", 
-                    tag="POOL"
-                )
-        except Exception as e:
-            if self.logger:
-                self.logger.error(
-                    f"Failed to pre-warm pages: {str(e)}", 
-                    tag="POOL"
-                )
-            raise
-    
-    async def get_available_browser(
-        self, 
-        browser_config: Optional[BrowserConfig] = None
-    ) -> BaseBrowserStrategy:
-        """Get an available browser from the pool for the given configuration.
-        
-        Args:
-            browser_config: Browser configuration to match
-            
-        Returns:
-            BaseBrowserStrategy: An available browser strategy
-            
-        Raises:
-            Exception: If no browser is available and behavior is EXCEPTION
-        """
-        browser_config = browser_config or self.config
-        config_hash = self._create_browser_config_hash(browser_config)
-        
-        async with self._browser_locks.get(config_hash, asyncio.Lock()):
-            # Check if we have browsers for this config
-            if config_hash not in self.browser_pool or not self.browser_pool[config_hash]:
-                if self.unavailable_behavior == UnavailableBehavior.ON_DEMAND:
-                    # Create a new browser on demand
-                    if self.logger:
-                        self.logger.info(
-                            f"1> Creating new browser on demand for config {config_hash[:8]}",
-                            tag="POOL"
-                        )
-                    
-                    # Initialize pool for this config if needed
-                    async with self._browser_pool_lock:
-                        if config_hash not in self.browser_pool:
-                            self.browser_pool[config_hash] = []
-                        
-                        strategy = self._create_strategy(browser_config)
-                        await strategy.start()
-
-                        self.browser_pool[config_hash].append(strategy)
-                        self.browser_in_use[strategy] = False
-
-                elif self.unavailable_behavior == UnavailableBehavior.EXCEPTION:
-                    raise Exception(f"No browsers available for configuration {config_hash[:8]}")
-            
-            # Check for an available browser with capacity in the pool
-            for strategy in self.browser_pool[config_hash]:
-                # Check if this browser has capacity for more pages
-                async with self._page_count_lock:
-                    current_pages = self.browser_page_counts.get(strategy, 0)
-
-                    if current_pages < self.max_pages_per_browser:
-                        # Increment the page count
-                        self.browser_page_counts[strategy] = current_pages + 1
-
-                        self.browser_in_use[strategy] = True
-
-                        # Get browser information for better logging
-                        browser_type = getattr(strategy.config, 'browser_type', 'unknown')
-                        browser_mode = getattr(strategy.config, 'browser_mode', 'unknown')
-                        strategy_id = id(strategy)  # Use object ID as a unique identifier
-
-                        if self.logger:
-                            self.logger.debug(
-                                f"Selected browser #{strategy_id} ({browser_type}/{browser_mode}) - " 
-                                f"pages: {current_pages+1}/{self.max_pages_per_browser}",
-                                tag="POOL"
-                            )                        
-
-                        return strategy
-            
-            # All browsers are at capacity or in use
-            if self.unavailable_behavior == UnavailableBehavior.ON_DEMAND:
-                # Check if we've reached the maximum number of browsers
-                if len(self.browser_pool[config_hash]) >= self.max_browsers_per_config:
-                    if self.logger:
-                        self.logger.warning(
-                            f"Maximum browsers reached for config {config_hash[:8]} and all at page capacity",
-                            tag="POOL"
-                        )
-                    if self.unavailable_behavior == UnavailableBehavior.EXCEPTION:
-                        raise Exception("Maximum browsers reached and all at page capacity")
-                
-                # Create a new browser on demand
-                if self.logger:
-                    self.logger.info(
-                        f"2> Creating new browser on demand for config {config_hash[:8]}",
-                        tag="POOL"
-                    )
-                
-                strategy = self._create_strategy(browser_config)
-                await strategy.start()
-                
-                async with self._browser_pool_lock:
-                    self.browser_pool[config_hash].append(strategy)
-                    self.browser_in_use[strategy] = True
-                
-                return strategy
-            
-            # If we get here, either behavior is EXCEPTION or PENDING
-            if self.unavailable_behavior == UnavailableBehavior.EXCEPTION:
-                raise Exception(f"All browsers in use or at page capacity for configuration {config_hash[:8]}")
-            
-            # For PENDING behavior, set up waiting mechanism
-            if config_hash not in self.request_queues:
-                self.request_queues[config_hash] = asyncio.Queue()
-            
-            # Create a future to wait on
-            future = asyncio.Future()
-            await self.request_queues[config_hash].put(future)
-            
-            if self.logger:
-                self.logger.debug(
-                    f"Waiting for available browser for config {config_hash[:8]}",
-                    tag="POOL"
-                )
-            
-            # Wait for a browser to become available
-            strategy = await future
-            return strategy
-
-    async def get_page(
-        self, 
-        crawlerRunConfig: CrawlerRunConfig,
-        browser_config: Optional[BrowserConfig] = None
-    ) -> Tuple[Page, BrowserContext, BaseBrowserStrategy]:
-        """Get a page from the browser pool."""
-        browser_config = browser_config or self.config
-        
-        # Check if we have a pre-warmed page available
-        browser_config_hash = self._create_browser_config_hash(browser_config)
-        crawler_config_hash = self._make_config_signature(crawlerRunConfig)
-        pool_key = (browser_config_hash, crawler_config_hash)
-        
-        # Try to get a page from the pool
-        async with self._page_pool_lock:
-            if pool_key in self.page_pool and self.page_pool[pool_key]:
-                # Get a page from the pool
-                page, context, strategy = self.page_pool[pool_key].pop()
-                
-                # Mark browser as in use (it already is, but ensure consistency)
-                self.browser_in_use[strategy] = True
-                
-                if self.logger:
-                    self.logger.debug(
-                        f"Using pre-warmed page for config {crawler_config_hash[:8]}", 
-                        tag="POOL"
-                    )
-                    
-                # Note: We don't increment page count since it was already counted when created
-                
-                return page, context, strategy
-        
-        # No pre-warmed page available, create a new one
-        # get_available_browser already increments the page count
-        strategy = await self.get_available_browser(browser_config)
-        
-        try:
-            # Get a page from the browser
-            page, context = await strategy.get_page(crawlerRunConfig)
-            
-            # Store config hashes on the page object for later retrieval
-            setattr(page, "_browser_config_hash", browser_config_hash)
-            setattr(page, "_crawler_config_hash", crawler_config_hash)
-            
-            return page, context, strategy
-        except Exception as e:
-            # Release the browser back to the pool and decrement the page count
-            await self.release_browser(strategy, browser_config, decrement_page_count=True)
-            raise e
-        
-    async def release_page(
-        self, 
-        page: Page, 
-        strategy: BaseBrowserStrategy, 
-        browser_config: Optional[BrowserConfig] = None,
-        keep_alive: bool = True,
-        return_to_pool: bool = True
-    ):
-        """Release a page back to the pool."""
-        browser_config = browser_config or self.config
-
-        page_url = page.url if page else None
-        
-        # If not keeping the page alive, close it and decrement count
-        if not keep_alive:
-            try:
-                await page.close()
-            except Exception as e:
-                if self.logger:
-                    self.logger.error(
-                        f"Error closing page: {str(e)}",
-                        tag="POOL"
-                    )
-            # Release the browser with page count decrement
-            await self.release_browser(strategy, browser_config, decrement_page_count=True)
-            return
-        
-        # If returning to pool
-        if return_to_pool:
-            # Get the configuration hashes from the page object
-            browser_config_hash = getattr(page, "_browser_config_hash", None)
-            crawler_config_hash = getattr(page, "_crawler_config_hash", None)
-            
-            if browser_config_hash and crawler_config_hash:
-                pool_key = (browser_config_hash, crawler_config_hash)
-                
-                async with self._page_pool_lock:
-                    if pool_key not in self.page_pool:
-                        self.page_pool[pool_key] = []
-                    
-                    # Add page back to the pool
-                    self.page_pool[pool_key].append((page, page.context, strategy))
-                    
-                    if self.logger:
-                        self.logger.debug(
-                            f"Returned page to pool for config {crawler_config_hash[:8]}, url: {page_url}", 
-                            tag="POOL"
-                        )
-                    
-                    # Note: We don't decrement the page count here since the page is still "in use" 
-                    # from the browser's perspective, just in our pool
-                    return
-            else:
-                # If we can't identify the configuration, log a warning
-                if self.logger:
-                    self.logger.warning(
-                        "Cannot return page to pool - missing configuration hashes", 
-                        tag="POOL"
-                    )
-        
-        # If we got here, we couldn't return to pool, so just release the browser
-        await self.release_browser(strategy, browser_config, decrement_page_count=True)
-    
-    async def release_browser(
-        self, 
-        strategy: BaseBrowserStrategy, 
-        browser_config: Optional[BrowserConfig] = None,
-        decrement_page_count: bool = True
-    ):
-        """Release a browser back to the pool."""
-        browser_config = browser_config or self.config
-        config_hash = self._create_browser_config_hash(browser_config)
-        
-        # Decrement page count
-        if decrement_page_count:
-            async with self._page_count_lock:
-                current_count = self.browser_page_counts.get(strategy, 1)
-                self.browser_page_counts[strategy] = max(0, current_count - 1)
-                
-                if self.logger:
-                    self.logger.debug(
-                        f"Decremented page count for browser (now: {self.browser_page_counts[strategy]})",
-                        tag="POOL"
-                    )
-        
-        # Mark as not in use
-        self.browser_in_use[strategy] = False
-        
-        # Process any waiting requests
-        if config_hash in self.request_queues and not self.request_queues[config_hash].empty():
-            future = await self.request_queues[config_hash].get()
-            if not future.done():
-                future.set_result(strategy)
-
-    async def get_pages(
-        self, 
-        crawlerRunConfig: CrawlerRunConfig, 
-        count: int = 1,
-        browser_config: Optional[BrowserConfig] = None
-    ) -> List[Tuple[Page, BrowserContext, BaseBrowserStrategy]]:
-        """Get multiple pages from the browser pool.
-        
-        Args:
-            crawlerRunConfig: Configuration for the crawler run
-            count: Number of pages to get
-            browser_config: Browser configuration to use
-            
-        Returns:
-            List of (Page, Context, Strategy) tuples
-        """
-        results = []
-        for _ in range(count):
-            try:
-                result = await self.get_page(crawlerRunConfig, browser_config)
-                results.append(result)
-            except Exception as e:
-                # Release any pages we've already gotten
-                for page, _, strategy in results:
-                    await self.release_page(page, strategy, browser_config)
-                raise e
-        
-        return results
-
-    async def get_page_pool_status(self) -> Dict[str, Any]:
-        """Get information about the page pool status.
-        
-        Returns:
-            Dict with page pool status information
-        """
-        status = {
-            "total_pooled_pages": 0,
-            "configs": {}
-        }
-        
-        async with self._page_pool_lock:
-            for (browser_hash, crawler_hash), pages in self.page_pool.items():
-                config_key = f"{browser_hash[:8]}_{crawler_hash[:8]}"
-                status["configs"][config_key] = len(pages)
-                status["total_pooled_pages"] += len(pages)
-        
-        if self.logger:
-            self.logger.debug(
-                f"Page pool status: {status['total_pooled_pages']} pages available",
-                tag="POOL"
-            )
-        
-        return status
-
-    async def get_pool_status(self) -> Dict[str, Any]:
-        """Get information about the browser pool status.
-        
-        Returns:
-            Dict with pool status information
-        """
-        status = {
-            "total_browsers": 0,
-            "browsers_in_use": 0,
-            "total_pages": 0,
-            "configs": {}
-        }
-        
-        for config_hash, strategies in self.browser_pool.items():
-            config_pages = 0
-            in_use = 0
-            
-            for strategy in strategies:
-                is_in_use = self.browser_in_use.get(strategy, False)
-                if is_in_use:
-                    in_use += 1
-                
-                # Get page count for this browser
-                try:
-                    page_count = len(await strategy.get_opened_pages())
-                    config_pages += page_count
-                except Exception as e:
-                    if self.logger:
-                        self.logger.error(f"Error getting page count: {str(e)}", tag="POOL")
-            
-            config_status = {
-                "total_browsers": len(strategies),
-                "browsers_in_use": in_use,
-                "pages_open": config_pages,
-                "waiting_requests": self.request_queues.get(config_hash, asyncio.Queue()).qsize(),
-                "max_capacity": len(strategies) * self.max_pages_per_browser,
-                "utilization_pct": round((config_pages / (len(strategies) * self.max_pages_per_browser)) * 100, 1) 
-                    if strategies else 0
-            }
-            
-            status["configs"][config_hash] = config_status
-            status["total_browsers"] += config_status["total_browsers"]
-            status["browsers_in_use"] += config_status["browsers_in_use"]
-            status["total_pages"] += config_pages
-        
-        # Add overall utilization
-        if status["total_browsers"] > 0:
-            max_capacity = status["total_browsers"] * self.max_pages_per_browser
-            status["overall_utilization_pct"] = round((status["total_pages"] / max_capacity) * 100, 1)
-        else:
-            status["overall_utilization_pct"] = 0
-        
-        return status
-
-    async def start(self):
-        """Start at least one browser instance in the pool.
-        
-        This method is kept for backward compatibility.
-        
-        Returns:
-            self: For method chaining
-        """
-        await self.initialize_pool([self.config], 1)
-        return self
-        
-    async def kill_session(self, session_id: str):
-        """Kill a browser session and clean up resources.
-        
-        Delegated to the strategy. This method is kept for backward compatibility.
-        
-        Args:
-            session_id: The session ID to kill
-        """
-        if not self.strategy:
-            return
-            
-        await self.strategy.kill_session(session_id)
-        
-        # Sync sessions
-        if hasattr(self.strategy, 'sessions'):
-            self.sessions = self.strategy.sessions
-    
-    async def close(self):
-        """Close all browsers in the pool and clean up resources."""
-        # Close all browsers in the pool
-        for strategies in self.browser_pool.values():
-            for strategy in strategies:
-                try:
-                    await strategy.close()
-                except Exception as e:
-                    if self.logger:
-                        self.logger.error(
-                            f"Error closing browser: {str(e)}",
-                            tag="POOL"
-                        )
-        
-        # Clear pool data
-        self.browser_pool = {}
-        self.browser_in_use = {}
-        
-        # Reset legacy references
-        self.browser = None
-        self.default_context = None
-        self.managed_browser = None
-        self.playwright = None
-        self.strategy = None
-        self.sessions = {}
-
-
-async def create_browser_manager(
-    browser_config: Optional[BrowserConfig] = None,
-    logger: Optional[AsyncLogger] = None,
-    unavailable_behavior: UnavailableBehavior = UnavailableBehavior.EXCEPTION,
-    max_browsers_per_config: int = 10,
-    initial_pool_size: int = 1,
-    page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None
-) -> BrowserManager:
-    """Factory function to create and initialize a BrowserManager.
-    
-    Args:
-        browser_config: Configuration for the browsers
-        logger: Logger for recording events
-        unavailable_behavior: Behavior when no browser is available
-        max_browsers_per_config: Maximum browsers per configuration
-        initial_pool_size: Initial number of browsers per configuration
-        page_configs: Optional configurations for pre-warming pages
-        
-    Returns:
-        Initialized BrowserManager
-    """
-    manager = BrowserManager(
-        browser_config=browser_config,
-        logger=logger,
-        unavailable_behavior=unavailable_behavior,
-        max_browsers_per_config=max_browsers_per_config
-    )
-    
-    await manager.initialize_pool(
-        [browser_config] if browser_config else None,
-        initial_pool_size,
-        page_configs
-    )
-    
-    return manager
-
-
-
-
-
diff --git a/crawl4ai/browser/models.py b/crawl4ai/browser/models.py
deleted file mode 100644
index e2ac2b3f..00000000
--- a/crawl4ai/browser/models.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""Docker configuration module for Crawl4AI browser automation.
-
-This module provides configuration classes for Docker-based browser automation,
-allowing flexible configuration of Docker containers for browsing.
-"""
-
-from typing import Dict, List, Optional
-
-
-class DockerConfig:
-    """Configuration for Docker-based browser automation.
-    
-    This class contains Docker-specific settings to avoid cluttering BrowserConfig.
-    
-    Attributes:
-        mode (str): Docker operation mode - "connect" or "launch".
-            - "connect": Uses a container with Chrome already running
-            - "launch": Dynamically configures and starts Chrome in container
-        image (str): Docker image to use. If None, defaults from DockerUtils are used.
-        registry_file (str): Path to container registry file for persistence.
-        persistent (bool): Keep container running after browser closes.
-        remove_on_exit (bool): Remove container on exit when not persistent.
-        network (str): Docker network to use.
-        volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]).
-        env_vars (Dict[str, str]): Environment variables to set in container.
-        extra_args (List[str]): Additional docker run arguments.
-        host_port (int): Host port to map to container's 9223 port.
-        user_data_dir (str): Path to user data directory on host.
-        container_user_data_dir (str): Path to user data directory in container.
-    """
-    
-    def __init__(
-        self,
-        mode: str = "connect",                     # "connect" or "launch" 
-        image: Optional[str] = None,               # Docker image to use
-        registry_file: Optional[str] = None,       # Path to registry file
-        persistent: bool = False,                  # Keep container running after browser closes
-        remove_on_exit: bool = True,               # Remove container on exit when not persistent
-        network: Optional[str] = None,             # Docker network to use
-        volumes: List[str] = None,                 # Volume mappings
-        cpu_limit: float = 1.0,                    # CPU limit for the container
-        memory_limit: str = "1.5g",                # Memory limit for the container
-        env_vars: Dict[str, str] = None,           # Environment variables
-        host_port: Optional[int] = None,           # Host port to map to container's 9223
-        user_data_dir: Optional[str] = None,       # Path to user data directory on host
-        container_user_data_dir: str = "/data",    # Path to user data directory in container
-        extra_args: List[str] = None,              # Additional docker run arguments
-    ):
-        """Initialize Docker configuration.
-        
-        Args:
-            mode: Docker operation mode ("connect" or "launch")
-            image: Docker image to use
-            registry_file: Path to container registry file
-            persistent: Whether to keep container running after browser closes
-            remove_on_exit: Whether to remove container on exit when not persistent
-            network: Docker network to use
-            volumes: Volume mappings as list of strings
-            cpu_limit: CPU limit for the container
-            memory_limit: Memory limit for the container
-            env_vars: Environment variables as dictionary
-            extra_args: Additional docker run arguments
-            host_port: Host port to map to container's 9223
-            user_data_dir: Path to user data directory on host
-            container_user_data_dir: Path to user data directory in container
-        """
-        self.mode = mode
-        self.image = image  # If None, defaults will be used from DockerUtils
-        self.registry_file = registry_file
-        self.persistent = persistent
-        self.remove_on_exit = remove_on_exit
-        self.network = network
-        self.volumes = volumes or []
-        self.cpu_limit = cpu_limit
-        self.memory_limit = memory_limit
-        self.env_vars = env_vars or {}
-        self.extra_args = extra_args or []
-        self.host_port = host_port
-        self.user_data_dir = user_data_dir
-        self.container_user_data_dir = container_user_data_dir
-    
-    def to_dict(self) -> Dict:
-        """Convert this configuration to a dictionary.
-        
-        Returns:
-            Dictionary representation of this configuration
-        """
-        return {
-            "mode": self.mode,
-            "image": self.image,
-            "registry_file": self.registry_file,
-            "persistent": self.persistent,
-            "remove_on_exit": self.remove_on_exit,
-            "network": self.network,
-            "volumes": self.volumes,
-            "cpu_limit": self.cpu_limit,
-            "memory_limit": self.memory_limit,
-            "env_vars": self.env_vars,
-            "extra_args": self.extra_args,
-            "host_port": self.host_port,
-            "user_data_dir": self.user_data_dir,
-            "container_user_data_dir": self.container_user_data_dir
-        }
-        
-    @staticmethod
-    def from_kwargs(kwargs: Dict) -> "DockerConfig":
-        """Create a DockerConfig from a dictionary of keyword arguments.
-        
-        Args:
-            kwargs: Dictionary of configuration options
-            
-        Returns:
-            New DockerConfig instance
-        """
-        return DockerConfig(
-            mode=kwargs.get("mode", "connect"),
-            image=kwargs.get("image"),
-            registry_file=kwargs.get("registry_file"),
-            persistent=kwargs.get("persistent", False),
-            remove_on_exit=kwargs.get("remove_on_exit", True),
-            network=kwargs.get("network"),
-            volumes=kwargs.get("volumes"),
-            cpu_limit=kwargs.get("cpu_limit", 1.0),
-            memory_limit=kwargs.get("memory_limit", "1.5g"),
-            env_vars=kwargs.get("env_vars"),
-            extra_args=kwargs.get("extra_args"),
-            host_port=kwargs.get("host_port"),
-            user_data_dir=kwargs.get("user_data_dir"),
-            container_user_data_dir=kwargs.get("container_user_data_dir", "/data")
-        )
-        
-    def clone(self, **kwargs) -> "DockerConfig":
-        """Create a copy of this configuration with updated values.
-        
-        Args:
-            **kwargs: Key-value pairs of configuration options to update
-            
-        Returns:
-            DockerConfig: A new instance with the specified updates
-        """
-        config_dict = self.to_dict()
-        config_dict.update(kwargs)
-        return DockerConfig.from_kwargs(config_dict)
\ No newline at end of file
diff --git a/crawl4ai/browser/profiles.py b/crawl4ai/browser/profiles.py
deleted file mode 100644
index afd0d78a..00000000
--- a/crawl4ai/browser/profiles.py
+++ /dev/null
@@ -1,457 +0,0 @@
-"""Browser profile management module for Crawl4AI.
-
-This module provides functionality for creating and managing browser profiles
-that can be used for authenticated browsing.
-"""
-
-import os
-import asyncio
-import signal
-import sys
-import datetime
-import uuid
-import shutil
-from typing import List, Dict, Optional, Any
-from colorama import Fore, Style, init
-
-from ..async_configs import BrowserConfig
-from ..async_logger import AsyncLogger, AsyncLoggerBase
-from ..utils import get_home_folder
-
-class BrowserProfileManager:
-    """Manages browser profiles for Crawl4AI.
-    
-    This class provides functionality to create and manage browser profiles
-    that can be used for authenticated browsing with Crawl4AI.
-    
-    Profiles are stored by default in ~/.crawl4ai/profiles/
-    """
-    
-    def __init__(self, logger: Optional[AsyncLoggerBase] = None):
-        """Initialize the BrowserProfileManager.
-        
-        Args:
-            logger: Logger for outputting messages. If None, a default AsyncLogger is created.
-        """
-        # Initialize colorama for colorful terminal output
-        init()
-        
-        # Create a logger if not provided
-        if logger is None:
-            self.logger = AsyncLogger(verbose=True)
-        elif not isinstance(logger, AsyncLoggerBase):
-            self.logger = AsyncLogger(verbose=True)
-        else:
-            self.logger = logger
-            
-        # Ensure profiles directory exists
-        self.profiles_dir = os.path.join(get_home_folder(), "profiles")
-        os.makedirs(self.profiles_dir, exist_ok=True)
-    
-    async def create_profile(self, 
-                          profile_name: Optional[str] = None, 
-                          browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
-        """Create a browser profile interactively.
-        
-        Args:
-            profile_name: Name for the profile. If None, a name is generated.
-            browser_config: Configuration for the browser. If None, a default configuration is used.
-                
-        Returns:
-            Path to the created profile directory, or None if creation failed
-        """
-        # Create default browser config if none provided
-        if browser_config is None:
-            browser_config = BrowserConfig(
-                browser_type="chromium",
-                headless=False,  # Must be visible for user interaction
-                verbose=True
-            )
-        else:
-            # Ensure headless is False for user interaction
-            browser_config.headless = False
-            
-        # Generate profile name if not provided
-        if not profile_name:
-            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-            profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
-            
-        # Sanitize profile name (replace spaces and special chars)
-        profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
-        
-        # Set user data directory
-        profile_path = os.path.join(self.profiles_dir, profile_name)
-        os.makedirs(profile_path, exist_ok=True)
-        
-        # Print instructions for the user with colorama formatting
-        border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
-        self.logger.info(f"\n{border}", tag="PROFILE")
-        self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
-        self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
-        
-        self.logger.info("\nInstructions:", tag="PROFILE")
-        self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
-        self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
-        self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
-        self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
-        self.logger.info(f"{border}\n", tag="PROFILE")
-        
-        # Import the necessary classes with local imports to avoid circular references
-        from .strategies import CDPBrowserStrategy
-        
-        # Set browser config to use the profile path
-        browser_config.user_data_dir = profile_path
-        
-        # Create a CDP browser strategy for the profile creation
-        browser_strategy = CDPBrowserStrategy(browser_config, self.logger)
-        
-        # Set up signal handlers to ensure cleanup on interrupt
-        original_sigint = signal.getsignal(signal.SIGINT)
-        original_sigterm = signal.getsignal(signal.SIGTERM)
-        
-        # Define cleanup handler for signals
-        async def cleanup_handler(sig, frame):
-            self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
-            await browser_strategy.close()
-            # Restore original signal handlers
-            signal.signal(signal.SIGINT, original_sigint)
-            signal.signal(signal.SIGTERM, original_sigterm)
-            if sig == signal.SIGINT:
-                self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
-                sys.exit(1)
-                
-        # Set signal handlers
-        def sigint_handler(sig, frame):
-            asyncio.create_task(cleanup_handler(sig, frame))
-        
-        signal.signal(signal.SIGINT, sigint_handler)
-        signal.signal(signal.SIGTERM, sigint_handler)
-        
-        # Event to signal when user is done with the browser
-        user_done_event = asyncio.Event()
-        
-        # Run keyboard input loop in a separate task
-        async def listen_for_quit_command():
-            import termios
-            import tty
-            import select
-            
-            # First output the prompt
-            self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
-            
-            # Save original terminal settings
-            fd = sys.stdin.fileno()
-            old_settings = termios.tcgetattr(fd)
-            
-            try:
-                # Switch to non-canonical mode (no line buffering)
-                tty.setcbreak(fd)
-                
-                while True:
-                    # Check if input is available (non-blocking)
-                    readable, _, _ = select.select([sys.stdin], [], [], 0.5)
-                    if readable:
-                        key = sys.stdin.read(1)
-                        if key.lower() == 'q':
-                            self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
-                            user_done_event.set()
-                            return
-                    
-                    # Check if the browser process has already exited
-                    if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None:
-                        self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
-                        user_done_event.set()
-                        return
-                        
-                    await asyncio.sleep(0.1)
-            
-            finally:
-                # Restore terminal settings 
-                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
-        
-        try:
-            # Start the browser
-            await browser_strategy.start()
-            
-            # Check if browser started successfully
-            if not browser_strategy.browser_process:
-                self.logger.error("Failed to start browser process.", tag="PROFILE")
-                return None
-            
-            self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") 
-            
-            # Start listening for keyboard input
-            listener_task = asyncio.create_task(listen_for_quit_command())
-            
-            # Wait for either the user to press 'q' or for the browser process to exit naturally
-            while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None:
-                await asyncio.sleep(0.5)
-            
-            # Cancel the listener task if it's still running
-            if not listener_task.done():
-                listener_task.cancel()
-                try:
-                    await listener_task
-                except asyncio.CancelledError:
-                    pass
-            
-            # If the browser is still running and the user pressed 'q', terminate it
-            if browser_strategy.browser_process.poll() is None and user_done_event.is_set():
-                self.logger.info("Terminating browser process...", tag="PROFILE")
-                await browser_strategy.close()
-            
-            self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
-                
-        except Exception as e:
-            self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
-            await browser_strategy.close()
-            return None
-        finally:
-            # Restore original signal handlers
-            signal.signal(signal.SIGINT, original_sigint)
-            signal.signal(signal.SIGTERM, original_sigterm)
-            
-            # Make sure browser is fully cleaned up
-            await browser_strategy.close()
-        
-        # Return the profile path
-        return profile_path
-    
-    def list_profiles(self) -> List[Dict[str, Any]]:
-        """List all available browser profiles.
-        
-        Returns:
-            List of dictionaries containing profile information
-        """
-        if not os.path.exists(self.profiles_dir):
-            return []
-            
-        profiles = []
-        
-        for name in os.listdir(self.profiles_dir):
-            profile_path = os.path.join(self.profiles_dir, name)
-            
-            # Skip if not a directory
-            if not os.path.isdir(profile_path):
-                continue
-                
-            # Check if this looks like a valid browser profile
-            # For Chromium: Look for Preferences file
-            # For Firefox: Look for prefs.js file
-            is_valid = False
-            
-            if os.path.exists(os.path.join(profile_path, "Preferences")) or \
-               os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
-                is_valid = "chromium"
-            elif os.path.exists(os.path.join(profile_path, "prefs.js")):
-                is_valid = "firefox"
-                
-            if is_valid:
-                # Get creation time
-                created = datetime.datetime.fromtimestamp(
-                    os.path.getctime(profile_path)
-                )
-                
-                profiles.append({
-                    "name": name,
-                    "path": profile_path,
-                    "created": created,
-                    "type": is_valid
-                })
-                
-        # Sort by creation time, newest first
-        profiles.sort(key=lambda x: x["created"], reverse=True)
-        
-        return profiles
-    
-    def get_profile_path(self, profile_name: str) -> Optional[str]:
-        """Get the full path to a profile by name.
-        
-        Args:
-            profile_name: Name of the profile (not the full path)
-            
-        Returns:
-            Full path to the profile directory, or None if not found
-        """
-        profile_path = os.path.join(self.profiles_dir, profile_name)
-        
-        # Check if path exists and is a valid profile
-        if not os.path.isdir(profile_path):
-            # Check if profile_name itself is full path
-            if os.path.isabs(profile_name):
-                profile_path = profile_name
-            else:
-                return None
-        
-        # Look for profile indicators
-        is_profile = (
-            os.path.exists(os.path.join(profile_path, "Preferences")) or
-            os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
-            os.path.exists(os.path.join(profile_path, "prefs.js"))
-        )
-        
-        if not is_profile:
-            return None  # Not a valid browser profile
-            
-        return profile_path
-    
-    def delete_profile(self, profile_name_or_path: str) -> bool:
-        """Delete a browser profile by name or path.
-        
-        Args:
-            profile_name_or_path: Name of the profile or full path to profile directory
-            
-        Returns:
-            True if the profile was deleted successfully, False otherwise
-        """
-        # Determine if input is a name or a path
-        if os.path.isabs(profile_name_or_path):
-            # Full path provided
-            profile_path = profile_name_or_path
-        else:
-            # Just a name provided, construct path
-            profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
-        
-        # Check if path exists and is a valid profile
-        if not os.path.isdir(profile_path):
-            return False
-            
-        # Look for profile indicators
-        is_profile = (
-            os.path.exists(os.path.join(profile_path, "Preferences")) or
-            os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
-            os.path.exists(os.path.join(profile_path, "prefs.js"))
-        )
-        
-        if not is_profile:
-            return False  # Not a valid browser profile
-            
-        # Delete the profile directory
-        try:
-            shutil.rmtree(profile_path)
-            return True
-        except Exception:
-            return False
-            
-    async def interactive_manager(self, crawl_callback=None):
-        """Launch an interactive profile management console.
-        
-        Args:
-            crawl_callback: Function to call when selecting option to use 
-                a profile for crawling. It will be called with (profile_path, url).
-        """
-        while True:
-            self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
-            self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
-            self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
-            self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
-            
-            # Only show crawl option if callback provided
-            if crawl_callback:
-                self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
-                self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
-                exit_option = "5"
-            else:
-                self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
-                exit_option = "4"
-            
-            choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
-            
-            if choice == "1":
-                # Create new profile
-                name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
-                await self.create_profile(name or None)
-                
-            elif choice == "2":
-                # List profiles
-                profiles = self.list_profiles()
-                
-                if not profiles:
-                    self.logger.warning("  No profiles found. Create one first with option 1.", tag="PROFILES")
-                    continue
-                
-                # Print profile information with colorama formatting
-                self.logger.info("\nAvailable profiles:", tag="PROFILES")
-                for i, profile in enumerate(profiles):
-                    self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
-                    self.logger.info(f"    Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
-                    self.logger.info(f"    Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
-                    self.logger.info(f"    Browser type: {profile['type']}", tag="PROFILES")
-                    self.logger.info("", tag="PROFILES")  # Empty line for spacing
-                
-            elif choice == "3":
-                # Delete profile
-                profiles = self.list_profiles()
-                if not profiles:
-                    self.logger.warning("No profiles found to delete", tag="PROFILES")
-                    continue
-                    
-                # Display numbered list
-                self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
-                for i, profile in enumerate(profiles):
-                    self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
-                    
-                # Get profile to delete
-                profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
-                if profile_idx.lower() == 'c':
-                    continue
-                    
-                try:
-                    idx = int(profile_idx) - 1
-                    if 0 <= idx < len(profiles):
-                        profile_name = profiles[idx]["name"]
-                        self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
-                        
-                        # Confirm deletion
-                        confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
-                        if confirm.lower() == 'y':
-                            success = self.delete_profile(profiles[idx]["path"])
-                            
-                            if success:
-                                self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
-                            else:
-                                self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
-                    else:
-                        self.logger.error("Invalid profile number", tag="PROFILES")
-                except ValueError:
-                    self.logger.error("Please enter a valid number", tag="PROFILES")
-                    
-            elif choice == "4" and crawl_callback:
-                # Use profile to crawl a site
-                profiles = self.list_profiles()
-                if not profiles:
-                    self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
-                    continue
-                    
-                # Display numbered list
-                self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
-                for i, profile in enumerate(profiles):
-                    self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
-                    
-                # Get profile to use
-                profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
-                if profile_idx.lower() == 'c':
-                    continue
-                    
-                try:
-                    idx = int(profile_idx) - 1
-                    if 0 <= idx < len(profiles):
-                        profile_path = profiles[idx]["path"]
-                        url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
-                        if url:
-                            # Call the provided crawl callback
-                            await crawl_callback(profile_path, url)
-                        else:
-                            self.logger.error("No URL provided", tag="CRAWL")
-                    else:
-                        self.logger.error("Invalid profile number", tag="PROFILES")
-                except ValueError:
-                    self.logger.error("Please enter a valid number", tag="PROFILES")
-                    
-            elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback):
-                # Exit
-                self.logger.info("Exiting profile management", tag="MENU")
-                break
-                
-            else:
-                self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
diff --git a/crawl4ai/browser/strategies/__init__.py b/crawl4ai/browser/strategies/__init__.py
deleted file mode 100644
index c4f17fd9..00000000
--- a/crawl4ai/browser/strategies/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from .base import BaseBrowserStrategy
-from .cdp import CDPBrowserStrategy
-from .docker_strategy import DockerBrowserStrategy
-from .playwright import PlaywrightBrowserStrategy
-from .builtin import BuiltinBrowserStrategy
-
-__all__ = [
-    "BrowserStrategy",
-    "CDPBrowserStrategy",
-    "DockerBrowserStrategy",
-    "PlaywrightBrowserStrategy",
-    "BuiltinBrowserStrategy",
-]
\ No newline at end of file
diff --git a/crawl4ai/browser/strategies/base.py b/crawl4ai/browser/strategies/base.py
deleted file mode 100644
index 14f7464d..00000000
--- a/crawl4ai/browser/strategies/base.py
+++ /dev/null
@@ -1,601 +0,0 @@
-"""Browser strategies module for Crawl4AI.
-
-This module implements the browser strategy pattern for different
-browser implementations, including Playwright, CDP, and builtin browsers.
-"""
-
-from abc import ABC, abstractmethod
-import asyncio
-import json
-import hashlib
-import os
-import time
-from typing import Optional, Tuple, List
-
-from playwright.async_api import BrowserContext, Page
-
-from ...async_logger import AsyncLogger
-from ...async_configs import BrowserConfig, CrawlerRunConfig
-from ...config import DOWNLOAD_PAGE_TIMEOUT
-from ...js_snippet import load_js_script
-from ..utils import get_playwright
-
-
-class BaseBrowserStrategy(ABC):
-    """Base class for all browser strategies.
-    
-    This abstract class defines the interface that all browser strategies
-    must implement. It handles common functionality like context caching,
-    browser configuration, and session management.
-    """
-    
-    _playwright_instance = None
-    
-    @classmethod
-    async def get_playwright(cls):
-        """Get or create a shared Playwright instance.
-        
-        Returns:
-            Playwright: The shared Playwright instance
-        """
-        # For now I dont want Singleton pattern for Playwright
-        if cls._playwright_instance is None or True:
-            cls._playwright_instance = await get_playwright()
-        return cls._playwright_instance
-        
-    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
-        """Initialize the strategy with configuration and logger.
-        
-        Args:
-            config: Browser configuration
-            logger: Logger for recording events and errors
-        """
-        self.config = config
-        self.logger = logger
-        self.browser = None
-        self.default_context = None
-        
-        # Context management
-        self.contexts_by_config = {}  # config_signature -> context
-
-        self._contexts_lock = asyncio.Lock()
-        
-        # Session management
-        self.sessions = {}
-        self.session_ttl = 1800  # 30 minutes default
-        
-        # Playwright instance
-        self.playwright = None
-    
-    @abstractmethod
-    async def start(self):
-        """Start the browser.
-        
-        This method should be implemented by concrete strategies to initialize 
-        the browser in the appropriate way (direct launch, CDP connection, etc.)
-        
-        Returns:
-            self: For method chaining
-        """
-        # Base implementation gets the playwright instance
-        self.playwright = await self.get_playwright()
-        return self
-    
-    @abstractmethod
-    async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        pass
-
-    async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        """Get a page with specified configuration.
-        
-        This method should be implemented by concrete strategies to create
-        or retrieve a page according to their browser management approach.
-        
-        Args:
-            crawlerRunConfig: Crawler run configuration
-            
-        Returns:
-            Tuple of (Page, BrowserContext)
-        """
-        # Clean up expired sessions first
-        self._cleanup_expired_sessions()
-
-        # If a session_id is provided and we already have it, reuse that page + context
-        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
-            context, page, _ = self.sessions[crawlerRunConfig.session_id]
-            # Update last-used timestamp
-            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
-            return page, context
-        
-        page, context = await self._generate_page(crawlerRunConfig)
-
-        import uuid
-        setattr(page, "guid", uuid.uuid4())
-
-        # If a session_id is specified, store this session so we can reuse later
-        if crawlerRunConfig.session_id:
-            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
-            
-        return page, context        
-        pass
-    
-    async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
-        """Get multiple pages with the same configuration.
-        
-        Args:
-            crawlerRunConfig: Configuration for the pages
-            count: Number of pages to create
-            
-        Returns:
-            List of (Page, Context) tuples
-        """
-        pages = []
-        for _ in range(count):
-            page, context = await self.get_page(crawlerRunConfig)
-            pages.append((page, context))
-        return pages
-       
-    async def get_opened_pages(self) -> List[Page]:
-        """Get all opened pages in the
-        browser.
-        """
-        return [page for context in self.contexts_by_config.values() for page in context.pages]
-
-    def _build_browser_args(self) -> dict:
-        """Build browser launch arguments from config.
-        
-        Returns:
-            dict: Browser launch arguments for Playwright
-        """
-        # Define common browser arguments that improve performance and stability
-        args = [
-            "--no-sandbox",
-            "--no-first-run",
-            "--no-default-browser-check",
-            "--window-position=0,0",
-            "--ignore-certificate-errors",
-            "--ignore-certificate-errors-spki-list",
-            "--window-position=400,0",
-            "--force-color-profile=srgb",
-            "--mute-audio",
-            "--disable-gpu",
-            "--disable-gpu-compositing",
-            "--disable-software-rasterizer",
-            "--disable-dev-shm-usage",
-            "--disable-infobars",
-            "--disable-blink-features=AutomationControlled",
-            "--disable-renderer-backgrounding",
-            "--disable-ipc-flooding-protection",
-            "--disable-background-timer-throttling",
-            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
-        ]
-
-        # Define browser disable options for light mode
-        browser_disable_options = [
-            "--disable-backgrounding-occluded-windows",
-            "--disable-breakpad",
-            "--disable-client-side-phishing-detection",
-            "--disable-component-extensions-with-background-pages",
-            "--disable-default-apps",
-            "--disable-extensions",
-            "--disable-features=TranslateUI",
-            "--disable-hang-monitor",
-            "--disable-popup-blocking",
-            "--disable-prompt-on-repost",
-            "--disable-sync",
-            "--metrics-recording-only",
-            "--password-store=basic",
-            "--use-mock-keychain",
-        ]
-
-        # Apply light mode settings if enabled
-        if self.config.light_mode:
-            args.extend(browser_disable_options)
-
-        # Apply text mode settings if enabled (disables images, JS, etc)
-        if self.config.text_mode:
-            args.extend([
-                "--blink-settings=imagesEnabled=false",
-                "--disable-remote-fonts",
-                "--disable-images",
-                "--disable-javascript",
-                "--disable-software-rasterizer",
-                "--disable-dev-shm-usage",
-            ])
-
-        # Add any extra arguments from the config
-        if self.config.extra_args:
-            args.extend(self.config.extra_args)
-
-        # Build the core browser args dictionary
-        browser_args = {"headless": self.config.headless, "args": args}
-
-        # Add chrome channel if specified
-        if self.config.chrome_channel:
-            browser_args["channel"] = self.config.chrome_channel
-
-        # Configure downloads
-        if self.config.accept_downloads:
-            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
-                os.getcwd(), "downloads"
-            )
-            os.makedirs(browser_args["downloads_path"], exist_ok=True)
-
-        # Check for user data directory
-        if self.config.user_data_dir:
-            # Ensure the directory exists
-            os.makedirs(self.config.user_data_dir, exist_ok=True)
-            browser_args["user_data_dir"] = self.config.user_data_dir
-        
-        # Configure proxy settings
-        if self.config.proxy or self.config.proxy_config:
-            from playwright.async_api import ProxySettings
-
-            proxy_settings = (
-                ProxySettings(server=self.config.proxy)
-                if self.config.proxy
-                else ProxySettings(
-                    server=self.config.proxy_config.server,
-                    username=self.config.proxy_config.username,
-                    password=self.config.proxy_config.password,
-                )
-            )
-            browser_args["proxy"] = proxy_settings
-
-        return browser_args
-    
-    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
-        """Create a signature hash from configuration for context caching.
-        
-        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
-        then returns a hash of the sorted JSON. This yields a stable signature
-        that identifies configurations requiring a unique browser context.
-        
-        Args:
-            crawlerRunConfig: Crawler run configuration
-            
-        Returns:
-            str: Unique hash for this configuration
-        """
-        config_dict = crawlerRunConfig.__dict__.copy()
-        # Exclude items that do not affect browser-level setup
-        ephemeral_keys = [
-            "session_id",
-            "js_code",
-            "scraping_strategy",
-            "extraction_strategy",
-            "chunking_strategy",
-            "cache_mode",
-            "content_filter",
-            "semaphore_count",
-            "url"
-        ]
-        for key in ephemeral_keys:
-            if key in config_dict:
-                del config_dict[key]
-                
-        # Convert to canonical JSON string
-        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
-
-        # Hash the JSON so we get a compact, unique string
-        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
-        return signature_hash
-        
-    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
-        """Creates and returns a new browser context with configured settings.
-        
-        Args:
-            crawlerRunConfig: Configuration object for the crawler run
-            
-        Returns:
-            BrowserContext: Browser context object with the specified configurations
-        """
-        if not self.browser:
-            raise ValueError("Browser must be initialized before creating context")
-            
-        # Base settings
-        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
-        viewport_settings = {
-            "width": self.config.viewport_width,
-            "height": self.config.viewport_height,
-        }
-        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
-        
-        # Define blocked extensions for resource optimization
-        blocked_extensions = [
-            # Images
-            "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
-            # Fonts
-            "woff", "woff2", "ttf", "otf", "eot",
-            # Media
-            "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
-            "m4a", "opus", "flac",
-            # Documents
-            "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
-            # Archives
-            "zip", "rar", "7z", "tar", "gz",
-            # Scripts and data
-            "xml", "swf", "wasm",
-        ]
-
-        # Common context settings
-        context_settings = {
-            "user_agent": user_agent,
-            "viewport": viewport_settings,
-            "proxy": proxy_settings,
-            "accept_downloads": self.config.accept_downloads,
-            "storage_state": self.config.storage_state,
-            "ignore_https_errors": self.config.ignore_https_errors,
-            "device_scale_factor": 1.0,
-            "java_script_enabled": self.config.java_script_enabled,
-        }
-        
-        # Apply text mode settings if enabled
-        if self.config.text_mode:
-            text_mode_settings = {
-                "has_touch": False,
-                "is_mobile": False,
-                "java_script_enabled": False,  # Disable javascript in text mode
-            }
-            # Update context settings with text mode settings
-            context_settings.update(text_mode_settings)
-            if self.logger:
-                self.logger.debug("Text mode enabled for browser context", tag="BROWSER")
-        
-        # Handle storage state properly - this is key for persistence
-        if self.config.storage_state:
-            if self.logger:
-                if isinstance(self.config.storage_state, str):
-                    self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
-                else:
-                    self.logger.debug("Using storage state from config object", tag="BROWSER")
-
-        if self.config.user_data_dir:
-            # For CDP-based browsers, storage persistence is typically handled by the user_data_dir
-            # at the browser level, but we'll create a storage_state location for Playwright as well
-            storage_path = os.path.join(self.config.user_data_dir, "storage_state.json")
-            if not os.path.exists(storage_path):
-                # Create parent directory if it doesn't exist
-                os.makedirs(os.path.dirname(storage_path), exist_ok=True)
-                with open(storage_path, "w") as f:
-                    json.dump({}, f)
-            self.config.storage_state = storage_path
-
-            if self.logger:
-                self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
-        
-        # Apply crawler-specific configurations if provided
-        if crawlerRunConfig:
-            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
-            if crawlerRunConfig.proxy_config:
-                proxy_settings = {
-                    "server": crawlerRunConfig.proxy_config.server,
-                }
-                if crawlerRunConfig.proxy_config.username:
-                    proxy_settings.update({
-                        "username": crawlerRunConfig.proxy_config.username,
-                        "password": crawlerRunConfig.proxy_config.password,
-                    })
-                context_settings["proxy"] = proxy_settings
-                
-        # Create and return the context
-        try:
-            # Create the context with appropriate settings
-            context = await self.browser.new_context(**context_settings)
-            
-            # Apply text mode resource blocking if enabled
-            if self.config.text_mode:
-                # Create and apply route patterns for each extension
-                for ext in blocked_extensions:
-                    await context.route(f"**/*.{ext}", lambda route: route.abort())
-                    
-            return context
-        except Exception as e:
-            if self.logger:
-                self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER")
-            # Fallback to basic context creation if the advanced settings fail
-            return await self.browser.new_context()
-        
-    async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None):
-        """Set up a browser context with the configured options.
-        
-        Args:
-            context: The browser context to set up
-            crawlerRunConfig: Configuration object containing all browser settings
-        """
-        # Set HTTP headers
-        if self.config.headers:
-            await context.set_extra_http_headers(self.config.headers)
-
-        # Add cookies
-        if self.config.cookies:
-            await context.add_cookies(self.config.cookies)
-
-        # Apply storage state if provided
-        if self.config.storage_state:
-            await context.storage_state(path=None)
-
-        # Configure downloads
-        if self.config.accept_downloads:
-            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
-            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
-            if self.config.downloads_path:
-                context._impl_obj._options["accept_downloads"] = True
-                context._impl_obj._options["downloads_path"] = self.config.downloads_path
-
-        # Handle user agent and browser hints
-        if self.config.user_agent:
-            combined_headers = {
-                "User-Agent": self.config.user_agent,
-                "sec-ch-ua": self.config.browser_hint,
-            }
-            combined_headers.update(self.config.headers)
-            await context.set_extra_http_headers(combined_headers)
-
-        # Add default cookie
-        target_url = (crawlerRunConfig and crawlerRunConfig.url) or "https://crawl4ai.com/"
-        await context.add_cookies(
-            [
-                {
-                    "name": "cookiesEnabled",
-                    "value": "true",
-                    "url": target_url,
-                }
-            ]
-        )
-
-        # Handle navigator overrides
-        if crawlerRunConfig:
-            if (
-                crawlerRunConfig.override_navigator
-                or crawlerRunConfig.simulate_user
-                or crawlerRunConfig.magic
-            ):
-                await context.add_init_script(load_js_script("navigator_overrider"))
-                
-    async def kill_session(self, session_id: str):
-        """Kill a browser session and clean up resources.
-
-        Args:
-            session_id (str): The session ID to kill.
-        """
-        if session_id not in self.sessions:
-            return
-            
-        context, page, _ = self.sessions[session_id]
-        
-        # Close the page
-        try:
-            await page.close()
-        except Exception as e:
-            if self.logger:
-                self.logger.error(f"Error closing page for session {session_id}: {str(e)}", tag="BROWSER")
-        
-        # Remove session from tracking
-        del self.sessions[session_id]
-        
-        # Clean up any contexts that no longer have pages
-        await self._cleanup_unused_contexts()
-        
-        if self.logger:
-            self.logger.debug(f"Killed session: {session_id}", tag="BROWSER")
-
-    async def _cleanup_unused_contexts(self):
-        """Clean up contexts that no longer have any pages."""
-        async with self._contexts_lock:
-            # Get all contexts we're managing
-            contexts_to_check = list(self.contexts_by_config.values())
-            
-            for context in contexts_to_check:
-                # Check if the context has any pages left
-                if not context.pages:
-                    # No pages left, we can close this context
-                    config_signature = next((sig for sig, ctx in self.contexts_by_config.items() 
-                                           if ctx == context), None)
-                    if config_signature:
-                        try:
-                            await context.close()
-                            del self.contexts_by_config[config_signature]
-                            if self.logger:
-                                self.logger.debug(f"Closed unused context", tag="BROWSER")
-                        except Exception as e:
-                            if self.logger:
-                                self.logger.error(f"Error closing unused context: {str(e)}", tag="BROWSER")
-    
-    def _cleanup_expired_sessions(self):
-        """Clean up expired sessions based on TTL."""
-        current_time = time.time()
-        expired_sessions = [
-            sid
-            for sid, (_, _, last_used) in self.sessions.items()
-            if current_time - last_used > self.session_ttl
-        ]
-        
-        for sid in expired_sessions:
-            if self.logger:
-                self.logger.debug(f"Session expired: {sid}", tag="BROWSER")
-            asyncio.create_task(self.kill_session(sid))
-
-    async def close(self):
-        """Close the browser and clean up resources.
-        
-        This method handles common cleanup tasks like:
-        1. Persisting storage state if a user_data_dir is configured
-        2. Closing all sessions
-        3. Closing all browser contexts
-        4. Closing the browser
-        5. Stopping Playwright
-        
-        Child classes should override this method to add their specific cleanup logic,
-        but should call super().close() to ensure common cleanup tasks are performed.
-        """
-        # Set a flag to prevent race conditions during cleanup
-        self.shutting_down = True
-        
-        try:
-            # Add brief delay if configured
-            if self.config.sleep_on_close:
-                await asyncio.sleep(0.5)
-                
-            # Persist storage state if using a user data directory
-            if self.config.user_data_dir and self.browser:
-                for context in self.browser.contexts:
-                    try:
-                        # Ensure the directory exists
-                        storage_dir = os.path.join(self.config.user_data_dir, "Default")
-                        os.makedirs(storage_dir, exist_ok=True)
-                        
-                        # Save storage state
-                        storage_path = os.path.join(storage_dir, "storage_state.json")
-                        await context.storage_state(path=storage_path)
-                        
-                        if self.logger:
-                            self.logger.debug("Storage state persisted before closing browser", tag="BROWSER")
-                    except Exception as e:
-                        if self.logger:
-                            self.logger.warning(
-                                message="Failed to ensure storage persistence: {error}",
-                                tag="BROWSER", 
-                                params={"error": str(e)}
-                            )
-            
-            # Close all active sessions
-            session_ids = list(self.sessions.keys())
-            for session_id in session_ids:
-                await self.kill_session(session_id)
-                
-            # Close all cached contexts
-            for ctx in self.contexts_by_config.values():
-                try:
-                    await ctx.close()
-                except Exception as e:
-                    if self.logger:
-                        self.logger.error(
-                            message="Error closing context: {error}",
-                            tag="BROWSER",
-                            params={"error": str(e)}
-                        )
-            self.contexts_by_config.clear()
-            
-            # Close the browser if it exists
-            if self.browser:
-                await self.browser.close()
-                self.browser = None
-                
-            # Stop playwright
-            if self.playwright:
-                await self.playwright.stop()
-                self.playwright = None
-                
-        except Exception as e:
-            if self.logger:
-                self.logger.error(
-                    message="Error during browser cleanup: {error}",
-                    tag="BROWSER",
-                    params={"error": str(e)}
-                )
-        finally:
-            # Reset shutting down flag
-            self.shutting_down = False
-    
-    
\ No newline at end of file
diff --git a/crawl4ai/browser/strategies/builtin.py b/crawl4ai/browser/strategies/builtin.py
deleted file mode 100644
index 678346fc..00000000
--- a/crawl4ai/browser/strategies/builtin.py
+++ /dev/null
@@ -1,468 +0,0 @@
-import asyncio
-import os
-import time
-import json
-import subprocess
-import shutil
-import signal
-from typing import Optional, Dict, Any, Tuple
-
-
-from ...async_logger import AsyncLogger
-from ...async_configs import CrawlerRunConfig
-from playwright.async_api import Page, BrowserContext
-from ...async_logger import AsyncLogger
-from ...async_configs import BrowserConfig
-from ...utils import get_home_folder
-from ..utils import get_browser_executable, is_windows, is_browser_running, find_process_by_port, terminate_process
-
-
-from .cdp import CDPBrowserStrategy
-from .base import BaseBrowserStrategy
-
-class BuiltinBrowserStrategy(CDPBrowserStrategy):
-    """Built-in browser strategy.
-    
-    This strategy extends the CDP strategy to use the built-in browser.
-    """
-    
-    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
-        """Initialize the built-in browser strategy.
-        
-        Args:
-            config: Browser configuration
-            logger: Logger for recording events and errors
-        """
-        super().__init__(config, logger)
-        self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir
-        self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
-
-        # Raise error if user data dir is already engaged
-        if self._check_user_dir_is_engaged(self.builtin_browser_dir):
-            raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.")
-
-        os.makedirs(self.builtin_browser_dir, exist_ok=True)
-    
-    def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool:
-        """Check if the user data directory is already in use.
-        
-        Returns:
-            bool: True if the directory is engaged, False otherwise
-        """
-        # Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches
-        # the current user data directory
-        if os.path.exists(self.builtin_config_file):
-            try:
-                with open(self.builtin_config_file, 'r') as f:
-                    browser_info_dict = json.load(f)
-                
-                # Check if user data dir is already engaged
-                for port_str, browser_info in browser_info_dict.get("port_map", {}).items():
-                    if browser_info.get("user_data_dir") == user_data_dir:
-                        return True
-            except Exception as e:
-                if self.logger:
-                    self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
-        return False
-
-    async def start(self):
-        """Start or connect to the built-in browser.
-        
-        Returns:
-            self: For method chaining
-        """
-        # Initialize Playwright instance via base class method
-        await BaseBrowserStrategy.start(self)
-        
-        try:
-            # Check for existing built-in browser (get_browser_info already checks if running)
-            browser_info = self.get_browser_info()
-            if browser_info:
-                if self.logger:
-                    self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER")
-                self.config.cdp_url = browser_info.get('cdp_url')
-            else:
-                if self.logger:
-                    self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER")
-                cdp_url = await self.launch_builtin_browser(
-                    browser_type=self.config.browser_type,
-                    debugging_port=self.config.debugging_port,
-                    headless=self.config.headless,
-                )
-                if not cdp_url:
-                    if self.logger:
-                        self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER")
-                    # Call CDP's start but skip BaseBrowserStrategy.start() since we already called it
-                    return await CDPBrowserStrategy.start(self)
-                self.config.cdp_url = cdp_url
-            
-            # Connect to the browser using CDP protocol
-            self.browser = await self.playwright.chromium.connect_over_cdp(self.config.cdp_url)
-            
-            # Get or create default context
-            contexts = self.browser.contexts
-            if contexts:
-                self.default_context = contexts[0]
-            else:
-                self.default_context = await self.create_browser_context()
-            
-            await self.setup_context(self.default_context)
-            
-            if self.logger:
-                self.logger.debug(f"Connected to built-in browser at {self.config.cdp_url}", tag="BUILTIN")
-                
-            return self
-        except Exception as e:
-            if self.logger:
-                self.logger.error(f"Failed to start built-in browser: {str(e)}", tag="BUILTIN")
-            
-            # There is a possibility that at this point I need to clean up some resourece
-            raise
-
-    def _get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]:
-        """Get information about the built-in browser for a specific debugging port.
-        
-        Args:
-            debugging_port: The debugging port to look for
-            config_file: Path to the config file
-            logger: Optional logger for recording events
-            
-        Returns:
-            dict: Browser information or None if no running browser is configured for this port
-        """
-        if not os.path.exists(config_file):
-            return None
-            
-        try:
-            with open(config_file, 'r') as f:
-                browser_info_dict = json.load(f)
-            
-            # Get browser info from port map
-            if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict:
-                port_str = str(debugging_port)
-                if port_str in browser_info_dict["port_map"]:
-                    browser_info = browser_info_dict["port_map"][port_str]
-                    
-                    # Check if the browser is still running
-                    pids = browser_info.get('pid', '')
-                    if isinstance(pids, str):
-                        pids = [int(pid) for pid in pids.split() if pid.isdigit()]
-                    elif isinstance(pids, int):
-                        pids = [pids]
-                    else:
-                        pids = []
-
-                    # Check if any of the PIDs are running
-                    if not pids:
-                        if logger:
-                            logger.warning(f"Built-in browser on port {debugging_port} has no valid PID", tag="BUILTIN")
-                        # Remove this port from the dictionary
-                        del browser_info_dict["port_map"][port_str]
-                        with open(config_file, 'w') as f:
-                            json.dump(browser_info_dict, f, indent=2)
-                        return None
-                    # Check if any of the PIDs are running
-                    for pid in pids:
-                        if is_browser_running(pid):
-                            browser_info['pid'] = pid
-                            break
-                    else:
-                        # If none of the PIDs are running, remove this port from the dictionary
-                        if logger:
-                            logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN")
-                        # Remove this port from the dictionary
-                        del browser_info_dict["port_map"][port_str]
-                        with open(config_file, 'w') as f:
-                            json.dump(browser_info_dict, f, indent=2)
-                        return None
-                    
-                    return browser_info
-            
-            return None
-                
-        except Exception as e:
-            if logger:
-                logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
-            return None
-            
-    def get_browser_info(self) -> Optional[Dict[str, Any]]:
-        """Get information about the current built-in browser instance.
-        
-        Returns:
-            dict: Browser information or None if no running browser is configured
-        """
-        return self._get_builtin_browser_info(
-            debugging_port=self.config.debugging_port,
-            config_file=self.builtin_config_file,
-            logger=self.logger
-        )
-    
-    async def launch_builtin_browser(self, 
-                               browser_type: str = "chromium",
-                               debugging_port: int = 9222,
-                               headless: bool = True) -> Optional[str]:
-        """Launch a browser in the background for use as the built-in browser.
-        
-        Args:
-            browser_type: Type of browser to launch ('chromium' or 'firefox')
-            debugging_port: Port to use for CDP debugging
-            headless: Whether to run in headless mode
-            
-        Returns:
-            str: CDP URL for the browser, or None if launch failed
-        """
-        # Check if there's an existing browser still running
-        browser_info = self._get_builtin_browser_info(
-            debugging_port=debugging_port,
-            config_file=self.builtin_config_file,
-            logger=self.logger
-        )
-        if browser_info:
-            if self.logger:
-                self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN")
-            return browser_info.get('cdp_url')
-        
-        # Create a user data directory for the built-in browser
-        user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
-     
-        # Raise error if user data dir is already engaged
-        if self._check_user_dir_is_engaged(user_data_dir):
-            raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.")
-            
-        # Create the user data directory if it doesn't exist
-        os.makedirs(user_data_dir, exist_ok=True)
-        
-        # Prepare browser launch arguments
-        browser_args = super()._build_browser_args()
-        browser_path = await get_browser_executable(browser_type)
-        base_args = [browser_path]
-
-        if browser_type == "chromium":
-            args = [
-                browser_path,
-                f"--remote-debugging-port={debugging_port}",
-                f"--user-data-dir={user_data_dir}",
-            ]
-            # if headless:
-            #     args.append("--headless=new")
-
-        elif browser_type == "firefox":
-            args = [
-                browser_path,
-                "--remote-debugging-port",
-                str(debugging_port),
-                "--profile",
-                user_data_dir,
-            ]
-            if headless:
-                args.append("--headless")
-        else:
-            if self.logger:
-                self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN")
-            return None
-        
-        args = base_args + browser_args + args
-        
-        try:
-
-            # Check if the port is already in use
-            PID = ""
-            cdp_url = f"http://localhost:{debugging_port}"
-            config_json = await self._check_port_in_use(cdp_url)
-            if config_json:
-                if self.logger:
-                    self.logger.info(f"Port {debugging_port} is already in use.", tag="BUILTIN")
-                PID = find_process_by_port(debugging_port)
-            else:
-                # Start the browser process detached
-                process = None
-                if is_windows():
-                    process = subprocess.Popen(
-                        args, 
-                        stdout=subprocess.PIPE, 
-                        stderr=subprocess.PIPE,
-                        creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
-                    )
-                else:
-                    process = subprocess.Popen(
-                        args, 
-                        stdout=subprocess.PIPE, 
-                        stderr=subprocess.PIPE,
-                        preexec_fn=os.setpgrp  # Start in a new process group
-                    )
-                
-                # Wait briefly to ensure the process starts successfully
-                await asyncio.sleep(2.0)
-                
-                # Check if the process is still running
-                if process and process.poll() is not None:
-                    if self.logger:
-                        self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN")
-                    return None
-            
-                PID = process.pid
-                # Construct CDP URL
-                config_json = await self._check_port_in_use(cdp_url)
-
-            
-            # Create browser info
-            browser_info = {
-                'pid': PID,
-                'cdp_url': cdp_url,
-                'user_data_dir': user_data_dir,
-                'browser_type': browser_type,
-                'debugging_port': debugging_port,
-                'start_time': time.time(),
-                'config': config_json
-            }
-            
-            # Read existing config file if it exists
-            port_map = {}
-            if os.path.exists(self.builtin_config_file):
-                try:
-                    with open(self.builtin_config_file, 'r') as f:
-                        existing_data = json.load(f)
-                    
-                    # Check if it already uses port mapping
-                    if isinstance(existing_data, dict) and "port_map" in existing_data:
-                        port_map = existing_data["port_map"]
-
-                    # # Convert legacy format to port mapping
-                    # elif isinstance(existing_data, dict) and "debugging_port" in existing_data:
-                    #     old_port = str(existing_data.get("debugging_port"))
-                    #     if self._is_browser_running(existing_data.get("pid")):
-                    #         port_map[old_port] = existing_data
-                except Exception as e:
-                    if self.logger:
-                        self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN")
-            
-            # Add/update this browser in the port map
-            port_map[str(debugging_port)] = browser_info
-            
-            # Write updated config
-            with open(self.builtin_config_file, 'w') as f:
-                json.dump({"port_map": port_map}, f, indent=2)
-                
-            # Detach from the browser process - don't keep any references
-            # This is important to allow the Python script to exit while the browser continues running
-            process = None
-                
-            if self.logger:
-                self.logger.success(f"Built-in browser launched at CDP URL: {cdp_url}", tag="BUILTIN")
-            return cdp_url
-            
-        except Exception as e:
-            if self.logger:
-                self.logger.error(f"Error launching built-in browser: {str(e)}", tag="BUILTIN")
-            return None
-
-    async def _check_port_in_use(self, cdp_url: str) -> dict:
-        """Check if a port is already in use by a Chrome DevTools instance.
-        
-        Args:
-            cdp_url: The CDP URL to check
-            
-        Returns:
-            dict: Chrome DevTools protocol version information or None if not found
-        """
-        import aiohttp
-        json_url = f"{cdp_url}/json/version"
-        json_config = None
-        
-        try:
-            async with aiohttp.ClientSession() as session:
-                try:
-                    async with session.get(json_url, timeout=2.0) as response:
-                        if response.status == 200:
-                            json_config = await response.json()
-                            if self.logger:
-                                self.logger.debug(f"Found CDP server running at {cdp_url}", tag="BUILTIN")
-                            return json_config
-                except (aiohttp.ClientError, asyncio.TimeoutError):
-                    pass
-            return None
-        except Exception as e:
-            if self.logger:
-                self.logger.debug(f"Error checking CDP port: {str(e)}", tag="BUILTIN")
-            return None
-
-    async def kill_builtin_browser(self) -> bool:
-        """Kill the built-in browser if it's running.
-        
-        Returns:
-            bool: True if the browser was killed, False otherwise
-        """
-        browser_info = self.get_browser_info()
-        if not browser_info:
-            if self.logger:
-                self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN")
-            return False
-            
-        pid = browser_info.get('pid')
-        if not pid:
-            return False
-            
-        success, error_msg = terminate_process(pid, logger=self.logger)
-        if success:
-            # Update config file to remove this browser
-            with open(self.builtin_config_file, 'r') as f:
-                browser_info_dict = json.load(f)
-
-            # Remove this port from the dictionary
-            port_str = str(self.config.debugging_port)
-            if port_str in browser_info_dict.get("port_map", {}):
-                del browser_info_dict["port_map"][port_str]
-
-            with open(self.builtin_config_file, 'w') as f:
-                json.dump(browser_info_dict, f, indent=2)
-
-            # Remove user data directory if it exists
-            if os.path.exists(self.builtin_browser_dir):
-                shutil.rmtree(self.builtin_browser_dir)
-
-            # Clear the browser info cache
-            self.browser = None
-            self.temp_dir = None
-            self.shutting_down = True
-                
-            if self.logger:
-                self.logger.success("Built-in browser terminated", tag="BUILTIN")
-            return True
-        else:
-            if self.logger:
-                self.logger.error(f"Error killing built-in browser: {error_msg}", tag="BUILTIN")
-            return False
-    
-    async def get_builtin_browser_status(self) -> Dict[str, Any]:
-        """Get status information about the built-in browser.
-        
-        Returns:
-            dict: Status information with running, cdp_url, and info fields
-        """
-        browser_info = self.get_browser_info()
-        
-        if not browser_info:
-            return {
-                'running': False,
-                'cdp_url': None,
-                'info': None,
-                'port': self.config.debugging_port
-            }
-            
-        return {
-            'running': True,
-            'cdp_url': browser_info.get('cdp_url'),
-            'info': browser_info,
-            'port': self.config.debugging_port
-        }
-
-    async def close(self):
-        """Close the built-in browser and clean up resources."""
-        # Call parent class close method
-        await super().close()
-        
-        # Clean up built-in browser if we created it and were in shutdown mode
-        if self.shutting_down:
-            await self.kill_builtin_browser()
-            if self.logger:
-                self.logger.debug("Killed built-in browser during shutdown", tag="BUILTIN")
\ No newline at end of file
diff --git a/crawl4ai/browser/strategies/cdp.py b/crawl4ai/browser/strategies/cdp.py
deleted file mode 100644
index 0bef6fec..00000000
--- a/crawl4ai/browser/strategies/cdp.py
+++ /dev/null
@@ -1,281 +0,0 @@
-"""Browser strategies module for Crawl4AI.
-
-This module implements the browser strategy pattern for different
-browser implementations, including Playwright, CDP, and builtin browsers.
-"""
-
-import asyncio
-import os
-import time
-import json
-import subprocess
-import shutil
-from typing import Optional, Tuple, List
-
-from playwright.async_api import BrowserContext, Page
-
-from ...async_logger import AsyncLogger
-from ...async_configs import BrowserConfig, CrawlerRunConfig
-from ..utils import get_playwright, get_browser_executable, create_temp_directory, is_windows, check_process_is_running, terminate_process
-
-from .base import BaseBrowserStrategy
-
-class CDPBrowserStrategy(BaseBrowserStrategy):
-    """CDP-based browser strategy.
-    
-    This strategy connects to an existing browser using CDP protocol or
-    launches and connects to a browser using CDP.
-    """
-    
-    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
-        """Initialize the CDP browser strategy.
-        
-        Args:
-            config: Browser configuration
-            logger: Logger for recording events and errors
-        """
-        super().__init__(config, logger)
-        self.sessions = {}
-        self.session_ttl = 1800  # 30 minutes
-        self.browser_process = None
-        self.temp_dir = None
-        self.shutting_down = False
-        
-    async def start(self):
-        """Start or connect to the browser using CDP.
-        
-        Returns:
-            self: For method chaining
-        """
-        # Call the base class start to initialize Playwright
-        await super().start()
-        
-        try:
-            # Get or create CDP URL
-            cdp_url = await self._get_or_create_cdp_url()
-            
-            # Connect to the browser using CDP
-            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
-            
-            # Get or create default context
-            contexts = self.browser.contexts
-            if contexts:
-                self.default_context = contexts[0]
-            else:
-                self.default_context = await self.create_browser_context()
-            
-            await self.setup_context(self.default_context)
-            
-            if self.logger:
-                self.logger.debug(f"Connected to CDP browser at {cdp_url}", tag="CDP")
-
-        except Exception as e:
-            if self.logger:
-                self.logger.error(f"Failed to connect to CDP browser: {str(e)}", tag="CDP")
-
-            # Clean up any resources before re-raising
-            await self._cleanup_process()
-            raise
-            
-        return self
-    
-    async def _get_or_create_cdp_url(self) -> str:
-        """Get existing CDP URL or launch a browser and return its CDP URL.
-        
-        Returns:
-            str: CDP URL for connecting to the browser
-        """
-        # If CDP URL is provided, just return it
-        if self.config.cdp_url:
-            return self.config.cdp_url
-
-        # Create temp dir if needed
-        if not self.config.user_data_dir:
-            self.temp_dir = create_temp_directory()
-            user_data_dir = self.temp_dir
-        else:
-            user_data_dir = self.config.user_data_dir
-
-        # Get browser args based on OS and browser type
-        # args = await self._get_browser_args(user_data_dir)
-        browser_args = super()._build_browser_args()
-        browser_path = await get_browser_executable(self.config.browser_type)
-        base_args = [browser_path]
-
-        if self.config.browser_type == "chromium":
-            args = [
-                f"--remote-debugging-port={self.config.debugging_port}",
-                f"--user-data-dir={user_data_dir}",
-            ]
-            # if self.config.headless:
-            #     args.append("--headless=new")
-
-        elif self.config.browser_type == "firefox":
-            args = [
-                "--remote-debugging-port",
-                str(self.config.debugging_port),
-                "--profile",
-                user_data_dir,
-            ]
-            if self.config.headless:
-                args.append("--headless")
-        else:
-            raise NotImplementedError(f"Browser type {self.config.browser_type} not supported")
-
-        args = base_args + browser_args['args'] + args         
-
-        # Start browser process
-        try:
-            # Use DETACHED_PROCESS flag on Windows to fully detach the process
-            # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
-            if is_windows():
-                self.browser_process = subprocess.Popen(
-                    args, 
-                    stdout=subprocess.PIPE, 
-                    stderr=subprocess.PIPE,
-                    creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
-                )
-            else:
-                self.browser_process = subprocess.Popen(
-                    args, 
-                    stdout=subprocess.PIPE, 
-                    stderr=subprocess.PIPE,
-                    preexec_fn=os.setpgrp  # Start in a new process group
-                )
-                
-            # Monitor for a short time to make sure it starts properly
-            is_running, return_code, stdout, stderr = await check_process_is_running(self.browser_process, delay=2)
-            if not is_running:
-                if self.logger:
-                    self.logger.error(
-                        message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
-                        tag="ERROR",
-                        params={
-                            "code": return_code,
-                            "stdout": stdout.decode() if stdout else "",
-                            "stderr": stderr.decode() if stderr else "",
-                        },
-                    )
-                await self._cleanup_process()
-                raise Exception("Browser process terminated unexpectedly")
-
-            return f"http://localhost:{self.config.debugging_port}"
-        except Exception as e:
-            await self._cleanup_process()
-            raise Exception(f"Failed to start browser: {e}")    
-
-    async def _cleanup_process(self):
-        """Cleanup browser process and temporary directory."""
-        # Set shutting_down flag BEFORE any termination actions
-        self.shutting_down = True
-
-        if self.browser_process:
-            try:
-                # Only attempt termination if the process is still running
-                if self.browser_process.poll() is None:
-                    # Use our robust cross-platform termination utility
-                    success = terminate_process(
-                        pid=self.browser_process.pid,
-                        timeout=1.0,  # Equivalent to the previous 10*0.1s wait
-                        logger=self.logger
-                    )
-                    
-                    if not success and self.logger:
-                        self.logger.warning(
-                            message="Failed to terminate browser process cleanly",
-                            tag="PROCESS"
-                        )
-                        
-            except Exception as e:
-                if self.logger:
-                    self.logger.error(
-                        message="Error during browser process cleanup: {error}",
-                        tag="ERROR",
-                        params={"error": str(e)},
-                    )
-
-        if self.temp_dir and os.path.exists(self.temp_dir):
-            try:
-                shutil.rmtree(self.temp_dir)
-                self.temp_dir = None
-                if self.logger:
-                    self.logger.debug("Removed temporary directory", tag="CDP")
-            except Exception as e:
-                if self.logger:
-                    self.logger.error(
-                        message="Error removing temporary directory: {error}",
-                        tag="CDP",
-                        params={"error": str(e)}
-                    )
-
-        self.browser_process = None
-    
-    async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        # For CDP, we typically use the shared default_context
-        context = self.default_context
-        pages = context.pages
-        
-        # Otherwise, check if we have an existing context for this config
-        config_signature = self._make_config_signature(crawlerRunConfig)
-        self.contexts_by_config[config_signature] = context
-
-        await self.setup_context(context, crawlerRunConfig)
-
-        # Check if there's already a page with the target URL
-        page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
-        
-        # If not found, create a new page
-        if not page:
-            page = await context.new_page()
-        
-        return page, context
-
-    async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        """Get a page for the given configuration.
-        
-        Args:
-            crawlerRunConfig: Configuration object for the crawler run
-            
-        Returns:
-            Tuple of (Page, BrowserContext)
-        """
-        # Call parent method to ensure browser is started
-        await super().get_page(crawlerRunConfig)
-        
-        # For CDP, we typically use the shared default_context
-        context = self.default_context
-        pages = context.pages
-        
-        # Otherwise, check if we have an existing context for this config
-        config_signature = self._make_config_signature(crawlerRunConfig)
-        self.contexts_by_config[config_signature] = context
-
-        await self.setup_context(context, crawlerRunConfig)
-
-        # Check if there's already a page with the target URL
-        page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
-        
-        # If not found, create a new page
-        if not page:
-            page = await context.new_page()
-        
-        # If a session_id is specified, store this session for reuse
-        if crawlerRunConfig.session_id:
-            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
-        
-        return page, context
-
-    async def close(self):
-        """Close the CDP browser and clean up resources."""
-        # Skip cleanup if using external CDP URL and not launched by us
-        if self.config.cdp_url and not self.browser_process:
-            if self.logger:
-                self.logger.debug("Skipping cleanup for external CDP browser", tag="CDP")
-            return
-        
-        # Call parent implementation for common cleanup
-        await super().close()
-        
-        # Additional CDP-specific cleanup
-        await asyncio.sleep(0.5)
-        await self._cleanup_process()
diff --git a/crawl4ai/browser/strategies/docker_strategy.py b/crawl4ai/browser/strategies/docker_strategy.py
deleted file mode 100644
index 5390fc8a..00000000
--- a/crawl4ai/browser/strategies/docker_strategy.py
+++ /dev/null
@@ -1,430 +0,0 @@
-"""Docker browser strategy module for Crawl4AI.
-
-This module provides browser strategies for running browsers in Docker containers,
-which offers better isolation, consistency across platforms, and easy scaling.
-"""
-
-import os
-import uuid
-from typing import List, Optional
-
-
-from ...async_logger import AsyncLogger
-from ...async_configs import BrowserConfig
-from ..models import DockerConfig
-from ..docker_registry import DockerRegistry
-from ..docker_utils import DockerUtils
-from .builtin import CDPBrowserStrategy
-from .base import BaseBrowserStrategy
-
-class DockerBrowserStrategy(CDPBrowserStrategy):
-    """Docker-based browser strategy.
-
-    Extends the CDPBrowserStrategy to run browsers in Docker containers.
-    Supports two modes:
-    1. "connect" - Uses a Docker image with Chrome already running
-    2. "launch" - Starts Chrome within the container with custom settings
-
-    Attributes:
-        docker_config: Docker-specific configuration options
-        container_id: ID of current Docker container
-        container_name: Name assigned to the container
-        registry: Registry for tracking and reusing containers
-        docker_utils: Utilities for Docker operations
-        chrome_process_id: Process ID of Chrome within container
-        socat_process_id: Process ID of socat within container
-        internal_cdp_port: Chrome's internal CDP port
-        internal_mapped_port: Port that socat maps to internally
-    """
-
-    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
-        """Initialize the Docker browser strategy.
-
-        Args:
-            config: Browser configuration including Docker-specific settings
-            logger: Logger for recording events and errors
-        """
-        super().__init__(config, logger)
-
-        # Initialize Docker-specific attributes
-        self.docker_config = self.config.docker_config or DockerConfig()
-        self.container_id = None
-        self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}"
-
-        # Use the shared registry file path for consistency with BuiltinBrowserStrategy
-        registry_file = self.docker_config.registry_file
-        if registry_file is None and self.config.user_data_dir:
-            # Use the same registry file as BuiltinBrowserStrategy if possible
-            registry_file = os.path.join(
-                os.path.dirname(self.config.user_data_dir), "browser_config.json"
-            )
-
-        self.registry = DockerRegistry(self.docker_config.registry_file)
-        self.docker_utils = DockerUtils(logger)
-        self.chrome_process_id = None
-        self.socat_process_id = None
-        self.internal_cdp_port = 9222  # Chrome's internal CDP port
-        self.internal_mapped_port = 9223  # Port that socat maps to internally
-        self.shutting_down = False
-
-    async def start(self):
-        """Start or connect to a browser running in a Docker container.
-
-        This method initializes Playwright and establishes a connection to
-        a browser running in a Docker container. Depending on the configured mode:
-        - "connect": Connects to a container with Chrome already running
-        - "launch": Creates a container and launches Chrome within it
-
-        Returns:
-            self: For method chaining
-        """
-        # Initialize Playwright
-        await BaseBrowserStrategy.start(self)
-
-        if self.logger:
-            self.logger.info(
-                f"Starting Docker browser strategy in {self.docker_config.mode} mode",
-                tag="DOCKER",
-            )
-
-        try:
-            # Get CDP URL by creating or reusing a Docker container
-            # This handles the container management and browser startup
-            cdp_url = await self._get_or_create_cdp_url()
-
-            if not cdp_url:
-                raise Exception(
-                    "Failed to establish CDP connection to Docker container"
-                )
-
-            if self.logger:
-                self.logger.info(
-                    f"Connecting to browser in Docker via CDP: {cdp_url}", tag="DOCKER"
-                )
-
-            # Connect to the browser using CDP
-            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
-
-            # Get existing context or create default context
-            contexts = self.browser.contexts
-            if contexts:
-                self.default_context = contexts[0]
-                if self.logger:
-                    self.logger.debug("Using existing browser context", tag="DOCKER")
-            else:
-                if self.logger:
-                    self.logger.debug("Creating new browser context", tag="DOCKER")
-                self.default_context = await self.create_browser_context()
-                await self.setup_context(self.default_context)
-
-            return self
-
-        except Exception as e:
-            # Clean up resources if startup fails
-            if self.container_id and not self.docker_config.persistent:
-                if self.logger:
-                    self.logger.warning(
-                        f"Cleaning up container after failed start: {self.container_id[:12]}",
-                        tag="DOCKER",
-                    )
-                await self.docker_utils.remove_container(self.container_id)
-                self.registry.unregister_container(self.container_id)
-                self.container_id = None
-
-            if self.playwright:
-                await self.playwright.stop()
-                self.playwright = None
-
-            # Re-raise the exception
-            if self.logger:
-                self.logger.error(
-                    f"Failed to start Docker browser: {str(e)}", tag="DOCKER"
-                )
-            raise
-
-    async def _generate_config_hash(self) -> str:
-        """Generate a hash of the configuration for container matching.
-
-        Returns:
-            Hash string uniquely identifying this configuration
-        """
-        # Create a dict with the relevant parts of the config
-        config_dict = {
-            "image": self.docker_config.image,
-            "mode": self.docker_config.mode,
-            "browser_type": self.config.browser_type,
-            "headless": self.config.headless,
-        }
-
-        # Add browser-specific config if in launch mode
-        if self.docker_config.mode == "launch":
-            config_dict.update(
-                {
-                    "text_mode": self.config.text_mode,
-                    "light_mode": self.config.light_mode,
-                    "viewport_width": self.config.viewport_width,
-                    "viewport_height": self.config.viewport_height,
-                }
-            )
-
-        # Use the utility method to generate the hash
-        return self.docker_utils.generate_config_hash(config_dict)
-
-    async def _get_or_create_cdp_url(self) -> str:
-        """Get CDP URL by either creating a new container or using an existing one.
-
-        Returns:
-            CDP URL for connecting to the browser
-
-        Raises:
-            Exception: If container creation or browser launch fails
-        """
-        # If CDP URL is explicitly provided, use it
-        if self.config.cdp_url:
-            return self.config.cdp_url
-
-        # Ensure Docker image exists (will build if needed)
-        image_name = await self.docker_utils.ensure_docker_image_exists(
-            self.docker_config.image, self.docker_config.mode
-        )
-
-        # Generate config hash for container matching
-        config_hash = await self._generate_config_hash()
-
-        # Look for existing container with matching config
-        container_id = await self.registry.find_container_by_config(
-            config_hash, self.docker_utils
-        )
-
-        if container_id:
-            # Use existing container
-            self.container_id = container_id
-            host_port = self.registry.get_container_host_port(container_id)
-            if self.logger:
-                self.logger.info(
-                    f"Using existing Docker container: {container_id[:12]}",
-                    tag="DOCKER",
-                )
-        else:
-            # Get a port for the new container
-            host_port = (
-                self.docker_config.host_port
-                or self.registry.get_next_available_port(self.docker_utils)
-            )
-
-            # Prepare volumes list
-            volumes = list(self.docker_config.volumes)
-
-            # Add user data directory if specified
-            if self.docker_config.user_data_dir:
-                # Ensure user data directory exists
-                os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
-                volumes.append(
-                    f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}"
-                )
-
-                # # Update config user_data_dir to point to container path
-                # self.config.user_data_dir = self.docker_config.container_user_data_dir
-
-            # Create a new container
-            container_id = await self.docker_utils.create_container(
-                image_name=image_name,
-                host_port=host_port,
-                container_name=self.container_name,
-                volumes=volumes,
-                network=self.docker_config.network,
-                env_vars=self.docker_config.env_vars,
-                cpu_limit=self.docker_config.cpu_limit,
-                memory_limit=self.docker_config.memory_limit,
-                extra_args=self.docker_config.extra_args,
-            )
-
-            if not container_id:
-                raise Exception("Failed to create Docker container")
-
-            self.container_id = container_id
-
-            # Wait for container to be ready
-            await self.docker_utils.wait_for_container_ready(container_id)
-
-            # Handle specific setup based on mode
-            if self.docker_config.mode == "launch":
-                # In launch mode, we need to start socat and Chrome
-                await self.docker_utils.start_socat_in_container(container_id)
-
-                # Build browser arguments
-                browser_args = self._build_browser_args()
-
-                # Launch Chrome
-                await self.docker_utils.launch_chrome_in_container(
-                    container_id, browser_args
-                )
-
-                # Get PIDs for later cleanup
-                self.chrome_process_id = (
-                    await self.docker_utils.get_process_id_in_container(
-                        container_id, "chromium"
-                    )
-                )
-                self.socat_process_id = (
-                    await self.docker_utils.get_process_id_in_container(
-                        container_id, "socat"
-                    )
-                )
-
-            # Wait for CDP to be ready
-            cdp_json_config = await self.docker_utils.wait_for_cdp_ready(host_port)
-
-            if cdp_json_config:
-                # Register the container in the shared registry
-                self.registry.register_container(
-                    container_id, host_port, config_hash, cdp_json_config
-                )
-            else:
-                raise Exception("Failed to get CDP JSON config from Docker container")
-
-            if self.logger:
-                self.logger.success(
-                    f"Docker container ready: {container_id[:12]} on port {host_port}",
-                    tag="DOCKER",
-                )
-
-        # Return CDP URL
-        return f"http://localhost:{host_port}"
-
-    def _build_browser_args(self) -> List[str]:
-        """Build Chrome command line arguments based on BrowserConfig.
-
-        Returns:
-            List of command line arguments for Chrome
-        """
-        # Call parent method to get common arguments
-        browser_args = super()._build_browser_args()
-        return browser_args["args"] + [
-            f"--remote-debugging-port={self.internal_cdp_port}",
-            "--remote-debugging-address=0.0.0.0",  # Allow external connections
-            "--disable-dev-shm-usage",
-            "--headless=new",
-        ]
-
-        # args = [
-        #     "--no-sandbox",
-        #     "--disable-gpu",
-        #     f"--remote-debugging-port={self.internal_cdp_port}",
-        #     "--remote-debugging-address=0.0.0.0",  # Allow external connections
-        #     "--disable-dev-shm-usage",
-        # ]
-
-        # if self.config.headless:
-        #     args.append("--headless=new")
-
-        # if self.config.viewport_width and self.config.viewport_height:
-        #     args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}")
-
-        # if self.config.user_agent:
-        #     args.append(f"--user-agent={self.config.user_agent}")
-
-        # if self.config.text_mode:
-        #     args.extend([
-        #         "--blink-settings=imagesEnabled=false",
-        #         "--disable-remote-fonts",
-        #         "--disable-images",
-        #         "--disable-javascript",
-        #     ])
-
-        # if self.config.light_mode:
-        #     # Import here to avoid circular import
-        #     from ..utils import get_browser_disable_options
-        #     args.extend(get_browser_disable_options())
-
-        # if self.config.user_data_dir:
-        #     args.append(f"--user-data-dir={self.config.user_data_dir}")
-
-        # if self.config.extra_args:
-        #     args.extend(self.config.extra_args)
-
-        # return args
-
-    async def close(self):
-        """Close the browser and clean up Docker container if needed."""
-        # Set flag to track if we were the ones initiating shutdown
-        initiated_shutdown = not self.shutting_down
-        # Storage persistence for Docker needs special handling
-        # We need to store state before calling super().close() which will close the browser
-        if (
-            self.browser
-            and self.docker_config.user_data_dir
-            and self.docker_config.persistent
-        ):
-            for context in self.browser.contexts:
-                try:
-                    # Ensure directory exists
-                    os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
-
-                    # Save storage state to user data directory
-                    storage_path = os.path.join(
-                        self.docker_config.user_data_dir, "storage_state.json"
-                    )
-                    await context.storage_state(path=storage_path)
-                    if self.logger:
-                        self.logger.debug(
-                            "Persisted Docker-specific storage state", tag="DOCKER"
-                        )
-                except Exception as e:
-                    if self.logger:
-                        self.logger.warning(
-                            message="Failed to persist Docker storage state: {error}",
-                            tag="DOCKER",
-                            params={"error": str(e)},
-                        )
-
-        # Call parent method to handle common cleanup
-        await super().close()
-
-        # Only perform container cleanup if we initiated shutdown
-        # and we need to handle Docker-specific resources
-        if initiated_shutdown:
-            # Only clean up container if not persistent
-            if self.container_id and not self.docker_config.persistent:
-                # Stop Chrome process in "launch" mode
-                if self.docker_config.mode == "launch" and self.chrome_process_id:
-                    await self.docker_utils.stop_process_in_container(
-                        self.container_id, self.chrome_process_id
-                    )
-                    if self.logger:
-                        self.logger.debug(
-                            f"Stopped Chrome process {self.chrome_process_id} in container",
-                            tag="DOCKER",
-                        )
-
-                # Stop socat process in "launch" mode
-                if self.docker_config.mode == "launch" and self.socat_process_id:
-                    await self.docker_utils.stop_process_in_container(
-                        self.container_id, self.socat_process_id
-                    )
-                    if self.logger:
-                        self.logger.debug(
-                            f"Stopped socat process {self.socat_process_id} in container",
-                            tag="DOCKER",
-                        )
-
-                # Remove or stop container based on configuration
-                if self.docker_config.remove_on_exit:
-                    await self.docker_utils.remove_container(self.container_id)
-                    # Unregister from registry
-                    if hasattr(self, "registry") and self.registry:
-                        self.registry.unregister_container(self.container_id)
-                    if self.logger:
-                        self.logger.debug(
-                            f"Removed Docker container {self.container_id}",
-                            tag="DOCKER",
-                        )
-                else:
-                    await self.docker_utils.stop_container(self.container_id)
-                    if self.logger:
-                        self.logger.debug(
-                            f"Stopped Docker container {self.container_id}",
-                            tag="DOCKER",
-                        )
-
-                self.container_id = None
diff --git a/crawl4ai/browser/strategies/playwright.py b/crawl4ai/browser/strategies/playwright.py
deleted file mode 100644
index bea99753..00000000
--- a/crawl4ai/browser/strategies/playwright.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Browser strategies module for Crawl4AI.
-
-This module implements the browser strategy pattern for different
-browser implementations, including Playwright, CDP, and builtin browsers.
-"""
-
-import time
-from typing import Optional, Tuple
-
-from playwright.async_api import BrowserContext, Page
-
-from ...async_logger import AsyncLogger
-from ...async_configs import BrowserConfig, CrawlerRunConfig
-
-from playwright_stealth import StealthConfig
-
-from .base import BaseBrowserStrategy
-
-stealth_config = StealthConfig(
-    webdriver=True,
-    chrome_app=True,
-    chrome_csi=True,
-    chrome_load_times=True,
-    chrome_runtime=True,
-    navigator_languages=True,
-    navigator_plugins=True,
-    navigator_permissions=True,
-    webgl_vendor=True,
-    outerdimensions=True,
-    navigator_hardware_concurrency=True,
-    media_codecs=True,
-)
-
-class PlaywrightBrowserStrategy(BaseBrowserStrategy):
-    """Standard Playwright browser strategy.
-    
-    This strategy launches a new browser instance using Playwright
-    and manages browser contexts.
-    """
-    
-    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
-        """Initialize the Playwright browser strategy.
-        
-        Args:
-            config: Browser configuration
-            logger: Logger for recording events and errors
-        """
-        super().__init__(config, logger)
-        # No need to re-initialize sessions and session_ttl as they're now in the base class
-    
-    async def start(self):
-        """Start the browser instance.
-        
-        Returns:
-            self: For method chaining
-        """
-        # Call the base class start to initialize Playwright
-        await super().start()
-        
-        # Build browser arguments using the base class method
-        browser_args = self._build_browser_args()
-        
-        try:
-            # Launch appropriate browser type
-            if self.config.browser_type == "firefox":
-                self.browser = await self.playwright.firefox.launch(**browser_args)
-            elif self.config.browser_type == "webkit":
-                self.browser = await self.playwright.webkit.launch(**browser_args)
-            else:
-                self.browser = await self.playwright.chromium.launch(**browser_args)
-                
-            self.default_context = self.browser
-            
-            if self.logger:
-                self.logger.debug(f"Launched {self.config.browser_type} browser", tag="BROWSER")
-                
-        except Exception as e:
-            if self.logger:
-                self.logger.error(f"Failed to launch browser: {str(e)}", tag="BROWSER")
-            raise
-            
-        return self
-
-    async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        # Otherwise, check if we have an existing context for this config
-        config_signature = self._make_config_signature(crawlerRunConfig)
-        
-        async with self._contexts_lock:
-            if config_signature in self.contexts_by_config:
-                context = self.contexts_by_config[config_signature]
-            else:
-                # Create and setup a new context
-                context = await self.create_browser_context(crawlerRunConfig)
-                await self.setup_context(context, crawlerRunConfig)
-                self.contexts_by_config[config_signature] = context
-
-        # Create a new page from the chosen context
-        page = await context.new_page()
-        
-        return page, context
-
-    async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
-        """Get a page for the given configuration.
-        
-        Args:
-            crawlerRunConfig: Configuration object for the crawler run
-            
-        Returns:
-            Tuple of (Page, BrowserContext)
-        """
-        # Call parent method to ensure browser is started
-        await super().get_page(crawlerRunConfig)
-        
-        # Otherwise, check if we have an existing context for this config
-        config_signature = self._make_config_signature(crawlerRunConfig)
-        
-        async with self._contexts_lock:
-            if config_signature in self.contexts_by_config:
-                context = self.contexts_by_config[config_signature]
-            else:
-                # Create and setup a new context
-                context = await self.create_browser_context(crawlerRunConfig)
-                await self.setup_context(context, crawlerRunConfig)
-                self.contexts_by_config[config_signature] = context
-
-        # Create a new page from the chosen context
-        page = await context.new_page()
-        
-        # If a session_id is specified, store this session so we can reuse later
-        if crawlerRunConfig.session_id:
-            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
-            
-        return page, context
-        
diff --git a/crawl4ai/browser/utils.py b/crawl4ai/browser/utils.py
deleted file mode 100644
index 421230bf..00000000
--- a/crawl4ai/browser/utils.py
+++ /dev/null
@@ -1,465 +0,0 @@
-"""Browser utilities module for Crawl4AI.
-
-This module provides utility functions for browser management,
-including process management, CDP connection utilities,
-and Playwright instance management.
-"""
-
-import asyncio
-import os
-import sys
-import time
-import tempfile
-import subprocess
-from typing import Optional, Tuple, Union
-import signal
-import psutil
-
-from playwright.async_api import async_playwright
-
-from ..utils import get_chromium_path
-from ..async_configs import BrowserConfig, CrawlerRunConfig
-
-from ..async_logger import AsyncLogger
-
-
-_playwright_instance = None
-
-async def get_playwright():
-    """Get or create the Playwright instance (singleton pattern).
-    
-    Returns:
-        Playwright: The Playwright instance
-    """
-    global _playwright_instance
-    if _playwright_instance is None or True:
-        _playwright_instance = await async_playwright().start()
-    return _playwright_instance
-
-async def get_browser_executable(browser_type: str) -> str:
-    """Get the path to browser executable, with platform-specific handling.
-    
-    Args:
-        browser_type: Type of browser (chromium, firefox, webkit)
-        
-    Returns:
-        Path to browser executable
-    """
-    return await get_chromium_path(browser_type)
-
-def create_temp_directory(prefix="browser-profile-") -> str:
-    """Create a temporary directory for browser data.
-    
-    Args:
-        prefix: Prefix for the temporary directory name
-        
-    Returns:
-        Path to the created temporary directory
-    """
-    return tempfile.mkdtemp(prefix=prefix)
-
-def is_windows() -> bool:
-    """Check if the current platform is Windows.
-    
-    Returns:
-        True if Windows, False otherwise
-    """
-    return sys.platform == "win32"
-
-def is_macos() -> bool:
-    """Check if the current platform is macOS.
-    
-    Returns:
-        True if macOS, False otherwise
-    """
-    return sys.platform == "darwin"
-
-def is_linux() -> bool:
-    """Check if the current platform is Linux.
-    
-    Returns:
-        True if Linux, False otherwise
-    """
-    return not (is_windows() or is_macos())
-    
-def is_browser_running(pid: Optional[int]) -> bool:
-    """Check if a process with the given PID is running.
-    
-    Args:
-        pid: Process ID to check
-        
-    Returns:
-        bool: True if the process is running, False otherwise
-    """
-    if not pid:
-        return False
-        
-    try:
-        if type(pid) is str:
-            pid = int(pid)
-        # Check if the process exists
-        if is_windows():
-            process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], 
-                                     capture_output=True, text=True)
-            return str(pid) in process.stdout
-        else:
-            # Unix-like systems
-            os.kill(pid, 0)  # This doesn't actually kill the process, just checks if it exists
-        return True
-    except (ProcessLookupError, PermissionError, OSError):
-        return False
-
-def get_browser_disable_options() -> list:
-    """Get standard list of browser disable options for performance.
-    
-    Returns:
-        List of command-line options to disable various browser features
-    """
-    return [
-        "--disable-background-networking",
-        "--disable-background-timer-throttling",
-        "--disable-backgrounding-occluded-windows",
-        "--disable-breakpad",
-        "--disable-client-side-phishing-detection",
-        "--disable-component-extensions-with-background-pages",
-        "--disable-default-apps",
-        "--disable-extensions",
-        "--disable-features=TranslateUI",
-        "--disable-hang-monitor",
-        "--disable-ipc-flooding-protection",
-        "--disable-popup-blocking",
-        "--disable-prompt-on-repost",
-        "--disable-sync",
-        "--force-color-profile=srgb",
-        "--metrics-recording-only",
-        "--no-first-run",
-        "--password-store=basic",
-        "--use-mock-keychain",
-    ]
-
-
-async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
-    """Find optimal browser configuration for crawling a specific number of URLs.
-    
-    Args:
-        total_urls: Number of URLs to crawl
-        verbose: Whether to print progress
-        rate_limit_delay: Delay between page loads to avoid rate limiting
-        
-    Returns:
-        dict: Contains fastest, lowest_memory, and optimal configurations
-    """
-    from .manager import BrowserManager
-    if verbose:
-        print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
-    
-    # Generate test URLs with timestamp to avoid caching
-    timestamp = int(time.time())
-    urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
-    
-    # Limit browser configurations to test (1 browser to max 10)
-    max_browsers = min(10, total_urls)
-    configs_to_test = []
-    
-    # Generate configurations (browser count, pages distribution)
-    for num_browsers in range(1, max_browsers + 1):
-        base_pages = total_urls // num_browsers
-        remainder = total_urls % num_browsers
-        
-        # Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
-        if remainder > 0:
-            distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
-        else:
-            distribution = [base_pages] * num_browsers
-            
-        configs_to_test.append((num_browsers, distribution))
-    
-    results = []
-    
-    # Test each configuration
-    for browser_count, page_distribution in configs_to_test:
-        if verbose:
-            print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
-        
-        try:
-            # Track memory if possible
-            try:
-                import psutil
-                process = psutil.Process()
-                start_memory = process.memory_info().rss / (1024 * 1024)  # MB
-            except ImportError:
-                if verbose: 
-                    print("Memory tracking not available (psutil not installed)")
-                start_memory = 0
-            
-            # Start browsers in parallel
-            managers = []
-            start_tasks = []
-            start_time = time.time()
-
-            logger = AsyncLogger(verbose=True, log_file=None)
-            
-            for i in range(browser_count):
-                config = BrowserConfig(headless=True)
-                manager = BrowserManager(browser_config=config, logger=logger)
-                start_tasks.append(manager.start())
-                managers.append(manager)
-            
-            await asyncio.gather(*start_tasks)
-            
-            # Distribute URLs among browsers
-            urls_per_manager = {}
-            url_index = 0
-            
-            for i, manager in enumerate(managers):
-                pages_for_this_browser = page_distribution[i]
-                end_index = url_index + pages_for_this_browser
-                urls_per_manager[manager] = urls[url_index:end_index]
-                url_index = end_index
-            
-            # Create pages for each browser
-            all_pages = []
-            for manager, manager_urls in urls_per_manager.items():
-                if not manager_urls:
-                    continue
-                pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
-                all_pages.extend(zip(pages, manager_urls))
-            
-            # Crawl pages with delay to avoid rate limiting
-            async def crawl_page(page_ctx, url):
-                page, _ = page_ctx
-                try:
-                    await page.goto(url)
-                    if rate_limit_delay > 0:
-                        await asyncio.sleep(rate_limit_delay)
-                    title = await page.title()
-                    return title
-                finally:
-                    await page.close()
-            
-            crawl_start = time.time()
-            crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
-            await asyncio.gather(*crawl_tasks)
-            crawl_time = time.time() - crawl_start
-            total_time = time.time() - start_time
-            
-            # Measure final memory usage
-            if start_memory > 0:
-                end_memory = process.memory_info().rss / (1024 * 1024)
-                memory_used = end_memory - start_memory
-            else:
-                memory_used = 0
-            
-            # Close all browsers
-            for manager in managers:
-                await manager.close()
-            
-            # Calculate metrics
-            pages_per_second = total_urls / crawl_time
-            
-            # Calculate efficiency score (higher is better)
-            # This balances speed vs memory
-            if memory_used > 0:
-                efficiency = pages_per_second / (memory_used + 1)
-            else:
-                efficiency = pages_per_second
-            
-            # Store result
-            result = {
-                "browser_count": browser_count,
-                "distribution": tuple(page_distribution),
-                "crawl_time": crawl_time,
-                "total_time": total_time,
-                "memory_used": memory_used,
-                "pages_per_second": pages_per_second, 
-                "efficiency": efficiency
-            }
-            
-            results.append(result)
-            
-            if verbose:
-                print(f"  ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
-                if memory_used > 0:
-                    print(f"  ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
-                print(f"  ✓ Efficiency score: {efficiency:.4f}")
-            
-        except Exception as e:
-            if verbose:
-                print(f"  ✗ Error: {str(e)}")
-            
-            # Clean up
-            for manager in managers:
-                try:
-                    await manager.close()
-                except:
-                    pass
-    
-    # If no successful results, return None
-    if not results:
-        return None
-    
-    # Find best configurations
-    fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
-    
-    # Only consider memory if available
-    memory_results = [r for r in results if r["memory_used"] > 0]
-    if memory_results:
-        lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
-    else:
-        lowest_memory = fastest
-    
-    # Find most efficient (balanced speed vs memory)
-    optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
-    
-    # Print summary
-    if verbose:
-        print("\n=== OPTIMAL CONFIGURATIONS ===")
-        print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
-        print(f"   {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
-        
-        print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
-        if lowest_memory["memory_used"] > 0:
-            print(f"   {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
-        
-        print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
-        print(f"   {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
-    
-    return {
-        "fastest": fastest,
-        "lowest_memory": lowest_memory,
-        "optimal": optimal,
-        "all_configs": results
-    }
-
-
-# Find process ID of the existing browser using os
-def find_process_by_port(port: int) -> str:
-    """Find process ID listening on a specific port.
-    
-    Args:
-        port: Port number to check
-        
-    Returns:
-        str: Process ID or empty string if not found
-    """
-    try:
-        if is_windows():
-            cmd = f"netstat -ano | findstr :{port}"
-            result = subprocess.check_output(cmd, shell=True).decode()
-            return result.strip().split()[-1] if result else ""
-        else:
-            cmd = f"lsof -i :{port} -t"
-            return subprocess.check_output(cmd, shell=True).decode().strip()
-    except subprocess.CalledProcessError:
-        return ""
-    
-async def check_process_is_running(process: subprocess.Popen, delay: float = 0.5) -> Tuple[bool, Optional[int], bytes, bytes]:
-    """Perform a quick check to make sure the browser started successfully."""
-    if not process:
-        return False, None, b"", b""
-        
-    # Check that process started without immediate termination
-    await asyncio.sleep(delay)
-    if process.poll() is not None:
-        # Process already terminated
-        stdout, stderr = b"", b""
-        try:
-            stdout, stderr = process.communicate(timeout=0.5)
-        except subprocess.TimeoutExpired:
-            pass
-
-        return False, process.returncode, stdout, stderr
-            
-
-    return True, 0, b"", b""
-
-
-def terminate_process(
-    pid: Union[int, str], 
-    timeout: float = 5.0,
-    force_kill_timeout: float = 3.0,
-    logger = None
-) -> Tuple[bool, Optional[str]]:
-    """
-    Robustly terminate a process across platforms with verification.
-    
-    Args:
-        pid: Process ID to terminate (int or string)
-        timeout: Seconds to wait for graceful termination before force killing
-        force_kill_timeout: Seconds to wait after force kill before considering it failed
-        logger: Optional logger object with error, warning, and info methods
-        
-    Returns:
-        Tuple of (success: bool, error_message: Optional[str])
-    """
-    # Convert pid to int if it's a string
-    if isinstance(pid, str):
-        try:
-            pid = int(pid)
-        except ValueError:
-            error_msg = f"Invalid PID format: {pid}"
-            if logger:
-                logger.error(error_msg)
-            return False, error_msg
-    
-    # Check if process exists
-    if not psutil.pid_exists(pid):
-        return True, None  # Process already terminated
-    
-    try:
-        process = psutil.Process(pid)
-        
-        # First attempt: graceful termination
-        if logger:
-            logger.info(f"Attempting graceful termination of process {pid}")
-            
-        if os.name == 'nt':  # Windows
-            subprocess.run(["taskkill", "/PID", str(pid)], 
-                          stdout=subprocess.DEVNULL, 
-                          stderr=subprocess.DEVNULL, 
-                          check=False)
-        else:  # Unix/Linux/MacOS
-            process.send_signal(signal.SIGTERM)
-        
-        # Wait for process to terminate
-        try:
-            process.wait(timeout=timeout)
-            if logger:
-                logger.info(f"Process {pid} terminated gracefully")
-            return True, None
-        except psutil.TimeoutExpired:
-            if logger:
-                logger.warning(f"Process {pid} did not terminate gracefully within {timeout} seconds, forcing termination")
-        
-        # Second attempt: force kill
-        if os.name == 'nt':  # Windows
-            subprocess.run(["taskkill", "/F", "/PID", str(pid)], 
-                          stdout=subprocess.DEVNULL, 
-                          stderr=subprocess.DEVNULL, 
-                          check=False)
-        else:  # Unix/Linux/MacOS
-            process.send_signal(signal.SIGKILL)
-        
-        # Verify process is killed
-        gone, alive = psutil.wait_procs([process], timeout=force_kill_timeout)
-        if process in alive:
-            error_msg = f"Failed to kill process {pid} even after force kill"
-            if logger:
-                logger.error(error_msg)
-            return False, error_msg
-            
-        if logger:
-            logger.info(f"Process {pid} terminated by force")
-        return True, None
-        
-    except psutil.NoSuchProcess:
-        # Process terminated while we were working with it
-        if logger:
-            logger.info(f"Process {pid} already terminated")
-        return True, None
-        
-    except Exception as e:
-        error_msg = f"Error terminating process {pid}: {str(e)}"
-        if logger:
-            logger.error(error_msg)
-        return False, error_msg
\ No newline at end of file

From 02e627e0bd9f0c6b43a296ddc7dd69942be4984c Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 8 Apr 2025 17:43:36 +0800
Subject: [PATCH 40/78] fix(crawler): simplify page retrieval logic in
 AsyncPlaywrightCrawlerStrategy

---
 crawl4ai/async_crawler_strategy.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 89b4df84..37aa0962 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -505,10 +505,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             )
 
         # Get page for session
-        try:
-            page, context, _ = await self.browser_manager.get_page(crawlerRunConfig=config)
-        except Exception as e:
-            page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+        page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
 
         # await page.goto(URL)
 

From 6f7ab9c92722f85db0e8aaa5fcf4d4275c6bc230 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Tue, 8 Apr 2025 18:31:00 +0530
Subject: [PATCH 41/78] fix: Revert changes to session management in
 AsyncHttpWebcrawler and solve the underlying issue by removing the session
 closure in finally block of session context.

---
 crawl4ai/async_crawler_strategy.py | 133 +++++++++++++++--------------
 1 file changed, 70 insertions(+), 63 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 301d925f..1e987450 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1706,6 +1706,15 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
     async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
         await self.close()
 
+    @contextlib.asynccontextmanager
+    async def _session_context(self):
+        try:
+            if not self._session:
+                await self.start()
+            yield self._session
+        finally:
+            pass
+
     def set_hook(self, hook_type: str, hook_func: Callable) -> None:
         if hook_type in self.hooks:
             self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
@@ -1782,77 +1791,75 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
         url: str, 
         config: CrawlerRunConfig
     ) -> AsyncCrawlResponse:
-        if not self._session or self._session.closed:
-            await self.start()
-        
-        timeout = ClientTimeout(
-            total=config.page_timeout or self.DEFAULT_TIMEOUT,
-            connect=10,
-            sock_read=30
-        )
-        
-        headers = dict(self._BASE_HEADERS)
-        if self.browser_config.headers:
-            headers.update(self.browser_config.headers)
+        async with self._session_context() as session:
+            timeout = ClientTimeout(
+                total=config.page_timeout or self.DEFAULT_TIMEOUT,
+                connect=10,
+                sock_read=30
+            )
+            
+            headers = dict(self._BASE_HEADERS)
+            if self.browser_config.headers:
+                headers.update(self.browser_config.headers)
 
-        request_kwargs = {
-            'timeout': timeout,
-            'allow_redirects': self.browser_config.follow_redirects,
-            'ssl': self.browser_config.verify_ssl,
-            'headers': headers
-        }
+            request_kwargs = {
+                'timeout': timeout,
+                'allow_redirects': self.browser_config.follow_redirects,
+                'ssl': self.browser_config.verify_ssl,
+                'headers': headers
+            }
 
-        if self.browser_config.method == "POST":
-            if self.browser_config.data:
-                request_kwargs['data'] = self.browser_config.data
-            if self.browser_config.json:
-                request_kwargs['json'] = self.browser_config.json
+            if self.browser_config.method == "POST":
+                if self.browser_config.data:
+                    request_kwargs['data'] = self.browser_config.data
+                if self.browser_config.json:
+                    request_kwargs['json'] = self.browser_config.json
 
-        await self.hooks['before_request'](url, request_kwargs)
+            await self.hooks['before_request'](url, request_kwargs)
 
-        try:
-            async with self._session.request(self.browser_config.method, url, **request_kwargs) as response:
-                content = memoryview(await response.read())
-                
-                if not (200 <= response.status < 300):
-                    raise HTTPStatusError(
-                        response.status,
-                        f"Unexpected status code for {url}"
+            try:
+                async with session.request(self.browser_config.method, url, **request_kwargs) as response:
+                    content = memoryview(await response.read())
+                    
+                    if not (200 <= response.status < 300):
+                        raise HTTPStatusError(
+                            response.status,
+                            f"Unexpected status code for {url}"
+                        )
+                    
+                    encoding = response.charset
+                    if not encoding:
+                        encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
+                    
+                    result = AsyncCrawlResponse(
+                        html=content.tobytes().decode(encoding, errors='replace'),
+                        response_headers=dict(response.headers),
+                        status_code=response.status,
+                        redirected_url=str(response.url)
                     )
-                
-                encoding = response.charset
-                if not encoding:
-                    encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
-                
-                result = AsyncCrawlResponse(
-                    html=content.tobytes().decode(encoding, errors='replace'),
-                    response_headers=dict(response.headers),
-                    status_code=response.status,
-                    redirected_url=str(response.url)
-                )
-                
-                await self.hooks['after_request'](result)
-                return result
+                    
+                    await self.hooks['after_request'](result)
+                    return result
 
-        except aiohttp.ServerTimeoutError as e:
-            await self.hooks['on_error'](e)
-            raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+            except aiohttp.ServerTimeoutError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+                
+            except aiohttp.ClientConnectorError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionError(f"Connection failed: {str(e)}")
+                
+            except aiohttp.ClientError as e:
+                await self.hooks['on_error'](e)
+                raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
             
-        except aiohttp.ClientConnectorError as e:
-            await self.hooks['on_error'](e)
-            raise ConnectionError(f"Connection failed: {str(e)}")
+            except asyncio.exceptions.TimeoutError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
             
-        except aiohttp.ClientError as e:
-            await self.hooks['on_error'](e)
-            raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
-        
-        except asyncio.exceptions.TimeoutError as e:
-            await self.hooks['on_error'](e)
-            raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
-        
-        except Exception as e:
-            await self.hooks['on_error'](e)
-            raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
+            except Exception as e:
+                await self.hooks['on_error'](e)
+                raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
 
     async def crawl(
         self, 

From a2061bf31ec6bfc3fa8b2e526ed24c8044d09273 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 9 Apr 2025 15:39:04 +0800
Subject: [PATCH 42/78] feat(crawler): add MHTML capture functionality

Add ability to capture web pages as MHTML format, which includes all page resources
in a single file. This enables complete page archival and offline viewing.

- Add capture_mhtml parameter to CrawlerRunConfig
- Implement MHTML capture using CDP in AsyncPlaywrightCrawlerStrategy
- Add mhtml field to CrawlResult and AsyncCrawlResponse models
- Add comprehensive tests for MHTML capture functionality
- Update documentation with MHTML capture details
- Add exclude_all_images option for better memory management

Breaking changes: None
---
 JOURNAL.md                                |  49 +++++
 crawl4ai/async_configs.py                 |   8 +
 crawl4ai/async_crawler_strategy.py        |  75 +++++++-
 crawl4ai/async_webcrawler.py              |   1 +
 crawl4ai/browser_manager.py               |   4 +-
 crawl4ai/content_scraping_strategy.py     |  13 ++
 crawl4ai/models.py                        |  20 +-
 docs/md_v2/api/crawl-result.md            |  16 +-
 docs/md_v2/api/parameters.md              |   1 +
 docs/md_v2/core/browser-crawler-config.md |   8 +-
 docs/md_v2/core/crawler-result.md         |  17 +-
 docs/md_v2/core/link-media.md             |  63 ++++++-
 temp.txt                                  |   3 +
 tests/20241401/test_mhtml.py              | 213 ++++++++++++++++++++++
 14 files changed, 467 insertions(+), 24 deletions(-)
 create mode 100644 JOURNAL.md
 create mode 100644 temp.txt
 create mode 100644 tests/20241401/test_mhtml.py

diff --git a/JOURNAL.md b/JOURNAL.md
new file mode 100644
index 00000000..31e86131
--- /dev/null
+++ b/JOURNAL.md
@@ -0,0 +1,49 @@
+# Development Journal
+
+This journal tracks significant feature additions, bug fixes, and architectural decisions in the crawl4ai project. It serves as both documentation and a historical record of the project's evolution.
+
+## [2025-04-09] Added MHTML Capture Feature
+
+**Feature:** MHTML snapshot capture of crawled pages
+
+**Changes Made:**
+1. Added `capture_mhtml: bool = False` parameter to `CrawlerRunConfig` class
+2. Added `mhtml: Optional[str] = None` field to `CrawlResult` model
+3. Added `mhtml_data: Optional[str] = None` field to `AsyncCrawlResponse` class
+4. Implemented `capture_mhtml()` method in `AsyncPlaywrightCrawlerStrategy` class to capture MHTML via CDP
+5. Modified the crawler to capture MHTML when enabled and pass it to the result
+
+**Implementation Details:**
+- MHTML capture uses Chrome DevTools Protocol (CDP) via Playwright's CDP session API
+- The implementation waits for page to fully load before capturing MHTML content
+- Enhanced waiting for JavaScript content with requestAnimationFrame for better JS content capture
+- We ensure all browser resources are properly cleaned up after capture
+
+**Files Modified:**
+- `crawl4ai/models.py`: Added the mhtml field to CrawlResult
+- `crawl4ai/async_configs.py`: Added capture_mhtml parameter to CrawlerRunConfig
+- `crawl4ai/async_crawler_strategy.py`: Implemented MHTML capture logic
+- `crawl4ai/async_webcrawler.py`: Added mapping from AsyncCrawlResponse.mhtml_data to CrawlResult.mhtml
+
+**Testing:**
+- Created comprehensive tests in `tests/20241401/test_mhtml.py` covering:
+  - Capturing MHTML when enabled
+  - Ensuring mhtml is None when disabled explicitly
+  - Ensuring mhtml is None by default
+  - Capturing MHTML on JavaScript-enabled pages
+
+**Challenges:**
+- Had to improve page loading detection to ensure JavaScript content was fully rendered
+- Tests needed to be run independently due to Playwright browser instance management
+- Modified test expected content to match actual MHTML output
+
+**Why This Feature:**
+The MHTML capture feature allows users to capture complete web pages including all resources (CSS, images, etc.) in a single file. This is valuable for:
+1. Offline viewing of captured pages
+2. Creating permanent snapshots of web content for archival
+3. Ensuring consistent content for later analysis, even if the original site changes
+
+**Future Enhancements to Consider:**
+- Add option to save MHTML to file
+- Support for filtering what resources get included in MHTML
+- Add support for specifying MHTML capture options
\ No newline at end of file
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 2f0efe90..079afdee 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -772,10 +772,12 @@ class CrawlerRunConfig():
         screenshot_wait_for: float = None,
         screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
         pdf: bool = False,
+        capture_mhtml: bool = False,
         image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
         image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
         table_score_threshold: int = 7,
         exclude_external_images: bool = False,
+        exclude_all_images: bool = False,
         # Link and Domain Handling Parameters
         exclude_social_media_domains: list = None,
         exclude_external_links: bool = False,
@@ -860,9 +862,11 @@ class CrawlerRunConfig():
         self.screenshot_wait_for = screenshot_wait_for
         self.screenshot_height_threshold = screenshot_height_threshold
         self.pdf = pdf
+        self.capture_mhtml = capture_mhtml
         self.image_description_min_word_threshold = image_description_min_word_threshold
         self.image_score_threshold = image_score_threshold
         self.exclude_external_images = exclude_external_images
+        self.exclude_all_images = exclude_all_images
         self.table_score_threshold = table_score_threshold
 
         # Link and Domain Handling Parameters
@@ -991,6 +995,7 @@ class CrawlerRunConfig():
                 "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
             ),
             pdf=kwargs.get("pdf", False),
+            capture_mhtml=kwargs.get("capture_mhtml", False),
             image_description_min_word_threshold=kwargs.get(
                 "image_description_min_word_threshold",
                 IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -999,6 +1004,7 @@ class CrawlerRunConfig():
                 "image_score_threshold", IMAGE_SCORE_THRESHOLD
             ),
             table_score_threshold=kwargs.get("table_score_threshold", 7),
+            exclude_all_images=kwargs.get("exclude_all_images", False),
             exclude_external_images=kwargs.get("exclude_external_images", False),
             # Link and Domain Handling Parameters
             exclude_social_media_domains=kwargs.get(
@@ -1088,9 +1094,11 @@ class CrawlerRunConfig():
             "screenshot_wait_for": self.screenshot_wait_for,
             "screenshot_height_threshold": self.screenshot_height_threshold,
             "pdf": self.pdf,
+            "capture_mhtml": self.capture_mhtml,
             "image_description_min_word_threshold": self.image_description_min_word_threshold,
             "image_score_threshold": self.image_score_threshold,
             "table_score_threshold": self.table_score_threshold,
+            "exclude_all_images": self.exclude_all_images,
             "exclude_external_images": self.exclude_external_images,
             "exclude_social_media_domains": self.exclude_social_media_domains,
             "exclude_external_links": self.exclude_external_links,
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 37aa0962..bdb7bfca 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -836,14 +836,18 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 "before_return_html", page=page, html=html, context=context, config=config
             )
 
-            # Handle PDF and screenshot generation
+            # Handle PDF, MHTML and screenshot generation
             start_export_time = time.perf_counter()
             pdf_data = None
             screenshot_data = None
+            mhtml_data = None
 
             if config.pdf:
                 pdf_data = await self.export_pdf(page)
 
+            if config.capture_mhtml:
+                mhtml_data = await self.capture_mhtml(page)
+
             if config.screenshot:
                 if config.screenshot_wait_for:
                     await asyncio.sleep(config.screenshot_wait_for)
@@ -851,9 +855,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     page, screenshot_height_threshold=config.screenshot_height_threshold
                 )
 
-            if screenshot_data or pdf_data:
+            if screenshot_data or pdf_data or mhtml_data:
                 self.logger.info(
-                    message="Exporting PDF and taking screenshot took {duration:.2f}s",
+                    message="Exporting media (PDF/MHTML/screenshot) took {duration:.2f}s",
                     tag="EXPORT",
                     params={"duration": time.perf_counter() - start_export_time},
                 )
@@ -876,6 +880,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 status_code=status_code,
                 screenshot=screenshot_data,
                 pdf_data=pdf_data,
+                mhtml_data=mhtml_data,
                 get_delayed_content=get_delayed_content,
                 ssl_certificate=ssl_cert,
                 downloaded_files=(
@@ -1052,6 +1057,70 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         pdf_data = await page.pdf(print_background=True)
         return pdf_data
+        
+    async def capture_mhtml(self, page: Page) -> Optional[str]:
+        """
+        Captures the current page as MHTML using CDP.
+        
+        MHTML (MIME HTML) is a web page archive format that combines the HTML content 
+        with its resources (images, CSS, etc.) into a single MIME-encoded file.
+        
+        Args:
+            page (Page): The Playwright page object
+            
+        Returns:
+            Optional[str]: The MHTML content as a string, or None if there was an error
+        """
+        try:
+            # Ensure the page is fully loaded before capturing
+            try:
+                # Wait for DOM content and network to be idle
+                await page.wait_for_load_state("domcontentloaded", timeout=5000)
+                await page.wait_for_load_state("networkidle", timeout=5000)
+                
+                # Give a little extra time for JavaScript execution
+                await page.wait_for_timeout(1000)
+                
+                # Wait for any animations to complete
+                await page.evaluate("""
+                    () => new Promise(resolve => {
+                        // First requestAnimationFrame gets scheduled after the next repaint
+                        requestAnimationFrame(() => {
+                            // Second requestAnimationFrame gets called after all animations complete
+                            requestAnimationFrame(resolve);
+                        });
+                    })
+                """)
+            except Error as e:
+                if self.logger:
+                    self.logger.warning(
+                        message="Wait for load state timed out: {error}",
+                        tag="MHTML",
+                        params={"error": str(e)},
+                    )
+            
+            # Create a new CDP session
+            cdp_session = await page.context.new_cdp_session(page)
+            
+            # Call Page.captureSnapshot with format "mhtml"
+            result = await cdp_session.send("Page.captureSnapshot", {"format": "mhtml"})
+            
+            # The result contains a 'data' field with the MHTML content
+            mhtml_content = result.get("data")
+            
+            # Detach the CDP session to clean up resources
+            await cdp_session.detach()
+            
+            return mhtml_content
+        except Exception as e:
+            # Log the error but don't raise it - we'll just return None for the MHTML
+            if self.logger:
+                self.logger.error(
+                    message="Failed to capture MHTML: {error}",
+                    tag="MHTML",
+                    params={"error": str(e)},
+                )
+            return None
 
     async def take_screenshot(self, page, **kwargs) -> str:
         """
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index fca2d673..16bd5f57 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -365,6 +365,7 @@ class AsyncWebCrawler:
                     crawl_result.response_headers = async_response.response_headers
                     crawl_result.downloaded_files = async_response.downloaded_files
                     crawl_result.js_execution_result = js_execution_result
+                    crawl_result.mhtml = async_response.mhtml_data
                     crawl_result.ssl_certificate = (
                         async_response.ssl_certificate
                     )  # Add SSL certificate
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index df0886c7..7fc819e0 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -440,8 +440,7 @@ class BrowserManager:
     @classmethod
     async def get_playwright(cls):
         from playwright.async_api import async_playwright
-        if cls._playwright_instance is None:
-            cls._playwright_instance = await async_playwright().start()
+        cls._playwright_instance = await async_playwright().start()
         return cls._playwright_instance    
 
     def __init__(self, browser_config: BrowserConfig, logger=None):
@@ -492,7 +491,6 @@ class BrowserManager:
 
         Note: This method should be called in a separate task to avoid blocking the main event loop.
         """
-        self.playwright  = await self.get_playwright()
         if self.playwright is None:
             from playwright.async_api import async_playwright
 
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index a806b045..d6cf7b8c 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -860,6 +860,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         soup = BeautifulSoup(html, parser_type)
         body = soup.body
         base_domain = get_base_domain(url)
+        
+        # Early removal of all images if exclude_all_images is set
+        # This happens before any processing to minimize memory usage
+        if kwargs.get("exclude_all_images", False):
+            for img in body.find_all('img'):
+                img.decompose()
 
         try:
             meta = extract_metadata("", soup)
@@ -1491,6 +1497,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             body = doc
 
             base_domain = get_base_domain(url)
+            
+            # Early removal of all images if exclude_all_images is set
+            # This is more efficient in lxml as we remove elements before any processing
+            if kwargs.get("exclude_all_images", False):
+                for img in body.xpath('//img'):
+                    if img.getparent() is not None:
+                        img.getparent().remove(img)
 
             # Add comment removal
             if kwargs.get("remove_comments", False):
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index aad14a1d..f132dc16 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -95,15 +95,7 @@ class UrlModel(BaseModel):
     url: HttpUrl
     forced: bool = False
 
-class MarkdownGenerationResult(BaseModel):
-    raw_markdown: str
-    markdown_with_citations: str
-    references_markdown: str
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None
 
-    def __str__(self):
-        return self.raw_markdown
 
 @dataclass
 class TraversalStats:
@@ -124,6 +116,16 @@ class DispatchResult(BaseModel):
     end_time: Union[datetime, float]
     error_message: str = ""
 
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+    def __str__(self):
+        return self.raw_markdown
+    
 class CrawlResult(BaseModel):
     url: str
     html: str
@@ -135,6 +137,7 @@ class CrawlResult(BaseModel):
     js_execution_result: Optional[Dict[str, Any]] = None
     screenshot: Optional[str] = None
     pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None
     _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
     extracted_content: Optional[str] = None
     metadata: Optional[dict] = None
@@ -307,6 +310,7 @@ class AsyncCrawlResponse(BaseModel):
     status_code: int
     screenshot: Optional[str] = None
     pdf_data: Optional[bytes] = None
+    mhtml_data: Optional[str] = None
     get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
     downloaded_files: Optional[List[str]] = None
     ssl_certificate: Optional[SSLCertificate] = None
diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md
index 4c42009b..43967886 100644
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -15,6 +15,7 @@ class CrawlResult(BaseModel):
     downloaded_files: Optional[List[str]] = None
     screenshot: Optional[str] = None
     pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
     markdown: Optional[Union[str, MarkdownGenerationResult]] = None
     extracted_content: Optional[str] = None
     metadata: Optional[dict] = None
@@ -236,7 +237,16 @@ if result.pdf:
         f.write(result.pdf)
 ```
 
-### 5.5 **`metadata`** *(Optional[dict])*  
+### 5.5 **`mhtml`** *(Optional[str])*  
+**What**: MHTML snapshot of the page if `capture_mhtml=True` in `CrawlerRunConfig`. MHTML (MIME HTML) format preserves the entire web page with all its resources (CSS, images, scripts, etc.) in a single file.  
+**Usage**:
+```python
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
+```
+
+### 5.6 **`metadata`** *(Optional[dict])*  
 **What**: Page-level metadata if discovered (title, description, OG data, etc.).  
 **Usage**:
 ```python
@@ -304,11 +314,13 @@ async def handle_result(result: CrawlResult):
     if result.extracted_content:
         print("Structured data:", result.extracted_content)
     
-    # Screenshot/PDF
+    # Screenshot/PDF/MHTML
     if result.screenshot:
         print("Screenshot length:", len(result.screenshot))
     if result.pdf:
         print("PDF bytes length:", len(result.pdf))
+    if result.mhtml:
+        print("MHTML length:", len(result.mhtml))
 ```
 
 ---
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index d352e162..de4ba467 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -140,6 +140,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`screenshot_wait_for`**                  | `float or None`     | Extra wait time before the screenshot.                                                                    |
 | **`screenshot_height_threshold`**          | `int` (~20000)      | If the page is taller than this, alternate screenshot strategies are used.                                |
 | **`pdf`**                                  | `bool` (False)      | If `True`, returns a PDF in `result.pdf`.                                                                 |
+| **`capture_mhtml`**                        | `bool` (False)      | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
 | **`image_description_min_word_threshold`** | `int` (~50)         | Minimum words for an image’s alt text or description to be considered valid.                              |
 | **`image_score_threshold`**                | `int` (~3)          | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.).              |
 | **`exclude_external_images`**              | `bool` (False)      | Exclude images from other domains.                                                                        |
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 0d97e0fc..1f7e5ee2 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -136,6 +136,7 @@ class CrawlerRunConfig:
         wait_for=None,
         screenshot=False,
         pdf=False,
+        capture_mhtml=False,
         enable_rate_limiting=False,
         rate_limit_config=None,
         memory_threshold_percent=70.0,
@@ -175,10 +176,9 @@ class CrawlerRunConfig:
    - A CSS or JS expression to wait for before extracting content.  
    - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
 
-7. **`screenshot`** & **`pdf`**:  
-   - If `True`, captures a screenshot or PDF after the page is fully loaded.  
-   - The results go to `result.screenshot` (base64) or `result.pdf` (bytes).
-
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
 8. **`verbose`**:  
    - Logs additional runtime details.  
    - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
diff --git a/docs/md_v2/core/crawler-result.md b/docs/md_v2/core/crawler-result.md
index 961b38f6..d7648ecb 100644
--- a/docs/md_v2/core/crawler-result.md
+++ b/docs/md_v2/core/crawler-result.md
@@ -26,6 +26,7 @@ class CrawlResult(BaseModel):
     downloaded_files: Optional[List[str]] = None
     screenshot: Optional[str] = None
     pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
     markdown: Optional[Union[str, MarkdownGenerationResult]] = None
     extracted_content: Optional[str] = None
     metadata: Optional[dict] = None
@@ -51,6 +52,7 @@ class CrawlResult(BaseModel):
 | **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
 | **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
 | **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
+| **mhtml (`Optional[str]`)**               | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources.      |
 | **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
 | **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
 | **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
@@ -190,18 +192,27 @@ for img in images:
     print("Image URL:", img["src"], "Alt:", img.get("alt"))
 ```
 
-### 5.3 `screenshot` and `pdf`
+### 5.3 `screenshot`, `pdf`, and `mhtml`
 
-If you set `screenshot=True` or `pdf=True` in **`CrawlerRunConfig`**, then:
+If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
 
-- `result.screenshot` contains a base64-encoded PNG string.  
+- `result.screenshot` contains a base64-encoded PNG string.
 - `result.pdf` contains raw PDF bytes (you can write them to a file).
+- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
 
 ```python
+# Save the PDF
 with open("page.pdf", "wb") as f:
     f.write(result.pdf)
+
+# Save the MHTML
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
 ```
 
+The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
+
 ### 5.4 `ssl_certificate`
 
 If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md
index cccc8df0..58bedcbc 100644
--- a/docs/md_v2/core/link-media.md
+++ b/docs/md_v2/core/link-media.md
@@ -4,7 +4,35 @@ In this tutorial, you’ll learn how to:
 
 1. Extract links (internal, external) from crawled pages  
 2. Filter or exclude specific domains (e.g., social media or custom domains)  
-3. Access and manage media data (especially images) in the crawl result  
+3. Access and ma### 3.2 Excluding Images
+
+#### Excluding External Images
+
+If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
+
+#### Excluding All Images
+
+If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_all_images=True
+)
+```
+
+This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
+- You don't need image data in your results
+- You're crawling image-heavy pages that cause memory issues
+- You want to focus only on text content
+- You need to maximize crawling speeddata (especially images) in the crawl result  
 4. Configure your crawler to exclude or prioritize certain images
 
 > **Prerequisites**  
@@ -271,8 +299,41 @@ Each extracted table contains:
 
 - **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.  
 - **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.  
+- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
 - **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
 
+#### Example: Capturing Page as MHTML
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        capture_mhtml=True  # Enable MHTML capture
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=crawler_cfg)
+        
+        if result.success and result.mhtml:
+            # Save the MHTML snapshot to a file
+            with open("example.mhtml", "w", encoding="utf-8") as f:
+                f.write(result.mhtml)
+            print("MHTML snapshot saved to example.mhtml")
+        else:
+            print("Failed to capture MHTML:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+The MHTML format is particularly useful because:
+- It captures the complete page state including all resources
+- It can be opened in most modern browsers for offline viewing
+- It preserves the page exactly as it appeared during crawling
+- It's a single file, making it easy to store and transfer
+
 ---
 
 ## 4. Putting It All Together: Link & Media Filtering
diff --git a/temp.txt b/temp.txt
new file mode 100644
index 00000000..a9fd218d
--- /dev/null
+++ b/temp.txt
@@ -0,0 +1,3 @@
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
diff --git a/tests/20241401/test_mhtml.py b/tests/20241401/test_mhtml.py
new file mode 100644
index 00000000..06e0e294
--- /dev/null
+++ b/tests/20241401/test_mhtml.py
@@ -0,0 +1,213 @@
+# test_mhtml_capture.py
+
+import pytest
+import asyncio
+import re  # For more robust MHTML checks
+
+# Assuming these can be imported directly from the crawl4ai library
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
+
+# A reliable, simple static HTML page for testing
+# Using httpbin as it's designed for testing clients
+TEST_URL_SIMPLE = "https://httpbin.org/html"
+EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
+
+# A slightly more complex page that might involve JS (good secondary test)
+TEST_URL_JS = "https://quotes.toscrape.com/js/"
+EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
+
+# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_enabled():
+    """
+    Verify that when CrawlerRunConfig has capture_mhtml=True,
+    the CrawlResult contains valid MHTML content.
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
+    # --- Key: Enable MHTML capture in the run config ---
+    run_config = CrawlerRunConfig(capture_mhtml=True)
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        
+        # Perform the crawl with the MHTML-enabled config
+        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+        # --- Assertions ---
+        assert result is not None, "Crawler should return a result object"
+        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+        # 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+        # 2. Check if mhtml is populated
+        assert result.mhtml is not None, "MHTML content should be captured when enabled"
+        assert isinstance(result.mhtml, str), "MHTML content should be a string"
+        assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
+
+        # 3. Check for MHTML structure indicators (more robust than simple string contains)
+        # MHTML files are multipart MIME messages
+        assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
+            "MHTML should contain 'Content-Type: multipart/related;'"
+        # Should contain a boundary definition
+        assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
+            "MHTML should contain a multipart boundary"
+        # Should contain the main HTML part
+        assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
+            "MHTML should contain a 'Content-Type: text/html' part"
+
+        # 4. Check if the *actual page content* is within the MHTML string
+        # This confirms the snapshot captured the rendered page
+        assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
+            f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
+
+        # 5. Ensure standard HTML is still present and correct
+        assert result.html is not None, "Standard HTML should still be present"
+        assert isinstance(result.html, str), "Standard HTML should be a string"
+        assert EXPECTED_CONTENT_SIMPLE in result.html, \
+            f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_disabled_explicitly():
+    """
+    Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
+    the CrawlResult.mhtml attribute is None.
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True)
+    # --- Key: Explicitly disable MHTML capture ---
+    run_config = CrawlerRunConfig(capture_mhtml=False)
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+        assert result is not None
+        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+        # 1. Check attribute existence (important for TDD start)
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+        # 2. Check mhtml is None
+        assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
+
+        # 3. Ensure standard HTML is still present
+        assert result.html is not None
+        assert EXPECTED_CONTENT_SIMPLE in result.html
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_disabled_by_default():
+    """
+    Verify that if capture_mhtml is not specified (using its default),
+    the CrawlResult.mhtml attribute is None.
+    (This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True)
+    # --- Key: Use default run config ---
+    run_config = CrawlerRunConfig() # Do not specify capture_mhtml
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+        assert result is not None
+        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+        # 1. Check attribute existence
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+        # 2. Check mhtml is None (assuming default is False)
+        assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
+
+        # 3. Ensure standard HTML is still present
+        assert result.html is not None
+        assert EXPECTED_CONTENT_SIMPLE in result.html
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+# Optional: Add a test for a JS-heavy page if needed
+@pytest.mark.asyncio
+async def test_mhtml_capture_on_js_page_when_enabled():
+    """
+    Verify MHTML capture works on a page requiring JavaScript execution.
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True)
+    run_config = CrawlerRunConfig(
+        capture_mhtml=True,
+        # Add a small wait or JS execution if needed for the JS page to fully render
+        # For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
+        # wait_for_timeout=2000 # Example: wait up to 2 seconds
+        js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
+    )
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
+
+        assert result is not None
+        assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+        assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
+        assert isinstance(result.mhtml, str), "MHTML content should be a string"
+        assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
+
+        # Check for MHTML structure
+        assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
+        assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
+
+        # Check for content rendered by JS within the MHTML
+        assert EXPECTED_CONTENT_JS in result.mhtml, \
+            f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
+
+        # Check standard HTML too
+        assert result.html is not None
+        assert EXPECTED_CONTENT_JS in result.html, \
+             f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+if __name__ == "__main__":
+    # Use pytest for async tests
+    pytest.main(["-xvs", __file__])

From 66ac07b4f3f1c6d8a756ef86b580c667eb3cd598 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Thu, 10 Apr 2025 16:03:48 +0800
Subject: [PATCH 43/78] feat(crawler): add network request and console message
 capturing

Implement comprehensive network request and console message capturing functionality:
- Add capture_network_requests and capture_console_messages config parameters
- Add network_requests and console_messages fields to models
- Implement Playwright event listeners to capture requests, responses, and console output
- Create detailed documentation and examples
- Add comprehensive tests

This feature enables deep visibility into web page activity for debugging,
security analysis, performance profiling, and API discovery in web applications.
---
 JOURNAL.md                                    |  61 ++-
 crawl4ai/async_configs.py                     |  12 +
 crawl4ai/async_crawler_strategy.py            | 154 +++++-
 crawl4ai/async_webcrawler.py                  |   7 +-
 crawl4ai/models.py                            |   4 +
 .../network_console_capture_example.py        | 471 +++++++++++++++++
 .../md_v2/advanced/network-console-capture.md | 205 ++++++++
 docs/md_v2/api/crawl-result.md                |  84 ++-
 mkdocs.yml                                    |   1 +
 parameter_updates.txt                         |  20 +
 prompts/prompt_net_requests.md                | 489 ++++++++++++++++++
 temp.txt                                      |   3 -
 ...t_acyn_crawl_wuth_http_crawler_strategy.py |   0
 .../test_advanced_deep_crawl.py               |   0
 .../test_async_crawler_strategy.py            |   0
 .../test_async_markdown_generator.py          |   0
 .../test_async_webcrawler.py                  |   0
 .../test_cache_context.py                     |   0
 tests/{20241401 => general}/test_crawlers.py  |   0
 .../{20241401 => general}/test_deep_crawl.py  |   0
 .../test_deep_crawl_filters.py                |   0
 .../test_deep_crawl_scorers.py                |   0
 .../test_http_crawler_strategy.py             |   0
 .../{20241401 => general}/test_llm_filter.py  |   0
 tests/{20241401 => general}/test_mhtml.py     |   0
 tests/general/test_network_console_capture.py | 185 +++++++
 .../test_robot_parser.py                      |   0
 .../test_schema_builder.py                    |   0
 tests/{20241401 => general}/test_stream.py    |   0
 .../test_stream_dispatch.py                   |   0
 tests/{20241401 => general}/tets_robot.py     |   0
 31 files changed, 1686 insertions(+), 10 deletions(-)
 create mode 100644 docs/examples/network_console_capture_example.py
 create mode 100644 docs/md_v2/advanced/network-console-capture.md
 create mode 100644 parameter_updates.txt
 create mode 100644 prompts/prompt_net_requests.md
 delete mode 100644 temp.txt
 rename tests/{20241401 => general}/test_acyn_crawl_wuth_http_crawler_strategy.py (100%)
 rename tests/{20241401 => general}/test_advanced_deep_crawl.py (100%)
 rename tests/{20241401 => general}/test_async_crawler_strategy.py (100%)
 rename tests/{20241401 => general}/test_async_markdown_generator.py (100%)
 rename tests/{20241401 => general}/test_async_webcrawler.py (100%)
 rename tests/{20241401 => general}/test_cache_context.py (100%)
 rename tests/{20241401 => general}/test_crawlers.py (100%)
 rename tests/{20241401 => general}/test_deep_crawl.py (100%)
 rename tests/{20241401 => general}/test_deep_crawl_filters.py (100%)
 rename tests/{20241401 => general}/test_deep_crawl_scorers.py (100%)
 rename tests/{20241401 => general}/test_http_crawler_strategy.py (100%)
 rename tests/{20241401 => general}/test_llm_filter.py (100%)
 rename tests/{20241401 => general}/test_mhtml.py (100%)
 create mode 100644 tests/general/test_network_console_capture.py
 rename tests/{20241401 => general}/test_robot_parser.py (100%)
 rename tests/{20241401 => general}/test_schema_builder.py (100%)
 rename tests/{20241401 => general}/test_stream.py (100%)
 rename tests/{20241401 => general}/test_stream_dispatch.py (100%)
 rename tests/{20241401 => general}/tets_robot.py (100%)

diff --git a/JOURNAL.md b/JOURNAL.md
index 31e86131..ac00e890 100644
--- a/JOURNAL.md
+++ b/JOURNAL.md
@@ -46,4 +46,63 @@ The MHTML capture feature allows users to capture complete web pages including a
 **Future Enhancements to Consider:**
 - Add option to save MHTML to file
 - Support for filtering what resources get included in MHTML
-- Add support for specifying MHTML capture options
\ No newline at end of file
+- Add support for specifying MHTML capture options
+
+## [2025-04-10] Added Network Request and Console Message Capturing
+
+**Feature:** Comprehensive capturing of network requests/responses and browser console messages during crawling
+
+**Changes Made:**
+1. Added `capture_network_requests: bool = False` and `capture_console_messages: bool = False` parameters to `CrawlerRunConfig` class
+2. Added `network_requests: Optional[List[Dict[str, Any]]] = None` and `console_messages: Optional[List[Dict[str, Any]]] = None` fields to both `AsyncCrawlResponse` and `CrawlResult` models
+3. Implemented event listeners in `AsyncPlaywrightCrawlerStrategy._crawl_web()` to capture browser network events and console messages
+4. Added proper event listener cleanup in the finally block to prevent resource leaks
+5. Modified the crawler flow to pass captured data from AsyncCrawlResponse to CrawlResult
+
+**Implementation Details:**
+- Network capture uses Playwright event listeners (`request`, `response`, and `requestfailed`) to record all network activity
+- Console capture uses Playwright event listeners (`console` and `pageerror`) to record console messages and errors
+- Each network event includes metadata like URL, headers, status, and timing information
+- Each console message includes type, text content, and source location when available
+- All captured events include timestamps for chronological analysis
+- Error handling ensures even failed capture attempts won't crash the main crawling process
+
+**Files Modified:**
+- `crawl4ai/models.py`: Added new fields to AsyncCrawlResponse and CrawlResult
+- `crawl4ai/async_configs.py`: Added new configuration parameters to CrawlerRunConfig
+- `crawl4ai/async_crawler_strategy.py`: Implemented capture logic using event listeners
+- `crawl4ai/async_webcrawler.py`: Added data transfer from AsyncCrawlResponse to CrawlResult
+
+**Documentation:**
+- Created detailed documentation in `docs/md_v2/advanced/network-console-capture.md`
+- Added feature to site navigation in `mkdocs.yml`
+- Updated CrawlResult documentation in `docs/md_v2/api/crawl-result.md`
+- Created comprehensive example in `docs/examples/network_console_capture_example.py`
+
+**Testing:**
+- Created `tests/general/test_network_console_capture.py` with tests for:
+  - Verifying capture is disabled by default
+  - Testing network request capturing
+  - Testing console message capturing
+  - Ensuring both capture types can be enabled simultaneously
+  - Checking correct content is captured in expected formats
+
+**Challenges:**
+- Initial implementation had synchronous/asynchronous mismatches in event handlers
+- Needed to fix type of property access vs. method calls in handlers
+- Required careful cleanup of event listeners to prevent memory leaks
+
+**Why This Feature:**
+The network and console capture feature provides deep visibility into web page activity, enabling:
+1. Debugging complex web applications by seeing all network requests and errors
+2. Security analysis to detect unexpected third-party requests and data flows
+3. Performance profiling to identify slow-loading resources
+4. API discovery in single-page applications
+5. Comprehensive analysis of web application behavior
+
+**Future Enhancements to Consider:**
+- Option to filter captured events by type, domain, or content
+- Support for capturing response bodies (with size limits)
+- Aggregate statistics calculation for performance metrics
+- Integration with visualization tools for network waterfall analysis
+- Exporting captures in HAR format for use with external tools
\ No newline at end of file
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 079afdee..af98e607 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -787,6 +787,9 @@ class CrawlerRunConfig():
         # Debugging and Logging Parameters
         verbose: bool = True,
         log_console: bool = False,
+        # Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
         # Connection Parameters
         method: str = "GET",
         stream: bool = False,
@@ -881,6 +884,10 @@ class CrawlerRunConfig():
         # Debugging and Logging Parameters
         self.verbose = verbose
         self.log_console = log_console
+        
+        # Network and Console Capturing Parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
 
         # Connection Parameters
         self.stream = stream
@@ -1017,6 +1024,9 @@ class CrawlerRunConfig():
             # Debugging and Logging Parameters
             verbose=kwargs.get("verbose", True),
             log_console=kwargs.get("log_console", False),
+            # Network and Console Capturing Parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
             # Connection Parameters
             method=kwargs.get("method", "GET"),
             stream=kwargs.get("stream", False),
@@ -1107,6 +1117,8 @@ class CrawlerRunConfig():
             "exclude_internal_links": self.exclude_internal_links,
             "verbose": self.verbose,
             "log_console": self.log_console,
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
             "method": self.method,
             "stream": self.stream,
             "check_robots_txt": self.check_robots_txt,
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index bdb7bfca..f99d1cb9 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -478,6 +478,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     ) -> AsyncCrawlResponse:
         """
         Internal method to crawl web URLs with the specified configuration.
+        Includes optional network and console capturing.
 
         Args:
             url (str): The web URL to crawl
@@ -494,6 +495,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
         # Reset downloaded files list for new crawl
         self._downloaded_files = []
+        
+        # Initialize capture lists
+        captured_requests = []
+        captured_console = []
 
         # Handle user agent with magic mode
         user_agent_to_override = config.user_agent
@@ -521,9 +526,144 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         # Call hook after page creation
         await self.execute_hook("on_page_context_created", page, context=context, config=config)
 
+        # Network Request Capturing
+        if config.capture_network_requests:
+            async def handle_request_capture(request):
+                try:
+                    post_data_str = None
+                    try:
+                        # Be cautious with large post data
+                        post_data = request.post_data_buffer
+                        if post_data:
+                             # Attempt to decode, fallback to base64 or size indication
+                             try:
+                                 post_data_str = post_data.decode('utf-8', errors='replace')
+                             except UnicodeDecodeError:
+                                 post_data_str = f"[Binary data: {len(post_data)} bytes]"
+                    except Exception:
+                        post_data_str = "[Error retrieving post data]"
+
+                    captured_requests.append({
+                        "event_type": "request",
+                        "url": request.url,
+                        "method": request.method,
+                        "headers": dict(request.headers), # Convert Header dict
+                        "post_data": post_data_str,
+                        "resource_type": request.resource_type,
+                        "is_navigation_request": request.is_navigation_request(),
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing request details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_response_capture(response):
+                try:
+                    captured_requests.append({
+                        "event_type": "response",
+                        "url": response.url,
+                        "status": response.status,
+                        "status_text": response.status_text,
+                        "headers": dict(response.headers), # Convert Header dict
+                        "from_service_worker": response.from_service_worker,
+                        "request_timing": response.request.timing, # Detailed timing info
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing response details for {response.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "response_capture_error", "url": response.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_request_failed_capture(request):
+                 try:
+                    captured_requests.append({
+                        "event_type": "request_failed",
+                        "url": request.url,
+                        "method": request.method,
+                        "resource_type": request.resource_type,
+                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing request failed details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_failed_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            page.on("request", handle_request_capture)
+            page.on("response", handle_response_capture)
+            page.on("requestfailed", handle_request_failed_capture)
+
+        # Console Message Capturing
+        if config.capture_console_messages:
+            def handle_console_capture(msg):
+                try:
+                    message_type = "unknown"
+                    try:
+                        message_type = msg.type
+                    except:
+                        pass
+                        
+                    message_text = "unknown"
+                    try:
+                        message_text = msg.text
+                    except:
+                        pass
+                        
+                    # Basic console message with minimal content
+                    entry = {
+                        "type": message_type,
+                        "text": message_text,
+                        "timestamp": time.time()
+                    }
+                    
+                    captured_console.append(entry)
+                    
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
+                    # Still add something to the list even on error
+                    captured_console.append({
+                        "type": "console_capture_error", 
+                        "error": str(e), 
+                        "timestamp": time.time()
+                    })
+
+            def handle_pageerror_capture(err):
+                try:
+                    error_message = "Unknown error"
+                    try:
+                        error_message = err.message
+                    except:
+                        pass
+                        
+                    error_stack = ""
+                    try:
+                        error_stack = err.stack
+                    except:
+                        pass
+                        
+                    captured_console.append({
+                        "type": "error",
+                        "text": error_message,
+                        "stack": error_stack,
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
+                    captured_console.append({
+                        "type": "pageerror_capture_error", 
+                        "error": str(e), 
+                        "timestamp": time.time()
+                    })
+
+            # Add event listeners directly
+            page.on("console", handle_console_capture)
+            page.on("pageerror", handle_pageerror_capture)
+
         # Set up console logging if requested
         if config.log_console:
-
             def log_consol(
                 msg, console_log_type="debug"
             ):  # Corrected the parameter syntax
@@ -887,6 +1027,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     self._downloaded_files if self._downloaded_files else None
                 ),
                 redirected_url=redirected_url,
+                # Include captured data if enabled
+                network_requests=captured_requests if config.capture_network_requests else None,
+                console_messages=captured_console if config.capture_console_messages else None,
             )
 
         except Exception as e:
@@ -895,6 +1038,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         finally:
             # If no session_id is given we should close the page
             if not config.session_id:
+                # Detach listeners before closing to prevent potential errors during close
+                if config.capture_network_requests:
+                    page.remove_listener("request", handle_request_capture)
+                    page.remove_listener("response", handle_response_capture)
+                    page.remove_listener("requestfailed", handle_request_failed_capture)
+                if config.capture_console_messages:
+                    page.remove_listener("console", handle_console_capture)
+                    page.remove_listener("pageerror", handle_pageerror_capture)
+                
                 await page.close()
 
     async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 16bd5f57..1cd1b8c9 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -366,9 +366,10 @@ class AsyncWebCrawler:
                     crawl_result.downloaded_files = async_response.downloaded_files
                     crawl_result.js_execution_result = js_execution_result
                     crawl_result.mhtml = async_response.mhtml_data
-                    crawl_result.ssl_certificate = (
-                        async_response.ssl_certificate
-                    )  # Add SSL certificate
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # Add captured network and console data if available
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
 
                     crawl_result.success = bool(html)
                     crawl_result.session_id = getattr(config, "session_id", None)
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index f132dc16..32cca3ed 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -148,6 +148,8 @@ class CrawlResult(BaseModel):
     ssl_certificate: Optional[SSLCertificate] = None
     dispatch_result: Optional[DispatchResult] = None
     redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
 
     class Config:
         arbitrary_types_allowed = True
@@ -315,6 +317,8 @@ class AsyncCrawlResponse(BaseModel):
     downloaded_files: Optional[List[str]] = None
     ssl_certificate: Optional[SSLCertificate] = None
     redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
 
     class Config:
         arbitrary_types_allowed = True
diff --git a/docs/examples/network_console_capture_example.py b/docs/examples/network_console_capture_example.py
new file mode 100644
index 00000000..5305ddc3
--- /dev/null
+++ b/docs/examples/network_console_capture_example.py
@@ -0,0 +1,471 @@
+import asyncio
+import json
+import os
+import base64
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+# Create temp directory if it doesn't exist
+os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
+
+async def demo_basic_network_capture():
+    """Basic network request capturing example"""
+    print("\n=== 1. Basic Network Request Capturing ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            wait_until="networkidle"  # Wait for network to be idle
+        )
+        
+        result = await crawler.arun(
+            url="https://example.com/",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            print(f"Captured {len(result.network_requests)} network events")
+            
+            # Count by event type
+            event_types = {}
+            for req in result.network_requests:
+                event_type = req.get("event_type", "unknown")
+                event_types[event_type] = event_types.get(event_type, 0) + 1
+            
+            print("Event types:")
+            for event_type, count in event_types.items():
+                print(f"  - {event_type}: {count}")
+            
+            # Show a sample request and response
+            request = next((r for r in result.network_requests if r.get("event_type") == "request"), None)
+            response = next((r for r in result.network_requests if r.get("event_type") == "response"), None)
+            
+            if request:
+                print("\nSample request:")
+                print(f"  URL: {request.get('url')}")
+                print(f"  Method: {request.get('method')}")
+                print(f"  Headers: {list(request.get('headers', {}).keys())}")
+            
+            if response:
+                print("\nSample response:")
+                print(f"  URL: {response.get('url')}")
+                print(f"  Status: {response.get('status')} {response.get('status_text', '')}")
+                print(f"  Headers: {list(response.get('headers', {}).keys())}")
+
+async def demo_basic_console_capture():
+    """Basic console message capturing example"""
+    print("\n=== 2. Basic Console Message Capturing ===")
+    
+    # Create a simple HTML file with console messages
+    html_file = os.path.join(__cur_dir__, "tmp", "console_test.html")
+    with open(html_file, "w") as f:
+        f.write("""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Console Test</title>
+        </head>
+        <body>
+            <h1>Console Message Test</h1>
+            <script>
+                console.log("This is a basic log message");
+                console.info("This is an info message");
+                console.warn("This is a warning message");
+                console.error("This is an error message");
+                
+                // Generate an error
+                try {
+                    nonExistentFunction();
+                } catch (e) {
+                    console.error("Caught error:", e);
+                }
+            </script>
+        </body>
+        </html>
+        """)
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_console_messages=True,
+            wait_until="networkidle"  # Wait to make sure all scripts execute
+        )
+        
+        result = await crawler.arun(
+            url=f"file://{html_file}",
+            config=config
+        )
+        
+        if result.success and result.console_messages:
+            print(f"Captured {len(result.console_messages)} console messages")
+            
+            # Count by message type
+            message_types = {}
+            for msg in result.console_messages:
+                msg_type = msg.get("type", "unknown")
+                message_types[msg_type] = message_types.get(msg_type, 0) + 1
+            
+            print("Message types:")
+            for msg_type, count in message_types.items():
+                print(f"  - {msg_type}: {count}")
+            
+            # Show all messages
+            print("\nAll console messages:")
+            for i, msg in enumerate(result.console_messages, 1):
+                print(f"  {i}. [{msg.get('type', 'unknown')}] {msg.get('text', '')}")
+
+async def demo_combined_capture():
+    """Capturing both network requests and console messages"""
+    print("\n=== 3. Combined Network and Console Capture ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            capture_console_messages=True,
+            wait_until="networkidle"
+        )
+        
+        result = await crawler.arun(
+            url="https://httpbin.org/html",
+            config=config
+        )
+        
+        if result.success:
+            network_count = len(result.network_requests) if result.network_requests else 0
+            console_count = len(result.console_messages) if result.console_messages else 0
+            
+            print(f"Captured {network_count} network events and {console_count} console messages")
+            
+            # Save the captured data to a JSON file for analysis
+            output_file = os.path.join(__cur_dir__, "tmp", "capture_data.json")
+            with open(output_file, "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "timestamp": datetime.now().isoformat(),
+                    "network_requests": result.network_requests,
+                    "console_messages": result.console_messages
+                }, f, indent=2)
+            
+            print(f"Full capture data saved to {output_file}")
+
+async def analyze_spa_network_traffic():
+    """Analyze network traffic of a Single-Page Application"""
+    print("\n=== 4. Analyzing SPA Network Traffic ===")
+    
+    async with AsyncWebCrawler(config=BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=800
+    )) as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            capture_console_messages=True,
+            # Wait longer to ensure all resources are loaded
+            wait_until="networkidle",
+            page_timeout=60000,  # 60 seconds
+        )
+        
+        result = await crawler.arun(
+            url="https://weather.com",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            # Extract different types of requests
+            requests = []
+            responses = []
+            failures = []
+            
+            for event in result.network_requests:
+                event_type = event.get("event_type")
+                if event_type == "request":
+                    requests.append(event)
+                elif event_type == "response":
+                    responses.append(event)
+                elif event_type == "request_failed":
+                    failures.append(event)
+            
+            print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
+            
+            # Analyze request types
+            resource_types = {}
+            for req in requests:
+                resource_type = req.get("resource_type", "unknown")
+                resource_types[resource_type] = resource_types.get(resource_type, 0) + 1
+            
+            print("\nResource types:")
+            for resource_type, count in sorted(resource_types.items(), key=lambda x: x[1], reverse=True):
+                print(f"  - {resource_type}: {count}")
+            
+            # Analyze API calls
+            api_calls = [r for r in requests if "api" in r.get("url", "").lower()]
+            if api_calls:
+                print(f"\nDetected {len(api_calls)} API calls:")
+                for i, call in enumerate(api_calls[:5], 1):  # Show first 5
+                    print(f"  {i}. {call.get('method')} {call.get('url')}")
+                if len(api_calls) > 5:
+                    print(f"     ... and {len(api_calls) - 5} more")
+            
+            # Analyze response status codes
+            status_codes = {}
+            for resp in responses:
+                status = resp.get("status", 0)
+                status_codes[status] = status_codes.get(status, 0) + 1
+            
+            print("\nResponse status codes:")
+            for status, count in sorted(status_codes.items()):
+                print(f"  - {status}: {count}")
+            
+            # Analyze failures
+            if failures:
+                print("\nFailed requests:")
+                for i, failure in enumerate(failures[:5], 1):  # Show first 5
+                    print(f"  {i}. {failure.get('url')} - {failure.get('failure_text')}")
+                if len(failures) > 5:
+                    print(f"     ... and {len(failures) - 5} more")
+            
+            # Check for console errors
+            if result.console_messages:
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"\nDetected {len(errors)} console errors:")
+                    for i, error in enumerate(errors[:3], 1):  # Show first 3
+                        print(f"  {i}. {error.get('text', '')[:100]}...")
+                    if len(errors) > 3:
+                        print(f"     ... and {len(errors) - 3} more")
+            
+            # Save analysis to file
+            output_file = os.path.join(__cur_dir__, "tmp", "weather_network_analysis.json")
+            with open(output_file, "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "timestamp": datetime.now().isoformat(),
+                    "statistics": {
+                        "request_count": len(requests),
+                        "response_count": len(responses),
+                        "failure_count": len(failures),
+                        "resource_types": resource_types,
+                        "status_codes": {str(k): v for k, v in status_codes.items()},
+                        "api_call_count": len(api_calls),
+                        "console_error_count": len(errors) if result.console_messages else 0
+                    },
+                    "network_requests": result.network_requests,
+                    "console_messages": result.console_messages
+                }, f, indent=2)
+            
+            print(f"\nFull analysis saved to {output_file}")
+
+async def demo_security_analysis():
+    """Using network capture for security analysis"""
+    print("\n=== 5. Security Analysis with Network Capture ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            capture_console_messages=True,
+            wait_until="networkidle"
+        )
+        
+        # A site that makes multiple third-party requests
+        result = await crawler.arun(
+            url="https://www.nytimes.com/",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            print(f"Captured {len(result.network_requests)} network events")
+            
+            # Extract all domains
+            domains = set()
+            for req in result.network_requests:
+                if req.get("event_type") == "request":
+                    url = req.get("url", "")
+                    try:
+                        from urllib.parse import urlparse
+                        domain = urlparse(url).netloc
+                        if domain:
+                            domains.add(domain)
+                    except:
+                        pass
+            
+            print(f"\nDetected requests to {len(domains)} unique domains:")
+            main_domain = urlparse(result.url).netloc
+            
+            # Separate first-party vs third-party domains
+            first_party = [d for d in domains if main_domain in d]
+            third_party = [d for d in domains if main_domain not in d]
+            
+            print(f"  - First-party domains: {len(first_party)}")
+            print(f"  - Third-party domains: {len(third_party)}")
+            
+            # Look for potential trackers/analytics
+            tracking_keywords = ["analytics", "tracker", "pixel", "tag", "stats", "metric", "collect", "beacon"]
+            potential_trackers = []
+            
+            for domain in third_party:
+                if any(keyword in domain.lower() for keyword in tracking_keywords):
+                    potential_trackers.append(domain)
+            
+            if potential_trackers:
+                print(f"\nPotential tracking/analytics domains ({len(potential_trackers)}):")
+                for i, domain in enumerate(sorted(potential_trackers)[:10], 1):
+                    print(f"  {i}. {domain}")
+                if len(potential_trackers) > 10:
+                    print(f"     ... and {len(potential_trackers) - 10} more")
+            
+            # Check for insecure (HTTP) requests
+            insecure_requests = [
+                req.get("url") for req in result.network_requests 
+                if req.get("event_type") == "request" and req.get("url", "").startswith("http://")
+            ]
+            
+            if insecure_requests:
+                print(f"\nWarning: Found {len(insecure_requests)} insecure (HTTP) requests:")
+                for i, url in enumerate(insecure_requests[:5], 1):
+                    print(f"  {i}. {url}")
+                if len(insecure_requests) > 5:
+                    print(f"     ... and {len(insecure_requests) - 5} more")
+            
+            # Save security analysis to file
+            output_file = os.path.join(__cur_dir__, "tmp", "security_analysis.json")
+            with open(output_file, "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "main_domain": main_domain,
+                    "timestamp": datetime.now().isoformat(),
+                    "analysis": {
+                        "total_requests": len([r for r in result.network_requests if r.get("event_type") == "request"]),
+                        "unique_domains": len(domains),
+                        "first_party_domains": first_party,
+                        "third_party_domains": third_party,
+                        "potential_trackers": potential_trackers,
+                        "insecure_requests": insecure_requests
+                    }
+                }, f, indent=2)
+            
+            print(f"\nFull security analysis saved to {output_file}")
+
+async def demo_performance_analysis():
+    """Using network capture for performance analysis"""
+    print("\n=== 6. Performance Analysis with Network Capture ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            wait_until="networkidle",
+            page_timeout=60000  # 60 seconds
+        )
+        
+        result = await crawler.arun(
+            url="https://www.cnn.com/",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            # Filter only response events with timing information
+            responses_with_timing = [
+                r for r in result.network_requests 
+                if r.get("event_type") == "response" and r.get("request_timing")
+            ]
+            
+            if responses_with_timing:
+                print(f"Analyzing timing for {len(responses_with_timing)} network responses")
+                
+                # Group by resource type
+                resource_timings = {}
+                for resp in responses_with_timing:
+                    url = resp.get("url", "")
+                    timing = resp.get("request_timing", {})
+                    
+                    # Determine resource type from URL extension
+                    ext = url.split(".")[-1].lower() if "." in url.split("/")[-1] else "unknown"
+                    if ext in ["jpg", "jpeg", "png", "gif", "webp", "svg", "ico"]:
+                        resource_type = "image"
+                    elif ext in ["js"]:
+                        resource_type = "javascript"
+                    elif ext in ["css"]:
+                        resource_type = "css"
+                    elif ext in ["woff", "woff2", "ttf", "otf", "eot"]:
+                        resource_type = "font"
+                    else:
+                        resource_type = "other"
+                    
+                    if resource_type not in resource_timings:
+                        resource_timings[resource_type] = []
+                    
+                    # Calculate request duration if timing information is available
+                    if isinstance(timing, dict) and "requestTime" in timing and "receiveHeadersEnd" in timing:
+                        # Convert to milliseconds
+                        duration = (timing["receiveHeadersEnd"] - timing["requestTime"]) * 1000
+                        resource_timings[resource_type].append({
+                            "url": url,
+                            "duration_ms": duration
+                        })
+                
+                # Calculate statistics for each resource type
+                print("\nPerformance by resource type:")
+                for resource_type, timings in resource_timings.items():
+                    if timings:
+                        durations = [t["duration_ms"] for t in timings]
+                        avg_duration = sum(durations) / len(durations)
+                        max_duration = max(durations)
+                        slowest_resource = next(t["url"] for t in timings if t["duration_ms"] == max_duration)
+                        
+                        print(f"  {resource_type.upper()}:")
+                        print(f"    - Count: {len(timings)}")
+                        print(f"    - Avg time: {avg_duration:.2f} ms")
+                        print(f"    - Max time: {max_duration:.2f} ms")
+                        print(f"    - Slowest: {slowest_resource}")
+                
+                # Identify the slowest resources overall
+                all_timings = []
+                for resource_type, timings in resource_timings.items():
+                    for timing in timings:
+                        timing["type"] = resource_type
+                        all_timings.append(timing)
+                
+                all_timings.sort(key=lambda x: x["duration_ms"], reverse=True)
+                
+                print("\nTop 5 slowest resources:")
+                for i, timing in enumerate(all_timings[:5], 1):
+                    print(f"  {i}. [{timing['type']}] {timing['url']} - {timing['duration_ms']:.2f} ms")
+                
+                # Save performance analysis to file
+                output_file = os.path.join(__cur_dir__, "tmp", "performance_analysis.json")
+                with open(output_file, "w") as f:
+                    json.dump({
+                        "url": result.url,
+                        "timestamp": datetime.now().isoformat(),
+                        "resource_timings": resource_timings,
+                        "slowest_resources": all_timings[:10]  # Save top 10
+                    }, f, indent=2)
+                
+                print(f"\nFull performance analysis saved to {output_file}")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Network and Console Capture Examples ===")
+    
+    # Make sure tmp directory exists
+    os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
+    
+    # Run basic examples
+    await demo_basic_network_capture()
+    await demo_basic_console_capture()
+    await demo_combined_capture()
+    
+    # Run advanced examples
+    await analyze_spa_network_traffic()
+    await demo_security_analysis()
+    await demo_performance_analysis()
+    
+    print("\n=== Examples Complete ===")
+    print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/docs/md_v2/advanced/network-console-capture.md b/docs/md_v2/advanced/network-console-capture.md
new file mode 100644
index 00000000..4305a25f
--- /dev/null
+++ b/docs/md_v2/advanced/network-console-capture.md
@@ -0,0 +1,205 @@
+# Network Requests & Console Message Capturing
+
+Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
+
+## Configuration
+
+To enable network and console capturing, use these configuration options:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+# Enable both network request capture and console message capture
+config = CrawlerRunConfig(
+    capture_network_requests=True,  # Capture all network requests and responses
+    capture_console_messages=True   # Capture all browser console output
+)
+```
+
+## Example Usage
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable both network request capture and console message capture
+    config = CrawlerRunConfig(
+        capture_network_requests=True,
+        capture_console_messages=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            # Analyze network requests
+            if result.network_requests:
+                print(f"Captured {len(result.network_requests)} network events")
+                
+                # Count request types
+                request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+                response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+                failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
+                
+                print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
+                
+                # Find API calls
+                api_calls = [r for r in result.network_requests 
+                            if r.get("event_type") == "request" and "api" in r.get("url", "")]
+                if api_calls:
+                    print(f"Detected {len(api_calls)} API calls:")
+                    for call in api_calls[:3]:  # Show first 3
+                        print(f"  - {call.get('method')} {call.get('url')}")
+            
+            # Analyze console messages
+            if result.console_messages:
+                print(f"Captured {len(result.console_messages)} console messages")
+                
+                # Group by type
+                message_types = {}
+                for msg in result.console_messages:
+                    msg_type = msg.get("type", "unknown")
+                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
+                
+                print("Message types:", message_types)
+                
+                # Show errors (often the most important)
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"Found {len(errors)} console errors:")
+                    for err in errors[:2]:  # Show first 2
+                        print(f"  - {err.get('text', '')[:100]}")
+            
+            # Export all captured data to a file for detailed analysis
+            with open("network_capture.json", "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "network_requests": result.network_requests or [],
+                    "console_messages": result.console_messages or []
+                }, f, indent=2)
+            
+            print("Exported detailed capture data to network_capture.json")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Captured Data Structure
+
+### Network Requests
+
+The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
+| `url` | The URL of the request |
+| `timestamp` | Unix timestamp when the event was captured |
+
+#### Request Event Fields
+
+```json
+{
+  "event_type": "request",
+  "url": "https://example.com/api/data.json",
+  "method": "GET",
+  "headers": {"User-Agent": "...", "Accept": "..."},
+  "post_data": "key=value&otherkey=value",
+  "resource_type": "fetch",
+  "is_navigation_request": false,
+  "timestamp": 1633456789.123
+}
+```
+
+#### Response Event Fields
+
+```json
+{
+  "event_type": "response",
+  "url": "https://example.com/api/data.json",
+  "status": 200,
+  "status_text": "OK",
+  "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
+  "from_service_worker": false,
+  "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
+  "timestamp": 1633456789.456
+}
+```
+
+#### Failed Request Event Fields
+
+```json
+{
+  "event_type": "request_failed",
+  "url": "https://example.com/missing.png",
+  "method": "GET",
+  "resource_type": "image",
+  "failure_text": "net::ERR_ABORTED 404",
+  "timestamp": 1633456789.789
+}
+```
+
+### Console Messages
+
+The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
+| `text` | The message text |
+| `timestamp` | Unix timestamp when the message was captured |
+
+#### Console Message Example
+
+```json
+{
+  "type": "error",
+  "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
+  "location": "https://example.com/script.js:123:45",
+  "timestamp": 1633456790.123
+}
+```
+
+## Key Benefits
+
+- **Full Request Visibility**: Capture all network activity including:
+  - Requests (URLs, methods, headers, post data)
+  - Responses (status codes, headers, timing)
+  - Failed requests (with error messages)
+  
+- **Console Message Access**: View all JavaScript console output:
+  - Log messages
+  - Warnings
+  - Errors with stack traces
+  - Developer debugging information
+
+- **Debugging Power**: Identify issues such as:
+  - Failed API calls or resource loading
+  - JavaScript errors affecting page functionality
+  - CORS or other security issues
+  - Hidden API endpoints and data flows
+
+- **Security Analysis**: Detect:
+  - Unexpected third-party requests
+  - Data leakage in request payloads
+  - Suspicious script behavior
+
+- **Performance Insights**: Analyze:
+  - Request timing data
+  - Resource loading patterns
+  - Potential bottlenecks
+
+## Use Cases
+
+1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
+2. **Debugging**: Track down JavaScript errors affecting page functionality
+3. **Security Auditing**: Detect unwanted third-party requests or data leakage
+4. **Performance Analysis**: Identify slow-loading resources
+5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
+
+This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
\ No newline at end of file
diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md
index 43967886..52cf6ace 100644
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -281,7 +281,69 @@ for result in results:
 
 ---
 
-## 7. Example: Accessing Everything
+## 7. Network Requests & Console Messages
+
+When you enable network and console message capturing in `CrawlerRunConfig` using `capture_network_requests=True` and `capture_console_messages=True`, the `CrawlResult` will include these fields:
+
+### 7.1 **`network_requests`** *(Optional[List[Dict[str, Any]]])*
+**What**: A list of dictionaries containing information about all network requests, responses, and failures captured during the crawl.
+**Structure**:
+- Each item has an `event_type` field that can be `"request"`, `"response"`, or `"request_failed"`.
+- Request events include `url`, `method`, `headers`, `post_data`, `resource_type`, and `is_navigation_request`.
+- Response events include `url`, `status`, `status_text`, `headers`, and `request_timing`.
+- Failed request events include `url`, `method`, `resource_type`, and `failure_text`.
+- All events include a `timestamp` field.
+
+**Usage**:
+```python
+if result.network_requests:
+    # Count different types of events
+    requests = [r for r in result.network_requests if r.get("event_type") == "request"]
+    responses = [r for r in result.network_requests if r.get("event_type") == "response"]
+    failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"]
+    
+    print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
+    
+    # Analyze API calls
+    api_calls = [r for r in requests if "api" in r.get("url", "")]
+    
+    # Identify failed resources
+    for failure in failures:
+        print(f"Failed to load: {failure.get('url')} - {failure.get('failure_text')}")
+```
+
+### 7.2 **`console_messages`** *(Optional[List[Dict[str, Any]]])*
+**What**: A list of dictionaries containing all browser console messages captured during the crawl.
+**Structure**:
+- Each item has a `type` field indicating the message type (e.g., `"log"`, `"error"`, `"warning"`, etc.).
+- The `text` field contains the actual message text.
+- Some messages include `location` information (URL, line, column).
+- All messages include a `timestamp` field.
+
+**Usage**:
+```python
+if result.console_messages:
+    # Count messages by type
+    message_types = {}
+    for msg in result.console_messages:
+        msg_type = msg.get("type", "unknown")
+        message_types[msg_type] = message_types.get(msg_type, 0) + 1
+    
+    print(f"Message type counts: {message_types}")
+    
+    # Display errors (which are usually most important)
+    for msg in result.console_messages:
+        if msg.get("type") == "error":
+            print(f"Error: {msg.get('text')}")
+```
+
+These fields provide deep visibility into the page's network activity and browser console, which is invaluable for debugging, security analysis, and understanding complex web applications.
+
+For more details on network and console capturing, see the [Network & Console Capture documentation](../advanced/network-console-capture.md).
+
+---
+
+## 8. Example: Accessing Everything
 
 ```python
 async def handle_result(result: CrawlResult):
@@ -321,11 +383,29 @@ async def handle_result(result: CrawlResult):
         print("PDF bytes length:", len(result.pdf))
     if result.mhtml:
         print("MHTML length:", len(result.mhtml))
+        
+    # Network and console capturing
+    if result.network_requests:
+        print(f"Network requests captured: {len(result.network_requests)}")
+        # Analyze request types
+        req_types = {}
+        for req in result.network_requests:
+            if "resource_type" in req:
+                req_types[req["resource_type"]] = req_types.get(req["resource_type"], 0) + 1
+        print(f"Resource types: {req_types}")
+        
+    if result.console_messages:
+        print(f"Console messages captured: {len(result.console_messages)}")
+        # Count by message type
+        msg_types = {}
+        for msg in result.console_messages:
+            msg_types[msg.get("type", "unknown")] = msg_types.get(msg.get("type", "unknown"), 0) + 1
+        print(f"Message types: {msg_types}")
 ```
 
 ---
 
-## 8. Key Points & Future
+## 9. Key Points & Future
 
 1. **Deprecated legacy properties of CrawlResult**  
    - `markdown_v2` - Deprecated in v0.5. Just use `markdown`. It holds the `MarkdownGenerationResult` now!
diff --git a/mkdocs.yml b/mkdocs.yml
index 3082d041..82b2fa02 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -38,6 +38,7 @@ nav:
     - "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
     - "Identity Based Crawling": "advanced/identity-based-crawling.md"
     - "SSL Certificate": "advanced/ssl-certificate.md"
+    - "Network & Console Capture": "advanced/network-console-capture.md"
   - Extraction:
     - "LLM-Free Strategies": "extraction/no-llm-strategies.md"
     - "LLM Strategies": "extraction/llm-strategies.md"
diff --git a/parameter_updates.txt b/parameter_updates.txt
new file mode 100644
index 00000000..5a5027d0
--- /dev/null
+++ b/parameter_updates.txt
@@ -0,0 +1,20 @@
+The file /docs/md_v2/api/parameters.md should be updated to include the new network and console capturing parameters. 
+
+Here's what needs to be updated:
+
+1. Change section title from:
+```
+### G) **Debug & Logging**
+```
+to:
+```
+### G) **Debug, Logging & Capturing**
+```
+
+2. Add new parameters to the table:
+```
+| **`capture_network_requests`** | `bool` (False) | Captures all network requests, responses, and failures during the crawl. Available in `result.network_requests`. |
+| **`capture_console_messages`** | `bool` (False) | Captures all browser console messages (logs, warnings, errors) during the crawl. Available in `result.console_messages`. |
+```
+
+These changes demonstrate how to use the new network and console capturing features in the CrawlerRunConfig.
\ No newline at end of file
diff --git a/prompts/prompt_net_requests.md b/prompts/prompt_net_requests.md
new file mode 100644
index 00000000..d033591e
--- /dev/null
+++ b/prompts/prompt_net_requests.md
@@ -0,0 +1,489 @@
+I want to enhance the `AsyncPlaywrightCrawlerStrategy` to optionally capture network requests and console messages during a crawl, storing them in the final `CrawlResult`.
+
+Here's a breakdown of the proposed changes across the relevant files:
+
+**1. Configuration (`crawl4ai/async_configs.py`)**
+
+*   **Goal:** Add flags to `CrawlerRunConfig` to enable/disable capturing.
+*   **Changes:**
+    *   Add two new boolean attributes to `CrawlerRunConfig`:
+        *   `capture_network_requests: bool = False`
+        *   `capture_console_messages: bool = False`
+    *   Update `__init__`, `from_kwargs`, `to_dict`, and implicitly `clone`/`dump`/`load` to include these new attributes.
+
+```python
+# ==== File: crawl4ai/async_configs.py ====
+# ... (imports) ...
+
+class CrawlerRunConfig():
+    # ... (existing attributes) ...
+
+    # NEW: Network and Console Capturing Parameters
+    capture_network_requests: bool = False
+    capture_console_messages: bool = False
+
+    # Experimental Parameters
+    experimental: Dict[str, Any] = None,
+
+    def __init__(
+        self,
+        # ... (existing parameters) ...
+
+        # NEW: Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
+
+        # Experimental Parameters
+        experimental: Dict[str, Any] = None,
+    ):
+        # ... (existing assignments) ...
+
+        # NEW: Assign new parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
+
+        # Experimental Parameters
+        self.experimental = experimental or {}
+
+        # ... (rest of __init__) ...
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+        return CrawlerRunConfig(
+            # ... (existing kwargs gets) ...
+
+            # NEW: Get new parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
+
+            # Experimental Parameters
+            experimental=kwargs.get("experimental"),
+        )
+
+    def to_dict(self):
+        return {
+            # ... (existing dict entries) ...
+
+            # NEW: Add new parameters to dict
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
+
+            "experimental": self.experimental,
+        }
+
+    # clone(), dump(), load() should work automatically if they rely on to_dict() and from_kwargs()
+    # or the serialization logic correctly handles all attributes.
+```
+
+**2. Data Models (`crawl4ai/models.py`)**
+
+*   **Goal:** Add fields to store the captured data in the response/result objects.
+*   **Changes:**
+    *   Add `network_requests: Optional[List[Dict[str, Any]]] = None` and `console_messages: Optional[List[Dict[str, Any]]] = None` to `AsyncCrawlResponse`.
+    *   Add the same fields to `CrawlResult`.
+
+```python
+# ==== File: crawl4ai/models.py ====
+# ... (imports) ...
+
+# ... (Existing dataclasses/models) ...
+
+class AsyncCrawlResponse(BaseModel):
+    html: str
+    response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
+    status_code: int
+    screenshot: Optional[str] = None
+    pdf_data: Optional[bytes] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+    downloaded_files: Optional[List[str]] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    redirected_url: Optional[str] = None
+    # NEW: Fields for captured data
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+# ... (Existing models like MediaItem, Link, etc.) ...
+
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
+    screenshot: Optional[str] = None
+    pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None # Added mhtml based on the provided models.py
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
+    # NEW: Fields for captured data
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    # ... (Existing __init__, properties, model_dump for markdown compatibility) ...
+
+# ... (Rest of the models) ...
+```
+
+**3. Crawler Strategy (`crawl4ai/async_crawler_strategy.py`)**
+
+*   **Goal:** Implement the actual capturing logic within `AsyncPlaywrightCrawlerStrategy._crawl_web`.
+*   **Changes:**
+    *   Inside `_crawl_web`, initialize empty lists `captured_requests = []` and `captured_console = []`.
+    *   Conditionally attach Playwright event listeners (`page.on(...)`) based on the `config.capture_network_requests` and `config.capture_console_messages` flags.
+    *   Define handler functions for these listeners to extract relevant data and append it to the respective lists. Include timestamps.
+    *   Pass the captured lists to the `AsyncCrawlResponse` constructor at the end of the method.
+
+```python
+# ==== File: crawl4ai/async_crawler_strategy.py ====
+# ... (imports) ...
+import time # Make sure time is imported
+
+class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
+    # ... (existing methods like __init__, start, close, etc.) ...
+
+    async def _crawl_web(
+        self, url: str, config: CrawlerRunConfig
+    ) -> AsyncCrawlResponse:
+        """
+        Internal method to crawl web URLs with the specified configuration.
+        Includes optional network and console capturing. # MODIFIED DOCSTRING
+        """
+        config.url = url
+        response_headers = {}
+        execution_result = None
+        status_code = None
+        redirected_url = url
+
+        # Reset downloaded files list for new crawl
+        self._downloaded_files = []
+
+        # Initialize capture lists - IMPORTANT: Reset per crawl
+        captured_requests: List[Dict[str, Any]] = []
+        captured_console: List[Dict[str, Any]] = []
+
+        # Handle user agent ... (existing code) ...
+
+        # Get page for session
+        page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+
+        # ... (existing code for cookies, navigator overrides, hooks) ...
+
+        # --- Setup Capturing Listeners ---
+        # NOTE: These listeners are attached *before* page.goto()
+
+        # Network Request Capturing
+        if config.capture_network_requests:
+            async def handle_request_capture(request):
+                try:
+                    post_data_str = None
+                    try:
+                        # Be cautious with large post data
+                        post_data = request.post_data_buffer
+                        if post_data:
+                             # Attempt to decode, fallback to base64 or size indication
+                             try:
+                                 post_data_str = post_data.decode('utf-8', errors='replace')
+                             except UnicodeDecodeError:
+                                 post_data_str = f"[Binary data: {len(post_data)} bytes]"
+                    except Exception:
+                        post_data_str = "[Error retrieving post data]"
+
+                    captured_requests.append({
+                        "event_type": "request",
+                        "url": request.url,
+                        "method": request.method,
+                        "headers": dict(request.headers), # Convert Header dict
+                        "post_data": post_data_str,
+                        "resource_type": request.resource_type,
+                        "is_navigation_request": request.is_navigation_request(),
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    self.logger.warning(f"Error capturing request details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_response_capture(response):
+                try:
+                    # Avoid capturing full response body by default due to size/security
+                    # security_details = await response.security_details() # Optional: More SSL info
+                    captured_requests.append({
+                        "event_type": "response",
+                        "url": response.url,
+                        "status": response.status,
+                        "status_text": response.status_text,
+                        "headers": dict(response.headers), # Convert Header dict
+                        "from_service_worker": response.from_service_worker,
+                        # "security_details": security_details, # Uncomment if needed
+                        "request_timing": response.request.timing, # Detailed timing info
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    self.logger.warning(f"Error capturing response details for {response.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "response_capture_error", "url": response.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_request_failed_capture(request):
+                 try:
+                    captured_requests.append({
+                        "event_type": "request_failed",
+                        "url": request.url,
+                        "method": request.method,
+                        "resource_type": request.resource_type,
+                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    self.logger.warning(f"Error capturing request failed details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_failed_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            page.on("request", handle_request_capture)
+            page.on("response", handle_response_capture)
+            page.on("requestfailed", handle_request_failed_capture)
+
+        # Console Message Capturing
+        if config.capture_console_messages:
+            def handle_console_capture(msg):
+                 try:
+                    location = msg.location()
+                    # Attempt to resolve JSHandle args to primitive values
+                    resolved_args = []
+                    try:
+                        for arg in msg.args:
+                            resolved_args.append(arg.json_value()) # May fail for complex objects
+                    except Exception:
+                         resolved_args.append("[Could not resolve JSHandle args]")
+
+                    captured_console.append({
+                        "type": msg.type(), # e.g., 'log', 'error', 'warning'
+                        "text": msg.text(),
+                        "args": resolved_args, # Captured arguments
+                        "location": f"{location['url']}:{location['lineNumber']}:{location['columnNumber']}" if location else "N/A",
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
+                    captured_console.append({"type": "console_capture_error", "error": str(e), "timestamp": time.time()})
+
+            def handle_pageerror_capture(err):
+                 try:
+                    captured_console.append({
+                        "type": "error", # Consistent type for page errors
+                        "text": err.message,
+                        "stack": err.stack,
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
+                    captured_console.append({"type": "pageerror_capture_error", "error": str(e), "timestamp": time.time()})
+
+            page.on("console", handle_console_capture)
+            page.on("pageerror", handle_pageerror_capture)
+        # --- End Setup Capturing Listeners ---
+
+
+        # Set up console logging if requested (Keep original logging logic separate or merge carefully)
+        if config.log_console:
+            # ... (original log_console setup using page.on(...) remains here) ...
+            # This allows logging to screen *and* capturing to the list if both flags are True
+            def log_consol(msg, console_log_type="debug"):
+                # ... existing implementation ...
+                pass # Placeholder for existing code
+
+            page.on("console", lambda msg: log_consol(msg, "debug"))
+            page.on("pageerror", lambda e: log_consol(e, "error"))
+
+
+        try:
+            # ... (existing code for SSL, downloads, goto, waits, JS execution, etc.) ...
+
+            # Get final HTML content
+            # ... (existing code for selector logic or page.content()) ...
+            if config.css_selector:
+                # ... existing selector logic ...
+                html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
+            else:
+                html = await page.content()
+
+            await self.execute_hook(
+                "before_return_html", page=page, html=html, context=context, config=config
+            )
+
+            # Handle PDF and screenshot generation
+            # ... (existing code) ...
+
+            # Define delayed content getter
+            # ... (existing code) ...
+
+            # Return complete response - ADD CAPTURED DATA HERE
+            return AsyncCrawlResponse(
+                html=html,
+                response_headers=response_headers,
+                js_execution_result=execution_result,
+                status_code=status_code,
+                screenshot=screenshot_data,
+                pdf_data=pdf_data,
+                get_delayed_content=get_delayed_content,
+                ssl_certificate=ssl_cert,
+                downloaded_files=(
+                    self._downloaded_files if self._downloaded_files else None
+                ),
+                redirected_url=redirected_url,
+                # NEW: Pass captured data conditionally
+                network_requests=captured_requests if config.capture_network_requests else None,
+                console_messages=captured_console if config.capture_console_messages else None,
+            )
+
+        except Exception as e:
+            raise e # Re-raise the original exception
+
+        finally:
+            # If no session_id is given we should close the page
+            if not config.session_id:
+                # Detach listeners before closing to prevent potential errors during close
+                if config.capture_network_requests:
+                    page.remove_listener("request", handle_request_capture)
+                    page.remove_listener("response", handle_response_capture)
+                    page.remove_listener("requestfailed", handle_request_failed_capture)
+                if config.capture_console_messages:
+                    page.remove_listener("console", handle_console_capture)
+                    page.remove_listener("pageerror", handle_pageerror_capture)
+                # Also remove logging listeners if they were attached
+                if config.log_console:
+                    # Need to figure out how to remove the lambdas if necessary,
+                    # or ensure they don't cause issues on close. Often, it's fine.
+                    pass
+
+                await page.close()
+
+    # ... (rest of AsyncPlaywrightCrawlerStrategy methods) ...
+
+```
+
+**4. Core Crawler (`crawl4ai/async_webcrawler.py`)**
+
+*   **Goal:** Ensure the captured data from `AsyncCrawlResponse` is transferred to the final `CrawlResult`.
+*   **Changes:**
+    *   In `arun`, when processing a non-cached result (inside the `if not cached_result or not html:` block), after receiving `async_response` and calling `aprocess_html` to get `crawl_result`, copy the `network_requests` and `console_messages` from `async_response` to `crawl_result`.
+
+```python
+# ==== File: crawl4ai/async_webcrawler.py ====
+# ... (imports) ...
+
+class AsyncWebCrawler:
+    # ... (existing methods) ...
+
+    async def arun(
+        self,
+        url: str,
+        config: CrawlerRunConfig = None,
+        **kwargs,
+    ) -> RunManyReturn:
+        # ... (existing setup, cache check) ...
+
+        async with self._lock or self.nullcontext():
+            try:
+                # ... (existing logging, cache context setup) ...
+
+                if cached_result:
+                    # ... (existing cache handling logic) ...
+                    # Note: Captured network/console usually not useful from cache
+                    # Ensure they are None or empty if read from cache, unless stored explicitly
+                    cached_result.network_requests = cached_result.network_requests or None
+                    cached_result.console_messages = cached_result.console_messages or None
+                    # ... (rest of cache logic) ...
+
+                # Fetch fresh content if needed
+                if not cached_result or not html:
+                    t1 = time.perf_counter()
+
+                    # ... (existing user agent update, robots.txt check) ...
+
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
+                    async_response = await self.crawler_strategy.crawl(
+                        url,
+                        config=config,
+                    )
+
+                    # ... (existing assignment of html, screenshot, pdf, js_result from async_response) ...
+
+                    t2 = time.perf_counter()
+                    # ... (existing logging) ...
+
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
+                    crawl_result: CrawlResult = await self.aprocess_html(
+                        # ... (existing args) ...
+                    )
+
+                    # --- Transfer data from AsyncCrawlResponse to CrawlResult ---
+                    crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.redirected_url or url
+                    crawl_result.response_headers = async_response.response_headers
+                    crawl_result.downloaded_files = async_response.downloaded_files
+                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # NEW: Copy captured data
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
+                    # ------------------------------------------------------------
+
+                    crawl_result.success = bool(html)
+                    crawl_result.session_id = getattr(config, "session_id", None)
+
+                    # ... (existing logging) ...
+
+                    # Update cache if appropriate
+                    if cache_context.should_write() and not bool(cached_result):
+                        # crawl_result now includes network/console data if captured
+                        await async_db_manager.acache_url(crawl_result)
+
+                    return CrawlResultContainer(crawl_result)
+
+                else: # Cached result was used
+                     # ... (existing logging for cache hit) ...
+                    cached_result.success = bool(html)
+                    cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
+                    return CrawlResultContainer(cached_result)
+
+            except Exception as e:
+                # ... (existing error handling) ...
+                return CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
+                )
+
+    # ... (aprocess_html remains unchanged regarding capture) ...
+
+    # ... (arun_many remains unchanged regarding capture) ...
+```
+
+**Summary of Changes:**
+
+1.  **Configuration:** Added `capture_network_requests` and `capture_console_messages` flags to `CrawlerRunConfig`.
+2.  **Models:** Added corresponding `network_requests` and `console_messages` fields (List of Dicts) to `AsyncCrawlResponse` and `CrawlResult`.
+3.  **Strategy:** Implemented conditional event listeners in `AsyncPlaywrightCrawlerStrategy._crawl_web` to capture data into lists when flags are true. Populated these fields in the returned `AsyncCrawlResponse`. Added basic error handling within capture handlers. Added timestamps.
+4.  **Crawler:** Modified `AsyncWebCrawler.arun` to copy the captured data from `AsyncCrawlResponse` into the final `CrawlResult` for non-cached fetches.
+
+This approach keeps the capturing logic contained within the Playwright strategy, uses clear configuration flags, and integrates the results into the existing data flow. The data format (list of dictionaries) is flexible for storing varied information from requests/responses/console messages.
\ No newline at end of file
diff --git a/temp.txt b/temp.txt
deleted file mode 100644
index a9fd218d..00000000
--- a/temp.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
-   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
-   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
diff --git a/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py b/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
rename to tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
diff --git a/tests/20241401/test_advanced_deep_crawl.py b/tests/general/test_advanced_deep_crawl.py
similarity index 100%
rename from tests/20241401/test_advanced_deep_crawl.py
rename to tests/general/test_advanced_deep_crawl.py
diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/general/test_async_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_async_crawler_strategy.py
rename to tests/general/test_async_crawler_strategy.py
diff --git a/tests/20241401/test_async_markdown_generator.py b/tests/general/test_async_markdown_generator.py
similarity index 100%
rename from tests/20241401/test_async_markdown_generator.py
rename to tests/general/test_async_markdown_generator.py
diff --git a/tests/20241401/test_async_webcrawler.py b/tests/general/test_async_webcrawler.py
similarity index 100%
rename from tests/20241401/test_async_webcrawler.py
rename to tests/general/test_async_webcrawler.py
diff --git a/tests/20241401/test_cache_context.py b/tests/general/test_cache_context.py
similarity index 100%
rename from tests/20241401/test_cache_context.py
rename to tests/general/test_cache_context.py
diff --git a/tests/20241401/test_crawlers.py b/tests/general/test_crawlers.py
similarity index 100%
rename from tests/20241401/test_crawlers.py
rename to tests/general/test_crawlers.py
diff --git a/tests/20241401/test_deep_crawl.py b/tests/general/test_deep_crawl.py
similarity index 100%
rename from tests/20241401/test_deep_crawl.py
rename to tests/general/test_deep_crawl.py
diff --git a/tests/20241401/test_deep_crawl_filters.py b/tests/general/test_deep_crawl_filters.py
similarity index 100%
rename from tests/20241401/test_deep_crawl_filters.py
rename to tests/general/test_deep_crawl_filters.py
diff --git a/tests/20241401/test_deep_crawl_scorers.py b/tests/general/test_deep_crawl_scorers.py
similarity index 100%
rename from tests/20241401/test_deep_crawl_scorers.py
rename to tests/general/test_deep_crawl_scorers.py
diff --git a/tests/20241401/test_http_crawler_strategy.py b/tests/general/test_http_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_http_crawler_strategy.py
rename to tests/general/test_http_crawler_strategy.py
diff --git a/tests/20241401/test_llm_filter.py b/tests/general/test_llm_filter.py
similarity index 100%
rename from tests/20241401/test_llm_filter.py
rename to tests/general/test_llm_filter.py
diff --git a/tests/20241401/test_mhtml.py b/tests/general/test_mhtml.py
similarity index 100%
rename from tests/20241401/test_mhtml.py
rename to tests/general/test_mhtml.py
diff --git a/tests/general/test_network_console_capture.py b/tests/general/test_network_console_capture.py
new file mode 100644
index 00000000..da41ecec
--- /dev/null
+++ b/tests/general/test_network_console_capture.py
@@ -0,0 +1,185 @@
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
+import asyncio
+import aiohttp
+from aiohttp import web
+import tempfile
+import shutil
+import os, sys, time, json
+
+
+async def start_test_server():
+    app = web.Application()
+    
+    async def basic_page(request):
+        return web.Response(text="""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Network Request Test</title>
+        </head>
+        <body>
+            <h1>Test Page for Network Capture</h1>
+            <p>This page performs network requests and console logging.</p>
+            <img src="/image.png" alt="Test Image">
+            <script>
+                console.log("Basic console log");
+                console.error("Error message");
+                console.warn("Warning message");
+                
+                // Make some XHR requests
+                const xhr = new XMLHttpRequest();
+                xhr.open('GET', '/api/data', true);
+                xhr.send();
+                
+                // Make a fetch request
+                fetch('/api/json')
+                    .then(response => response.json())
+                    .catch(error => console.error('Fetch error:', error));
+                
+                // Trigger an error
+                setTimeout(() => {
+                    try {
+                        nonExistentFunction();
+                    } catch (e) {
+                        console.error("Caught error:", e);
+                    }
+                }, 100);
+            </script>
+        </body>
+        </html>
+        """, content_type="text/html")
+    
+    async def image(request):
+        # Return a small 1x1 transparent PNG
+        return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
+    
+    async def api_data(request):
+        return web.Response(text="sample data")
+    
+    async def api_json(request):
+        return web.json_response({"status": "success", "message": "JSON data"})
+    
+    # Register routes
+    app.router.add_get('/', basic_page)
+    app.router.add_get('/image.png', image)
+    app.router.add_get('/api/data', api_data)
+    app.router.add_get('/api/json', api_json)
+    
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8080)
+    await site.start()
+    
+    return runner
+
+
+async def test_network_console_capture():
+    print("\n=== Testing Network and Console Capture ===\n")
+    
+    # Start test server
+    runner = await start_test_server()
+    try:
+        browser_config = BrowserConfig(headless=True)
+        
+        # Test with capture disabled (default)
+        print("\n1. Testing with capture disabled (default)...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.network_requests is None, "Network requests should be None when capture is disabled"
+            assert result.console_messages is None, "Console messages should be None when capture is disabled"
+            print("✓ Default config correctly returns None for network_requests and console_messages")
+        
+        # Test with network capture enabled
+        print("\n2. Testing with network capture enabled...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+                capture_network_requests=True
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.network_requests is not None, "Network requests should be captured"
+            print(f"✓ Captured {len(result.network_requests)} network requests")
+            
+            # Check if we have both requests and responses
+            request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+            response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+            print(f"  - {request_count} requests, {response_count} responses")
+            
+            # Check if we captured specific resources
+            urls = [r.get("url") for r in result.network_requests]
+            has_image = any("/image.png" in url for url in urls)
+            has_api_data = any("/api/data" in url for url in urls)
+            has_api_json = any("/api/json" in url for url in urls)
+            
+            assert has_image, "Should have captured image request"
+            assert has_api_data, "Should have captured API data request"
+            assert has_api_json, "Should have captured API JSON request"
+            print("✓ Captured expected network requests (image, API endpoints)")
+        
+        # Test with console capture enabled
+        print("\n3. Testing with console capture enabled...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+                capture_console_messages=True
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.console_messages is not None, "Console messages should be captured"
+            print(f"✓ Captured {len(result.console_messages)} console messages")
+            
+            # Check if we have different types of console messages
+            message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
+            print(f"  - Message types: {', '.join(message_types)}")
+            
+            # Print all captured messages for debugging
+            print("  - Captured messages:")
+            for msg in result.console_messages:
+                print(f"    * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
+            
+            # Look for specific messages
+            messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
+            has_basic_log = any("Basic console log" in msg for msg in messages)
+            has_error_msg = any("Error message" in msg for msg in messages)
+            has_warning_msg = any("Warning message" in msg for msg in messages)
+            
+            assert has_basic_log, "Should have captured basic console.log message"
+            assert has_error_msg, "Should have captured console.error message"
+            assert has_warning_msg, "Should have captured console.warn message"
+            print("✓ Captured expected console messages (log, error, warning)")
+        
+        # Test with both captures enabled
+        print("\n4. Testing with both network and console capture enabled...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+                capture_network_requests=True,
+                capture_console_messages=True
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.network_requests is not None, "Network requests should be captured"
+            assert result.console_messages is not None, "Console messages should be captured"
+            print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
+        
+    finally:
+        await runner.cleanup()
+        print("\nTest server shutdown")
+
+
+async def main():
+    try:
+        await test_network_console_capture()
+        print("\n✅ All tests passed successfully!")
+    except Exception as e:
+        print(f"\n❌ Test failed: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/20241401/test_robot_parser.py b/tests/general/test_robot_parser.py
similarity index 100%
rename from tests/20241401/test_robot_parser.py
rename to tests/general/test_robot_parser.py
diff --git a/tests/20241401/test_schema_builder.py b/tests/general/test_schema_builder.py
similarity index 100%
rename from tests/20241401/test_schema_builder.py
rename to tests/general/test_schema_builder.py
diff --git a/tests/20241401/test_stream.py b/tests/general/test_stream.py
similarity index 100%
rename from tests/20241401/test_stream.py
rename to tests/general/test_stream.py
diff --git a/tests/20241401/test_stream_dispatch.py b/tests/general/test_stream_dispatch.py
similarity index 100%
rename from tests/20241401/test_stream_dispatch.py
rename to tests/general/test_stream_dispatch.py
diff --git a/tests/20241401/tets_robot.py b/tests/general/tets_robot.py
similarity index 100%
rename from tests/20241401/tets_robot.py
rename to tests/general/tets_robot.py

From 108b2a8bfbfdca6b928603596002a91b608af860 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 10 Apr 2025 23:22:38 +0800
Subject: [PATCH 44/78] Fixed capturing console messages for case the url is
 the local file. Update docker configuration (work in progress)

---
 Dockerfile                                    |   33 +-
 crawl4ai/async_crawler_strategy.py            |   49 +-
 crawl4ai/browser_manager.py                   |    2 +-
 deploy/docker/requirements.txt                |    1 -
 deploy/docker/supervisord.conf                |   24 +-
 docker-compose.yml                            |   72 +-
 .../network_console_capture_example.py        |   20 +-
 docs/md_v2/core/docker-deployment.md          | 1361 +++++++++--------
 docs/tutorials/coming_soon.md                 |    0
 9 files changed, 898 insertions(+), 664 deletions(-)
 create mode 100644 docs/tutorials/coming_soon.md

diff --git a/Dockerfile b/Dockerfile
index 9796bcb6..8b84f797 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,7 +24,7 @@ ARG TARGETARCH
 
 LABEL maintainer="unclecode"
 LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"    
+LABEL version="1.0"
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libjpeg-dev \
     redis-server \
     supervisor \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libcairo2 \
     libasound2 \
     libatspi2.0-0 \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*
 
 RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
     apt-get update && apt-get install -y --no-install-recommends \
     nvidia-cuda-toolkit \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/* ; \
 else \
     echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
     echo "🦾 Installing ARM-specific optimizations"; \
     apt-get update && apt-get install -y --no-install-recommends \
     libopenblas-dev \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*; \
 elif [ "$TARGETARCH" = "amd64" ]; then \
     echo "🖥️ Installing AMD64-specific optimizations"; \
     apt-get update && apt-get install -y --no-install-recommends \
     libomp-dev \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*; \
 else \
     echo "Skipping platform-specific optimizations (unsupported platform)"; \
 fi
 
+# Create a non-root user and group
+RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
+
+# Create and set permissions for appuser home directory
+RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
+
 WORKDIR ${APP_HOME}
 
 RUN echo '#!/bin/bash\n\
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
 
 COPY . /tmp/project/
 
+# Copy supervisor config first (might need root later, but okay for now)
 COPY deploy/docker/supervisord.conf .
 
 COPY deploy/docker/requirements.txt .
@@ -131,16 +143,23 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
     else \
         pip install "/tmp/project" ; \
     fi
-    
+
 RUN pip install --no-cache-dir --upgrade pip && \
     /tmp/install.sh && \
     python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
     python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
-    
+
 RUN playwright install --with-deps chromium
 
+# Copy application code
 COPY deploy/docker/* ${APP_HOME}/
 
+# Change ownership of the application directory to the non-root user
+RUN chown -R appuser:appuser ${APP_HOME}
+
+# give permissions to redis persistence dirs if used
+RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
+
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
     CMD bash -c '\
     MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
@@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
         exit 1; \
     fi && \
     redis-cli ping > /dev/null && \
-    curl -f http://localhost:8000/health || exit 1'
+    curl -f http://localhost:11235/health || exit 1'
 
 EXPOSE 6379
-CMD ["supervisord", "-c", "supervisord.conf"]
-    
+# Switch to the non-root user before starting the application
+USER appuser
+
+CMD ["supervisord", "-c", "supervisord.conf"]
\ No newline at end of file
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index f99d1cb9..3278c731 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
         user_agent = kwargs.get("user_agent", self.user_agent)
         # Use browser_manager to get a fresh page & context assigned to this session_id
-        page, context = await self.browser_manager.get_page(session_id, user_agent)
+        page, context = await self.browser_manager.get_page(CrawlerRunConfig(
+            session_id=session_id,
+            user_agent=user_agent,
+            **kwargs,
+        ))
         return session_id
 
     async def crawl(
@@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 html = f.read()
             if config.screenshot:
                 screenshot_data = await self._generate_screenshot_from_html(html)
+            if config.capture_console_messages:
+                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+                captured_console = await self._capture_console_messages(page, url)
+
             return AsyncCrawlResponse(
                 html=html,
                 response_headers=response_headers,
                 status_code=status_code,
                 screenshot=screenshot_data,
                 get_delayed_content=None,
+                console_messages=captured_console,
             )
 
         elif url.startswith("raw:") or url.startswith("raw://"):
@@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         "url": request.url,
                         "method": request.method,
                         "resource_type": request.resource_type,
-                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "failure_text": str(request.failure) if request.failure else "Unknown failure",
                         "timestamp": time.time()
                     })
                  except Exception as e:
@@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 )
             return None
 
+    async def _capture_console_messages(
+        self, page: Page, file_path: str
+    ) -> List[Dict[str, Union[str, float]]]:
+        """
+        Captures console messages from the page.
+        Args:
+
+            page (Page): The Playwright page object
+        Returns:
+            List[Dict[str, Union[str, float]]]: A list of captured console messages
+        """
+        captured_console = []
+
+        def handle_console_message(msg):
+            try:
+                message_type = msg.type
+                message_text = msg.text
+
+                entry = {
+                    "type": message_type,
+                    "text": message_text,
+                    "timestamp": time.time(),
+                }
+                captured_console.append(entry)
+            except Exception as e:
+                if self.logger:
+                    self.logger.warning(
+                        f"Error capturing console message: {e}", tag="CAPTURE"
+                    )
+
+        page.on("console", handle_console_message)
+        
+        await page.goto(file_path)
+
+        return captured_console
+        
     async def take_screenshot(self, page, **kwargs) -> str:
         """
         Take a screenshot of the current page.
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 7fc819e0..f3c7d861 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -658,7 +658,7 @@ class BrowserManager:
                     "name": "cookiesEnabled",
                     "value": "true",
                     "url": crawlerRunConfig.url
-                    if crawlerRunConfig
+                    if crawlerRunConfig and crawlerRunConfig.url
                     else "https://crawl4ai.com/",
                 }
             ]
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index b7e6d8ad..40a33a79 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,4 +1,3 @@
-crawl4ai
 fastapi
 uvicorn
 gunicorn>=23.0.0
diff --git a/deploy/docker/supervisord.conf b/deploy/docker/supervisord.conf
index 1274f2c3..d51cc953 100644
--- a/deploy/docker/supervisord.conf
+++ b/deploy/docker/supervisord.conf
@@ -1,12 +1,28 @@
 [supervisord]
-nodaemon=true
+nodaemon=true                   ; Run supervisord in the foreground
+logfile=/dev/null               ; Log supervisord output to stdout/stderr
+logfile_maxbytes=0
 
 [program:redis]
-command=redis-server
+command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
+user=appuser                    ; Run redis as our non-root user
 autorestart=true
 priority=10
+stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
+stderr_logfile_maxbytes=0
 
 [program:gunicorn]
-command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
+directory=/app                  ; Working directory for the app
+user=appuser                    ; Run gunicorn as our non-root user
 autorestart=true
-priority=20
\ No newline at end of file
+priority=20
+environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
+stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
+stderr_logfile_maxbytes=0
+
+# Optional: Add filebeat or other logging agents here if needed
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 6a7bf7cb..f112f9fd 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,15 +1,31 @@
-# Base configuration (not a service, just a reusable config block)
+# docker-compose.yml
+# This file is in the root directory alongside Dockerfile
+
+# Base configuration anchor for reusability
 x-base-config: &base-config
   ports:
+    # Map host port 11235 to container port 11235 (where Gunicorn will listen)
     - "11235:11235"
-    - "8000:8000"
-    - "9222:9222"
-    - "8080:8080"
+    # - "8080:8080" # Uncomment if needed
+
+  # Load API keys primarily from .llm.env file
+  # Create .llm.env in the root directory from deploy/docker/.llm.env.example
+  env_file:
+    - .llm.env
+
+  # Define environment variables, allowing overrides from host environment
+  # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
   environment:
-    - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
     - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-    - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+    - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+    - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+    - GROQ_API_KEY=${GROQ_API_KEY:-}
+    - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
+    - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+    - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
+
   volumes:
+    # Mount /dev/shm for Chromium/Playwright performance
     - /dev/shm:/dev/shm
   deploy:
     resources:
@@ -19,47 +35,47 @@ x-base-config: &base-config
         memory: 1G
   restart: unless-stopped
   healthcheck:
+    # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
     test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
     interval: 30s
     timeout: 10s
     retries: 3
-    start_period: 40s
+    start_period: 40s # Give the server time to start
+  # Run the container as the non-root user defined in the Dockerfile
+  user: "appuser"
 
 services:
-  # Local build services for different platforms
-  crawl4ai-amd64:
+  # --- Local Build Services ---
+  crawl4ai-local-amd64:
     build:
-      context: .
-      dockerfile: Dockerfile
+      context: . # Build context is the root directory
+      dockerfile: Dockerfile # Dockerfile is in the root directory
       args:
-        PYTHON_VERSION: "3.10"
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
-        ENABLE_GPU: false
-      platforms:
-        - linux/amd64
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
+        # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
+    platform: linux/amd64
     profiles: ["local-amd64"]
-    <<: *base-config  # extends yerine doğrudan yapılandırmayı dahil ettik
+    <<: *base-config # Inherit base configuration
 
-  crawl4ai-arm64:
+  crawl4ai-local-arm64:
     build:
-      context: .
-      dockerfile: Dockerfile
+      context: . # Build context is the root directory
+      dockerfile: Dockerfile # Dockerfile is in the root directory
       args:
-        PYTHON_VERSION: "3.10"
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
-        ENABLE_GPU: false
-      platforms:
-        - linux/arm64
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
+    platform: linux/arm64
     profiles: ["local-arm64"]
     <<: *base-config
 
-  # Hub services for different platforms and versions
+  # --- Docker Hub Image Services ---
   crawl4ai-hub-amd64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-amd64
+    image: unclecode/crawl4ai:${VERSION:-latest}-amd64
     profiles: ["hub-amd64"]
     <<: *base-config
 
   crawl4ai-hub-arm64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-arm64
+    image: unclecode/crawl4ai:${VERSION:-latest}-arm64
     profiles: ["hub-arm64"]
     <<: *base-config
\ No newline at end of file
diff --git a/docs/examples/network_console_capture_example.py b/docs/examples/network_console_capture_example.py
index 5305ddc3..0208bdce 100644
--- a/docs/examples/network_console_capture_example.py
+++ b/docs/examples/network_console_capture_example.py
@@ -357,8 +357,7 @@ async def demo_performance_analysis():
     async with AsyncWebCrawler() as crawler:
         config = CrawlerRunConfig(
             capture_network_requests=True,
-            wait_until="networkidle",
-            page_timeout=60000  # 60 seconds
+            page_timeout=60 * 2 * 1000  # 120 seconds
         )
         
         result = await crawler.arun(
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
                             "url": url,
                             "duration_ms": duration
                         })
+                    if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
+                        # Convert to milliseconds
+                        duration = (timing["responseStart"] - timing["requestStart"]) * 1000
+                        resource_timings[resource_type].append({
+                            "url": url,
+                            "duration_ms": duration
+                        })
                 
                 # Calculate statistics for each resource type
                 print("\nPerformance by resource type:")
@@ -455,14 +461,14 @@ async def main():
     os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
     
     # Run basic examples
-    await demo_basic_network_capture()
+    # await demo_basic_network_capture()
     await demo_basic_console_capture()
-    await demo_combined_capture()
+    # await demo_combined_capture()
     
     # Run advanced examples
-    await analyze_spa_network_traffic()
-    await demo_security_analysis()
-    await demo_performance_analysis()
+    # await analyze_spa_network_traffic()
+    # await demo_security_analysis()
+    # await demo_performance_analysis()
     
     print("\n=== Examples Complete ===")
     print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
index a3d0def1..b4b6e414 100644
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -1,702 +1,833 @@
-# Docker Deployment
+# Crawl4AI Docker Guide 🐳
 
-Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments.
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Local Build](#local-build)
+  - [Docker Hub](#docker-hub)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Getting Help](#getting-help)
 
-## Quick Start 🚀
+## Prerequisites
 
-Pull and run the basic version:
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher)
+- At least 4GB of RAM available for the container
+- Python 3.10+ (if using the Python SDK)
+- Node.js 16+ (if using the Node.js examples)
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+### Local Build
+
+Let's get your local environment set up step by step!
+
+#### 1. Building the Image
+
+First, clone the repository and build the Docker image:
 
 ```bash
-# Basic run without security
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai/deploy
 
-# Run with API security enabled
-docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
+# Build the Docker image
+docker build --platform=linux/amd64 --no-cache -t crawl4ai .
+
+# Or build for arm64
+docker build --platform=linux/arm64 --no-cache -t crawl4ai .
 ```
 
-## Running with Docker Compose 🐳
+#### 2. Environment Setup
 
-### Use Docker Compose (From Local Dockerfile or Docker Hub)
+If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
 
-Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
-
-### **Option 1: Using Docker Compose to Build Locally**
-If you want to build the image locally, use the provided `docker-compose.local.yml` file.
-
-```bash
-docker-compose -f docker-compose.local.yml up -d
-```
-
-This will:
-1. Build the Docker image from the provided `Dockerfile`.
-2. Start the container and expose it on `http://localhost:11235`.
-
----
-
-### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
-If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
-
-```bash
-docker-compose -f docker-compose.hub.yml up -d
-```
-
-This will:
-1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
-2. Start the container and expose it on `http://localhost:11235`.
-
----
-
-### **Stopping the Running Services**
-
-To stop the services started via Docker Compose, you can use:
-
-```bash
-docker-compose -f docker-compose.local.yml down
-# OR
-docker-compose -f docker-compose.hub.yml down
-```
-
-If the containers don’t stop and the application is still running, check the running containers:
-
-```bash
-docker ps
-```
-
-Find the `CONTAINER ID` of the running service and stop it forcefully:
-
-```bash
-docker stop <CONTAINER_ID>
-```
-
----
-
-### **Debugging with Docker Compose**
-
-- **Check Logs**: To view the container logs:
-  ```bash
-  docker-compose -f docker-compose.local.yml logs -f
-  ```
-
-- **Remove Orphaned Containers**: If the service is still running unexpectedly:
-  ```bash
-  docker-compose -f docker-compose.local.yml down --remove-orphans
-  ```
-
-- **Manually Remove Network**: If the network is still in use:
-  ```bash
-  docker network ls
-  docker network rm crawl4ai_default
-  ```
-
----
-
-### Why Use Docker Compose?
-
-Docker Compose is the recommended way to deploy Crawl4AI because:
-1. It simplifies multi-container setups.
-2. Allows you to define environment variables, resources, and ports in a single file.
-3. Makes it easier to switch between local development and production-ready images.
-
-For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
-
-
-
-
-## API Security 🔒
-
-### Understanding CRAWL4AI_API_TOKEN
-
-The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance:
-
-- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication
-- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible
-
-```bash
-# Secured Instance
-docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all
-
-# Unsecured Instance
-docker run -p 11235:11235 unclecode/crawl4ai:all
-```
-
-### Making API Calls
-
-For secured instances, include the token in all requests:
-
-```python
-import requests
-
-# Setup headers if token is being used
-api_token = "your_secret_token"  # Same token set in CRAWL4AI_API_TOKEN
-headers = {"Authorization": f"Bearer {api_token}"} if api_token else {}
-
-# Making authenticated requests
-response = requests.post(
-    "http://localhost:11235/crawl",
-    headers=headers,
-    json={
-        "urls": "https://example.com",
-        "priority": 10
-    }
-)
-
-# Checking task status
-task_id = response.json()["task_id"]
-status = requests.get(
-    f"http://localhost:11235/task/{task_id}",
-    headers=headers
-)
-```
-
-### Using with Docker Compose
-
-In your `docker-compose.yml`:
-```yaml
-services:
-  crawl4ai:
-    image: unclecode/crawl4ai:all
-    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional
-    # ... other configuration
-```
-
-Then either:
-1. Set in `.env` file:
 ```env
-CRAWL4AI_API_TOKEN=your_secret_token
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# DeepSeek
+DEEPSEEK_API_KEY=your-deepseek-key
+
+# Check out https://docs.litellm.ai/docs/providers for more providers!
 ```
 
-2. Or set via command line:
+> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
+
+#### 3. Running the Container
+
+You have several options for running the container:
+
+Basic run (no LLM support):
 ```bash
-CRAWL4AI_API_TOKEN=your_secret_token docker-compose up
+docker run -d -p 8000:8000 --name crawl4ai crawl4ai
 ```
 
-> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`).
+With LLM support:
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --name crawl4ai \
+  crawl4ai
+```
 
-## Configuration Options 🔧
+Using host environment variables (Not a good practice, but works for local testing):
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --env "$(env)" \
+  --name crawl4ai \
+  crawl4ai
+```
 
-### Environment Variables
-
-You can configure the service using environment variables:
+#### Multi-Platform Build
+For distributing your image across different architectures, use `buildx`:
 
 ```bash
-# Basic configuration
-docker run -p 11235:11235 \
-    -e MAX_CONCURRENT_TASKS=5 \
-    unclecode/crawl4ai:all
+# Set up buildx builder
+docker buildx create --use
 
-# With security and LLM support
-docker run -p 11235:11235 \
-    -e CRAWL4AI_API_TOKEN=your_secret_token \
-    -e OPENAI_API_KEY=sk-... \
-    -e ANTHROPIC_API_KEY=sk-ant-... \
-    unclecode/crawl4ai:all
+# Build for multiple platforms
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  -t crawl4ai \
+  --push \
+  .
 ```
 
-### Using Docker Compose (Recommended) 🐳
+> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
 
-Create a `docker-compose.yml`:
+#### Development Build
+For development, you might want to enable all features:
 
-```yaml
-version: '3.8'
-
-services:
-  crawl4ai:
-    image: unclecode/crawl4ai:all
-    ports:
-      - "11235:11235"
-    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API security
-      - MAX_CONCURRENT_TASKS=5
-      # LLM Provider Keys
-      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
-    volumes:
-      - /dev/shm:/dev/shm
-    deploy:
-      resources:
-        limits:
-          memory: 4G
-        reservations:
-          memory: 1G
-```
-
-You can run it in two ways:
-
-1. Using environment variables directly:
 ```bash
-CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up
+docker build -t crawl4ai
+  --build-arg INSTALL_TYPE=all \
+  --build-arg PYTHON_VERSION=3.10 \
+  --build-arg ENABLE_GPU=true \
+  .
 ```
 
-2. Using a `.env` file (recommended):
-Create a `.env` file in the same directory:
-```env
-# API Security (optional)
-CRAWL4AI_API_TOKEN=your_secret_token
+#### GPU-Enabled Build
+If you plan to use GPU acceleration:
 
-# LLM Provider Keys
-OPENAI_API_KEY=sk-...
-ANTHROPIC_API_KEY=sk-ant-...
-
-# Other Configuration
-MAX_CONCURRENT_TASKS=5
-```
-
-Then simply run:
 ```bash
-docker-compose up
+docker build -t crawl4ai
+  --build-arg ENABLE_GPU=true \
+  deploy/docker/
 ```
 
-### Testing the Deployment 🧪
+### Build Arguments Explained
+
+| Argument | Description | Default | Options |
+|----------|-------------|---------|----------|
+| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
+| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
+| ENABLE_GPU | GPU support | false | true, false |
+| APP_HOME | Install path | /app | any valid path |
+
+### Build Best Practices
+
+1. **Choose the Right Install Type**
+   - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
+   - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+
+2. **Platform Considerations**
+   - Let Docker auto-detect platform unless you need cross-compilation
+   - Use --platform for specific architecture requirements
+   - Consider buildx for multi-architecture distribution
+
+3. **Performance Optimization**
+   - The image automatically includes platform-specific optimizations
+   - AMD64 gets OpenMP optimizations
+   - ARM64 gets OpenBLAS optimizations
+
+### Docker Hub
+
+> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
+
+## Using the API
+
+In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
+
+### Python SDK
+
+The SDK makes things easier! Here's how to use it:
 
 ```python
-import requests
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig
 
-# For unsecured instances
-def test_unsecured():
-    # Health check
-    health = requests.get("http://localhost:11235/health")
-    print("Health check:", health.json())
-
-    # Basic crawl
-    response = requests.post(
-        "http://localhost:11235/crawl",
-        json={
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
-    )
-    task_id = response.json()["task_id"]
-    print("Task ID:", task_id)
-
-# For secured instances
-def test_secured(api_token):
-    headers = {"Authorization": f"Bearer {api_token}"}
-    
-    # Basic crawl with authentication
-    response = requests.post(
-        "http://localhost:11235/crawl",
-        headers=headers,
-        json={
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
-    )
-    task_id = response.json()["task_id"]
-    print("Task ID:", task_id)
-```
-
-### LLM Extraction Example 🤖
-
-When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction:
-
-```python
-request = {
-    "urls": "https://example.com",
-    "extraction_config": {
-        "type": "llm",
-        "params": {
-            "provider": "openai/gpt-4",
-            "instruction": "Extract main topics from the page"
-        }
-    }
-}
-
-# Make the request (add headers if using API security)
-response = requests.post("http://localhost:11235/crawl", json=request)
-```
-
-> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure!
-
-
-## Usage Examples 📝
-
-### Basic Crawling
-
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "priority": 10
-}
-
-response = requests.post("http://localhost:11235/crawl", json=request)
-task_id = response.json()["task_id"]
-
-# Get results
-result = requests.get(f"http://localhost:11235/task/{task_id}")
-```
-
-### Structured Data Extraction
-
-```python
-schema = {
-    "name": "Crypto Prices",
-    "baseSelector": ".cds-tableRow-t45thuk",
-    "fields": [
-        {
-            "name": "crypto",
-            "selector": "td:nth-child(1) h2",
-            "type": "text",
-        },
-        {
-            "name": "price",
-            "selector": "td:nth-child(2)",
-            "type": "text",
-        }
-    ],
-}
-
-request = {
-    "urls": "https://www.coinbase.com/explore",
-    "extraction_config": {
-        "type": "json_css",
-        "params": {"schema": schema}
-    }
-}
-```
-
-### Dynamic Content Handling
-
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "js_code": [
-        "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-    ],
-    "wait_for": "article.tease-card:nth-child(10)"
-}
-```
-
-### AI-Powered Extraction (Full Version)
-
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "extraction_config": {
-        "type": "cosine",
-        "params": {
-            "semantic_filter": "business finance economy",
-            "word_count_threshold": 10,
-            "max_dist": 0.2,
-            "top_k": 3
-        }
-    }
-}
-```
-
-## Platform-Specific Instructions 💻
-
-### macOS
-```bash
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-```
-
-### Ubuntu
-```bash
-# Basic version
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-
-# With GPU support
-docker pull unclecode/crawl4ai:gpu
-docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu
-```
-
-### Windows (PowerShell)
-```powershell
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-```
-
-## Testing 🧪
-
-Save this as `test_docker.py`:
-
-```python
-import requests
-import json
-import time
-import sys
-
-class Crawl4AiTester:
-    def __init__(self, base_url: str = "http://localhost:11235"):
-        self.base_url = base_url
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+      # If JWT is enabled, you can authenticate like this: (more on this later)
+        # await client.authenticate("test@example.com")
         
-    def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict:
-        # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data)
-        task_id = response.json()["task_id"]
-        print(f"Task ID: {task_id}")
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
         
-        # Poll for result
-        start_time = time.time()
-        while True:
-            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} timeout")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}")
-            status = result.json()
-            
-            if status["status"] == "completed":
-                return status
-                
-            time.sleep(2)
-
-def test_deployment():
-    tester = Crawl4AiTester()
-    
-    # Test basic crawl
-    request = {
-        "urls": "https://www.nbcnews.com/business",
-        "priority": 10
-    }
-    
-    result = tester.submit_and_wait(request)
-    print("Basic crawl successful!")
-    print(f"Content length: {len(result['result']['markdown'])}")
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
 
 if __name__ == "__main__":
-    test_deployment()
+    asyncio.run(main())
 ```
 
-## Advanced Configuration ⚙️
+`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
 
-### Crawler Parameters
+- `base_url` (str): Base URL of the Crawl4AI Docker server
+- `timeout` (float): Default timeout for requests in seconds
+- `verify_ssl` (bool): Whether to verify SSL certificates
+- `verbose` (bool): Whether to show logging output
+- `log_file` (str, optional): Path to log file if file logging is desired
 
-The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use:
+This client SDK generates a properly structured JSON request for the server's HTTP API.
 
+## Second Approach: Direct API Calls
+
+This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
+
+### Understanding Configuration Structure
+
+Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
+
+#### The Basic Pattern
+
+Try this in Python to understand the structure:
 ```python
-request = {
-    "urls": "https://example.com",
-    "crawler_params": {
-        # Browser Configuration
-        "headless": True,                    # Run in headless mode
-        "browser_type": "chromium",          # chromium/firefox/webkit
-        "user_agent": "custom-agent",        # Custom user agent
-        "proxy": "http://proxy:8080",        # Proxy configuration
-        
-        # Performance & Behavior
-        "page_timeout": 30000,               # Page load timeout (ms)
-        "verbose": True,                     # Enable detailed logging
-        "semaphore_count": 5,               # Concurrent request limit
-        
-        # Anti-Detection Features
-        "simulate_user": True,               # Simulate human behavior
-        "magic": True,                       # Advanced anti-detection
-        "override_navigator": True,          # Override navigator properties
-        
-        # Session Management
-        "user_data_dir": "./browser-data",   # Browser profile location
-        "use_managed_browser": True,         # Use persistent browser
+from crawl4ai import BrowserConfig
+
+# Create a config and see its structure
+config = BrowserConfig(headless=True)
+print(config.dump())
+```
+
+This outputs:
+```json
+{
+    "type": "BrowserConfig",
+    "params": {
+        "headless": true
     }
 }
 ```
 
-### Extra Parameters
+#### Simple vs Complex Values
 
-The `extra` field allows passing additional parameters directly to the crawler's `arun` function:
+The structure follows these rules:
+- Simple values (strings, numbers, booleans, lists) are passed directly
+- Complex values (classes, dictionaries) use the type-params pattern
 
-```python
-request = {
-    "urls": "https://example.com",
-    "extra": {
-        "word_count_threshold": 10,          # Min words per block
-        "only_text": True,                   # Extract only text
-        "bypass_cache": True,                # Force fresh crawl
-        "process_iframes": True,             # Include iframe content
-    }
-}
-```
-
-### Complete Examples
-
-1. **Advanced News Crawling**
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "crawler_params": {
-        "headless": True,
-        "page_timeout": 30000,
-        "remove_overlay_elements": True      # Remove popups
-    },
-    "extra": {
-        "word_count_threshold": 50,          # Longer content blocks
-        "bypass_cache": True                 # Fresh content
-    },
-    "css_selector": ".article-body"
-}
-```
-
-2. **Anti-Detection Configuration**
-```python
-request = {
-    "urls": "https://example.com",
-    "crawler_params": {
-        "simulate_user": True,
-        "magic": True,
-        "override_navigator": True,
-        "user_agent": "Mozilla/5.0 ...",
-        "headers": {
-            "Accept-Language": "en-US,en;q=0.9"
-        }
-    }
-}
-```
-
-3. **LLM Extraction with Custom Parameters**
-```python
-request = {
-    "urls": "https://openai.com/pricing",
-    "extraction_config": {
-        "type": "llm",
+For example, with dictionaries:
+```json
+{
+    "browser_config": {
+        "type": "BrowserConfig",
         "params": {
-            "provider": "openai/gpt-4",
-            "schema": pricing_schema
+            "headless": true,           // Simple boolean - direct value
+            "viewport": {               // Complex dictionary - needs type-params
+                "type": "dict",
+                "value": {
+                    "width": 1200,
+                    "height": 800
+                }
+            }
         }
-    },
-    "crawler_params": {
-        "verbose": True,
-        "page_timeout": 60000
-    },
-    "extra": {
-        "word_count_threshold": 1,
-        "only_text": True
     }
 }
 ```
 
-4. **Session-Based Dynamic Content**
+#### Strategy Pattern and Nesting
+
+Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "chunking_strategy": {
+                "type": "RegexChunking",      // Strategy implementation
+                "params": {
+                    "patterns": ["\n\n", "\\.\\s+"]
+                }
+            }
+        }
+    }
+}
+```
+
+Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
+
+#### Complex Nested Example
+
+Let's look at a more complex example with content filtering:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed"
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+This shows how deeply configurations can nest while maintaining a consistent structure.
+
+#### Quick Grammar Overview
+```
+config := {
+    "type": string,
+    "params": {
+        key: simple_value | complex_value
+    }
+}
+
+simple_value := string | number | boolean | [simple_value]
+complex_value := config | dict_value
+
+dict_value := {
+    "type": "dict",
+    "value": object
+}
+```
+
+#### Important Rules 🚨
+
+- Always use the type-params pattern for class instances
+- Use direct values for primitives (numbers, strings, booleans)
+- Wrap dictionaries with {"type": "dict", "value": {...}}
+- Arrays/lists are passed directly without type-params
+- All parameters are optional unless specifically required
+
+#### Pro Tip 💡
+
+The easiest way to get the correct structure is to:
+1. Create configuration objects in Python
+2. Use the `dump()` method to see their JSON representation
+3. Use that JSON in your API calls
+
+Example:
 ```python
-request = {
-    "urls": "https://example.com",
-    "crawler_params": {
-        "session_id": "dynamic_session",
-        "headless": False,
-        "page_timeout": 60000
-    },
-    "js_code": ["window.scrollTo(0, document.body.scrollHeight);"],
-    "wait_for": "js:() => document.querySelectorAll('.item').length > 10",
-    "extra": {
-        "delay_before_return_html": 2.0
+from crawl4ai import CrawlerRunConfig, PruningContentFilter
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
+    ),
+    cache_mode= CacheMode.BYPASS
+)
+print(config.dump())  # Use this JSON in your API calls
+```
+
+
+#### More Examples
+
+**Advanced Crawler Configuration**
+
+```json
+{
+    "urls": ["https://example.com"],
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "cache_mode": "bypass",
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed",
+                            "min_word_threshold": 0
+                        }
+                    }
+                }
+            }
+        }
     }
 }
 ```
 
-5. **Screenshot with Custom Timing**
+**Extraction Strategy**:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "baseSelector": "article.post",
+                        "fields": [
+                            {"name": "title", "selector": "h1", "type": "text"},
+                            {"name": "content", "selector": ".content", "type": "html"}
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "extraction_strategy": {
+        "type": "LLMExtractionStrategy",
+        "params": {
+          "instruction": "Extract article title, author, publication date and main content",
+          "provider": "openai/gpt-4",
+          "api_token": "your-api-token",
+          "schema": {
+            "type": "dict",
+            "value": {
+              "title": "Article Schema",
+              "type": "object",
+              "properties": {
+                "title": {
+                  "type": "string",
+                  "description": "The article's headline"
+                },
+                "author": {
+                  "type": "string",
+                  "description": "The author's name"
+                },
+                "published_date": {
+                  "type": "string",
+                  "format": "date-time",
+                  "description": "Publication date and time"
+                },
+                "content": {
+                  "type": "string",
+                  "description": "The main article content"
+                }
+              },
+              "required": ["title", "content"]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+**Deep Crawler Example**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "deep_crawl_strategy": {
+        "type": "BFSDeepCrawlStrategy",
+        "params": {
+          "max_depth": 3,
+          "filter_chain": {
+            "type": "FilterChain",
+            "params": {
+              "filters": [
+                {
+                  "type": "ContentTypeFilter",
+                  "params": {
+                    "allowed_types": ["text/html", "application/xhtml+xml"]
+                  }
+                },
+                {
+                  "type": "DomainFilter",
+                  "params": {
+                    "allowed_domains": ["blog.*", "docs.*"],
+                  }
+                }
+              ]
+            }
+          },
+          "url_scorer": {
+            "type": "CompositeScorer",
+            "params": {
+              "scorers": [
+                {
+                  "type": "KeywordRelevanceScorer",
+                  "params": {
+                    "keywords": ["tutorial", "guide", "documentation"],
+                  }
+                },
+                {
+                  "type": "PathDepthScorer",
+                  "params": {
+                    "weight": 0.5,
+                    "optimal_depth": 3  
+                  }
+                }
+              ]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+### REST API Examples
+
+Let's look at some practical examples:
+
+#### Simple Crawl
+
 ```python
-request = {
-    "urls": "https://example.com",
-    "screenshot": True,
-    "crawler_params": {
-        "headless": True,
-        "screenshot_wait_for": ".main-content"
-    },
-    "extra": {
-        "delay_before_return_html": 3.0
-    }
+import requests
+
+crawl_payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"headless": True},
+    "crawler_config": {"stream": False}
 }
+response = requests.post(
+    "http://localhost:8000/crawl",
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled, more on this later
+    json=crawl_payload
+)
+print(response.json())  # Print the response for debugging
 ```
 
-### Parameter Reference Table
+#### Streaming Results
 
-| Category | Parameter | Type | Description |
-|----------|-----------|------|-------------|
-| Browser | headless | bool | Run browser in headless mode |
-| Browser | browser_type | str | Browser engine selection |
-| Browser | user_agent | str | Custom user agent string |
-| Network | proxy | str | Proxy server URL |
-| Network | headers | dict | Custom HTTP headers |
-| Timing | page_timeout | int | Page load timeout (ms) |
-| Timing | delay_before_return_html | float | Wait before capture |
-| Anti-Detection | simulate_user | bool | Human behavior simulation |
-| Anti-Detection | magic | bool | Advanced protection |
-| Session | session_id | str | Browser session ID |
-| Session | user_data_dir | str | Profile directory |
-| Content | word_count_threshold | int | Minimum words per block |
-| Content | only_text | bool | Text-only extraction |
-| Content | process_iframes | bool | Include iframe content |
-| Debug | verbose | bool | Detailed logging |
-| Debug | log_console | bool | Browser console logs |
+```python
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  
+            "https://example.com/page2",  
+            "https://example.com/page3",  
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+    }
 
-## Troubleshooting 🔍
+    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+```
 
-### Common Issues
+## Metrics & Monitoring
 
-1. **Connection Refused**
-   ```
-   Error: Connection refused at localhost:11235
-   ```
-   Solution: Ensure the container is running and ports are properly mapped.
+Keep an eye on your crawler with these endpoints:
 
-2. **Resource Limits**
-   ```
-   Error: No available slots
-   ```
-   Solution: Increase MAX_CONCURRENT_TASKS or container resources.
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
 
-3. **GPU Access**
-   ```
-   Error: GPU not found
-   ```
-   Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag.
-
-### Debug Mode
-
-Access container for debugging:
+Example health check:
 ```bash
-docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all
+curl http://localhost:8000/health
 ```
 
-View container logs:
-```bash
-docker logs [container_id]
+## Deployment Scenarios
+
+> 🚧 Coming soon! We'll cover:
+> - Kubernetes deployment
+> - Cloud provider setups (AWS, GCP, Azure)
+> - High-availability configurations
+> - Load balancing strategies
+
+## Complete Examples
+
+Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
+[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
+[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+
+### Understanding config.yml
+
+The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+
+Here's a detailed breakdown of the configuration options:
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"           # Server title in OpenAPI docs
+  version: "1.0.0"               # API version
+  host: "0.0.0.0"               # Listen on all interfaces
+  port: 8000                    # Server port
+  reload: True                  # Enable hot reloading (development only)
+  timeout_keep_alive: 300       # Keep-alive timeout in seconds
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True                 # Enable/disable rate limiting
+  default_limit: "100/minute"   # Rate limit format: "number/timeunit"
+  trusted_proxies: []          # List of trusted proxy IPs
+  storage_uri: "memory://"     # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+  enabled: false               # Master toggle for security features
+  jwt_enabled: true            # Enable JWT authentication
+  https_redirect: True         # Force HTTPS
+  trusted_hosts: ["*"]         # Allowed hosts (use specific domains in production)
+  headers:                     # Security headers
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0  # Memory usage threshold
+  rate_limiter:
+    base_delay: [1.0, 2.0]      # Min and max delay between requests
+  timeouts:
+    stream_init: 30.0           # Stream initialization timeout
+    batch_process: 300.0        # Batch processing timeout
+
+# Logging Configuration
+logging:
+  level: "INFO"                 # Log level (DEBUG, INFO, WARNING, ERROR)
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True              # Enable Prometheus metrics
+    endpoint: "/metrics"       # Metrics endpoint
+  health_check:
+    endpoint: "/health"        # Health check endpoint
 ```
 
-## Best Practices 🌟
+### JWT Authentication
 
-1. **Resource Management**
-   - Set appropriate memory and CPU limits
-   - Monitor resource usage via health endpoint
-   - Use basic version for simple crawling tasks
+When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
 
-2. **Scaling**
-   - Use multiple containers for high load
-   - Implement proper load balancing
-   - Monitor performance metrics
-
-3. **Security**
-   - Use environment variables for sensitive data
-   - Implement proper network isolation
-   - Regular security updates
-
-## API Reference 📚
-
-### Health Check
-```http
-GET /health
-```
-
-### Submit Crawl Task
-```http
-POST /crawl
+#### Getting a Token
+```python
+POST /token
 Content-Type: application/json
 
 {
-    "urls": "string or array",
-    "extraction_config": {
-        "type": "basic|llm|cosine|json_css",
-        "params": {}
-    },
-    "priority": 1-10,
-    "ttl": 3600
+    "email": "user@example.com"
 }
 ```
 
-### Get Task Status
-```http
-GET /task/{task_id}
+The endpoint returns:
+```json
+{
+    "email": "user@example.com",
+    "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
+    "token_type": "bearer"
+}
 ```
 
-For more details, visit the [official documentation](https://docs.crawl4ai.com/).
\ No newline at end of file
+#### Using the Token
+Add the token to your requests:
+```bash
+curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
+```
+
+Using the Python SDK:
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+
+async with Crawl4aiDockerClient() as client:
+    # Authenticate first
+    await client.authenticate("user@example.com")
+    
+    # Now all requests will include the token automatically
+    result = await client.crawl(urls=["https://example.com"])
+```
+
+#### Production Considerations 💡
+The default implementation uses a simple email verification. For production use, consider:
+- Email verification via OTP/magic links
+- OAuth2 integration
+- Rate limiting token generation
+- Token expiration and refresh mechanisms
+- IP-based restrictions
+
+### Configuration Tips and Best Practices
+
+1. **Production Settings** 🏭
+
+   ```yaml
+   app:
+     reload: False              # Disable reload in production
+     timeout_keep_alive: 120    # Lower timeout for better resource management
+   
+   rate_limiting:
+     storage_uri: "redis://redis:6379"  # Use Redis for distributed rate limiting
+     default_limit: "50/minute"         # More conservative rate limit
+   
+   security:
+     enabled: true                      # Enable all security features
+     trusted_hosts: ["your-domain.com"] # Restrict to your domain
+   ```
+
+2. **Development Settings** 🛠️
+
+   ```yaml
+   app:
+     reload: True               # Enable hot reloading
+     timeout_keep_alive: 300    # Longer timeout for debugging
+   
+   logging:
+     level: "DEBUG"            # More verbose logging
+   ```
+
+3. **High-Traffic Settings** 🚦
+
+   ```yaml
+   crawler:
+     memory_threshold_percent: 85.0  # More conservative memory limit
+     rate_limiter:
+       base_delay: [2.0, 4.0]       # More aggressive rate limiting
+   ```
+
+### Customizing Your Configuration
+
+#### Method 1: Pre-build Configuration
+
+```bash
+# Copy and modify config before building
+cd crawl4ai/deploy
+vim custom-config.yml # Or use any editor
+
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
+```
+
+#### Method 2: Build-time Configuration
+
+Use a custom config during build:
+
+```bash
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache \
+  --build-arg CONFIG_PATH=/path/to/custom-config.yml \ 
+  -t crawl4ai:latest .
+```
+
+#### Method 3: Runtime Configuration
+```bash
+# Mount custom config at runtime
+docker run -d -p 8000:8000 \
+  -v $(pwd)/custom-config.yml:/app/config.yml \
+  crawl4ai-server:prod
+```
+
+> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
+> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
\ No newline at end of file
diff --git a/docs/tutorials/coming_soon.md b/docs/tutorials/coming_soon.md
new file mode 100644
index 00000000..e69de29b

From 7c358a1aee209eb6a79074307f6fe6a2068050af Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 10 Apr 2025 23:25:07 +0800
Subject: [PATCH 45/78] fix(browser): add null check for crawlerRunConfig.url

Add additional null check when accessing crawlerRunConfig.url in cookie configuration to prevent potential null pointer exceptions. Previously, the code only checked if crawlerRunConfig existed but not its url property.

Fixes potential runtime error when crawlerRunConfig.url is undefined.
---
 crawl4ai/browser_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 7fc819e0..f3c7d861 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -658,7 +658,7 @@ class BrowserManager:
                     "name": "cookiesEnabled",
                     "value": "true",
                     "url": crawlerRunConfig.url
-                    if crawlerRunConfig
+                    if crawlerRunConfig and crawlerRunConfig.url
                     else "https://crawl4ai.com/",
                 }
             ]

From 18e8227dfb5df47fc5725e9d56d0bcbfd062075f Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 10 Apr 2025 23:26:09 +0800
Subject: [PATCH 46/78] feat(crawler): add console message capture
 functionality

Add ability to capture browser console messages during crawling:
- Implement _capture_console_messages method to collect console logs
- Update crawl method to support console message capture
- Modify browser_manager page creation to accept full CrawlerRunConfig
- Fix request failure text formatting

This enhancement allows debugging and monitoring of JavaScript console output during crawling operations.
---
 crawl4ai/async_crawler_strategy.py | 49 ++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index f99d1cb9..3278c731 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
         user_agent = kwargs.get("user_agent", self.user_agent)
         # Use browser_manager to get a fresh page & context assigned to this session_id
-        page, context = await self.browser_manager.get_page(session_id, user_agent)
+        page, context = await self.browser_manager.get_page(CrawlerRunConfig(
+            session_id=session_id,
+            user_agent=user_agent,
+            **kwargs,
+        ))
         return session_id
 
     async def crawl(
@@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 html = f.read()
             if config.screenshot:
                 screenshot_data = await self._generate_screenshot_from_html(html)
+            if config.capture_console_messages:
+                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+                captured_console = await self._capture_console_messages(page, url)
+
             return AsyncCrawlResponse(
                 html=html,
                 response_headers=response_headers,
                 status_code=status_code,
                 screenshot=screenshot_data,
                 get_delayed_content=None,
+                console_messages=captured_console,
             )
 
         elif url.startswith("raw:") or url.startswith("raw://"):
@@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         "url": request.url,
                         "method": request.method,
                         "resource_type": request.resource_type,
-                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "failure_text": str(request.failure) if request.failure else "Unknown failure",
                         "timestamp": time.time()
                     })
                  except Exception as e:
@@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 )
             return None
 
+    async def _capture_console_messages(
+        self, page: Page, file_path: str
+    ) -> List[Dict[str, Union[str, float]]]:
+        """
+        Captures console messages from the page.
+        Args:
+
+            page (Page): The Playwright page object
+        Returns:
+            List[Dict[str, Union[str, float]]]: A list of captured console messages
+        """
+        captured_console = []
+
+        def handle_console_message(msg):
+            try:
+                message_type = msg.type
+                message_text = msg.text
+
+                entry = {
+                    "type": message_type,
+                    "text": message_text,
+                    "timestamp": time.time(),
+                }
+                captured_console.append(entry)
+            except Exception as e:
+                if self.logger:
+                    self.logger.warning(
+                        f"Error capturing console message: {e}", tag="CAPTURE"
+                    )
+
+        page.on("console", handle_console_message)
+        
+        await page.goto(file_path)
+
+        return captured_console
+        
     async def take_screenshot(self, page, **kwargs) -> str:
         """
         Take a screenshot of the current page.

From 3179d6ad0c03e40080ba1ec8274f4690019a39bb Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 11 Apr 2025 20:58:39 +0800
Subject: [PATCH 47/78] fix(core): improve error handling and stability in core
 components

Enhance error handling and stability across multiple components:
- Add safety checks in async_configs.py for type and params existence
- Fix browser manager initialization and cleanup logic
- Add default LLM config fallback in extraction strategy
- Add comprehensive Docker deployment guide and server tests

BREAKING CHANGE: BrowserManager.start() now automatically closes existing instances
---
 crawl4ai/async_configs.py            |  22 +-
 crawl4ai/browser_manager.py          |   8 +-
 crawl4ai/extraction_strategy.py      |   9 +-
 deploy/docker/README-new.md          | 644 ++++++++++++++++++++++++++
 deploy/docker/api.py                 |  28 +-
 deploy/docker/config.yml             |   2 +-
 tests/docker/test_server_requests.py | 650 +++++++++++++++++++++++++++
 7 files changed, 1336 insertions(+), 27 deletions(-)
 create mode 100644 deploy/docker/README-new.md
 create mode 100644 tests/docker/test_server_requests.py

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index af98e607..2f421178 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -122,23 +122,25 @@ def from_serializable_dict(data: Any) -> Any:
     # Handle typed data
     if isinstance(data, dict) and "type" in data:
         # Handle plain dictionaries
-        if data["type"] == "dict":
+        if data["type"] == "dict" and "value" in data:
             return {k: from_serializable_dict(v) for k, v in data["value"].items()}
 
         # Import from crawl4ai for class instances
         import crawl4ai
 
-        cls = getattr(crawl4ai, data["type"])
+        if hasattr(crawl4ai, data["type"]):
+            cls = getattr(crawl4ai, data["type"])
 
-        # Handle Enum
-        if issubclass(cls, Enum):
-            return cls(data["params"])
+            # Handle Enum
+            if issubclass(cls, Enum):
+                return cls(data["params"])
 
-        # Handle class instances
-        constructor_args = {
-            k: from_serializable_dict(v) for k, v in data["params"].items()
-        }
-        return cls(**constructor_args)
+            if "params" in data:
+                # Handle class instances
+                constructor_args = {
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
+                }
+                return cls(**constructor_args)
 
     # Handle lists
     if isinstance(data, list):
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index f3c7d861..bfe22f4e 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -491,10 +491,12 @@ class BrowserManager:
 
         Note: This method should be called in a separate task to avoid blocking the main event loop.
         """
-        if self.playwright is None:
-            from playwright.async_api import async_playwright
+        if self.playwright is not None:
+            await self.close()
+            
+        from playwright.async_api import async_playwright
 
-            self.playwright = await async_playwright().start()
+        self.playwright = await async_playwright().start()
 
         if self.config.cdp_url or self.config.use_managed_browser:
             self.config.use_managed_browser = True
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index bf4825cc..954fe37e 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -7,7 +7,9 @@ import time
 
 from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
 from .config import (
-    DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    CHUNK_TOKEN_THRESHOLD,
     OVERLAP_RATE,
     WORD_TOKEN_RATE,
 )
@@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
         """
         super().__init__( input_format=input_format, **kwargs)
         self.llm_config = llm_config
+        if not self.llm_config:
+            self.llm_config = create_llm_config(
+                provider=DEFAULT_PROVIDER,
+                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
+            )
         self.instruction = instruction
         self.extract_type = extraction_type
         self.schema = schema
diff --git a/deploy/docker/README-new.md b/deploy/docker/README-new.md
new file mode 100644
index 00000000..3a9bdf52
--- /dev/null
+++ b/deploy/docker/README-new.md
@@ -0,0 +1,644 @@
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
+  - [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
+  - [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+  - [Understanding config.yml](#understanding-configyml)
+  - [JWT Authentication](#jwt-authentication)
+  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+  - [Customizing Your Configuration](#customizing-your-configuration)
+  - [Configuration Recommendations](#configuration-recommendations)
+- [Getting Help](#getting-help)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
+
+### Option 1: Using Docker Compose (Recommended)
+
+Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
+
+#### 1. Clone Repository
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
+# Example content:
+# OPENAI_API_KEY=sk-your-key
+# ANTHROPIC_API_KEY=your-anthropic-key
+# ...
+```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
+
+#### 3. Build and Run with Compose
+
+The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
+
+*   **Build and Run Locally (AMD64):**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    docker compose --profile local-amd64 up --build -d
+    ```
+
+*   **Build and Run Locally (ARM64):**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    docker compose --profile local-arm64 up --build -d
+    ```
+
+*   **Run Pre-built Image from Docker Hub (AMD64):**
+    ```bash
+    # Pulls and runs the specified AMD64 image from Docker Hub
+    # (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
+    docker compose --profile hub-amd64 up -d
+    ```
+
+*   **Run Pre-built Image from Docker Hub (ARM64):**
+    ```bash
+    # Pulls and runs the specified ARM64 image from Docker Hub
+    docker compose --profile hub-arm64 up -d
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping Compose Services
+
+```bash
+# Stop the service(s) associated with a profile (e.g., local-amd64)
+docker compose --profile local-amd64 down
+```
+
+### Option 2: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for local builds.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+```
+
+#### 3. Run the Container
+
+*   **Basic run (no LLM support):**
+    ```bash
+    # Replace --platform if your host is ARM64
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --shm-size=1g \
+      --platform linux/amd64 \
+      crawl4ai-local:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory (project root)
+    # Replace --platform if your host is ARM64
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --env-file .llm.env \
+      --shm-size=1g \
+      --platform linux/amd64 \
+      crawl4ai-local:latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
+```
+
+### Option 3: Using Pre-built Docker Hub Images
+
+Pull and run images directly from Docker Hub without building locally.
+
+#### 1. Pull the Image
+
+We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
+
+```bash
+# Pull a specific version (recommended for stability)
+docker pull unclecode/crawl4ai:0.5.1-d1
+
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
+```
+
+#### 2. Setup Environment (API Keys)
+
+If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
+
+#### 3. Run the Container
+
+*   **Basic run:**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-hub \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory you are running docker from
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-hub \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Hub Container
+
+```bash
+docker stop crawl4ai-hub && docker rm crawl4ai-hub
+```
+
+#### Docker Hub Versioning Explained
+
+*   **Image Name:** `unclecode/crawl4ai`
+*   **Tag Format:** `LIBRARY_VERSION-dREVISION`
+    *   `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
+    *   `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
+*   **Example:** `unclecode/crawl4ai:0.5.1-d1`
+*   **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
+*   **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
+
+---
+
+*(Rest of the document remains largely the same, but with key updates below)*
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  --build-arg INSTALL_TYPE=all \
+  -t yourname/crawl4ai-all:latest \
+  --load \
+  . # Build from root context
+```
+
+### Build Arguments Explained
+
+| Argument     | Description                              | Default   | Options                            |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
+| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
+| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
+| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
+
+### Build Best Practices
+
+1.  **Choose the Right Install Type**
+    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2.  **Platform Considerations**
+    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
+    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3.  **Performance Optimization**
+    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
+
+---
+
+## Using the API
+
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Python SDK
+
+Install the SDK: `pip install crawl4ai`
+
+```python
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
+
+async def main():
+    # Point to the correct server port
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # If JWT is enabled on the server, authenticate first:
+        # await client.authenticate("user@example.com") # See Server Configuration section
+
+        # Example Non-streaming crawl
+        print("--- Running Non-Streaming Crawl ---")
+        results = await client.crawl(
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        if results: # client.crawl returns None on failure
+          print(f"Non-streaming results success: {results.success}")
+          if results.success:
+              for result in results: # Iterate through the CrawlResultContainer
+                  print(f"URL: {result.url}, Success: {result.success}")
+        else:
+            print("Non-streaming crawl failed.")
+
+
+        # Example Streaming crawl
+        print("\n--- Running Streaming Crawl ---")
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        try:
+            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=stream_config
+            ):
+                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+        except Exception as e:
+            print(f"Streaming crawl failed: {e}")
+
+
+        # Example Get schema
+        print("\n--- Getting Schema ---")
+        schema = await client.get_schema()
+        print(f"Schema received: {bool(schema)}") # Print whether schema was received
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
+
+### Second Approach: Direct API Calls
+
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
+
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
+
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
+
+**Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
+
+**Extraction Strategy**
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "type": "dict",
+                        "value": {
+                           "baseSelector": "article.post",
+                           "fields": [
+                               {"name": "title", "selector": "h1", "type": "text"},
+                               {"name": "content", "selector": ".content", "type": "html"}
+                           ]
+                         }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
+
+### REST API Examples
+
+Update URLs to use port `11235`.
+
+#### Simple Crawl
+
+```python
+import requests
+
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+    "type": "BrowserConfig",
+    "params": {"headless": True}
+}
+crawler_config_payload = {
+    "type": "CrawlerRunConfig",
+    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
+crawl_payload = {
+    "urls": ["https://httpbin.org/html"],
+    "browser_config": browser_config_payload,
+    "crawler_config": crawler_config_payload
+}
+response = requests.post(
+    "http://localhost:11235/crawl", # Updated port
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
+    json=crawl_payload
+)
+print(f"Status Code: {response.status_code}")
+if response.ok:
+    print(response.json())
+else:
+    print(f"Error: {response.text}")
+
+```
+
+#### Streaming Results
+
+```python
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:11235/crawl/stream" # Updated port
+    payload = {
+        "urls": [
+            "https://httpbin.org/html",
+            "https://httpbin.org/links/5/0",
+        ],
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"stream": True, "cache_mode": "bypass"}
+        }
+    }
+
+    headers = {}
+    # if token:
+    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
+    try:
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+                print(f"Status: {response.status_code} (Expected: 200)")
+                response.raise_for_status() # Raise exception for bad status codes
+
+                # Read streaming response line-by-line (NDJSON)
+                async for line in response.aiter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            # Check for completion marker
+                            if data.get("status") == "completed":
+                                print("Stream completed.")
+                                break
+                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
+                        except json.JSONDecodeError:
+                            print(f"Warning: Could not decode JSON line: {line}")
+
+    except httpx.HTTPStatusError as e:
+         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
+```
+
+---
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:11235/health
+```
+
+---
+
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
+
+---
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file.
+
+### Understanding config.yml
+
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
+
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"
+  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+  host: "0.0.0.0"
+  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  reload: False # Default set to False - suitable for production
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  # ... other redis options ...
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
+
+# Security Configuration
+security:
+  enabled: false # Master toggle for security features
+  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+  https_redirect: false # Force HTTPS (requires security.enabled=true)
+  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+  headers: # Security headers (applied if security.enabled=true)
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0
+  rate_limiter:
+    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
+  timeouts:
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0 # Timeout for non-streaming /crawl processing
+
+# Logging Configuration
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True
+    endpoint: "/metrics"
+  health_check:
+    endpoint: "/health"
+```
+
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
+
+*(Configuration Tips and Best Practices remain the same)*
+
+### Customizing Your Configuration
+
+You can override the default `config.yml`.
+
+#### Method 1: Modify Before Build
+
+1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
+2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
+
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
+
+1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2.  Mount it when running the container:
+
+    *   **Using `docker run`:**
+        ```bash
+        # Assumes my-custom-config.yml is in the current directory
+        docker run -d -p 11235:11235 \
+          --name crawl4ai-custom-config \
+          --env-file .llm.env \
+          --shm-size=1g \
+          -v $(pwd)/my-custom-config.yml:/app/config.yml \
+          unclecode/crawl4ai:latest # Or your specific tag
+        ```
+
+    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+        ```yaml
+        services:
+          crawl4ai-hub-amd64: # Or your chosen service
+            image: unclecode/crawl4ai:latest
+            profiles: ["hub-amd64"]
+            <<: *base-config
+            volumes:
+              # Mount local custom config over the default one in the container
+              - ./my-custom-config.yml:/app/config.yml
+              # Keep the shared memory volume from base-config
+              - /dev/shm:/dev/shm
+        ```
+        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
+
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 33802772..c01696b2 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -388,21 +388,25 @@ async def handle_crawl_request(
             )
         )
 
-        async with AsyncWebCrawler(config=browser_config) as crawler:
-            results = []
-            func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
-            partial_func = partial(func, 
-                                   urls[0] if len(urls) == 1 else urls, 
-                                   config=crawler_config, 
-                                   dispatcher=dispatcher)
-            results = await partial_func()
-            return {
-                "success": True,
-                "results": [result.model_dump() for result in results]
-            }
+        crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
+        await crawler.start()
+        results = []
+        func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
+        partial_func = partial(func, 
+                                urls[0] if len(urls) == 1 else urls, 
+                                config=crawler_config, 
+                                dispatcher=dispatcher)
+        results = await partial_func()
+        await crawler.close()
+        return {
+            "success": True,
+            "results": [result.model_dump() for result in results]
+        }
 
     except Exception as e:
         logger.error(f"Crawl error: {str(e)}", exc_info=True)
+        if 'crawler' in locals():
+            await crawler.close()
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail=str(e)
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index b7ef4885..3b5fead6 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -4,7 +4,7 @@ app:
   version: "1.0.0"
   host: "0.0.0.0"
   port: 8020
-  reload: True
+  reload: False
   timeout_keep_alive: 300
 
 # Default LLM Configuration
diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py
new file mode 100644
index 00000000..ab8b8ced
--- /dev/null
+++ b/tests/docker/test_server_requests.py
@@ -0,0 +1,650 @@
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
+# You don't strictly NEED these imports for the tests to run against the server,
+# but they help in understanding the structure you are mimicking in JSON.
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    BM25ContentFilter,
+    BFSDeepCrawlStrategy,
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    CompositeScorer,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+    LLMConfig
+)
+
+# --- Test Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
+# Use a known simple HTML page for basic tests
+SIMPLE_HTML_URL = "https://httpbin.org/html"
+# Use a site suitable for scraping tests
+SCRAPE_TARGET_URL = "http://books.toscrape.com/"
+# Use a site with internal links for deep crawl tests
+DEEP_CRAWL_URL = "https://python.org"
+
+# --- Pytest Fixtures ---
+
+# Use the built-in event_loop fixture from pytest_asyncio
+# The custom implementation was causing issues with closing the loop
+
+@pytest_asyncio.fixture(scope="function")  # Changed to function scope to avoid event loop issues
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+    """Provides an async HTTP client"""
+    client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
+    yield client
+    await client.aclose()
+
+# --- Helper Functions ---
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    try:
+        response = await client.get("/health")
+        response.raise_for_status()
+        print(f"\nServer healthy: {response.json()}")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any]):
+    """Asserts the basic structure of a single crawl result."""
+    assert isinstance(result, dict)
+    assert "url" in result
+    assert "success" in result
+    assert "html" in result
+    # Add more common checks if needed
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+    """Processes an NDJSON streaming response."""
+    results = []
+    completed = False
+    async for line in response.aiter_lines():
+        if line:
+            try:
+                data = json.loads(line)
+                if data.get("status") == "completed":
+                    completed = True
+                    break # Stop processing after completion marker
+                else:
+                    results.append(data)
+            except json.JSONDecodeError:
+                pytest.fail(f"Failed to decode JSON line: {line}")
+    assert completed, "Streaming response did not end with a completion marker."
+    return results
+
+
+# --- Test Class ---
+
+@pytest.mark.asyncio
+class TestCrawlEndpoints:
+
+    @pytest_asyncio.fixture(autouse=True)
+    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+        """Fixture to ensure server is healthy before each test in the class."""
+        await check_server_health(async_client)
+
+    # 1. Simple Requests (Primitives)
+    async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
+        """Test /crawl with a single URL and simple config values."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False, # Explicitly false for /crawl
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value # Use enum value
+                }
+            }
+        }
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error: {e}")
+            print(f"Response content: {e.response.text}")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert result["url"] == SIMPLE_HTML_URL
+        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
+        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
+        # It might be null, missing, or populated depending on the server's default behavior
+
+    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
+        """Test /crawl/stream with a single URL and simple config values."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": True, # Must be true for /crawl/stream
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+
+        assert len(results) == 1
+        result = results[0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert result["url"] == SIMPLE_HTML_URL
+        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
+
+
+    # 2. Multi-URL and Dispatcher
+    async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
+        """Test /crawl with multiple URLs, implicitly testing dispatcher."""
+        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+        payload = {
+            "urls": urls,
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {"headless": True}
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) == len(urls)
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] in urls
+
+    async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
+        """Test /crawl/stream with multiple URLs."""
+        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+        payload = {
+            "urls": urls,
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {"headless": True}
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
+            }
+        }
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+
+        assert len(results) == len(urls)
+        processed_urls = set()
+        for result in results:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] in urls
+            processed_urls.add(result["url"])
+        assert processed_urls == set(urls) # Ensure all URLs were processed
+
+
+    # 3. Class Values and Nested Classes (Markdown Generator)
+    async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
+        """Test /crawl with MarkdownGenerator using PruningContentFilter."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.ENABLED.value, # Test different cache mode
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "PruningContentFilter",
+                                "params": {
+                                    "threshold": 0.5, # Example param
+                                    "threshold_type": "relative"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "markdown" in result
+        assert isinstance(result["markdown"], dict)
+        assert "raw_markdown" in result["markdown"]
+        assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
+        assert "Moby-Dick" in result["markdown"]["raw_markdown"]
+        # Fit markdown content might be different/shorter due to pruning
+        assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
+
+    async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
+        """Test /crawl with MarkdownGenerator using BM25ContentFilter."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "BM25ContentFilter",
+                                "params": {
+                                    "user_query": "Herman Melville", # Query for BM25
+                                    "bm25_threshold": 0.1, # Lower threshold to increase matches
+                                    "language": "english"  # Valid parameters
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Payload for BM25 test: {json.dumps(payload)}")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "markdown" in result
+        assert isinstance(result["markdown"], dict)
+        assert "raw_markdown" in result["markdown"]
+        assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
+        
+        # Print values for debug
+        print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
+        print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
+        
+        # Either fit_markdown has content (possibly including our query terms)
+        # or it might be empty if no good BM25 matches were found
+        # Don't assert specific content since it can be environment-dependent
+
+
+    # 4. Deep Crawling
+    async def test_deep_crawl(self, async_client: httpx.AsyncClient):
+        """Test /crawl with a deep crawl strategy."""
+        payload = {
+            "urls": [DEEP_CRAWL_URL], # Start URL
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": 1, # Limit depth for testing speed
+                            "max_pages": 5, # Limit pages to crawl
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "ContentTypeFilter",
+                                            "params": {"allowed_types": ["text/html"]}
+                                        },
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
+                                        }
+                                    ]
+                                }
+                            },
+                            "url_scorer": {
+                                "type": "CompositeScorer",
+                                "params": {
+                                    "scorers": [
+                                        {
+                                            "type": "KeywordRelevanceScorer",
+                                            "params": {"keywords": ["documentation", "tutorial"]}
+                                        },
+                                        {
+                                            "type": "PathDepthScorer",
+                                            "params": {"weight": 0.5, "optimal_depth": 2}
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        # Expect more than 1 result due to deep crawl (start URL + crawled links)
+        assert len(data["results"]) > 1
+        assert len(data["results"]) <= 6 # Start URL + max_links=5
+
+        start_url_found = False
+        crawled_urls_found = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            
+            # Print URL for debugging
+            print(f"Crawled URL: {result['url']}")
+            
+            # Allow URLs that contain python.org (including subdomains like docs.python.org)
+            assert "python.org" in result["url"]
+            if result["url"] == DEEP_CRAWL_URL:
+                start_url_found = True
+            else:
+                crawled_urls_found = True
+
+        assert start_url_found
+        assert crawled_urls_found
+
+
+    # 5. Extraction without LLM (JSON/CSS)
+    async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
+        """Test /crawl with JsonCssExtractionStrategy."""
+        payload = {
+            "urls": [SCRAPE_TARGET_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "extraction_strategy": {
+                        "type": "JsonCssExtractionStrategy",
+                        "params": {
+                            "schema": { 
+                                "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
+                                "value": {
+                                    "name": "BookList",
+                                    "baseSelector": "ol.row li.col-xs-6", # Select each book item
+                                    "fields": [
+                                        {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
+                                        {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
+                                        {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+
+        # Extracted content should be a JSON string representing a list of dicts
+        try:
+            extracted_data = json.loads(result["extracted_content"])
+            assert isinstance(extracted_data, list)
+            assert len(extracted_data) > 0 # Should find some books
+            # Check structure of the first extracted item
+            first_item = extracted_data[0]
+            assert "title" in first_item
+            assert "price" in first_item
+            assert "rating" in first_item
+            assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+
+
+    # 6. Extraction with LLM
+    async def test_llm_extraction(self, async_client: httpx.AsyncClient):
+        """
+        Test /crawl with LLMExtractionStrategy.
+        NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
+              configured via .llm.env or environment variables.
+              This test uses the default provider configured in the server's config.yml.
+        """
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "extraction_strategy": {
+                        "type": "LLMExtractionStrategy",
+                        "params": {
+                            "instruction": "Extract the main title and the author mentioned in the text into JSON.",
+                            # LLMConfig is implicitly defined by server's config.yml and .llm.env
+                            # If you needed to override provider/token PER REQUEST:
+                            "llm_config": {
+                               "type": "LLMConfig",
+                               "params": {
+                                  "provider": "openai/gpt-4o", # Example override
+                                  "api_token": os.getenv("OPENAI_API_KEY") # Example override
+                               }
+                            },
+                            "schema": { # Optional: Provide a schema for structured output
+                                "type": "dict", # IMPORTANT: Wrap schema dict
+                                "value": {
+                                    "title": "Book Info",
+                                    "type": "object",
+                                    "properties": {
+                                        "title": {"type": "string", "description": "The main title of the work"},
+                                        "author": {"type": "string", "description": "The author of the work"}
+                                    },
+                                     "required": ["title", "author"]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            # Catch potential server errors (like 500 due to missing/invalid API keys)
+            pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
+        except httpx.RequestError as e:
+             pytest.fail(f"LLM extraction request failed: {e}.")
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+
+        # Extracted content should be JSON (because we provided a schema)
+        try:
+            extracted_data = json.loads(result["extracted_content"])
+            print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
+            
+            # Handle both dict and list formats (server returns a list)
+            if isinstance(extracted_data, list):
+                assert len(extracted_data) > 0
+                extracted_item = extracted_data[0]  # Take first item
+                assert isinstance(extracted_item, dict)
+                assert "title" in extracted_item
+                assert "author" in extracted_item
+                assert "Moby-Dick" in extracted_item.get("title", "")
+                assert "Herman Melville" in extracted_item.get("author", "")
+            else:
+                assert isinstance(extracted_data, dict)
+                assert "title" in extracted_data
+                assert "author" in extracted_data
+                assert "Moby-Dick" in extracted_data.get("title", "")
+                assert "Herman Melville" in extracted_data.get("author", "")
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+        except Exception as e: # Catch any other unexpected error
+            pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
+            
+if __name__ == "__main__":
+    # Define arguments for pytest programmatically
+    # -v: verbose output
+    # -s: show print statements immediately (useful for debugging)
+    # __file__: tells pytest to run tests in the current file
+    pytest_args = ["-v", "-s", __file__]
+
+    # You can add more pytest arguments here if needed, for example:
+    # '-k test_llm_extraction': Run only the LLM test function
+    # pytest_args.append("-k test_llm_extraction")
+
+    print(f"Running pytest with args: {pytest_args}")
+
+    # Execute pytest
+    exit_code = pytest.main(pytest_args)
+
+    print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file

From d84508b4d5dad7c3b8f9b772cedfdc08c89ab2a9 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 12 Apr 2025 12:05:17 +0530
Subject: [PATCH 48/78] fix: revert the old target_elms code in regular
 webscraping strategy

---
 crawl4ai/content_scraping_strategy.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 81fe9d4e..0a93352b 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -908,11 +908,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             try:
                 for_content_targeted_element = []
                 for target_element in target_elements:
-                    # Creating a fresh parse of HTML for each selector to prevent element extraction
-                    # from modifying the original DOM tree; this keeps the original body 
-                    # intact for link processing. This is better performant than deepcopy.
-                    fresh_body = BeautifulSoup(html, "lxml")
-                    for_content_targeted_element.extend(fresh_body.select(target_element))
+                    for_content_targeted_element.extend(body.select(target_element))
                 content_element = soup.new_tag("div")
                 for el in for_content_targeted_element:
                     content_element.append(el)
@@ -920,7 +916,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                 return None
         else:
-            content_element = body      
+            content_element = body     
 
         kwargs["exclude_social_media_domains"] = set(
             kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS

From 9fc5d315af570f51c5068f7aea95e6597c9773c9 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 12 Apr 2025 12:07:04 +0530
Subject: [PATCH 49/78] fix: revert the old target_elms code in LXMLwebscraping
 strategy

---
 crawl4ai/content_scraping_strategy.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 0a93352b..814e4b2b 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -1535,17 +1535,11 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             content_element = None
             if target_elements:
                 try:
-                    content_element = lhtml.Element("div")
+                    for_content_targeted_element = []
                     for target_element in target_elements:
-                        # Creating a fresh parse of HTML for each selector to prevent element extraction
-                        # from modifying the original DOM tree; this keeps the original body 
-                        # intact for link processing. This is better performant than deepcopy.
-                        fresh_body = lhtml.document_fromstring(html)
-                        for_content_targeted_element = []
-                        for target_element in target_elements:
-                            for_content_targeted_element.extend(fresh_body.cssselect(target_element))
-                        content_element = lhtml.Element("div")
-                        content_element.extend(for_content_targeted_element)
+                        for_content_targeted_element.extend(body.cssselect(target_element))
+                    content_element = lhtml.Element("div")
+                    content_element.extend(for_content_targeted_element)
                 except Exception as e:
                     self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                     return None

From 7d8e81fb2e04b4c0844b37491664b05f65441567 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 12 Apr 2025 12:44:00 +0530
Subject: [PATCH 50/78] fix: fix target_elements, in a less invasive and more
 efficient way simply by changing order of execution :) 
 https://github.com/unclecode/crawl4ai/issues/902

---
 crawl4ai/content_scraping_strategy.py | 58 +++++++++++++--------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 814e4b2b..aa69c5fb 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -901,22 +901,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                     element.extract()
             else:
                 for element in body.select(excluded_selector):
-                    element.extract()
-
-        content_element = None
-        if target_elements:
-            try:
-                for_content_targeted_element = []
-                for target_element in target_elements:
-                    for_content_targeted_element.extend(body.select(target_element))
-                content_element = soup.new_tag("div")
-                for el in for_content_targeted_element:
-                    content_element.append(el)
-            except Exception as e:
-                self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
-                return None
-        else:
-            content_element = body     
+                    element.extract()     
 
         kwargs["exclude_social_media_domains"] = set(
             kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -976,6 +961,20 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
         str_body = ""
         try:
+            content_element = None
+            if target_elements:
+                try:
+                    for_content_targeted_element = []
+                    for target_element in target_elements:
+                        for_content_targeted_element.extend(body.select(target_element))
+                    content_element = soup.new_tag("div")
+                    for el in for_content_targeted_element:
+                        content_element.append(el)
+                except Exception as e:
+                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
+                    return None
+            else:
+                content_element = body
             str_body = content_element.encode_contents().decode("utf-8")
         except Exception:
             # Reset body to the original HTML
@@ -1532,20 +1531,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                 self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
                 meta = {}
 
-            content_element = None
-            if target_elements:
-                try:
-                    for_content_targeted_element = []
-                    for target_element in target_elements:
-                        for_content_targeted_element.extend(body.cssselect(target_element))
-                    content_element = lhtml.Element("div")
-                    content_element.extend(for_content_targeted_element)
-                except Exception as e:
-                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
-                    return None
-            else:
-                content_element = body
-
             # Remove script and style tags
             for tag in ["script", "style", "link", "meta", "noscript"]:
                 for element in body.xpath(f".//{tag}"):
@@ -1614,6 +1599,19 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             )
 
             # Generate output HTML
+            content_element = None
+            if target_elements:
+                try:
+                    for_content_targeted_element = []
+                    for target_element in target_elements:
+                        for_content_targeted_element.extend(body.cssselect(target_element))
+                    content_element = lhtml.Element("div")
+                    content_element.extend(for_content_targeted_element)
+                except Exception as e:
+                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
+                    return None
+            else:
+                content_element = body
             cleaned_html = lhtml.tostring(
                 # body,   
                 content_element,

From ecec53a8c1560b082bfe8f9cb1f5223a83f5e2f7 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 13 Apr 2025 20:14:41 +0800
Subject: [PATCH 51/78] Docker tested on Windows machine.

---
 Dockerfile                           | 14 +++++++++++++-
 docker-compose.yml                   |  3 +--
 tests/docker/test_server_requests.py |  7 ++++++-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8b84f797..a4ab56df 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -149,7 +149,15 @@ RUN pip install --no-cache-dir --upgrade pip && \
     python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
     python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
 
-RUN playwright install --with-deps chromium
+RUN crawl4ai-setup
+
+RUN playwright install --with-deps
+
+RUN mkdir -p /home/appuser/.cache/ms-playwright \
+    && cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
+    && chown -R appuser:appuser /home/appuser/.cache/ms-playwright
+
+RUN crawl4ai-doctor
 
 # Copy application code
 COPY deploy/docker/* ${APP_HOME}/
@@ -174,4 +182,8 @@ EXPOSE 6379
 # Switch to the non-root user before starting the application
 USER appuser
 
+# Set environment variables to ptoduction
+ENV PYTHON_ENV=production 
+
+# Start the application using supervisord
 CMD ["supervisord", "-c", "supervisord.conf"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index f112f9fd..4331d219 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,4 @@
 # docker-compose.yml
-# This file is in the root directory alongside Dockerfile
 
 # Base configuration anchor for reusability
 x-base-config: &base-config
@@ -9,7 +8,7 @@ x-base-config: &base-config
     # - "8080:8080" # Uncomment if needed
 
   # Load API keys primarily from .llm.env file
-  # Create .llm.env in the root directory from deploy/docker/.llm.env.example
+  # Create .llm.env in the root directory .llm.env.example
   env_file:
     - .llm.env
 
diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py
index ab8b8ced..56d2ada4 100644
--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -6,6 +6,10 @@ import asyncio
 import os
 from typing import List, Dict, Any, AsyncGenerator
 
+from dotenv import load_dotenv
+load_dotenv()
+
+
 # Optional: Import crawl4ai classes directly for reference/easier payload creation aid
 # You don't strictly NEED these imports for the tests to run against the server,
 # but they help in understanding the structure you are mimicking in JSON.
@@ -29,7 +33,8 @@ from crawl4ai import (
 )
 
 # --- Test Configuration ---
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
+# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
 # Use a known simple HTML page for basic tests
 SIMPLE_HTML_URL = "https://httpbin.org/html"
 # Use a site suitable for scraping tests

From dcc265458cef022a6b03bcaa47686e08869bcb02 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Mon, 14 Apr 2025 12:39:05 +0530
Subject: [PATCH 52/78] fix: Add a nominal wait time for remove overlay
 elements since it's already controllable through delay_before_return_html

---
 crawl4ai/js_snippet/remove_overlay_elements.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js
index 9d93b4ac..a50d9427 100644
--- a/crawl4ai/js_snippet/remove_overlay_elements.js
+++ b/crawl4ai/js_snippet/remove_overlay_elements.js
@@ -116,5 +116,5 @@ async () => {
 
     // Wait a bit for any animations to complete
     document.body.scrollIntoView(false);
-    await new Promise((resolve) => setTimeout(resolve, 250));
+    await new Promise((resolve) => setTimeout(resolve, 50));
 };

From c56974cf5996302deb80a489163258607ec3cfde Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 14 Apr 2025 20:46:32 +0800
Subject: [PATCH 53/78] feat(docs): enhance documentation UI with ToC and
 GitHub stats

Add new features to documentation UI:
- Add table of contents with scroll spy functionality
- Add GitHub repository statistics badge
- Implement new centered layout system with fixed sidebar
- Add conditional Playwright installation based on CRAWL4AI_MODE

Breaking changes: None
---
 crawl4ai/install.py               |  19 +-
 docs/md_v2/assets/github_stats.js | 119 ++++++++++++
 docs/md_v2/assets/layout.css      | 297 ++++++++++++++++++++++++++++++
 docs/md_v2/assets/styles.css      |  13 +-
 docs/md_v2/assets/toc.js          | 144 +++++++++++++++
 mkdocs.yml                        |   5 +-
 6 files changed, 593 insertions(+), 4 deletions(-)
 create mode 100644 docs/md_v2/assets/github_stats.js
 create mode 100644 docs/md_v2/assets/layout.css
 create mode 100644 docs/md_v2/assets/toc.js

diff --git a/crawl4ai/install.py b/crawl4ai/install.py
index c0c3ab0d..b2fcca78 100644
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -40,10 +40,25 @@ def setup_home_directory():
             f.write("")
 
 def post_install():
-    """Run all post-installation tasks"""
+    """
+    Run all post-installation tasks.
+    Checks CRAWL4AI_MODE environment variable. If set to 'api',
+    skips Playwright browser installation.
+    """
     logger.info("Running post-installation setup...", tag="INIT")
     setup_home_directory()
-    install_playwright()
+
+    # Check environment variable to conditionally skip Playwright install
+    run_mode = os.getenv('CRAWL4AI_MODE')
+    if run_mode == 'api':
+        logger.warning(
+            "CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.",
+            tag="SETUP"
+        )
+    else:
+        # Proceed with installation only if mode is not 'api'
+        install_playwright()
+
     run_migration()
     # TODO: Will be added in the future
     # setup_builtin_browser()
diff --git a/docs/md_v2/assets/github_stats.js b/docs/md_v2/assets/github_stats.js
new file mode 100644
index 00000000..a48b3de1
--- /dev/null
+++ b/docs/md_v2/assets/github_stats.js
@@ -0,0 +1,119 @@
+// ==== File: assets/github_stats.js ====
+
+document.addEventListener('DOMContentLoaded', async () => {
+    // --- Configuration ---
+    const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
+    const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
+                                                  // Or set to null to append at the end of the header.
+
+    // --- Find elements ---
+    const headerContainer = document.querySelector(targetHeaderSelector);
+    if (!headerContainer) {
+        console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
+        return;
+    }
+
+    const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
+    let repoUrl = 'https://github.com/unclecode/crawl4ai';
+    // if (repoLinkElement) {
+    //     repoUrl = repoLinkElement.href;
+    // } else {
+    //     // Fallback: Try finding from config (requires template injection - harder)
+    //     // Or hardcode if necessary, but reading from the link is better.
+    //      console.warn('GitHub Stats: GitHub repo link not found in header.');
+    //      // Try to get repo_url from mkdocs config if available globally (less likely)
+    //      // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
+    //      // if (!repoUrl) return; // Exit if still no URL
+    //      return; // Exit for now if link isn't found
+    // }
+
+
+    // --- Extract Repo Owner/Name ---
+    let owner = '';
+    let repo = '';
+    try {
+        const url = new URL(repoUrl);
+        const pathParts = url.pathname.split('/').filter(part => part.length > 0);
+        if (pathParts.length >= 2) {
+            owner = pathParts[0];
+            repo = pathParts[1];
+        }
+    } catch (e) {
+        console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
+        return;
+    }
+
+    if (!owner || !repo) {
+        console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
+        return;
+    }
+
+    // --- Get Version (Attempt to extract from site title) ---
+    let version = '';
+    const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
+    // Example title: "Crawl4AI Documentation (v0.5.x)"
+    if (siteTitleElement) {
+         const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
+         if (match && match[1]) {
+             version = match[1].trim();
+         }
+    }
+     if (!version) {
+        console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
+        // You could fallback to config.extra.version if injected into JS
+        // version = window.mkdocs_config?.extra?.version || 'N/A';
+     }
+
+
+    // --- Fetch GitHub API Data ---
+    let stars = '...';
+    let forks = '...';
+    try {
+        const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
+        const response = await fetch(apiUrl);
+
+        if (response.ok) {
+            const data = await response.json();
+            // Format large numbers (optional)
+            stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
+            forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
+        } else {
+            console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
+            stars = 'N/A';
+            forks = 'N/A';
+        }
+    } catch (error) {
+        console.error('GitHub Stats: Error fetching repository data:', error);
+        stars = 'N/A';
+        forks = 'N/A';
+    }
+
+    // --- Create Badge HTML ---
+    const badgeContainer = document.createElement('div');
+    badgeContainer.className = 'github-stats-badge';
+
+    // Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
+    // Ensure your theme loads FontAwesome or add it yourself if you want icons.
+    badgeContainer.innerHTML = `
+        <a href="${repoUrl}" target="_blank" rel="noopener">
+            <!-- Optional Icon (FontAwesome example) -->
+            <!-- <i class="fab fa-github"></i> -->
+             <span class="repo-name">${owner}/${repo}</span>
+             ${version ? `<span class="stat version"><i class="fas fa-tag"></i> ${version}</span>` : ''}
+            <span class="stat stars"><i class="fas fa-star"></i> ${stars}</span>
+            <span class="stat forks"><i class="fas fa-code-branch"></i> ${forks}</span>
+        </a>
+    `;
+
+    // --- Inject Badge into Header ---
+    const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
+    if (insertBeforeElement) {
+        // headerContainer.insertBefore(badgeContainer, insertBeforeElement);
+        headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer); 
+    } else {
+        headerContainer.appendChild(badgeContainer); 
+    }
+
+     console.info('GitHub Stats: Badge added to header.');
+
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
new file mode 100644
index 00000000..db5fac55
--- /dev/null
+++ b/docs/md_v2/assets/layout.css
@@ -0,0 +1,297 @@
+/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
+
+:root {
+    --header-height: 55px; /* Adjust if needed */
+    --sidebar-width: 280px; /* Adjust if needed */
+    --toc-width: 340px; /* As specified */
+    --content-max-width: 90em; /* Max width for the centered content */
+    --layout-transition-speed: 0.2s;
+    --global-space: 10px;
+}
+
+/* --- Basic Setup --- */
+html {
+    scroll-behavior: smooth;
+    scroll-padding-top: calc(var(--header-height) + 15px);
+    box-sizing: border-box;
+}
+*, *:before, *:after {
+    box-sizing: inherit;
+}
+
+body {
+    padding-top: 0;
+    padding-bottom: 0;
+    background-color: var(--background-color);
+    color: var(--font-color);
+    /* Prevents horizontal scrollbars during transitions */
+    overflow-x: hidden;
+}
+
+/* --- Fixed Header --- */
+/* Full width, fixed header */
+.terminal .container:first-child { /* Assuming this targets the header container */
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    height: var(--header-height);
+    background-color: var(--background-color);
+    z-index: 1000;
+    border-bottom: 1px solid var(--progress-bar-background);
+    max-width: none; /* Override any container max-width */
+    padding: 0 calc(var(--global-space) * 2);
+}
+
+/* --- Main Layout Container (Below Header) --- */
+/* This container just provides space for the fixed header */
+.container:has(.terminal-mkdocs-main-grid) {
+    margin: 0 auto;
+    padding: 0;
+    padding-top: var(--header-height); /* Space for fixed header */
+}
+
+/* --- Flex Container: Grid holding content and toc (CENTERED) --- */
+/* THIS is the main centered block */
+.terminal-mkdocs-main-grid {
+    display: flex;
+    align-items: flex-start;
+    /* Enforce max-width and center */
+    max-width: var(--content-max-width);
+    margin-left: auto;
+    margin-right: auto;
+    position: relative;
+    /* Apply side padding within the centered block */
+    padding-left: calc(var(--global-space) * 2);
+    padding-right: calc(var(--global-space) * 2);
+    /* Add margin-left to clear the fixed sidebar */
+    margin-left: var(--sidebar-width);
+}
+
+/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
+#terminal-mkdocs-side-panel {
+    position: fixed;
+    top: var(--header-height);
+    left: max(0px, calc((100vw - var(--content-max-width)) / 2)); 
+    bottom: 0;
+    width: var(--sidebar-width);
+    background-color: var(--background-color);
+    border-right: 1px solid var(--progress-bar-background);
+    overflow-y: auto;
+    z-index: 900;
+    padding: 1em calc(var(--global-space) * 2);
+    padding-bottom: 2em;
+    /* transition: left var(--layout-transition-speed) ease-in-out; */
+}
+
+/* --- 2. Main Content Area (Within Centered Grid) --- */
+#terminal-mkdocs-main-content {
+    flex-grow: 1;
+    flex-shrink: 1;
+    min-width: 0; /* Flexbox shrink fix */
+
+    /* No left/right margins needed here - handled by parent grid */
+    margin-left: 0;
+    margin-right: 0;
+
+    /* Internal Padding */
+    padding: 1.5em 2em;
+
+    position: relative;
+    z-index: 1;
+}
+
+/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
+#toc-sidebar {
+    flex-basis: var(--toc-width);
+    flex-shrink: 0;
+    width: var(--toc-width);
+
+    position: sticky; /* Sticks within the centered grid */
+    top: var(--header-height);
+    align-self: stretch;
+    height: calc(100vh - var(--header-height));
+    overflow-y: auto;
+
+    padding: 1.5em 1em;
+    font-size: 0.85em;
+    border-left: 1px solid var(--progress-bar-background);
+    z-index: 800;
+    /* display: none; /* JS handles */
+}
+
+/* (ToC link styles remain the same) */
+#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
+#toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
+#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
+#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
+#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
+#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
+#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
+
+
+/* --- Footer Styling (Respects Centered Layout) --- */
+footer {
+    background-color: var(--code-bg-color);
+    color: var(--secondary-color);
+    position: relative;
+    z-index: 10;
+    margin-top: 2em;
+
+    /* Apply margin-left to clear the fixed sidebar */
+    margin-left: var(--sidebar-width);
+
+    /* Constrain width relative to the centered grid it follows */
+    max-width: calc(var(--content-max-width) - var(--sidebar-width));
+    margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
+
+    /* Use padding consistent with the grid */
+    padding: 2em calc(var(--global-space) * 2);
+}
+
+/* Adjust footer grid if needed */
+.terminal-mkdocs-footer-grid {
+    display: grid;
+    grid-template-columns: 1fr auto;
+    gap: 1em;
+    align-items: center;
+}
+
+/* ==========================================================================
+   RESPONSIVENESS (Adapting the Non-Fluid Layout)
+   ========================================================================== */
+
+/* --- Medium screens: Hide ToC --- */
+@media screen and (max-width: 1200px) {
+    #toc-sidebar {
+        display: none;
+    }
+
+    .terminal-mkdocs-main-grid {
+        /* Grid adjusts automatically as ToC is removed */
+        /* Ensure grid padding remains */
+         padding-left: calc(var(--global-space) * 2);
+         padding-right: calc(var(--global-space) * 2);
+    }
+
+    #terminal-mkdocs-main-content {
+        /* Content area naturally expands */
+    }
+
+    footer {
+        /* Footer still respects the left sidebar and overall max width */
+        margin-left: var(--sidebar-width);
+        max-width: calc(var(--content-max-width) - var(--sidebar-width));
+        /* Padding remains consistent */
+         padding-left: calc(var(--global-space) * 2);
+         padding-right: calc(var(--global-space) * 2);
+    }
+}
+
+/* --- Small screens: Hide left sidebar, full width content & footer --- */
+@media screen and (max-width: 768px) {
+
+    #terminal-mkdocs-side-panel {
+        left: calc(-1 * var(--sidebar-width));
+        z-index: 1100;
+        box-shadow: 2px 0 10px rgba(0,0,0,0.3);
+    }
+    #terminal-mkdocs-side-panel.sidebar-visible {
+        left: 0;
+    }
+
+    .terminal-mkdocs-main-grid {
+        /* Grid now takes full width (minus body padding) */
+        margin-left: 0; /* Override sidebar margin */
+        margin-right: 0; /* Override auto margin */
+        max-width: 100%; /* Allow full width */
+        padding-left: var(--global-space); /* Reduce padding */
+        padding-right: var(--global-space);
+    }
+
+    #terminal-mkdocs-main-content {
+        padding: 1.5em 1em; /* Adjust internal padding */
+    }
+
+    footer {
+        margin-left: 0; /* Full width footer */
+        max-width: 100%; /* Allow full width */
+        padding: 2em 1em; /* Adjust internal padding */
+    }
+
+    .terminal-mkdocs-footer-grid {
+         grid-template-columns: 1fr; /* Stack footer items */
+         text-align: center;
+         gap: 0.5em;
+    }
+    /* Remember JS for toggle button & overlay */
+}
+
+
+/* ==== GitHub Stats Badge Styling ==== */
+
+.github-stats-badge {
+    display: inline-block; /* Or flex if needed */
+    margin-left: 2em; /* Adjust spacing */
+    vertical-align: middle; /* Align with other header items */
+    font-size: 0.9em; /* Slightly smaller font */
+}
+
+.github-stats-badge a {
+    color: var(--secondary-color); /* Use secondary color */
+    text-decoration: none;
+    display: flex; /* Use flex for alignment */
+    align-items: center;
+    gap: 0.8em; /* Space between items */
+    padding: 0.2em 0.5em;
+    border: 1px solid var(--progress-bar-background); /* Subtle border */
+    border-radius: 4px;
+    transition: color 0.2s, background-color 0.2s;
+}
+
+.github-stats-badge a:hover {
+    color: var(--font-color); /* Brighter color on hover */
+    background-color: var(--progress-bar-background); /* Subtle background on hover */
+}
+
+.github-stats-badge .repo-name {
+    color: var(--font-color); /* Make repo name stand out slightly */
+    font-weight: 500; /* Optional bolder weight */
+}
+
+.github-stats-badge .stat {
+    /* Styles for individual stats (version, stars, forks) */
+    white-space: nowrap; /* Prevent wrapping */
+}
+
+.github-stats-badge .stat i {
+    /* Optional: Style for FontAwesome icons */
+    margin-right: 0.3em;
+    color: var(--secondary-dimmed-color); /* Dimmer color for icons */
+}
+
+
+/* Adjust positioning relative to search/nav if needed */
+/* Example: If search is floated right */
+/* .terminal-nav { float: left; } */
+/* .github-stats-badge { float: left; } */
+/* #mkdocs-search-query { float: right; } */
+
+/* --- Responsive adjustments --- */
+@media screen and (max-width: 900px) { /* Example breakpoint */
+    .github-stats-badge .repo-name {
+        display: none; /* Hide full repo name on smaller screens */
+    }
+    .github-stats-badge {
+        margin-left: 1em;
+    }
+     .github-stats-badge a {
+        gap: 0.5em;
+    }
+}
+@media screen and (max-width: 768px) {
+    /* Further hide or simplify on mobile if needed */
+     .github-stats-badge {
+        display: none; /* Example: Hide completely on smallest screens */
+     }
+}
\ No newline at end of file
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 8ee8cbb1..751aabb7 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -50,8 +50,17 @@
     --display-h1-decoration: none;
 
     --display-h1-decoration: none;
+
+    --header-height: 65px; /* Adjust based on your actual header height */
+    --sidebar-width: 280px; /* Adjust based on your desired sidebar width */
+    --toc-width: 240px; /* Adjust based on your desired ToC width */
+    --layout-transition-speed: 0.2s; /* For potential future animations */
+
+    --page-width : 90em; /* Adjust based on your design */
 }
 
+
+
 /* body {
     background-color: var(--background-color);
     color: var(--font-color);
@@ -256,4 +265,6 @@ div.badges a {
 }
 div.badges a > img {
     width: auto;
-}
\ No newline at end of file
+}
+
+
diff --git a/docs/md_v2/assets/toc.js b/docs/md_v2/assets/toc.js
new file mode 100644
index 00000000..8dad06b2
--- /dev/null
+++ b/docs/md_v2/assets/toc.js
@@ -0,0 +1,144 @@
+// ==== File: assets/toc.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    const mainContent = document.getElementById('terminal-mkdocs-main-content');
+    const tocContainer = document.getElementById('toc-sidebar');
+    const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
+
+    if (!mainContent) {
+        console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
+        return;
+    }
+
+    // --- Create ToC container if it doesn't exist ---
+    let tocElement = tocContainer;
+    if (!tocElement) {
+        if (!mainGrid) {
+            console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
+            return;
+        }
+        tocElement = document.createElement('aside');
+        tocElement.id = 'toc-sidebar';
+        tocElement.style.display = 'none'; // Keep hidden initially
+        // Append it as the last child of the flex grid
+        mainGrid.appendChild(tocElement);
+        console.info("TOC Generator: Created '#toc-sidebar' element.");
+    }
+
+    // --- Find Headings (h2, h3, h4 are common for ToC) ---
+    const headings = mainContent.querySelectorAll('h2, h3, h4');
+    if (headings.length === 0) {
+        console.info("TOC Generator: No headings found on this page. ToC not generated.");
+        tocElement.style.display = 'none'; // Ensure it's hidden
+        return;
+    }
+
+    // --- Generate ToC List ---
+    const tocList = document.createElement('ul');
+    const observerTargets = []; // Store headings for IntersectionObserver
+
+    headings.forEach((heading, index) => {
+        // Ensure heading has an ID for linking
+        if (!heading.id) {
+            // Create a simple slug-like ID
+            heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
+        }
+
+        const listItem = document.createElement('li');
+        const link = document.createElement('a');
+
+        link.href = `#${heading.id}`;
+        link.textContent = heading.textContent;
+
+        // Add class for styling based on heading level
+        const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
+        listItem.classList.add(`toc-level-${level}`);
+
+        listItem.appendChild(link);
+        tocList.appendChild(listItem);
+        observerTargets.push(heading); // Add to observer list
+    });
+
+    // --- Populate and Show ToC ---
+    // Optional: Add a title
+    const tocTitle = document.createElement('h4');
+    tocTitle.textContent = 'On this page'; // Customize title if needed
+
+    tocElement.innerHTML = ''; // Clear previous content if any
+    tocElement.appendChild(tocTitle);
+    tocElement.appendChild(tocList);
+    tocElement.style.display = ''; // Show the ToC container
+
+    console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
+
+    // --- Scroll Spy using Intersection Observer ---
+    const tocLinks = tocElement.querySelectorAll('a');
+    let activeLink = null; // Keep track of the current active link
+
+    const observerOptions = {
+        // Observe changes relative to the viewport, offset by the header height
+        // Negative top margin pushes the intersection trigger point down
+        // Negative bottom margin ensures elements low on the screen can trigger before they exit
+        rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
+        threshold: 0 // Trigger as soon as any part enters/exits the boundary
+    };
+
+    const observerCallback = (entries) => {
+        let topmostVisibleHeading = null;
+
+        entries.forEach(entry => {
+            const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
+            if (!link) return;
+
+            // Check if the heading is intersecting (partially or fully visible within rootMargin)
+            if (entry.isIntersecting) {
+                 // Among visible headings, find the one closest to the top edge (within the rootMargin)
+                if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
+                    topmostVisibleHeading = entry.target;
+                 }
+            }
+        });
+
+        // If we found a topmost visible heading, activate its link
+        if (topmostVisibleHeading) {
+            const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
+            if (newActiveLink && newActiveLink !== activeLink) {
+                 // Remove active class from previous link
+                 if (activeLink) {
+                     activeLink.classList.remove('active');
+                     activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
+                 }
+                 // Add active class to the new link
+                 newActiveLink.classList.add('active');
+                 newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
+                 activeLink = newActiveLink;
+
+                 // Optional: Scroll the ToC sidebar to keep the active link visible
+                 // newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+            }
+        }
+        // If no headings are intersecting (scrolled past the last one?), maybe deactivate all
+        // Or keep the last one active - depends on desired behavior. Current logic keeps last active.
+    };
+
+    const observer = new IntersectionObserver(observerCallback, observerOptions);
+
+    // Observe all target headings
+    observerTargets.forEach(heading => observer.observe(heading));
+
+    // Initial check in case a heading is already in view on load
+    // (Requires slight delay for accurate layout calculation)
+    setTimeout(() => {
+        observerCallback(observer.takeRecords()); // Process initial state
+    }, 100);
+
+    // move footer and the hr before footer to the end of the main content
+    const footer = document.querySelector('footer');
+    const hr = footer.previousElementSibling;
+    if (hr && hr.tagName === 'HR') {
+        mainContent.appendChild(hr);
+    }
+    mainContent.appendChild(footer);
+    console.info("TOC Generator: Footer moved to the end of the main content.");
+
+});
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 82b2fa02..1c7be7a3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -76,6 +76,7 @@ extra:
   version: !ENV [CRAWL4AI_VERSION, 'development']
 
 extra_css:
+  - assets/layout.css
   - assets/styles.css
   - assets/highlight.css
   - assets/dmvendor.css
@@ -83,4 +84,6 @@ extra_css:
 extra_javascript:
   - assets/highlight.min.js
   - assets/highlight_init.js
-  - https://buttons.github.io/buttons.js
\ No newline at end of file
+  - https://buttons.github.io/buttons.js
+  - assets/toc.js
+  - assets/github_stats.js 
\ No newline at end of file

From cd7ff6f9c137348003493606b1b453637c624fac Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 14 Apr 2025 23:00:47 +0800
Subject: [PATCH 54/78] feat(docs): add AI assistant interface and code copy
 button

Add new AI assistant chat interface with features:
- Real-time chat with markdown support
- Chat history management
- Citation tracking
- Selection-to-query functionality

Also adds code copy button to documentation code blocks and adjusts layout/styling.

Breaking changes: None
---
 docs/md_v2/ask_ai/ask-ai.css                | 444 ++++++++++++++
 docs/md_v2/ask_ai/ask-ai.js                 | 603 ++++++++++++++++++++
 docs/md_v2/ask_ai/index.html                |  64 +++
 docs/md_v2/assets/copy_code.js              |  62 ++
 docs/md_v2/assets/floating_ask_ai_button.js |  39 ++
 docs/md_v2/assets/layout.css                | 146 ++++-
 docs/md_v2/assets/selection_ask_ai.js       | 109 ++++
 docs/md_v2/assets/styles.css                |   6 +-
 docs/md_v2/core/ask-ai.md                   |  74 +++
 mkdocs.yml                                  |   8 +-
 10 files changed, 1549 insertions(+), 6 deletions(-)
 create mode 100644 docs/md_v2/ask_ai/ask-ai.css
 create mode 100644 docs/md_v2/ask_ai/ask-ai.js
 create mode 100644 docs/md_v2/ask_ai/index.html
 create mode 100644 docs/md_v2/assets/copy_code.js
 create mode 100644 docs/md_v2/assets/floating_ask_ai_button.js
 create mode 100644 docs/md_v2/assets/selection_ask_ai.js
 create mode 100644 docs/md_v2/core/ask-ai.md

diff --git a/docs/md_v2/ask_ai/ask-ai.css b/docs/md_v2/ask_ai/ask-ai.css
new file mode 100644
index 00000000..c464d43b
--- /dev/null
+++ b/docs/md_v2/ask_ai/ask-ai.css
@@ -0,0 +1,444 @@
+/* ==== File: docs/ask_ai/ask_ai.css ==== */
+
+/* --- Basic Reset & Font --- */
+body {
+    /* Attempt to inherit variables from parent window (iframe context) */
+    /* Fallback values if variables are not inherited */
+    --fallback-bg: #070708;
+    --fallback-font: #e8e9ed;
+    --fallback-secondary: #a3abba;
+    --fallback-primary: #50ffff;
+    --fallback-primary-dimmed: #09b5a5;
+    --fallback-border: #1d1d20;
+    --fallback-code-bg: #1e1e1e;
+    --fallback-invert-font: #222225;
+    --font-stack: dm, Monaco, Courier New, monospace, serif;
+
+    font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
+    background-color: var(--background-color, var(--fallback-bg));
+    color: var(--font-color, var(--fallback-font));
+    margin: 0;
+    padding: 0;
+    font-size: 14px; /* Match global font size */
+    line-height: 1.5em; /* Match global line height */
+    height: 100vh; /* Ensure body takes full height */
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+    display: flex; /* Use flex for the main container */
+}
+
+a {
+    color: var(--secondary-color, var(--fallback-secondary));
+    text-decoration: none;
+    transition: color 0.2s;
+}
+a:hover {
+    color: var(--primary-color, var(--fallback-primary));
+}
+
+/* --- Main Container Layout --- */
+.ai-assistant-container {
+    display: flex;
+    width: 100%;
+    height: 100%;
+    background-color: var(--background-color, var(--fallback-bg));
+}
+
+/* --- Sidebar Styling --- */
+.sidebar {
+    flex-shrink: 0; /* Prevent sidebars from shrinking */
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    /* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
+    overflow-y: hidden; /* Header fixed, list scrolls */
+}
+
+.left-sidebar {
+    flex-basis: 240px; /* Width of history panel */
+    border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.right-sidebar {
+    flex-basis: 280px; /* Width of citations panel */
+    border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.sidebar header {
+    padding: 0.6em 1em;
+    border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
+    flex-shrink: 0;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+
+.sidebar header h3 {
+    margin: 0;
+    font-size: 1.1em;
+    color: var(--font-color, var(--fallback-font));
+}
+
+.sidebar ul {
+    list-style: none;
+    padding: 0;
+    margin: 0;
+    overflow-y: auto; /* Enable scrolling for the list */
+    flex-grow: 1; /* Allow list to take remaining space */
+    padding: 0.5em 0;
+}
+
+.sidebar ul li {
+    padding: 0.3em 1em;
+}
+.sidebar ul li.no-citations,
+.sidebar ul li.no-history {
+    color: var(--secondary-color, var(--fallback-secondary));
+    font-style: italic;
+    font-size: 0.9em;
+    padding-left: 1em;
+}
+
+.sidebar ul li a {
+    color: var(--secondary-color, var(--fallback-secondary));
+    text-decoration: none;
+    display: block;
+    padding: 0.2em 0.5em;
+    border-radius: 3px;
+    transition: background-color 0.2s, color 0.2s;
+}
+
+.sidebar ul li a:hover {
+    color: var(--primary-color, var(--fallback-primary));
+    background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
+}
+/* Style for active history item */
+#history-list li.active a {
+    color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    font-weight: bold;
+    background-color: rgba(80, 255, 255, 0.12);
+}
+
+/* --- Chat Panel Styling --- */
+#chat-panel {
+    flex-grow: 1; /* Take remaining space */
+    display: flex;
+    flex-direction: column;
+    height: 100%;
+    overflow: hidden; /* Prevent overflow, internal elements handle scroll */
+}
+
+#chat-messages {
+    flex-grow: 1;
+    overflow-y: auto; /* Scrollable chat history */
+    padding: 1em 1.5em;
+    border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.message {
+    margin-bottom: 1em;
+    padding: 0.8em 1.2em;
+    border-radius: 8px;
+    max-width: 90%; /* Slightly wider */
+    line-height: 1.6;
+    /* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
+    white-space: pre-wrap;
+    word-wrap: break-word; /* Ensure long words break */
+}
+
+.user-message {
+    background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
+    color: var(--font-color, var(--fallback-font));
+    margin-left: auto; /* Align user messages to the right */
+    text-align: left;
+}
+
+.ai-message {
+    background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
+    color: var(--font-color, var(--fallback-font));
+    margin-right: auto; /* Align AI messages to the left */
+    border: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+.ai-message.welcome-message {
+    border: none;
+    background-color: transparent;
+    max-width: 100%;
+    text-align: center;
+    color: var(--secondary-color, var(--fallback-secondary));
+    white-space: normal;
+}
+
+/* Styles for code within messages */
+.ai-message code {
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
+    /* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
+    padding: 0.1em 0.4em;
+    border-radius: 4px;
+    font-size: 0.9em;
+}
+.ai-message pre {
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+    color: var(--background-color, var(--fallback-bg)) !important;
+    padding: 1em;
+    border-radius: 5px;
+    overflow-x: auto;
+    margin: 0.8em 0;
+    white-space: pre;
+}
+.ai-message pre code {
+    background-color: transparent !important;
+    padding: 0;
+    font-size: inherit;
+}
+
+/* Override white-space for specific elements generated by Markdown */
+.ai-message p,
+.ai-message ul,
+.ai-message ol,
+.ai-message blockquote {
+    white-space: normal; /* Allow standard wrapping for block elements */
+}
+
+/* --- Markdown Element Styling within Messages --- */
+.message p {
+    margin-top: 0;
+    margin-bottom: 0.5em;
+}
+.message p:last-child {
+    margin-bottom: 0;
+}
+.message ul,
+.message ol {
+    margin: 0.5em 0 0.5em 1.5em;
+    padding: 0;
+}
+.message li {
+    margin-bottom: 0.2em;
+}
+
+/* Code block styling (adjusts previous rules slightly) */
+.message code {
+    /* Inline code */
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+    color: var(--font-color);
+    padding: 0.1em 0.4em;
+    border-radius: 4px;
+    font-size: 0.9em;
+    /* Ensure inline code breaks nicely */
+    word-break: break-all;
+    white-space: normal; /* Allow inline code to wrap if needed */
+}
+.message pre {
+    /* Code block container */
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+    color: var(--background-color, var(--fallback-bg)) !important;
+    padding: 1em;
+    border-radius: 5px;
+    overflow-x: auto;
+    margin: 0.8em 0;
+    font-size: 0.9em; /* Slightly smaller code blocks */
+}
+.message pre code {
+    /* Code within code block */
+    background-color: transparent !important;
+    padding: 0;
+    font-size: inherit;
+    word-break: normal; /* Don't break words in code blocks */
+    white-space: pre; /* Preserve whitespace strictly in code blocks */
+}
+
+/* Thinking indicator */
+.message-thinking {
+    display: inline-block;
+    width: 5px;
+    height: 5px;
+    background-color: var(--primary-color, var(--fallback-primary));
+    border-radius: 50%;
+    margin-left: 8px;
+    vertical-align: middle;
+    animation: thinking 1s infinite ease-in-out;
+}
+@keyframes thinking {
+    0%,
+    100% {
+        opacity: 0.5;
+        transform: scale(0.8);
+    }
+    50% {
+        opacity: 1;
+        transform: scale(1.2);
+    }
+}
+
+/* --- Thinking Indicator (Blinking Cursor Style) --- */
+.thinking-indicator-cursor {
+    display: inline-block;
+    width: 10px; /* Width of the cursor */
+    height: 1.1em; /* Match line height */
+    background-color: var(--primary-color, var(--fallback-primary));
+    margin-left: 5px;
+    vertical-align: text-bottom; /* Align with text baseline */
+    animation: blink-cursor 1s step-end infinite;
+}
+
+@keyframes blink-cursor {
+    from,
+    to {
+        background-color: transparent;
+    }
+    50% {
+        background-color: var(--primary-color, var(--fallback-primary));
+    }
+}
+
+#chat-input-area {
+    flex-shrink: 0; /* Prevent input area from shrinking */
+    padding: 1em 1.5em;
+    display: flex;
+    align-items: flex-end; /* Align items to bottom */
+    gap: 10px;
+    background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
+}
+
+#chat-input-area textarea {
+    flex-grow: 1;
+    padding: 0.8em 1em;
+    border: 1px solid var(--progress-bar-background, var(--fallback-border));
+    background-color: var(--background-color, var(--fallback-bg));
+    color: var(--font-color, var(--fallback-font));
+    border-radius: 5px;
+    resize: none; /* Disable manual resize */
+    font-family: inherit;
+    font-size: 1em;
+    line-height: 1.4;
+    max-height: 150px; /* Limit excessive height */
+    overflow-y: auto;
+    /* rows: 2; */
+}
+
+#chat-input-area button {
+    /* Basic button styling - maybe inherit from main theme? */
+    padding: 0.6em 1.2em;
+    border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    color: var(--background-color, var(--fallback-bg));
+    border-radius: 5px;
+    cursor: pointer;
+    font-size: 0.9em;
+    transition: background-color 0.2s, border-color 0.2s;
+    height: min-content; /* Align with bottom of textarea */
+}
+
+#chat-input-area button:hover {
+    background-color: var(--primary-color, var(--fallback-primary));
+    border-color: var(--primary-color, var(--fallback-primary));
+}
+#chat-input-area button:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.loading-indicator {
+    font-size: 0.9em;
+    color: var(--secondary-color, var(--fallback-secondary));
+    margin-right: 10px;
+    align-self: center;
+}
+
+/* --- Buttons --- */
+/* Inherit some button styles if possible */
+.btn.btn-sm {
+    color: var(--font-color, var(--fallback-font));
+    padding: 0.2em 0.5em;
+    font-size: 0.8em;
+    border: 1px solid var(--secondary-color, var(--fallback-secondary));
+    background: none;
+    border-radius: 3px;
+    cursor: pointer;
+}
+.btn.btn-sm:hover {
+    border-color: var(--font-color, var(--fallback-font));
+    background-color: var(--progress-bar-background, var(--fallback-border));
+}
+
+/* --- Basic Responsiveness --- */
+@media screen and (max-width: 900px) {
+    .left-sidebar {
+        flex-basis: 200px; /* Shrink history */
+    }
+    .right-sidebar {
+        flex-basis: 240px; /* Shrink citations */
+    }
+}
+
+@media screen and (max-width: 768px) {
+    /* Stack layout on mobile? Or hide sidebars? Hiding for now */
+    .sidebar {
+        display: none; /* Hide sidebars on small screens */
+    }
+    /* Could add toggle buttons later */
+}
+
+
+/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
+
+
+.sidebar ul li {
+    /* Use flexbox to align link and delete button */
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 0; /* Remove padding from li, add to link/button */
+    margin: 0.1em 0; /* Small vertical margin */
+}
+
+.sidebar ul li a {
+    /* Link takes most space */
+    flex-grow: 1;
+    padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
+    /* Make ellipsis work for long titles */
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    /* Keep existing link styles */
+    color: var(--secondary-color, var(--fallback-secondary));
+    text-decoration: none;
+    display: block;
+    border-radius: 3px;
+    transition: background-color 0.2s, color 0.2s;
+}
+.sidebar ul li a:hover {
+    color: var(--primary-color, var(--fallback-primary));
+    background-color: rgba(80, 255, 255, 0.08);
+}
+
+/* Style for active history item's link */
+#history-list li.active a {
+    color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    font-weight: bold;
+    background-color: rgba(80, 255, 255, 0.12);
+}
+
+/* --- Delete Chat Button --- */
+.delete-chat-btn {
+    flex-shrink: 0; /* Don't shrink */
+    background: none;
+    border: none;
+    color: var(--secondary-color, var(--fallback-secondary));
+    cursor: pointer;
+    padding: 0.4em 0.8em; /* Padding around icon */
+    font-size: 0.9em;
+    opacity: 0.5; /* Dimmed by default */
+    transition: opacity 0.2s, color 0.2s;
+    margin-left: 5px; /* Space between link and button */
+    border-radius: 3px;
+}
+
+.sidebar ul li:hover .delete-chat-btn,
+.delete-chat-btn:hover {
+    opacity: 1; /* Show fully on hover */
+    color: var(--error-color, #ff3c74); /* Use error color on hover */
+}
+.delete-chat-btn:focus {
+    outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
+     opacity: 1;
+}
diff --git a/docs/md_v2/ask_ai/ask-ai.js b/docs/md_v2/ask_ai/ask-ai.js
new file mode 100644
index 00000000..2710923e
--- /dev/null
+++ b/docs/md_v2/ask_ai/ask-ai.js
@@ -0,0 +1,603 @@
+// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
+
+document.addEventListener("DOMContentLoaded", () => {
+    console.log("AI Assistant JS V2 Loaded");
+
+    // --- DOM Element Selectors ---
+    const historyList = document.getElementById("history-list");
+    const newChatButton = document.getElementById("new-chat-button");
+    const chatMessages = document.getElementById("chat-messages");
+    const chatInput = document.getElementById("chat-input");
+    const sendButton = document.getElementById("send-button");
+    const citationsList = document.getElementById("citations-list");
+
+    // --- Constants ---
+    const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
+    const CHAT_PREFIX = "aiAssistantChat_v1_";
+
+    // --- State ---
+    let currentChatId = null;
+    let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
+    let isThinking = false;
+    let streamInterval = null; // To control the streaming interval
+
+    // --- Event Listeners ---
+    sendButton.addEventListener("click", handleSendMessage);
+    chatInput.addEventListener("keydown", handleInputKeydown);
+    newChatButton.addEventListener("click", handleNewChat);
+    chatInput.addEventListener("input", autoGrowTextarea);
+
+    // --- Initialization ---
+    loadChatHistoryIndex(); // Load history list on startup
+    const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
+    if (!initialQuery) {
+        loadInitialChat(); // Load normally if no query
+    }
+
+    // --- Core Functions ---
+
+    function handleSendMessage() {
+        const userMessageText = chatInput.value.trim();
+        if (!userMessageText || isThinking) return;
+
+        setThinking(true); // Start thinking state
+
+        // Add user message to state and UI
+        const userMessage = { sender: "user", text: userMessageText };
+        conversationHistory.push(userMessage);
+        addMessageToChat(userMessage, false); // Add user message without parsing markdown
+
+        chatInput.value = "";
+        autoGrowTextarea(); // Reset textarea height
+
+        // Prepare for AI response (create empty div)
+        const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
+
+        // TODO: Generate fingerprint/JWT here
+
+        // TODO: Send `conversationHistory` + JWT to backend API
+        // Replace placeholder below with actual API call
+        // The backend should ideally return a stream of text tokens
+
+        // --- Placeholder Streaming Simulation ---
+        const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+\`\`\`python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # Print first 300 chars
+
+if __name__ == "__main__":
+    asyncio.run(main())
+\`\`\`
+
+A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
+
+        // Simulate receiving the response stream
+        streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
+
+        // // Simulate receiving citations *after* stream starts (or with first chunk)
+        // setTimeout(() => {
+        //     addCitations([
+        //         { title: "Simulated Doc 1", url: "#sim1" },
+        //         { title: "Another Concept", url: "#sim2" },
+        //     ]);
+        // }, 500); // Citations appear shortly after thinking starts
+    }
+
+    function handleInputKeydown(event) {
+        if (event.key === "Enter" && !event.shiftKey) {
+            event.preventDefault();
+            handleSendMessage();
+        }
+    }
+
+    function addMessageToChat(message, addThinkingIndicator = false) {
+        const messageDiv = document.createElement("div");
+        messageDiv.classList.add("message", `${message.sender}-message`);
+
+        // Parse markdown and set HTML
+        messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
+
+        if (message.sender === "ai") {
+            // Apply Syntax Highlighting AFTER setting innerHTML
+            messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
+                if (typeof hljs !== "undefined") {
+                    // Check if already highlighted to prevent double-highlighting issues
+                    if (!block.classList.contains("hljs")) {
+                        hljs.highlightElement(block);
+                    }
+                } else {
+                    console.warn("highlight.js (hljs) not found for syntax highlighting.");
+                }
+            });
+
+            // Add thinking indicator if needed (and not already present)
+            if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
+                const thinkingDiv = document.createElement("div");
+                thinkingDiv.className = "thinking-indicator-cursor";
+                messageDiv.appendChild(thinkingDiv);
+            }
+        } else {
+            // User messages remain plain text
+            // messageDiv.textContent = message.text;
+        }
+
+        // wrap each pre in a div.terminal
+        messageDiv.querySelectorAll("pre").forEach((block) => {
+            const wrapper = document.createElement("div");
+            wrapper.className = "terminal";
+            block.parentNode.insertBefore(wrapper, block);
+            wrapper.appendChild(block);
+        });
+
+        chatMessages.appendChild(messageDiv);
+        // Scroll only if user is near the bottom? (More advanced)
+        // Simple scroll for now:
+        scrollToBottom();
+        return messageDiv; // Return the created element
+    }
+
+    function streamSimulatedResponse(messageDiv, fullText) {
+        const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
+        if (thinkingIndicator) thinkingIndicator.remove();
+
+        const tokens = fullText.split(/(\s+)/);
+        let currentText = "";
+        let tokenIndex = 0;
+        // Clear previous interval just in case
+        if (streamInterval) clearInterval(streamInterval);
+
+        streamInterval = setInterval(() => {
+            const cursorSpan = '<span class="thinking-indicator-cursor"></span>'; // Cursor for streaming
+            if (tokenIndex < tokens.length) {
+                currentText += tokens[tokenIndex];
+                // Render intermediate markdown + cursor
+                messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
+                // Re-highlight code blocks on each stream update - might be slightly inefficient
+                // but ensures partial code blocks look okay. Highlight only final on completion.
+                // messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
+                //     hljs.highlightElement(block);
+                // });
+                scrollToBottom(); // Keep scrolling as content streams
+                tokenIndex++;
+            } else {
+                // Streaming finished
+                clearInterval(streamInterval);
+                streamInterval = null;
+
+                // Final render without cursor
+                messageDiv.innerHTML = marked.parse(currentText);
+
+                // === Final Syntax Highlighting ===
+                messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
+                    if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
+                        hljs.highlightElement(block);
+                    }
+                });
+
+                // === Extract Citations ===
+                const citations = extractMarkdownLinks(currentText);
+
+                // Wrap each pre in a div.terminal
+                messageDiv.querySelectorAll("pre").forEach((block) => {
+                    const wrapper = document.createElement("div");
+                    wrapper.className = "terminal";
+                    block.parentNode.insertBefore(wrapper, block);
+                    wrapper.appendChild(block);
+                });
+
+                const aiMessage = { sender: "ai", text: currentText, citations: citations };
+                conversationHistory.push(aiMessage);
+                updateCitationsDisplay();
+                saveCurrentChat();
+                setThinking(false);
+            }
+        }, 50); // Adjust speed
+    }
+
+    // === NEW Function to Extract Links ===
+    function extractMarkdownLinks(markdownText) {
+        const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
+        const citations = [];
+        let match;
+        while ((match = regex.exec(markdownText)) !== null) {
+            // Avoid adding self-links from within the citations list if AI includes them
+            if (!match[2].startsWith("#citation-")) {
+                citations.push({
+                    title: match[1].trim(),
+                    url: match[2].trim(),
+                });
+            }
+        }
+        // Optional: Deduplicate links based on URL
+        const uniqueCitations = citations.filter(
+            (citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
+        );
+        return uniqueCitations;
+    }
+
+    // === REVISED Function to Display Citations ===
+    function updateCitationsDisplay() {
+        let lastCitations = null;
+        // Find the most recent AI message with citations
+        for (let i = conversationHistory.length - 1; i >= 0; i--) {
+            if (
+                conversationHistory[i].sender === "ai" &&
+                conversationHistory[i].citations &&
+                conversationHistory[i].citations.length > 0
+            ) {
+                lastCitations = conversationHistory[i].citations;
+                break; // Found the latest citations
+            }
+        }
+
+        citationsList.innerHTML = ""; // Clear previous
+        if (!lastCitations) {
+            citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
+            return;
+        }
+
+        lastCitations.forEach((citation, index) => {
+            const li = document.createElement("li");
+            const a = document.createElement("a");
+            // Generate a unique ID for potential internal linking if needed
+            // a.id = `citation-${index}`;
+            a.href = citation.url || "#";
+            a.textContent = citation.title;
+            a.target = "_top"; // Open in main window
+            li.appendChild(a);
+            citationsList.appendChild(li);
+        });
+    }
+
+    function addCitations(citations) {
+        citationsList.innerHTML = ""; // Clear
+        if (!citations || citations.length === 0) {
+            citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
+            return;
+        }
+        citations.forEach((citation) => {
+            const li = document.createElement("li");
+            const a = document.createElement("a");
+            a.href = citation.url || "#";
+            a.textContent = citation.title;
+            a.target = "_top"; // Open in main window
+            li.appendChild(a);
+            citationsList.appendChild(li);
+        });
+    }
+
+    function setThinking(thinking) {
+        isThinking = thinking;
+        sendButton.disabled = thinking;
+        chatInput.disabled = thinking;
+        chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
+        // Stop any existing stream if we start thinking again (e.g., rapid resend)
+        if (thinking && streamInterval) {
+            clearInterval(streamInterval);
+            streamInterval = null;
+        }
+    }
+
+    function autoGrowTextarea() {
+        chatInput.style.height = "auto";
+        chatInput.style.height = `${chatInput.scrollHeight}px`;
+    }
+
+    function scrollToBottom() {
+        chatMessages.scrollTop = chatMessages.scrollHeight;
+    }
+
+    // --- Query Parameter Handling ---
+    function checkForInitialQuery(locationToCheck) {
+        // <-- Receive location object
+        if (!locationToCheck) {
+            console.warn("Ask AI: Could not access parent window location.");
+            return false;
+        }
+        const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
+        const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
+
+        if (encodedQuery) {
+            console.log("Initial query found (qq):", encodedQuery);
+            try {
+                const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
+                console.log("Decoded query:", decodedText);
+
+                // Start new chat immediately
+                handleNewChat(true);
+
+                // Delay setting input and sending message slightly
+                setTimeout(() => {
+                    chatInput.value = decodedText;
+                    autoGrowTextarea();
+                    handleSendMessage();
+
+                    // Clean the PARENT window's URL
+                    try {
+                        const cleanUrl = locationToCheck.pathname;
+                        // Use parent's history object
+                        window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
+                    } catch (e) {
+                        console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
+                        // This might fail due to cross-origin restrictions if served differently,
+                        // but should work fine with mkdocs serve on the same origin.
+                    }
+                }, 100);
+
+                return true; // Query processed
+            } catch (e) {
+                console.error("Error decoding initial query (qq):", e);
+                // Clean the PARENT window's URL even on error
+                try {
+                    const cleanUrl = locationToCheck.pathname;
+                    window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
+                } catch (cleanError) {
+                    console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
+                }
+                return false;
+            }
+        }
+        return false; // No 'qq' query found
+    }
+
+    // --- History Management ---
+
+    function handleNewChat(isFromQuery = false) {
+        if (isThinking) return; // Don't allow new chat while responding
+
+        // Only save if NOT triggered immediately by a query parameter load
+        if (!isFromQuery) {
+            saveCurrentChat();
+        }
+
+        currentChatId = `chat_${Date.now()}`;
+        conversationHistory = []; // Clear message history state
+        chatMessages.innerHTML = ""; // Start with clean slate for query
+        if (!isFromQuery) {
+            // Show welcome only if manually started
+            chatMessages.innerHTML =
+                '<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
+        }
+        addCitations([]); // Clear citations
+        updateCitationsDisplay(); // Clear UI
+
+        // Add to index and save
+        let index = loadChatIndex();
+        // Generate a generic title initially, update later
+        const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
+        // index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
+        index.unshift({ id: currentChatId, title: newTitle });
+        saveChatIndex(index);
+
+        renderHistoryList(index); // Update UI
+        setActiveHistoryItem(currentChatId);
+        saveCurrentChat(); // Save the empty new chat state
+    }
+
+    function loadChat(chatId) {
+        if (isThinking || chatId === currentChatId) return;
+
+        // Check if chat data actually exists before proceeding
+        const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
+        if (storedChat === null) {
+            console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
+            deleteChatData(chatId); // Clean up index
+            loadChatHistoryIndex(); // Reload history list
+            loadInitialChat(); // Load next available chat
+            return;
+        }
+
+        console.log(`Loading chat: ${chatId}`);
+        saveCurrentChat(); // Save current before switching
+
+        try {
+            conversationHistory = JSON.parse(storedChat);
+            currentChatId = chatId;
+            renderChatMessages(conversationHistory);
+            updateCitationsDisplay();
+            setActiveHistoryItem(chatId);
+        } catch (e) {
+            console.error("Error loading chat:", chatId, e);
+            alert("Failed to load chat data.");
+            conversationHistory = [];
+            renderChatMessages(conversationHistory);
+            updateCitationsDisplay();
+        }
+    }
+
+    function saveCurrentChat() {
+        if (currentChatId && conversationHistory.length > 0) {
+            try {
+                localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
+                console.log(`Chat ${currentChatId} saved.`);
+
+                // Update title in index (e.g., use first user message)
+                let index = loadChatIndex();
+                const currentItem = index.find((item) => item.id === currentChatId);
+                if (
+                    currentItem &&
+                    conversationHistory[0]?.sender === "user" &&
+                    !currentItem.title.startsWith("Chat about:")
+                ) {
+                    currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
+                    saveChatIndex(index);
+                    // Re-render history list if title changed - small optimization needed here maybe
+                    renderHistoryList(index);
+                    setActiveHistoryItem(currentChatId); // Re-set active after re-render
+                }
+            } catch (e) {
+                console.error("Error saving chat:", currentChatId, e);
+                // Handle potential storage full errors
+                if (e.name === "QuotaExceededError") {
+                    alert("Local storage is full. Cannot save chat history.");
+                    // Consider implementing history pruning logic here
+                }
+            }
+        } else if (currentChatId) {
+            // Save empty state for newly created chats if needed, or remove?
+            localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
+        }
+    }
+
+    function loadChatIndex() {
+        try {
+            const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
+            return storedIndex ? JSON.parse(storedIndex) : [];
+        } catch (e) {
+            console.error("Error loading chat index:", e);
+            return []; // Return empty array on error
+        }
+    }
+
+    function saveChatIndex(indexArray) {
+        try {
+            localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
+        } catch (e) {
+            console.error("Error saving chat index:", e);
+        }
+    }
+
+    function renderHistoryList(indexArray) {
+        historyList.innerHTML = ""; // Clear existing
+        if (!indexArray || indexArray.length === 0) {
+            historyList.innerHTML = '<li class="no-history">No past chats found.</li>';
+            return;
+        }
+        indexArray.forEach((item) => {
+            const li = document.createElement("li");
+            li.dataset.chatId = item.id; // Add ID to li for easier selection
+
+            const a = document.createElement("a");
+            a.href = "#";
+            a.dataset.chatId = item.id;
+            a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
+            a.title = a.textContent; // Tooltip for potentially long titles
+            a.addEventListener("click", (e) => {
+                e.preventDefault();
+                loadChat(item.id);
+            });
+
+            // === Add Delete Button ===
+            const deleteBtn = document.createElement("button");
+            deleteBtn.className = "delete-chat-btn";
+            deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
+            deleteBtn.title = "Delete Chat";
+            deleteBtn.dataset.chatId = item.id; // Store ID on button too
+            deleteBtn.addEventListener("click", handleDeleteChat);
+
+            li.appendChild(a);
+            li.appendChild(deleteBtn); // Append button to the list item
+            historyList.appendChild(li);
+        });
+    }
+
+    function renderChatMessages(messages) {
+        chatMessages.innerHTML = ""; // Clear existing messages
+        messages.forEach((message) => {
+            // Ensure highlighting is applied when loading from history
+            addMessageToChat(message, false);
+        });
+        if (messages.length === 0) {
+            chatMessages.innerHTML =
+                '<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
+        }
+        // Scroll to bottom after loading messages
+        scrollToBottom();
+    }
+
+    function setActiveHistoryItem(chatId) {
+        document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
+        // Select the LI element directly now
+        const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
+        if (activeLi) {
+            activeLi.classList.add("active");
+        }
+    }
+
+    function loadInitialChat() {
+        const index = loadChatIndex();
+        if (index.length > 0) {
+            loadChat(index[0].id);
+        } else {
+            // Check if handleNewChat wasn't already called by query handler
+            if (!currentChatId) {
+                handleNewChat();
+            }
+        }
+    }
+
+    function loadChatHistoryIndex() {
+        const index = loadChatIndex();
+        renderHistoryList(index);
+        if (currentChatId) setActiveHistoryItem(currentChatId);
+    }
+
+    // === NEW Function to Handle Delete Click ===
+    function handleDeleteChat(event) {
+        event.stopPropagation(); // Prevent triggering loadChat on the link behind it
+        const button = event.currentTarget;
+        const chatIdToDelete = button.dataset.chatId;
+
+        if (!chatIdToDelete) return;
+
+        // Confirmation dialog
+        if (
+            window.confirm(
+                `Are you sure you want to delete this chat session?\n"${
+                    button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
+                }"`
+            )
+        ) {
+            console.log(`Deleting chat: ${chatIdToDelete}`);
+
+            // Perform deletion
+            const updatedIndex = deleteChatData(chatIdToDelete);
+
+            // If the deleted chat was the currently active one, load another chat
+            if (currentChatId === chatIdToDelete) {
+                currentChatId = null; // Reset current ID
+                conversationHistory = []; // Clear state
+                if (updatedIndex.length > 0) {
+                    // Load the new top chat (most recent remaining)
+                    loadChat(updatedIndex[0].id);
+                } else {
+                    // No chats left, start a new one
+                    handleNewChat();
+                }
+            } else {
+                // If a different chat was deleted, just re-render the list
+                renderHistoryList(updatedIndex);
+                // Re-apply active state in case IDs shifted (though they shouldn't)
+                setActiveHistoryItem(currentChatId);
+            }
+        }
+    }
+
+    // === NEW Function to Delete Chat Data ===
+    function deleteChatData(chatId) {
+        // Remove chat data
+        localStorage.removeItem(CHAT_PREFIX + chatId);
+
+        // Update index
+        let index = loadChatIndex();
+        index = index.filter((item) => item.id !== chatId);
+        saveChatIndex(index);
+
+        console.log(`Chat ${chatId} data and index entry removed.`);
+        return index; // Return the updated index
+    }
+
+    // --- Virtual Scrolling Placeholder ---
+    // NOTE: Virtual scrolling is complex. For now, we do direct rendering.
+    // If performance becomes an issue with very long chats/history,
+    // investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
+    // You would replace parts of `renderChatMessages` and `renderHistoryList`
+    // to work with the chosen library's API (providing data and item renderers).
+    console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
+});
diff --git a/docs/md_v2/ask_ai/index.html b/docs/md_v2/ask_ai/index.html
new file mode 100644
index 00000000..5fe79b12
--- /dev/null
+++ b/docs/md_v2/ask_ai/index.html
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Crawl4AI Assistant</title>
+    <!-- Link main styles first for variable access -->
+    <link rel="stylesheet" href="../assets/layout.css">
+    <link rel="stylesheet" href="../assets/styles.css">
+    <!-- Link specific AI styles -->
+    <link rel="stylesheet" href="../assets/highlight.css">
+    <link rel="stylesheet" href="ask-ai.css">
+</head>
+<body>
+    <div class="ai-assistant-container">
+
+        <!-- Left Sidebar: Conversation History -->
+        <aside id="history-panel" class="sidebar left-sidebar">
+            <header>
+                <h3>History</h3>
+                <button id="new-chat-button" class="btn btn-sm">New Chat</button>
+            </header>
+            <ul id="history-list">
+                <!-- History items populated by JS -->
+            </ul>
+        </aside>
+
+        <!-- Main Area: Chat Interface -->
+        <main id="chat-panel">
+            <div id="chat-messages">
+                <!-- Chat messages populated by JS -->
+                 <div class="message ai-message welcome-message">
+                    Welcome to the Crawl4AI Assistant! How can I help you today?
+                 </div>
+            </div>
+            <div id="chat-input-area">
+                <!-- Loading indicator for general waiting (optional) -->
+                <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
+                <textarea id="chat-input" placeholder="Ask about Crawl4AI..." rows="2"></textarea> 
+                <button id="send-button">Send</button>
+            </div>
+        </main>
+
+        <!-- Right Sidebar: Citations / Context -->
+        <aside id="citations-panel" class="sidebar right-sidebar">
+            <header>
+                <h3>Citations</h3>
+            </header>
+            <ul id="citations-list">
+                <!-- Citations populated by JS -->
+                <li class="no-citations">No citations for this response yet.</li>
+            </ul>
+        </aside>
+
+    </div>
+
+    <!-- Include Marked.js library -->
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+    <script src="../assets/highlight.min.js"></script> 
+
+    <!-- Your AI Assistant Logic -->
+    <script src="ask-ai.js"></script>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/md_v2/assets/copy_code.js b/docs/md_v2/assets/copy_code.js
new file mode 100644
index 00000000..20e6be4f
--- /dev/null
+++ b/docs/md_v2/assets/copy_code.js
@@ -0,0 +1,62 @@
+// ==== File: docs/assets/copy_code.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    // Target specifically code blocks within the main content area
+    const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
+
+    codeBlocks.forEach((codeElement) => {
+        const preElement = codeElement.parentElement; // The <pre> tag
+
+        // Ensure the <pre> tag can contain a positioned button
+        if (window.getComputedStyle(preElement).position === 'static') {
+            preElement.style.position = 'relative';
+        }
+
+        // Create the button
+        const copyButton = document.createElement('button');
+        copyButton.className = 'copy-code-button';
+        copyButton.type = 'button';
+        copyButton.setAttribute('aria-label', 'Copy code to clipboard');
+        copyButton.title = 'Copy code to clipboard';
+        copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
+
+        // Append the button to the <pre> element
+        preElement.appendChild(copyButton);
+
+        // Add click event listener
+        copyButton.addEventListener('click', () => {
+            copyCodeToClipboard(codeElement, copyButton);
+        });
+    });
+
+    async function copyCodeToClipboard(codeElement, button) {
+        // Use innerText to get the rendered text content, preserving line breaks
+        const textToCopy = codeElement.innerText;
+
+        try {
+            await navigator.clipboard.writeText(textToCopy);
+
+            // Visual feedback
+            button.innerHTML = 'Copied!';
+            button.classList.add('copied');
+            button.disabled = true; // Temporarily disable
+
+            // Revert button state after a short delay
+            setTimeout(() => {
+                button.innerHTML = 'Copy';
+                button.classList.remove('copied');
+                button.disabled = false;
+            }, 2000); // Show "Copied!" for 2 seconds
+
+        } catch (err) {
+            console.error('Failed to copy code: ', err);
+            // Optional: Provide error feedback on the button
+            button.innerHTML = 'Error';
+            setTimeout(() => {
+                button.innerHTML = 'Copy';
+            }, 2000);
+        }
+    }
+
+    console.log("Copy Code Button script loaded.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/floating_ask_ai_button.js b/docs/md_v2/assets/floating_ask_ai_button.js
new file mode 100644
index 00000000..177c2356
--- /dev/null
+++ b/docs/md_v2/assets/floating_ask_ai_button.js
@@ -0,0 +1,39 @@
+// ==== File: docs/assets/floating_ask_ai_button.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
+    const currentPath = window.location.pathname;
+
+    // Determine the base URL for constructing the link correctly,
+    // especially if deployed in a sub-directory.
+    // This assumes a simple structure; adjust if needed.
+    const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
+
+
+    // Check if the current page IS the Ask AI page
+    // Use includes() for flexibility (handles trailing slash or .html)
+    if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
+        console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
+        return; // Don't add the button on the target page
+    }
+
+    // --- Create the button ---
+    const fabLink = document.createElement('a');
+    fabLink.className = 'floating-ask-ai-button';
+    fabLink.href = askAiPagePath; // Construct the correct URL
+    fabLink.title = 'Ask Crawl4AI Assistant';
+    fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
+
+    // Add content (using SVG icon for better visuals)
+    fabLink.innerHTML = `
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
+            <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
+        </svg>
+        <span>Ask AI</span>
+    `;
+
+    // Append to body
+    document.body.appendChild(fabLink);
+
+    console.log("Floating Ask AI Button added.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
index db5fac55..f8dbedde 100644
--- a/docs/md_v2/assets/layout.css
+++ b/docs/md_v2/assets/layout.css
@@ -72,7 +72,7 @@ body {
 #terminal-mkdocs-side-panel {
     position: fixed;
     top: var(--header-height);
-    left: max(0px, calc((100vw - var(--content-max-width)) / 2)); 
+    left: max(0px, calc((90vw - var(--content-max-width)) / 2)); 
     bottom: 0;
     width: var(--sidebar-width);
     background-color: var(--background-color);
@@ -294,4 +294,148 @@ footer {
      .github-stats-badge {
         display: none; /* Example: Hide completely on smallest screens */
      }
+}
+
+/* --- Ask AI Selection Button --- */
+.ask-ai-selection-button {
+    background-color: var(--primary-dimmed-color, #09b5a5);
+    color: var(--background-color, #070708);
+    border: none;
+    padding: 4px 8px;
+    font-size: 0.8em;
+    border-radius: 4px;
+    cursor: pointer;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
+    transition: background-color 0.2s ease;
+    white-space: nowrap;
+}
+
+.ask-ai-selection-button:hover {
+    background-color: var(--primary-color, #50ffff);
+}
+
+/* ==== File: docs/assets/layout.css (Additions) ==== */
+
+/* ... (keep all existing layout CSS) ... */
+
+/* --- Copy Code Button Styling --- */
+
+/* Ensure the parent <pre> can contain the absolutely positioned button */
+#terminal-mkdocs-main-content pre {
+    position: relative; /* Needed for absolute positioning of child */
+    /* Add a little padding top/right to make space for the button */
+    padding-top: 2.5em;
+    padding-right: 1em; /* Ensure padding is sufficient */
+}
+
+.copy-code-button {
+    position: absolute;
+    top: 0.5em; /* Adjust spacing from top */
+    left: 0.5em; /* Adjust spacing from left */
+    z-index: 1; /* Sit on top of code */
+
+    background-color: var(--progress-bar-background, #444); /* Use a background */
+    color: var(--font-color, #eaeaea);
+    border: 1px solid var(--secondary-color, #727578);
+    padding: 3px 8px;
+    font-size: 0.8em;
+    font-family: var(--font-stack, monospace);
+    border-radius: 4px;
+    cursor: pointer;
+    opacity: 0; /* Hidden by default */
+    transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
+    white-space: nowrap;
+}
+
+/* Show button on hover of the <pre> container */
+#terminal-mkdocs-main-content pre:hover .copy-code-button {
+    opacity: 0.8; /* Show partially */
+}
+
+.copy-code-button:hover {
+    opacity: 1; /* Fully visible on button hover */
+    background-color: var(--secondary-color, #727578);
+}
+
+.copy-code-button:focus {
+     opacity: 1; /* Ensure visible when focused */
+     outline: 1px dashed var(--primary-color);
+}
+
+
+/* Style for "Copied!" state */
+.copy-code-button.copied {
+    background-color: var(--primary-dimmed-color, #09b5a5);
+    color: var(--background-color, #070708);
+    border-color: var(--primary-dimmed-color, #09b5a5);
+    opacity: 1; /* Ensure visible */
+}
+.copy-code-button.copied:hover {
+     background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
+}
+
+/* ==== File: docs/assets/layout.css (Additions) ==== */
+
+/* ... (keep all existing layout CSS) ... */
+
+/* --- Floating Ask AI Button --- */
+.floating-ask-ai-button {
+    position: fixed;
+    bottom: 25px;
+    right: 25px;
+    z-index: 1050; /* Below modals, above most content */
+
+    background-color: var(--primary-dimmed-color, #09b5a5);
+    color: var(--background-color, #070708);
+    border: none;
+    border-radius: 50%; /* Make it circular */
+    width: 60px; /* Adjust size */
+    height: 60px; /* Adjust size */
+    padding: 10px; /* Adjust padding */
+    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
+    cursor: pointer;
+    transition: background-color 0.2s ease, transform 0.2s ease;
+
+    display: flex;
+    flex-direction: column; /* Stack icon and text */
+    align-items: center;
+    justify-content: center;
+    text-decoration: none;
+    text-align: center;
+}
+
+.floating-ask-ai-button svg {
+    width: 24px; /* Control icon size */
+    height: 24px;
+}
+
+.floating-ask-ai-button span {
+    font-size: 0.7em;
+    margin-top: 2px; /* Space between icon and text */
+    display: block; /* Ensure it takes space */
+     line-height: 1;
+}
+
+
+.floating-ask-ai-button:hover {
+    background-color: var(--primary-color, #50ffff);
+    transform: scale(1.05); /* Slight grow effect */
+}
+
+.floating-ask-ai-button:focus {
+     outline: 2px solid var(--primary-color);
+     outline-offset: 2px;
+}
+
+/* Optional: Hide text on smaller screens if needed */
+@media screen and (max-width: 768px) {
+     .floating-ask-ai-button span {
+        /* display: none; */ /* Uncomment to hide text */
+     }
+     .floating-ask-ai-button {
+        width: 55px;
+        height: 55px;
+        bottom: 20px;
+        right: 20px;
+     }
 }
\ No newline at end of file
diff --git a/docs/md_v2/assets/selection_ask_ai.js b/docs/md_v2/assets/selection_ask_ai.js
new file mode 100644
index 00000000..b5cb471d
--- /dev/null
+++ b/docs/md_v2/assets/selection_ask_ai.js
@@ -0,0 +1,109 @@
+// ==== File: docs/assets/selection_ask_ai.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    let askAiButton = null;
+    const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
+
+    function createAskAiButton() {
+        const button = document.createElement('button');
+        button.id = 'ask-ai-selection-btn';
+        button.className = 'ask-ai-selection-button';
+        button.textContent = 'Ask AI'; // Or use an icon
+        button.style.display = 'none'; // Initially hidden
+        button.style.position = 'absolute';
+        button.style.zIndex = '1500'; // Ensure it's on top
+        document.body.appendChild(button);
+
+        button.addEventListener('click', handleAskAiClick);
+        return button;
+    }
+
+    function getSafeSelectedText() {
+        const selection = window.getSelection();
+        if (!selection || selection.rangeCount === 0) {
+            return null;
+        }
+        // Avoid selecting text within the button itself if it was somehow selected
+        const container = selection.getRangeAt(0).commonAncestorContainer;
+        if (askAiButton && askAiButton.contains(container)) {
+             return null;
+        }
+
+        const text = selection.toString().trim();
+        return text.length > 0 ? text : null;
+    }
+
+    function positionButton(event) {
+         const selection = window.getSelection();
+         if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
+             hideButton();
+             return;
+         }
+
+        const range = selection.getRangeAt(0);
+        const rect = range.getBoundingClientRect();
+
+        // Calculate position: top-right of the selection
+        const scrollX = window.scrollX;
+        const scrollY = window.scrollY;
+        const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
+        const buttonLeft = rect.right + scrollX + 5; // 5px to the right
+
+        askAiButton.style.top = `${buttonTop}px`;
+        askAiButton.style.left = `${buttonLeft}px`;
+        askAiButton.style.display = 'block'; // Show the button
+    }
+
+    function hideButton() {
+        if (askAiButton) {
+            askAiButton.style.display = 'none';
+        }
+    }
+
+    function handleAskAiClick(event) {
+        event.stopPropagation(); // Prevent mousedown from hiding button immediately
+        const selectedText = getSafeSelectedText();
+        if (selectedText) {
+            console.log("Selected Text:", selectedText);
+            // Base64 encode for URL safety (handles special chars, line breaks)
+            // Use encodeURIComponent first for proper Unicode handling before btoa
+            const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
+            const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
+            console.log("Navigating to:", targetUrl);
+            window.location.href = targetUrl; // Navigate to Ask AI page
+        }
+        hideButton(); // Hide after click
+    }
+
+    // --- Event Listeners ---
+
+    // Show button on mouse up after selection
+    document.addEventListener('mouseup', (event) => {
+        // Slight delay to ensure selection is registered
+        setTimeout(() => {
+            const selectedText = getSafeSelectedText();
+            if (selectedText) {
+                if (!askAiButton) {
+                    askAiButton = createAskAiButton();
+                }
+                // Don't position if the click was ON the button itself
+                if (event.target !== askAiButton) {
+                     positionButton(event);
+                }
+            } else {
+                hideButton();
+            }
+        }, 10); // Small delay
+    });
+
+    // Hide button on scroll or click elsewhere
+    document.addEventListener('mousedown', (event) => {
+        // Hide if clicking anywhere EXCEPT the button itself
+        if (askAiButton && event.target !== askAiButton) {
+            hideButton();
+        }
+    });
+    document.addEventListener('scroll', hideButton, true); // Capture scroll events
+
+    console.log("Selection Ask AI script loaded.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 751aabb7..92e01f85 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -6,8 +6,8 @@
 }
 
 :root {
-    --global-font-size: 16px;
-    --global-code-font-size: 16px;
+    --global-font-size: 14px;
+    --global-code-font-size: 13px;
     --global-line-height: 1.5em;
     --global-space: 10px;
     --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
@@ -56,7 +56,7 @@
     --toc-width: 240px; /* Adjust based on your desired ToC width */
     --layout-transition-speed: 0.2s; /* For potential future animations */
 
-    --page-width : 90em; /* Adjust based on your design */
+    --page-width : 100em; /* Adjust based on your design */
 }
 
 
diff --git a/docs/md_v2/core/ask-ai.md b/docs/md_v2/core/ask-ai.md
new file mode 100644
index 00000000..9122bd29
--- /dev/null
+++ b/docs/md_v2/core/ask-ai.md
@@ -0,0 +1,74 @@
+<div class="ask-ai-container">
+<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
+</div>
+
+<script>
+// Iframe height adjustment
+function resizeAskAiIframe() {
+  const iframe = document.getElementById('ask-ai-frame');
+  if (iframe) {
+    const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
+    // Footer is removed by JS below, so calculate height based on header + small buffer
+    const topOffset = headerHeight + 20; // Header + buffer/margin
+
+    const availableHeight = window.innerHeight - topOffset;
+    iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
+  }
+}
+
+// Run immediately and on resize/load
+resizeAskAiIframe(); // Initial call
+let resizeTimer;
+window.addEventListener('load', resizeAskAiIframe);
+window.addEventListener('resize', () => {
+    clearTimeout(resizeTimer);
+    resizeTimer = setTimeout(resizeAskAiIframe, 150);
+});
+
+// Remove Footer & HR from parent page (DOM Ready might be safer)
+document.addEventListener('DOMContentLoaded', () => {
+    setTimeout(() => { // Add slight delay just in case elements render slowly
+        const footer = window.parent.document.querySelector('footer'); // Target parent document
+        if (footer) {
+            const hrBeforeFooter = footer.previousElementSibling;
+            if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
+                hrBeforeFooter.remove();
+            }
+            footer.remove();
+            // Trigger resize again after removing footer
+            resizeAskAiIframe();
+        } else {
+             console.warn("Ask AI Page: Could not find footer in parent document to remove.");
+        }
+    }, 100); // Shorter delay
+});
+</script>
+
+<style>
+#terminal-mkdocs-main-content {
+    padding: 0 !important;
+    margin: 0;
+    width: 100%;
+    height: 100%;
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+}
+
+/* Ensure iframe container takes full space */
+#terminal-mkdocs-main-content .ask-ai-container {
+    /* Remove negative margins if footer removal handles space */
+     margin: 0;
+    padding: 0;
+    max-width: none;
+    /* Let the JS set the height */
+    /* height: 600px; Initial fallback height */
+    overflow: hidden; /* Hide potential overflow before JS resize */
+}
+
+/* Hide title/paragraph if they were part of the markdown */
+/* Alternatively, just remove them from the .md file directly */
+/* #terminal-mkdocs-main-content > h1,
+#terminal-mkdocs-main-content > p:first-of-type {
+    display: none;
+} */
+
+</style>
diff --git a/mkdocs.yml b/mkdocs.yml
index 1c7be7a3..39e03a88 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -7,10 +7,11 @@ docs_dir: docs/md_v2
 
 nav:
   - Home: 'index.md'
+  - "Ask AI": "core/ask-ai.md"
+  - "Quick Start": "core/quickstart.md"
   - Setup & Installation:
     - "Installation": "core/installation.md"
     - "Docker Deployment": "core/docker-deployment.md"
-  - "Quick Start": "core/quickstart.md"
   - "Blog & Changelog":
     - "Blog Home": "blog/index.md"
     - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
@@ -86,4 +87,7 @@ extra_javascript:
   - assets/highlight_init.js
   - https://buttons.github.io/buttons.js
   - assets/toc.js
-  - assets/github_stats.js 
\ No newline at end of file
+  - assets/github_stats.js 
+  - assets/selection_ask_ai.js
+  - assets/copy_code.js
+  - assets/floating_ask_ai_button.js
\ No newline at end of file

From 793668a413bddc65f9d421dc294d50ed08b06ab7 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 14 Apr 2025 23:05:24 +0800
Subject: [PATCH 55/78] Remove parameter_updates.txt

---
 parameter_updates.txt | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 parameter_updates.txt

diff --git a/parameter_updates.txt b/parameter_updates.txt
deleted file mode 100644
index 5a5027d0..00000000
--- a/parameter_updates.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-The file /docs/md_v2/api/parameters.md should be updated to include the new network and console capturing parameters. 
-
-Here's what needs to be updated:
-
-1. Change section title from:
-```
-### G) **Debug & Logging**
-```
-to:
-```
-### G) **Debug, Logging & Capturing**
-```
-
-2. Add new parameters to the table:
-```
-| **`capture_network_requests`** | `bool` (False) | Captures all network requests, responses, and failures during the crawl. Available in `result.network_requests`. |
-| **`capture_console_messages`** | `bool` (False) | Captures all browser console messages (logs, warnings, errors) during the crawl. Available in `result.console_messages`. |
-```
-
-These changes demonstrate how to use the new network and console capturing features in the CrawlerRunConfig.
\ No newline at end of file

From 230f22da86fae0db3fd09fe0dfe7c9e4820b708b Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 15 Apr 2025 22:27:18 +0800
Subject: [PATCH 56/78] refactor(proxy): move ProxyConfig to async_configs and
 improve LLM token handling

Moved ProxyConfig class from proxy_strategy.py to async_configs.py for better organization.
Improved LLM token handling with new PROVIDER_MODELS_PREFIXES.
Added test cases for deep crawling and proxy rotation.
Removed docker_config from BrowserConfig as it's handled separately.

BREAKING CHANGE: ProxyConfig import path changed from crawl4ai.proxy_strategy to crawl4ai
---
 crawl4ai/__init__.py                       |   3 +-
 crawl4ai/async_configs.py                  | 154 +++++-
 crawl4ai/async_webcrawler.py               |   6 +-
 crawl4ai/browser_manager.py                |  19 +-
 crawl4ai/config.py                         |   8 +
 crawl4ai/proxy_strategy.py                 |   7 +-
 crawl4ai/ssl_certificate.py                | 260 ++++-----
 docs/examples/quickstart_examples_set_1.py |   2 +-
 docs/examples/tutorial_v0.5.py             |   2 +-
 docs/md_v2/blog/releases/0.5.0.md          |   2 +-
 tests/docker/test_rest_api_deep_crawl.py   | 596 +++++++++++++++++++++
 tests/general/generate_dummy_site.py       | 335 ++++++++++++
 12 files changed, 1232 insertions(+), 162 deletions(-)
 create mode 100644 tests/docker/test_rest_api_deep_crawl.py
 create mode 100644 tests/general/generate_dummy_site.py

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 0ab808f3..37dd8366 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -2,7 +2,7 @@
 import warnings
 
 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
 
 from .content_scraping_strategy import (
     ContentScrapingStrategy,
@@ -121,6 +121,7 @@ __all__ = [
     "Crawl4aiDockerClient",
     "ProxyRotationStrategy",
     "RoundRobinProxyStrategy",
+    "ProxyConfig"
 ]
 
 
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 2f421178..faa29024 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -5,6 +5,7 @@ from .config import (
     MIN_WORD_THRESHOLD,
     IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
     PROVIDER_MODELS,
+    PROVIDER_MODELS_PREFIXES,
     SCREENSHOT_HEIGHT_TRESHOLD,
     PAGE_TIMEOUT,
     IMAGE_SCORE_THRESHOLD,
@@ -27,11 +28,8 @@ import inspect
 from typing import Any, Dict, Optional
 from enum import Enum
 
-from .proxy_strategy import ProxyConfig
-try:
-    from .browser.models import DockerConfig
-except ImportError:
-    DockerConfig = None
+# from .proxy_strategy import ProxyConfig
+
 
 
 def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
@@ -161,6 +159,117 @@ def is_empty_value(value: Any) -> bool:
         return True
     return False
 
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
+
+
 
 class BrowserConfig:
     """
@@ -197,8 +306,6 @@ class BrowserConfig:
                              Default: None.
         proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                      If None, no additional proxy config. Default: None.
-        docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
-                                     Contains settings for Docker container operation. Default: None.
         viewport_width (int): Default viewport width for pages. Default: 1080.
         viewport_height (int): Default viewport height for pages. Default: 600.
         viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
@@ -244,7 +351,6 @@ class BrowserConfig:
         channel: str = "chromium",
         proxy: str = None,
         proxy_config: Union[ProxyConfig, dict, None] = None,
-        docker_config: Union[DockerConfig, dict, None] = None,
         viewport_width: int = 1080,
         viewport_height: int = 600,
         viewport: dict = None,
@@ -285,15 +391,7 @@ class BrowserConfig:
             self.chrome_channel = ""
         self.proxy = proxy
         self.proxy_config = proxy_config
-        
-        # Handle docker configuration
-        if isinstance(docker_config, dict) and DockerConfig is not None:
-            self.docker_config = DockerConfig.from_kwargs(docker_config)
-        else:
-            self.docker_config = docker_config
 
-        if self.docker_config:
-            self.user_data_dir = self.docker_config.user_data_dir
 
         self.viewport_width = viewport_width
         self.viewport_height = viewport_height
@@ -364,7 +462,6 @@ class BrowserConfig:
             channel=kwargs.get("channel", "chromium"),
             proxy=kwargs.get("proxy"),
             proxy_config=kwargs.get("proxy_config", None),
-            docker_config=kwargs.get("docker_config", None),
             viewport_width=kwargs.get("viewport_width", 1080),
             viewport_height=kwargs.get("viewport_height", 600),
             accept_downloads=kwargs.get("accept_downloads", False),
@@ -421,13 +518,7 @@ class BrowserConfig:
             "debugging_port": self.debugging_port,
             "host": self.host,
         }
-        
-        # Include docker_config if it exists
-        if hasattr(self, "docker_config") and self.docker_config is not None:
-            if hasattr(self.docker_config, "to_dict"):
-                result["docker_config"] = self.docker_config.to_dict()
-            else:
-                result["docker_config"] = self.docker_config
+
                 
         return result
 
@@ -1180,9 +1271,18 @@ class LLMConfig:
         elif api_token and api_token.startswith("env:"):
             self.api_token = os.getenv(api_token[4:])
         else:
-            self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
-                DEFAULT_PROVIDER_API_KEY
-            )
+            # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
+            # If not, check if it is in PROVIDER_MODELS
+            prefixes = PROVIDER_MODELS_PREFIXES.keys()
+            if any(provider.startswith(prefix) for prefix in prefixes):
+                selected_prefix = next(
+                    (prefix for prefix in prefixes if provider.startswith(prefix)),
+                    None,
+                )
+                self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)                    
+            else:
+                self.provider = DEFAULT_PROVIDER
+                self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
         self.base_url = base_url
         self.temprature = temprature
         self.max_tokens = max_tokens
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 1cd1b8c9..9ba508b2 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -36,7 +36,7 @@ from .markdown_generation_strategy import (
 )
 from .deep_crawling import DeepCrawlDecorator
 from .async_logger import AsyncLogger, AsyncLoggerBase
-from .async_configs import BrowserConfig, CrawlerRunConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
 from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
 
@@ -291,12 +291,12 @@ class AsyncWebCrawler:
 
                 # Update proxy configuration from rotation strategy if available
                 if config and config.proxy_rotation_strategy:
-                    next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
+                    next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
                     if next_proxy:
                         self.logger.info(
                             message="Switch proxy: {proxy}",
                             tag="PROXY",
-                            params={"proxy": next_proxy.server},
+                            params={"proxy": next_proxy.server} 
                         )
                         config.proxy_config = next_proxy
                         # config = config.clone(proxy_config=next_proxy)
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index bfe22f4e..a338d71d 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -94,6 +94,7 @@ class ManagedBrowser:
         host: str = "localhost",
         debugging_port: int = 9222,
         cdp_url: Optional[str] = None, 
+        browser_config: Optional[BrowserConfig] = None,
     ):
         """
         Initialize the ManagedBrowser instance.
@@ -109,17 +110,19 @@ class ManagedBrowser:
             host (str): Host for debugging the browser. Default: "localhost".
             debugging_port (int): Port for debugging the browser. Default: 9222.
             cdp_url (str or None): CDP URL to connect to the browser. Default: None.
+            browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
         """
-        self.browser_type = browser_type
-        self.user_data_dir = user_data_dir
-        self.headless = headless
+        self.browser_type = browser_config.browser_type
+        self.user_data_dir = browser_config.user_data_dir
+        self.headless = browser_config.headless
         self.browser_process = None
         self.temp_dir = None
-        self.debugging_port = debugging_port
-        self.host = host
+        self.debugging_port = browser_config.debugging_port
+        self.host = browser_config.host
         self.logger = logger
         self.shutting_down = False
-        self.cdp_url = cdp_url
+        self.cdp_url = browser_config.cdp_url
+        self.browser_config = browser_config
 
     async def start(self) -> str:
         """
@@ -142,6 +145,9 @@ class ManagedBrowser:
         # Get browser path and args based on OS and browser type
         # browser_path = self._get_browser_path()
         args = await self._get_browser_args()
+        
+        if self.browser_config.extra_args:
+            args.extend(self.browser_config.extra_args)
 
         # Start browser process
         try:
@@ -477,6 +483,7 @@ class BrowserManager:
                 logger=self.logger,
                 debugging_port=self.config.debugging_port,
                 cdp_url=self.config.cdp_url,
+                browser_config=self.config,
             )
 
     async def start(self):
diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index 103dc1b7..08f56b83 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -29,6 +29,14 @@ PROVIDER_MODELS = {
     'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
     "deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
 }
+PROVIDER_MODELS_PREFIXES = {
+    "ollama": "no-token-needed",  # Any model from Ollama no need for API token
+    "groq": os.getenv("GROQ_API_KEY"),
+    "openai": os.getenv("OPENAI_API_KEY"),
+    "anthropic": os.getenv("ANTHROPIC_API_KEY"),
+    "gemini": os.getenv("GEMINI_API_KEY"),
+    "deepseek": os.getenv("DEEPSEEK_API_KEY"),
+}
 
 # Chunk token threshold
 CHUNK_TOKEN_THRESHOLD = 2**11  # 2048 tokens
diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py
index 6821c566..2c01a2f5 100644
--- a/crawl4ai/proxy_strategy.py
+++ b/crawl4ai/proxy_strategy.py
@@ -4,6 +4,9 @@ from itertools import cycle
 import os
 
 
+########### ATTENTION PEOPLE OF EARTH ###########
+# I have moved this config to async_configs.py, kept it here, in case someone still importing it, however
+# be a dear and follow `from crawl4ai import ProxyConfig` instead :)
 class ProxyConfig:
     def __init__(
         self,
@@ -119,12 +122,12 @@ class ProxyRotationStrategy(ABC):
     """Base abstract class for proxy rotation strategies"""
     
     @abstractmethod
-    async def get_next_proxy(self) -> Optional[Dict]:
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
         """Get next proxy configuration from the strategy"""
         pass
 
     @abstractmethod
-    def add_proxies(self, proxies: List[Dict]):
+    def add_proxies(self, proxies: List[ProxyConfig]):
         """Add proxy configurations to the strategy"""
         pass
 
diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py
index 722bb7f9..a60b7cbc 100644
--- a/crawl4ai/ssl_certificate.py
+++ b/crawl4ai/ssl_certificate.py
@@ -9,83 +9,44 @@ from urllib.parse import urlparse
 import OpenSSL.crypto
 from pathlib import Path
 
-
-class SSLCertificate:
+# === Inherit from dict ===
+class SSLCertificate(dict):
     """
-    A class representing an SSL certificate with methods to export in various formats.
+    A class representing an SSL certificate, behaving like a dictionary
+    for direct JSON serialization. It stores the certificate information internally
+    and provides methods for export and property access.
 
-    Attributes:
-        cert_info (Dict[str, Any]): The certificate information.
-
-        Methods:
-            from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
-            from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
-            from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
-            export_as_pem() -> str: Export the certificate as PEM format.
-            export_as_der() -> bytes: Export the certificate as DER format.
-            export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
-            export_as_text() -> str: Export the certificate as text format.
+    Inherits from dict, so instances are directly JSON serializable.
     """
 
+    # Use __slots__ for potential memory optimization if desired, though less common when inheriting dict
+    # __slots__ = ("_cert_info",) # If using slots, be careful with dict inheritance interaction
+
     def __init__(self, cert_info: Dict[str, Any]):
-        self._cert_info = self._decode_cert_data(cert_info)
-
-    @staticmethod
-    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
         """
-        Create SSLCertificate instance from a URL.
+        Initializes the SSLCertificate object.
 
         Args:
-            url (str): URL of the website.
-            timeout (int): Timeout for the connection (default: 10).
-
-        Returns:
-            Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
+            cert_info (Dict[str, Any]): The raw certificate dictionary.
         """
-        try:
-            hostname = urlparse(url).netloc
-            if ":" in hostname:
-                hostname = hostname.split(":")[0]
+        # 1. Decode the data (handle bytes -> str)
+        decoded_info = self._decode_cert_data(cert_info)
 
-            context = ssl.create_default_context()
-            with socket.create_connection((hostname, 443), timeout=timeout) as sock:
-                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
-                    cert_binary = ssock.getpeercert(binary_form=True)
-                    x509 = OpenSSL.crypto.load_certificate(
-                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
-                    )
+        # 2. Store the decoded info internally (optional but good practice)
+        # self._cert_info = decoded_info # You can keep this if methods rely on it
 
-                    cert_info = {
-                        "subject": dict(x509.get_subject().get_components()),
-                        "issuer": dict(x509.get_issuer().get_components()),
-                        "version": x509.get_version(),
-                        "serial_number": hex(x509.get_serial_number()),
-                        "not_before": x509.get_notBefore(),
-                        "not_after": x509.get_notAfter(),
-                        "fingerprint": x509.digest("sha256").hex(),
-                        "signature_algorithm": x509.get_signature_algorithm(),
-                        "raw_cert": base64.b64encode(cert_binary),
-                    }
-
-                    # Add extensions
-                    extensions = []
-                    for i in range(x509.get_extension_count()):
-                        ext = x509.get_extension(i)
-                        extensions.append(
-                            {"name": ext.get_short_name(), "value": str(ext)}
-                        )
-                    cert_info["extensions"] = extensions
-
-                    return SSLCertificate(cert_info)
-
-        except Exception:
-            return None
+        # 3. Initialize the dictionary part of the object with the decoded data
+        super().__init__(decoded_info)
 
     @staticmethod
     def _decode_cert_data(data: Any) -> Any:
         """Helper method to decode bytes in certificate data."""
         if isinstance(data, bytes):
-            return data.decode("utf-8")
+            try:
+                # Try UTF-8 first, fallback to latin-1 for arbitrary bytes
+                return data.decode("utf-8")
+            except UnicodeDecodeError:
+                return data.decode("latin-1") # Or handle as needed, maybe hex representation
         elif isinstance(data, dict):
             return {
                 (
@@ -97,36 +58,119 @@ class SSLCertificate:
             return [SSLCertificate._decode_cert_data(item) for item in data]
         return data
 
+    @staticmethod
+    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
+        """
+        Create SSLCertificate instance from a URL. Fetches cert info and initializes.
+        (Fetching logic remains the same)
+        """
+        cert_info_raw = None # Variable to hold the fetched dict
+        try:
+            hostname = urlparse(url).netloc
+            if ":" in hostname:
+                hostname = hostname.split(":")[0]
+
+            context = ssl.create_default_context()
+            # Set check_hostname to False and verify_mode to CERT_NONE temporarily
+            # for potentially problematic certificates during fetch, but parse the result regardless.
+            # context.check_hostname = False
+            # context.verify_mode = ssl.CERT_NONE
+
+            with socket.create_connection((hostname, 443), timeout=timeout) as sock:
+                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
+                    cert_binary = ssock.getpeercert(binary_form=True)
+                    if not cert_binary:
+                         print(f"Warning: No certificate returned for {hostname}")
+                         return None
+
+                    x509 = OpenSSL.crypto.load_certificate(
+                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
+                    )
+
+                    # Create the dictionary directly
+                    cert_info_raw = {
+                        "subject": dict(x509.get_subject().get_components()),
+                        "issuer": dict(x509.get_issuer().get_components()),
+                        "version": x509.get_version(),
+                        "serial_number": hex(x509.get_serial_number()),
+                        "not_before": x509.get_notBefore(), # Keep as bytes initially, _decode handles it
+                        "not_after": x509.get_notAfter(),   # Keep as bytes initially
+                        "fingerprint": x509.digest("sha256").hex(), # hex() is already string
+                        "signature_algorithm": x509.get_signature_algorithm(), # Keep as bytes
+                        "raw_cert": base64.b64encode(cert_binary), # Base64 is bytes, _decode handles it
+                    }
+
+                    # Add extensions
+                    extensions = []
+                    for i in range(x509.get_extension_count()):
+                        ext = x509.get_extension(i)
+                        # get_short_name() returns bytes, str(ext) handles value conversion
+                        extensions.append(
+                            {"name": ext.get_short_name(), "value": str(ext)}
+                        )
+                    cert_info_raw["extensions"] = extensions
+
+        except ssl.SSLCertVerificationError as e:
+             print(f"SSL Verification Error for {url}: {e}")
+             # Decide if you want to proceed or return None based on your needs
+             # You might try fetching without verification here if needed, but be cautious.
+             return None
+        except socket.gaierror:
+            print(f"Could not resolve hostname: {hostname}")
+            return None
+        except socket.timeout:
+            print(f"Connection timed out for {url}")
+            return None
+        except Exception as e:
+            print(f"Error fetching/processing certificate for {url}: {e}")
+            # Log the full error details if needed: logging.exception("Cert fetch error")
+            return None
+
+        # If successful, create the SSLCertificate instance from the dictionary
+        if cert_info_raw:
+             return SSLCertificate(cert_info_raw)
+        else:
+             return None
+
+
+    # --- Properties now access the dictionary items directly via self[] ---
+    @property
+    def issuer(self) -> Dict[str, str]:
+        return self.get("issuer", {}) # Use self.get for safety
+
+    @property
+    def subject(self) -> Dict[str, str]:
+        return self.get("subject", {})
+
+    @property
+    def valid_from(self) -> str:
+        return self.get("not_before", "")
+
+    @property
+    def valid_until(self) -> str:
+        return self.get("not_after", "")
+
+    @property
+    def fingerprint(self) -> str:
+        return self.get("fingerprint", "")
+
+    # --- Export methods can use `self` directly as it is the dict ---
     def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
-        """
-        Export certificate as JSON.
-
-        Args:
-            filepath (Optional[str]): Path to save the JSON file (default: None).
-
-        Returns:
-            Optional[str]: JSON string if successful, None otherwise.
-        """
-        json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
+        """Export certificate as JSON."""
+        # `self` is already the dictionary we want to serialize
+        json_str = json.dumps(self, indent=2, ensure_ascii=False)
         if filepath:
             Path(filepath).write_text(json_str, encoding="utf-8")
             return None
         return json_str
 
     def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
-        """
-        Export certificate as PEM.
-
-        Args:
-            filepath (Optional[str]): Path to save the PEM file (default: None).
-
-        Returns:
-            Optional[str]: PEM string if successful, None otherwise.
-        """
+        """Export certificate as PEM."""
         try:
+            # Decode the raw_cert (which should be string due to _decode)
+            raw_cert_bytes = base64.b64decode(self.get("raw_cert", ""))
             x509 = OpenSSL.crypto.load_certificate(
-                OpenSSL.crypto.FILETYPE_ASN1,
-                base64.b64decode(self._cert_info["raw_cert"]),
+                OpenSSL.crypto.FILETYPE_ASN1, raw_cert_bytes
             )
             pem_data = OpenSSL.crypto.dump_certificate(
                 OpenSSL.crypto.FILETYPE_PEM, x509
@@ -136,49 +180,25 @@ class SSLCertificate:
                 Path(filepath).write_text(pem_data, encoding="utf-8")
                 return None
             return pem_data
-        except Exception:
-            return None
+        except Exception as e:
+             print(f"Error converting to PEM: {e}")
+             return None
 
     def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
-        """
-        Export certificate as DER.
-
-        Args:
-            filepath (Optional[str]): Path to save the DER file (default: None).
-
-        Returns:
-            Optional[bytes]: DER bytes if successful, None otherwise.
-        """
+        """Export certificate as DER."""
         try:
-            der_data = base64.b64decode(self._cert_info["raw_cert"])
+            # Decode the raw_cert (which should be string due to _decode)
+            der_data = base64.b64decode(self.get("raw_cert", ""))
             if filepath:
                 Path(filepath).write_bytes(der_data)
                 return None
             return der_data
-        except Exception:
-            return None
+        except Exception as e:
+             print(f"Error converting to DER: {e}")
+             return None
 
-    @property
-    def issuer(self) -> Dict[str, str]:
-        """Get certificate issuer information."""
-        return self._cert_info.get("issuer", {})
-
-    @property
-    def subject(self) -> Dict[str, str]:
-        """Get certificate subject information."""
-        return self._cert_info.get("subject", {})
-
-    @property
-    def valid_from(self) -> str:
-        """Get certificate validity start date."""
-        return self._cert_info.get("not_before", "")
-
-    @property
-    def valid_until(self) -> str:
-        """Get certificate validity end date."""
-        return self._cert_info.get("not_after", "")
-
-    @property
-    def fingerprint(self) -> str:
-        """Get certificate fingerprint."""
-        return self._cert_info.get("fingerprint", "")
+    # Optional: Add __repr__ for better debugging
+    def __repr__(self) -> str:
+        subject_cn = self.subject.get('CN', 'N/A')
+        issuer_cn = self.issuer.get('CN', 'N/A')
+        return f"<SSLCertificate Subject='{subject_cn}' Issuer='{issuer_cn}'>"
\ No newline at end of file
diff --git a/docs/examples/quickstart_examples_set_1.py b/docs/examples/quickstart_examples_set_1.py
index 76224746..078d1c4a 100644
--- a/docs/examples/quickstart_examples_set_1.py
+++ b/docs/examples/quickstart_examples_set_1.py
@@ -4,7 +4,7 @@ import json
 import base64
 from pathlib import Path
 from typing import List
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
 
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import RoundRobinProxyStrategy
diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py
index 3cbbdb7b..fe8e0a2b 100644
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
 )
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai.content_filter_strategy import LLMContentFilter
 from crawl4ai import DefaultMarkdownGenerator
diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md
index 24b0feda..30269a29 100644
--- a/docs/md_v2/blog/releases/0.5.0.md
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -251,7 +251,7 @@ from crawl4ai import (
     RoundRobinProxyStrategy,
 )
 import asyncio
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
 async def main():
     # Load proxies and create rotation strategy
     proxies = ProxyConfig.from_env()
diff --git a/tests/docker/test_rest_api_deep_crawl.py b/tests/docker/test_rest_api_deep_crawl.py
new file mode 100644
index 00000000..64afefff
--- /dev/null
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -0,0 +1,596 @@
+# ==== File: test_rest_api_deep_crawl.py ====
+
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+from dotenv import load_dotenv
+load_dotenv() # Load environment variables from .env file if present
+
+# --- Test Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Ensure this points to your running server
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Ensure this points to your running server
+DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
+DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
+
+# --- Helper Functions ---
+def load_proxies_from_env() -> List[Dict]:
+    """Load proxies from PROXIES environment variable"""
+    proxies = []
+    proxies_str = os.getenv("PROXIES", "")
+    if not proxies_str:
+        print("PROXIES environment variable not set or empty.")
+        return proxies
+    try:
+        proxy_list = proxies_str.split(",")
+        for proxy in proxy_list:
+            proxy = proxy.strip()
+            if not proxy:
+                continue
+            parts = proxy.split(":")
+            if len(parts) == 4:
+                ip, port, username, password = parts
+                proxies.append({
+                    "server": f"http://{ip}:{port}", # Assuming http, adjust if needed
+                    "username": username,
+                    "password": password,
+                    "ip": ip  # Store original IP if available
+                })
+            elif len(parts) == 2: # ip:port only
+                 ip, port = parts
+                 proxies.append({
+                    "server": f"http://{ip}:{port}",
+                    "ip": ip
+                 })
+            else:
+                 print(f"Skipping invalid proxy string format: {proxy}")
+
+    except Exception as e:
+        print(f"Error loading proxies from environment: {e}")
+    return proxies
+
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    try:
+        response = await client.get("/health")
+        response.raise_for_status()
+        print(f"\nServer healthy: {response.json()}")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
+    """Asserts the basic structure of a single crawl result."""
+    assert isinstance(result, dict)
+    assert "url" in result
+    assert "success" in result
+    assert "html" in result # Basic crawls should return HTML
+    assert "metadata" in result
+    assert isinstance(result["metadata"], dict)
+    assert "depth" in result["metadata"] # Deep crawls add depth
+
+    if check_ssl:
+        assert "ssl_certificate" in result # Check if SSL info is present
+        assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
+
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+    """Processes an NDJSON streaming response."""
+    results = []
+    completed = False
+    async for line in response.aiter_lines():
+        if line:
+            try:
+                data = json.loads(line)
+                if data.get("status") == "completed":
+                    completed = True
+                    break # Stop processing after completion marker
+                elif data.get("url"): # Ensure it looks like a result object
+                    results.append(data)
+                else:
+                    print(f"Received non-result JSON line: {data}") # Log other status messages if needed
+            except json.JSONDecodeError:
+                pytest.fail(f"Failed to decode JSON line: {line}")
+    assert completed, "Streaming response did not end with a completion marker."
+    return results
+
+
+# --- Pytest Fixtures ---
+@pytest_asyncio.fixture(scope="function")
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+    """Provides an async HTTP client"""
+    # Increased timeout for potentially longer deep crawls
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        yield client
+    # No explicit close needed with 'async with'
+
+# --- Test Class ---
+@pytest.mark.asyncio
+class TestDeepCrawlEndpoints:
+
+    @pytest_asyncio.fixture(autouse=True)
+    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+        """Fixture to ensure server is healthy before each test in the class."""
+        await check_server_health(async_client)
+
+    # 1. Basic Deep Crawl
+    # async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
+    #     """Test BFS deep crawl with limited depth and pages."""
+    #     max_depth = 1
+    #     max_pages = 3 # start_url + 2 more
+    #     payload = {
+    #         "urls": [DEEP_CRAWL_BASE_URL],
+    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    #         "crawler_config": {
+    #             "type": "CrawlerRunConfig",
+    #             "params": {
+    #                 "stream": False,
+    #                 "cache_mode": "BYPASS", # Use string value for CacheMode
+    #                 "deep_crawl_strategy": {
+    #                     "type": "BFSDeepCrawlStrategy",
+    #                     "params": {
+    #                         "max_depth": max_depth,
+    #                         "max_pages": max_pages,
+    #                         # Minimal filters for basic test
+    #                         "filter_chain": {
+    #                             "type": "FilterChain",
+    #                             "params": {
+    #                                 "filters": [
+    #                                     {
+    #                                         "type": "DomainFilter",
+    #                                         "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+    #                                     }
+    #                                 ]
+    #                             }
+    #                         }
+    #                     }
+    #                 }
+    #             }
+    #         }
+    #     }
+    #     response = await async_client.post("/crawl", json=payload)
+    #     response.raise_for_status()
+    #     data = response.json()
+
+    #     assert data["success"] is True
+    #     assert isinstance(data["results"], list)
+    #     assert len(data["results"]) > 1 # Should be more than just the start URL
+    #     assert len(data["results"]) <= max_pages # Respect max_pages
+
+    #     found_depth_0 = False
+    #     found_depth_1 = False
+    #     for result in data["results"]:
+    #         await assert_crawl_result_structure(result)
+    #         assert result["success"] is True
+    #         assert DEEP_CRAWL_DOMAIN in result["url"]
+    #         depth = result["metadata"]["depth"]
+    #         assert depth <= max_depth
+    #         if depth == 0: found_depth_0 = True
+    #         if depth == 1: found_depth_1 = True
+
+    #     assert found_depth_0
+    #     assert found_depth_1
+
+    # # 2. Deep Crawl with Filtering
+    # async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
+    #     """Test BFS deep crawl with content type and domain filters."""
+    #     max_depth = 1
+    #     max_pages = 5
+    #     payload = {
+    #         "urls": [DEEP_CRAWL_BASE_URL],
+    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    #         "crawler_config": {
+    #             "type": "CrawlerRunConfig",
+    #             "params": {
+    #                 "stream": False,
+    #                 "cache_mode": "BYPASS",
+    #                 "deep_crawl_strategy": {
+    #                     "type": "BFSDeepCrawlStrategy",
+    #                     "params": {
+    #                         "max_depth": max_depth,
+    #                         "max_pages": max_pages,
+    #                         "filter_chain": {
+    #                             "type": "FilterChain",
+    #                             "params": {
+    #                                 "filters": [
+    #                                     {
+    #                                         "type": "DomainFilter",
+    #                                         "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+    #                                     },
+    #                                     {
+    #                                         "type": "ContentTypeFilter",
+    #                                         "params": {"allowed_types": ["text/html"]}
+    #                                     },
+    #                                     # Example: Exclude specific paths using regex
+    #                                     {
+    #                                         "type": "URLPatternFilter",
+    #                                          "params": {
+    #                                              "patterns": ["*/category-3/*"], # Block category 3
+    #                                              "reverse": True # Block if match
+    #                                          }
+    #                                     }
+    #                                 ]
+    #                             }
+    #                         }
+    #                     }
+    #                 }
+    #             }
+    #         }
+    #     }
+    #     response = await async_client.post("/crawl", json=payload)
+    #     response.raise_for_status()
+    #     data = response.json()
+
+    #     assert data["success"] is True
+    #     assert len(data["results"]) > 0
+    #     assert len(data["results"]) <= max_pages
+
+    #     for result in data["results"]:
+    #         await assert_crawl_result_structure(result)
+    #         assert result["success"] is True
+    #         assert DEEP_CRAWL_DOMAIN in result["url"]
+    #         assert "category-3" not in result["url"] # Check if filter worked
+    #         assert result["metadata"]["depth"] <= max_depth
+
+    # # 3. Deep Crawl with Scoring
+    # async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
+    #     """Test BFS deep crawl with URL scoring."""
+    #     max_depth = 1
+    #     max_pages = 4
+    #     payload = {
+    #         "urls": [DEEP_CRAWL_BASE_URL],
+    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    #         "crawler_config": {
+    #             "type": "CrawlerRunConfig",
+    #             "params": {
+    #                 "stream": False,
+    #                 "cache_mode": "BYPASS",
+    #                 "deep_crawl_strategy": {
+    #                     "type": "BFSDeepCrawlStrategy",
+    #                     "params": {
+    #                         "max_depth": max_depth,
+    #                         "max_pages": max_pages,
+    #                         "filter_chain": { # Keep basic domain filter
+    #                             "type": "FilterChain",
+    #                             "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+    #                         },
+    #                         "url_scorer": { # Add scorer
+    #                             "type": "CompositeScorer",
+    #                             "params": {
+    #                                 "scorers": [
+    #                                     {   # Favor pages with 'product' in the URL
+    #                                         "type": "KeywordRelevanceScorer",
+    #                                         "params": {"keywords": ["product"], "weight": 1.0}
+    #                                     },
+    #                                     {   # Penalize deep paths slightly
+    #                                         "type": "PathDepthScorer",
+    #                                         "params": {"optimal_depth": 2, "weight": -0.2}
+    #                                     }
+    #                                 ]
+    #                             }
+    #                         },
+    #                         # Set a threshold if needed: "score_threshold": 0.1
+    #                     }
+    #                 }
+    #             }
+    #         }
+    #     }
+    #     response = await async_client.post("/crawl", json=payload)
+    #     response.raise_for_status()
+    #     data = response.json()
+
+    #     assert data["success"] is True
+    #     assert len(data["results"]) > 0
+    #     assert len(data["results"]) <= max_pages
+
+    #     # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
+    #     product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
+    #     print(f"Product URLs found among depth > 0 results: {product_urls_found}")
+    #     # We expect scoring to prioritize product pages if available within limits
+    #     # assert product_urls_found # This might be too strict depending on site structure and limits
+
+    #     for result in data["results"]:
+    #         await assert_crawl_result_structure(result)
+    #         assert result["success"] is True
+    #         assert result["metadata"]["depth"] <= max_depth
+
+    # # 4. Deep Crawl with CSS Extraction
+    # async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
+    #     """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
+    #     max_depth = 6 # Go deep enough to reach product pages
+    #     max_pages = 20
+    #     # Schema to extract product details
+    #     product_schema = {
+    #         "name": "ProductDetails",
+    #         "baseSelector": "div.container", # Base for product page
+    #         "fields": [
+    #             {"name": "product_title", "selector": "h1", "type": "text"},
+    #             {"name": "price", "selector": ".product-price", "type": "text"},
+    #             {"name": "description", "selector": ".product-description p", "type": "text"},
+    #             {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
+    #                  {"name": "spec_name", "selector": ".spec-name", "type": "text"},
+    #                  {"name": "spec_value", "selector": ".spec-value", "type": "text"}
+    #             ]}
+    #         ]
+    #     }
+    #     payload = {
+    #         "urls": [DEEP_CRAWL_BASE_URL],
+    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    #         "crawler_config": {
+    #             "type": "CrawlerRunConfig",
+    #             "params": {
+    #                 "stream": False,
+    #                 "cache_mode": "BYPASS",
+    #                 "extraction_strategy": { # Apply extraction to ALL crawled pages
+    #                     "type": "JsonCssExtractionStrategy",
+    #                     "params": {"schema": {"type": "dict", "value": product_schema}}
+    #                 },
+    #                 "deep_crawl_strategy": {
+    #                     "type": "BFSDeepCrawlStrategy",
+    #                     "params": {
+    #                         "max_depth": max_depth,
+    #                         "max_pages": max_pages,
+    #                         "filter_chain": { # Only crawl HTML on our domain
+    #                             "type": "FilterChain",
+    #                             "params": {
+    #                                 "filters": [
+    #                                     {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+    #                                     {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+    #                                 ]
+    #                             }
+    #                         }
+    #                         # Optional: Add scoring to prioritize product pages for extraction
+    #                     }
+    #                 }
+    #             }
+    #         }
+    #     }
+    #     response = await async_client.post("/crawl", json=payload)
+    #     response.raise_for_status()
+    #     data = response.json()
+
+    #     assert data["success"] is True
+    #     assert len(data["results"]) > 0
+    #     # assert len(data["results"]) <= max_pages
+
+    #     found_extracted_product = False
+    #     for result in data["results"]:
+    #         await assert_crawl_result_structure(result)
+    #         assert result["success"] is True
+    #         assert "extracted_content" in result
+    #         if "product_" in result["url"]: # Check product pages specifically
+    #              assert result["extracted_content"] is not None
+    #              try:
+    #                  extracted = json.loads(result["extracted_content"])
+    #                  # Schema returns list even if one base match
+    #                  assert isinstance(extracted, list)
+    #                  if extracted:
+    #                      item = extracted[0]
+    #                      assert "product_title" in item and item["product_title"]
+    #                      assert "price" in item and item["price"]
+    #                      # Specs might be empty list if not found
+    #                      assert "specs" in item and isinstance(item["specs"], list)
+    #                      found_extracted_product = True
+    #                      print(f"Extracted product: {item.get('product_title')}")
+    #              except (json.JSONDecodeError, AssertionError, IndexError) as e:
+    #                   pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+    #         # else:
+    #         #      # Non-product pages might have None or empty list depending on schema match
+    #         #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
+
+    #     assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
+
+    # # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
+    # async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
+    #     """Test BFS deep crawl combined with LLMExtractionStrategy."""
+    #     max_depth = 1 # Limit depth to keep LLM calls manageable
+    #     max_pages = 3
+    #     payload = {
+    #         "urls": [DEEP_CRAWL_BASE_URL],
+    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    #         "crawler_config": {
+    #             "type": "CrawlerRunConfig",
+    #             "params": {
+    #                 "stream": False,
+    #                 "cache_mode": "BYPASS",
+    #                 "extraction_strategy": { # Apply LLM extraction to crawled pages
+    #                     "type": "LLMExtractionStrategy",
+    #                     "params": {
+    #                         "instruction": "Extract the main H1 title and the text content of the first paragraph.",
+    #                         "llm_config": { # Example override, rely on server default if possible
+    #                            "type": "LLMConfig",
+    #                            "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
+    #                         },
+    #                          "schema": { # Expected JSON output
+    #                             "type": "dict",
+    #                             "value": {
+    #                                 "title": "PageContent", "type": "object",
+    #                                 "properties": {
+    #                                     "h1_title": {"type": "string"},
+    #                                     "first_paragraph": {"type": "string"}
+    #                                 }
+    #                             }
+    #                         }
+    #                     }
+    #                 },
+    #                 "deep_crawl_strategy": {
+    #                     "type": "BFSDeepCrawlStrategy",
+    #                     "params": {
+    #                         "max_depth": max_depth,
+    #                         "max_pages": max_pages,
+    #                         "filter_chain": {
+    #                             "type": "FilterChain",
+    #                             "params": {
+    #                                 "filters": [
+    #                                     {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+    #                                     {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+    #                                 ]
+    #                             }
+    #                         }
+    #                     }
+    #                 }
+    #             }
+    #         }
+    #     }
+
+    #     try:
+    #         response = await async_client.post("/crawl", json=payload)
+    #         response.raise_for_status()
+    #         data = response.json()
+    #     except httpx.HTTPStatusError as e:
+    #         pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
+    #     except httpx.RequestError as e:
+    #          pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
+
+
+    #     assert data["success"] is True
+    #     assert len(data["results"]) > 0
+    #     assert len(data["results"]) <= max_pages
+
+    #     found_llm_extraction = False
+    #     for result in data["results"]:
+    #         await assert_crawl_result_structure(result)
+    #         assert result["success"] is True
+    #         assert "extracted_content" in result
+    #         assert result["extracted_content"] is not None
+    #         try:
+    #             extracted = json.loads(result["extracted_content"])
+    #             if isinstance(extracted, list): extracted = extracted[0] # Handle list output
+    #             assert isinstance(extracted, dict)
+    #             assert "h1_title" in extracted # Check keys based on schema
+    #             assert "first_paragraph" in extracted
+    #             found_llm_extraction = True
+    #             print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
+    #         except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
+    #             pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+
+    #     assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
+
+
+    # # 6. Deep Crawl with SSL Certificate Fetching
+    # async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
+    #     """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+    #     max_depth = 0 # Only fetch for start URL to keep test fast
+    #     max_pages = 1
+    #     payload = {
+    #         "urls": [DEEP_CRAWL_BASE_URL],
+    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    #         "crawler_config": {
+    #             "type": "CrawlerRunConfig",
+    #             "params": {
+    #                 "stream": False,
+    #                 "cache_mode": "BYPASS",
+    #                 "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+    #                 "deep_crawl_strategy": {
+    #                     "type": "BFSDeepCrawlStrategy",
+    #                     "params": {
+    #                         "max_depth": max_depth,
+    #                         "max_pages": max_pages,
+    #                     }
+    #                 }
+    #             }
+    #         }
+    #     }
+    #     response = await async_client.post("/crawl", json=payload)
+    #     response.raise_for_status()
+    #     data = response.json()
+
+    #     assert data["success"] is True
+    #     assert len(data["results"]) == 1
+    #     result = data["results"][0]
+
+    #     await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
+    #     assert result["success"] is True
+    #             # Check if SSL info was actually retrieved
+    #     if result["ssl_certificate"]:
+    #         # Assert directly using dictionary keys
+    #         assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
+    #         assert "issuer" in result["ssl_certificate"]
+    #         assert "subject" in result["ssl_certificate"]
+    #         # --- MODIFIED ASSERTIONS ---
+    #         assert "not_before" in result["ssl_certificate"] # Check for the actual key
+    #         assert "not_after" in result["ssl_certificate"]  # Check for the actual key
+    #         # --- END MODIFICATIONS ---
+    #         assert "fingerprint" in result["ssl_certificate"] # Check another key
+
+    #         # This print statement using .get() already works correctly with dictionaries
+    #         print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
+    #         print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
+    #     else:
+    #         # This part remains the same
+    #         print("SSL Certificate was null in the result.")
+
+
+    # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
+    async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl using proxy rotation."""
+        proxies = load_proxies_from_env()
+        if not proxies:
+            pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
+
+        print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
+
+        max_depth = 1
+        max_pages = 3
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
+             # Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "proxy_rotation_strategy": { # <-- Define the strategy
+                        "type": "RoundRobinProxyStrategy",
+                        "params": {
+                             # Convert ProxyConfig dicts back to the serialized format expected by server
+                             "proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
+                        }
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            # Proxies often cause connection errors, catch them
+            pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
+        except httpx.RequestError as e:
+             pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
+
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
+        # Primary assertion is that the crawl succeeded *with* proxy config
+        print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
+
+        # Verifying specific proxy usage requires server logs or custom headers/responses
+
+
+# --- Main Execution Block (for running script directly) ---
+if __name__ == "__main__":
+    pytest_args = ["-v", "-s", __file__]
+    # Example: Run only proxy test
+    # pytest_args.append("-k test_deep_crawl_with_proxies")
+    print(f"Running pytest with args: {pytest_args}")
+    exit_code = pytest.main(pytest_args)
+    print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file
diff --git a/tests/general/generate_dummy_site.py b/tests/general/generate_dummy_site.py
new file mode 100644
index 00000000..d4218b6b
--- /dev/null
+++ b/tests/general/generate_dummy_site.py
@@ -0,0 +1,335 @@
+# ==== File: build_dummy_site.py ====
+
+import os
+import random
+import argparse
+from pathlib import Path
+from urllib.parse import quote
+
+# --- Configuration ---
+NUM_CATEGORIES = 3
+NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
+NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
+MAX_DEPTH_TARGET = 5 # Explicitly set target depth
+
+# --- Helper Functions ---
+
+def generate_lorem(words=20):
+    """Generates simple placeholder text."""
+    lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
+                   "adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
+                   "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
+    return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
+
+def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
+    """Creates an HTML file with basic structure and inline CSS."""
+    os.makedirs(filepath.parent, exist_ok=True)
+
+    # Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
+    breadcrumb_html = ""
+    if breadcrumbs:
+        links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
+        breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
+
+    # Basic CSS for structure identification (kept the same)
+    css = """
+<style>
+  body {
+    font-family: sans-serif;
+    padding: 20px;
+    background-color: #1e1e1e;
+    color: #d1d1d1;
+  }
+
+  .container {
+    max-width: 960px;
+    margin: auto;
+    background: #2c2c2c;
+    padding: 20px;
+    border-radius: 5px;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
+  }
+
+  h1, h2 {
+    color: #ccc;
+  }
+
+  a {
+    color: #9bcdff;
+    text-decoration: none;
+  }
+
+  a:hover {
+    text-decoration: underline;
+  }
+
+  ul {
+    list-style: none;
+    padding-left: 0;
+  }
+
+  li {
+    margin-bottom: 10px;
+  }
+
+  .category-link,
+  .subcategory-link,
+  .product-link,
+  .details-link,
+  .reviews-link {
+    display: block;
+    padding: 8px;
+    background-color: #3a3a3a;
+    border-radius: 3px;
+  }
+
+  .product-preview {
+    border: 1px solid #444;
+    padding: 10px;
+    margin-bottom: 10px;
+    border-radius: 4px;
+    background-color: #2a2a2a;
+  }
+
+  .product-title {
+    color: #d1d1d1;
+  }
+
+  .product-price {
+    font-weight: bold;
+    color: #85e085;
+  }
+
+  .product-description,
+  .product-specs,
+  .product-reviews {
+    margin-top: 15px;
+    line-height: 1.6;
+  }
+
+  .product-specs li {
+    margin-bottom: 5px;
+    font-size: 0.9em;
+  }
+
+  .spec-name {
+    font-weight: bold;
+  }
+
+  .breadcrumbs {
+    margin-bottom: 20px;
+    font-size: 0.9em;
+    color: #888;
+  }
+
+  .breadcrumbs a {
+    color: #9bcdff;
+  }
+</style>
+    """
+    html_content = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{title} - FakeShop</title>
+    {head_extras}
+    {css}
+</head>
+<body>
+    <div class="container">
+        {breadcrumb_html}
+        <h1>{title}</h1>
+        {body_content}
+    </div>
+</body>
+</html>"""
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(html_content)
+    # Keep print statement concise for clarity
+    # print(f"Created: {filepath}")
+
+def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
+    """Generates the dummy website structure."""
+    base_dir.mkdir(parents=True, exist_ok=True)
+
+    # --- Clean and prepare the base path for URL construction ---
+    # Ensure it starts with '/' if not empty, and remove any trailing '/'
+    if base_path:
+        full_base_path = "/" + base_path.strip('/')
+    else:
+        full_base_path = "" # Represents the root
+
+    print(f"Using base path for links: '{full_base_path}'")
+
+    # --- Level 0: Homepage ---
+    home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
+    # Define the *actual* link path for the homepage breadcrumb
+    home_link_path = f"{full_base_path}/index.html"
+    breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
+
+    # Links *within* the page content should remain relative
+    for i in range(NUM_CATEGORIES):
+        cat_name = f"Category-{i+1}"
+        cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
+        # This path is relative to the current directory (index.html)
+        cat_relative_page_path = f"{cat_folder_name}/index.html"
+        home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
+    home_body += "</ul>"
+    create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
+
+    # --- Levels 1-5 ---
+    for i in range(NUM_CATEGORIES):
+        cat_name = f"Category-{i+1}"
+        cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
+        cat_dir = base_dir / cat_folder_name
+        # This is the *absolute* path for the breadcrumb link
+        cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
+        # Update breadcrumbs list for this level
+        breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
+
+        # --- Level 1: Category Page ---
+        cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
+        for j in range(NUM_SUBCATEGORIES_PER_CAT):
+            subcat_name = f"{cat_name}-Sub-{j+1}"
+            subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
+            # Path relative to the category page
+            subcat_relative_page_path = f"{subcat_folder_name}/index.html"
+            cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
+        cat_body += "</ul>"
+        # Pass the updated breadcrumbs list
+        create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
+
+        for j in range(NUM_SUBCATEGORIES_PER_CAT):
+            subcat_name = f"{cat_name}-Sub-{j+1}"
+            subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
+            subcat_dir = cat_dir / subcat_folder_name
+            # Absolute path for the breadcrumb link
+            subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
+            # Update breadcrumbs list for this level
+            breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
+
+            # --- Level 2: Sub-Category Page (Product List) ---
+            subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
+            for k in range(NUM_PRODUCTS_PER_SUBCAT):
+                prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
+                prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
+                # Filename relative to the subcategory page
+                prod_filename = f"product_{prod_id}.html"
+                # Absolute path for the breadcrumb link
+                prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
+
+                # Preview on list page (link remains relative)
+                subcat_body += f"""
+                <li>
+                    <div class="product-preview">
+                        <a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
+                        <p>{generate_lorem(10)}</p>
+                        <span class="product-price">£{random.uniform(10, 500):.2f}</span>
+                    </div>
+                </li>"""
+
+                # --- Level 3: Product Page ---
+                prod_price = random.uniform(10, 500)
+                prod_desc = generate_lorem(40)
+                prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
+                prod_reviews_count = random.randint(0, 150)
+                # Relative filenames for links on this page
+                details_filename_relative = f"product_{prod_id}_details.html"
+                reviews_filename_relative = f"product_{prod_id}_reviews.html"
+
+                prod_body = f"""
+                <p class="product-price">Price: £{prod_price:.2f}</p>
+                <div class="product-description">
+                    <h2>Description</h2>
+                    <p>{prod_desc}</p>
+                </div>
+                <div class="product-specs">
+                    <h2>Specifications</h2>
+                    <ul>
+                        {''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
+                    </ul>
+                </div>
+                <div class="product-reviews">
+                    <h2>Reviews</h2>
+                    <p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
+                </div>
+                <hr>
+                <p>
+                    <a class="details-link" href="{details_filename_relative}">View More Details</a> |
+                    <a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
+                </p>
+                """
+                # Update breadcrumbs list for this level
+                breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
+                # Pass the updated breadcrumbs list
+                create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
+
+                # --- Level 4: Product Details Page ---
+                details_filename = f"product_{prod_id}_details.html" # Actual filename
+                # Absolute path for the breadcrumb link
+                details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
+                details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
+                # Update breadcrumbs list for this level
+                breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
+                # Pass the updated breadcrumbs list
+                create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
+
+                # --- Level 5: Product Reviews Page ---
+                reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
+                # Absolute path for the breadcrumb link
+                reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
+                reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
+                for r in range(prod_reviews_count):
+                     reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
+                reviews_body += "</ul>"
+                # Update breadcrumbs list for this level
+                breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
+                # Pass the updated breadcrumbs list
+                create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
+
+
+            subcat_body += "</ul>" # Close product-list ul
+            # Pass the correct breadcrumbs list for the subcategory index page
+            create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
+
+
+# --- Main Execution ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
+    parser.add_argument(
+        "-o", "--output-dir",
+        type=str,
+        default="dummy_retail_site",
+        help="Directory to generate the website in."
+    )
+    parser.add_argument(
+        "-n", "--site-name",
+        type=str,
+        default="FakeShop",
+        help="Name of the fake shop."
+    )
+    parser.add_argument(
+        "-b", "--base-path",
+        type=str,
+        default="",
+        help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
+    )
+    # Optional: Add more args to configure counts if needed
+
+    args = parser.parse_args()
+
+    output_directory = Path(args.output_dir)
+    site_name = args.site_name
+    base_path = args.base_path
+
+    print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
+    # Pass the base_path to the generation function
+    generate_site(output_directory, site_name, base_path)
+    print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
+    print("Dummy site generation complete.")
+    print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
+    if base_path:
+        print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
+    else:
+         print(f"Access the site at: http://localhost:8000/index.html")
\ No newline at end of file

From 5206c6f2d6b2a80a909ab4ae5ff4b6b4b788a2e2 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 15 Apr 2025 22:28:01 +0800
Subject: [PATCH 57/78] Modify the test file

---
 tests/docker/test_rest_api_deep_crawl.py | 760 +++++++++++------------
 1 file changed, 380 insertions(+), 380 deletions(-)

diff --git a/tests/docker/test_rest_api_deep_crawl.py b/tests/docker/test_rest_api_deep_crawl.py
index 64afefff..8995881d 100644
--- a/tests/docker/test_rest_api_deep_crawl.py
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -119,411 +119,411 @@ class TestDeepCrawlEndpoints:
         await check_server_health(async_client)
 
     # 1. Basic Deep Crawl
-    # async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with limited depth and pages."""
-    #     max_depth = 1
-    #     max_pages = 3 # start_url + 2 more
-    #     payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
-    #             "type": "CrawlerRunConfig",
-    #             "params": {
-    #                 "stream": False,
-    #                 "cache_mode": "BYPASS", # Use string value for CacheMode
-    #                 "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
-    #                         "max_depth": max_depth,
-    #                         "max_pages": max_pages,
-    #                         # Minimal filters for basic test
-    #                         "filter_chain": {
-    #                             "type": "FilterChain",
-    #                             "params": {
-    #                                 "filters": [
-    #                                     {
-    #                                         "type": "DomainFilter",
-    #                                         "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
-    #                                     }
-    #                                 ]
-    #                             }
-    #                         }
-    #                     }
-    #                 }
-    #             }
-    #         }
-    #     }
-    #     response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
-    #     data = response.json()
+    async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with limited depth and pages."""
+        max_depth = 1
+        max_pages = 3 # start_url + 2 more
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS", # Use string value for CacheMode
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            # Minimal filters for basic test
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
 
-    #     assert data["success"] is True
-    #     assert isinstance(data["results"], list)
-    #     assert len(data["results"]) > 1 # Should be more than just the start URL
-    #     assert len(data["results"]) <= max_pages # Respect max_pages
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) > 1 # Should be more than just the start URL
+        assert len(data["results"]) <= max_pages # Respect max_pages
 
-    #     found_depth_0 = False
-    #     found_depth_1 = False
-    #     for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
-    #         assert DEEP_CRAWL_DOMAIN in result["url"]
-    #         depth = result["metadata"]["depth"]
-    #         assert depth <= max_depth
-    #         if depth == 0: found_depth_0 = True
-    #         if depth == 1: found_depth_1 = True
+        found_depth_0 = False
+        found_depth_1 = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert DEEP_CRAWL_DOMAIN in result["url"]
+            depth = result["metadata"]["depth"]
+            assert depth <= max_depth
+            if depth == 0: found_depth_0 = True
+            if depth == 1: found_depth_1 = True
 
-    #     assert found_depth_0
-    #     assert found_depth_1
+        assert found_depth_0
+        assert found_depth_1
 
-    # # 2. Deep Crawl with Filtering
-    # async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with content type and domain filters."""
-    #     max_depth = 1
-    #     max_pages = 5
-    #     payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
-    #             "type": "CrawlerRunConfig",
-    #             "params": {
-    #                 "stream": False,
-    #                 "cache_mode": "BYPASS",
-    #                 "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
-    #                         "max_depth": max_depth,
-    #                         "max_pages": max_pages,
-    #                         "filter_chain": {
-    #                             "type": "FilterChain",
-    #                             "params": {
-    #                                 "filters": [
-    #                                     {
-    #                                         "type": "DomainFilter",
-    #                                         "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
-    #                                     },
-    #                                     {
-    #                                         "type": "ContentTypeFilter",
-    #                                         "params": {"allowed_types": ["text/html"]}
-    #                                     },
-    #                                     # Example: Exclude specific paths using regex
-    #                                     {
-    #                                         "type": "URLPatternFilter",
-    #                                          "params": {
-    #                                              "patterns": ["*/category-3/*"], # Block category 3
-    #                                              "reverse": True # Block if match
-    #                                          }
-    #                                     }
-    #                                 ]
-    #                             }
-    #                         }
-    #                     }
-    #                 }
-    #             }
-    #         }
-    #     }
-    #     response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
-    #     data = response.json()
+    # 2. Deep Crawl with Filtering
+    async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with content type and domain filters."""
+        max_depth = 1
+        max_pages = 5
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                        },
+                                        {
+                                            "type": "ContentTypeFilter",
+                                            "params": {"allowed_types": ["text/html"]}
+                                        },
+                                        # Example: Exclude specific paths using regex
+                                        {
+                                            "type": "URLPatternFilter",
+                                             "params": {
+                                                 "patterns": ["*/category-3/*"], # Block category 3
+                                                 "reverse": True # Block if match
+                                             }
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
 
-    #     assert data["success"] is True
-    #     assert len(data["results"]) > 0
-    #     assert len(data["results"]) <= max_pages
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
 
-    #     for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
-    #         assert DEEP_CRAWL_DOMAIN in result["url"]
-    #         assert "category-3" not in result["url"] # Check if filter worked
-    #         assert result["metadata"]["depth"] <= max_depth
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert DEEP_CRAWL_DOMAIN in result["url"]
+            assert "category-3" not in result["url"] # Check if filter worked
+            assert result["metadata"]["depth"] <= max_depth
 
-    # # 3. Deep Crawl with Scoring
-    # async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with URL scoring."""
-    #     max_depth = 1
-    #     max_pages = 4
-    #     payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
-    #             "type": "CrawlerRunConfig",
-    #             "params": {
-    #                 "stream": False,
-    #                 "cache_mode": "BYPASS",
-    #                 "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
-    #                         "max_depth": max_depth,
-    #                         "max_pages": max_pages,
-    #                         "filter_chain": { # Keep basic domain filter
-    #                             "type": "FilterChain",
-    #                             "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
-    #                         },
-    #                         "url_scorer": { # Add scorer
-    #                             "type": "CompositeScorer",
-    #                             "params": {
-    #                                 "scorers": [
-    #                                     {   # Favor pages with 'product' in the URL
-    #                                         "type": "KeywordRelevanceScorer",
-    #                                         "params": {"keywords": ["product"], "weight": 1.0}
-    #                                     },
-    #                                     {   # Penalize deep paths slightly
-    #                                         "type": "PathDepthScorer",
-    #                                         "params": {"optimal_depth": 2, "weight": -0.2}
-    #                                     }
-    #                                 ]
-    #                             }
-    #                         },
-    #                         # Set a threshold if needed: "score_threshold": 0.1
-    #                     }
-    #                 }
-    #             }
-    #         }
-    #     }
-    #     response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
-    #     data = response.json()
+    # 3. Deep Crawl with Scoring
+    async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with URL scoring."""
+        max_depth = 1
+        max_pages = 4
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": { # Keep basic domain filter
+                                "type": "FilterChain",
+                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                            },
+                            "url_scorer": { # Add scorer
+                                "type": "CompositeScorer",
+                                "params": {
+                                    "scorers": [
+                                        {   # Favor pages with 'product' in the URL
+                                            "type": "KeywordRelevanceScorer",
+                                            "params": {"keywords": ["product"], "weight": 1.0}
+                                        },
+                                        {   # Penalize deep paths slightly
+                                            "type": "PathDepthScorer",
+                                            "params": {"optimal_depth": 2, "weight": -0.2}
+                                        }
+                                    ]
+                                }
+                            },
+                            # Set a threshold if needed: "score_threshold": 0.1
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
 
-    #     assert data["success"] is True
-    #     assert len(data["results"]) > 0
-    #     assert len(data["results"]) <= max_pages
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
 
-    #     # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
-    #     product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
-    #     print(f"Product URLs found among depth > 0 results: {product_urls_found}")
-    #     # We expect scoring to prioritize product pages if available within limits
-    #     # assert product_urls_found # This might be too strict depending on site structure and limits
+        # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
+        product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
+        print(f"Product URLs found among depth > 0 results: {product_urls_found}")
+        # We expect scoring to prioritize product pages if available within limits
+        # assert product_urls_found # This might be too strict depending on site structure and limits
 
-    #     for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
-    #         assert result["metadata"]["depth"] <= max_depth
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["metadata"]["depth"] <= max_depth
 
-    # # 4. Deep Crawl with CSS Extraction
-    # async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
-    #     max_depth = 6 # Go deep enough to reach product pages
-    #     max_pages = 20
-    #     # Schema to extract product details
-    #     product_schema = {
-    #         "name": "ProductDetails",
-    #         "baseSelector": "div.container", # Base for product page
-    #         "fields": [
-    #             {"name": "product_title", "selector": "h1", "type": "text"},
-    #             {"name": "price", "selector": ".product-price", "type": "text"},
-    #             {"name": "description", "selector": ".product-description p", "type": "text"},
-    #             {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
-    #                  {"name": "spec_name", "selector": ".spec-name", "type": "text"},
-    #                  {"name": "spec_value", "selector": ".spec-value", "type": "text"}
-    #             ]}
-    #         ]
-    #     }
-    #     payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
-    #             "type": "CrawlerRunConfig",
-    #             "params": {
-    #                 "stream": False,
-    #                 "cache_mode": "BYPASS",
-    #                 "extraction_strategy": { # Apply extraction to ALL crawled pages
-    #                     "type": "JsonCssExtractionStrategy",
-    #                     "params": {"schema": {"type": "dict", "value": product_schema}}
-    #                 },
-    #                 "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
-    #                         "max_depth": max_depth,
-    #                         "max_pages": max_pages,
-    #                         "filter_chain": { # Only crawl HTML on our domain
-    #                             "type": "FilterChain",
-    #                             "params": {
-    #                                 "filters": [
-    #                                     {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
-    #                                     {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
-    #                                 ]
-    #                             }
-    #                         }
-    #                         # Optional: Add scoring to prioritize product pages for extraction
-    #                     }
-    #                 }
-    #             }
-    #         }
-    #     }
-    #     response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
-    #     data = response.json()
+    # 4. Deep Crawl with CSS Extraction
+    async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
+        max_depth = 6 # Go deep enough to reach product pages
+        max_pages = 20
+        # Schema to extract product details
+        product_schema = {
+            "name": "ProductDetails",
+            "baseSelector": "div.container", # Base for product page
+            "fields": [
+                {"name": "product_title", "selector": "h1", "type": "text"},
+                {"name": "price", "selector": ".product-price", "type": "text"},
+                {"name": "description", "selector": ".product-description p", "type": "text"},
+                {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
+                     {"name": "spec_name", "selector": ".spec-name", "type": "text"},
+                     {"name": "spec_value", "selector": ".spec-value", "type": "text"}
+                ]}
+            ]
+        }
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "extraction_strategy": { # Apply extraction to ALL crawled pages
+                        "type": "JsonCssExtractionStrategy",
+                        "params": {"schema": {"type": "dict", "value": product_schema}}
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": { # Only crawl HTML on our domain
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                    ]
+                                }
+                            }
+                            # Optional: Add scoring to prioritize product pages for extraction
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
 
-    #     assert data["success"] is True
-    #     assert len(data["results"]) > 0
-    #     # assert len(data["results"]) <= max_pages
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        # assert len(data["results"]) <= max_pages
 
-    #     found_extracted_product = False
-    #     for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
-    #         assert "extracted_content" in result
-    #         if "product_" in result["url"]: # Check product pages specifically
-    #              assert result["extracted_content"] is not None
-    #              try:
-    #                  extracted = json.loads(result["extracted_content"])
-    #                  # Schema returns list even if one base match
-    #                  assert isinstance(extracted, list)
-    #                  if extracted:
-    #                      item = extracted[0]
-    #                      assert "product_title" in item and item["product_title"]
-    #                      assert "price" in item and item["price"]
-    #                      # Specs might be empty list if not found
-    #                      assert "specs" in item and isinstance(item["specs"], list)
-    #                      found_extracted_product = True
-    #                      print(f"Extracted product: {item.get('product_title')}")
-    #              except (json.JSONDecodeError, AssertionError, IndexError) as e:
-    #                   pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
-    #         # else:
-    #         #      # Non-product pages might have None or empty list depending on schema match
-    #         #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
+        found_extracted_product = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert "extracted_content" in result
+            if "product_" in result["url"]: # Check product pages specifically
+                 assert result["extracted_content"] is not None
+                 try:
+                     extracted = json.loads(result["extracted_content"])
+                     # Schema returns list even if one base match
+                     assert isinstance(extracted, list)
+                     if extracted:
+                         item = extracted[0]
+                         assert "product_title" in item and item["product_title"]
+                         assert "price" in item and item["price"]
+                         # Specs might be empty list if not found
+                         assert "specs" in item and isinstance(item["specs"], list)
+                         found_extracted_product = True
+                         print(f"Extracted product: {item.get('product_title')}")
+                 except (json.JSONDecodeError, AssertionError, IndexError) as e:
+                      pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+            # else:
+            #      # Non-product pages might have None or empty list depending on schema match
+            #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
 
-    #     assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
+        assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
 
-    # # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
-    # async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl combined with LLMExtractionStrategy."""
-    #     max_depth = 1 # Limit depth to keep LLM calls manageable
-    #     max_pages = 3
-    #     payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
-    #             "type": "CrawlerRunConfig",
-    #             "params": {
-    #                 "stream": False,
-    #                 "cache_mode": "BYPASS",
-    #                 "extraction_strategy": { # Apply LLM extraction to crawled pages
-    #                     "type": "LLMExtractionStrategy",
-    #                     "params": {
-    #                         "instruction": "Extract the main H1 title and the text content of the first paragraph.",
-    #                         "llm_config": { # Example override, rely on server default if possible
-    #                            "type": "LLMConfig",
-    #                            "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
-    #                         },
-    #                          "schema": { # Expected JSON output
-    #                             "type": "dict",
-    #                             "value": {
-    #                                 "title": "PageContent", "type": "object",
-    #                                 "properties": {
-    #                                     "h1_title": {"type": "string"},
-    #                                     "first_paragraph": {"type": "string"}
-    #                                 }
-    #                             }
-    #                         }
-    #                     }
-    #                 },
-    #                 "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
-    #                         "max_depth": max_depth,
-    #                         "max_pages": max_pages,
-    #                         "filter_chain": {
-    #                             "type": "FilterChain",
-    #                             "params": {
-    #                                 "filters": [
-    #                                     {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
-    #                                     {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
-    #                                 ]
-    #                             }
-    #                         }
-    #                     }
-    #                 }
-    #             }
-    #         }
-    #     }
+    # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
+    async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl combined with LLMExtractionStrategy."""
+        max_depth = 1 # Limit depth to keep LLM calls manageable
+        max_pages = 3
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "extraction_strategy": { # Apply LLM extraction to crawled pages
+                        "type": "LLMExtractionStrategy",
+                        "params": {
+                            "instruction": "Extract the main H1 title and the text content of the first paragraph.",
+                            "llm_config": { # Example override, rely on server default if possible
+                               "type": "LLMConfig",
+                               "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
+                            },
+                             "schema": { # Expected JSON output
+                                "type": "dict",
+                                "value": {
+                                    "title": "PageContent", "type": "object",
+                                    "properties": {
+                                        "h1_title": {"type": "string"},
+                                        "first_paragraph": {"type": "string"}
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
 
-    #     try:
-    #         response = await async_client.post("/crawl", json=payload)
-    #         response.raise_for_status()
-    #         data = response.json()
-    #     except httpx.HTTPStatusError as e:
-    #         pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
-    #     except httpx.RequestError as e:
-    #          pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
+        except httpx.RequestError as e:
+             pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
 
 
-    #     assert data["success"] is True
-    #     assert len(data["results"]) > 0
-    #     assert len(data["results"]) <= max_pages
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
 
-    #     found_llm_extraction = False
-    #     for result in data["results"]:
-    #         await assert_crawl_result_structure(result)
-    #         assert result["success"] is True
-    #         assert "extracted_content" in result
-    #         assert result["extracted_content"] is not None
-    #         try:
-    #             extracted = json.loads(result["extracted_content"])
-    #             if isinstance(extracted, list): extracted = extracted[0] # Handle list output
-    #             assert isinstance(extracted, dict)
-    #             assert "h1_title" in extracted # Check keys based on schema
-    #             assert "first_paragraph" in extracted
-    #             found_llm_extraction = True
-    #             print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
-    #         except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
-    #             pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+        found_llm_extraction = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert "extracted_content" in result
+            assert result["extracted_content"] is not None
+            try:
+                extracted = json.loads(result["extracted_content"])
+                if isinstance(extracted, list): extracted = extracted[0] # Handle list output
+                assert isinstance(extracted, dict)
+                assert "h1_title" in extracted # Check keys based on schema
+                assert "first_paragraph" in extracted
+                found_llm_extraction = True
+                print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
+            except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
+                pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
 
-    #     assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
+        assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
 
 
-    # # 6. Deep Crawl with SSL Certificate Fetching
-    # async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
-    #     """Test BFS deep crawl with fetch_ssl_certificate enabled."""
-    #     max_depth = 0 # Only fetch for start URL to keep test fast
-    #     max_pages = 1
-    #     payload = {
-    #         "urls": [DEEP_CRAWL_BASE_URL],
-    #         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-    #         "crawler_config": {
-    #             "type": "CrawlerRunConfig",
-    #             "params": {
-    #                 "stream": False,
-    #                 "cache_mode": "BYPASS",
-    #                 "fetch_ssl_certificate": True, # <-- Enable SSL fetching
-    #                 "deep_crawl_strategy": {
-    #                     "type": "BFSDeepCrawlStrategy",
-    #                     "params": {
-    #                         "max_depth": max_depth,
-    #                         "max_pages": max_pages,
-    #                     }
-    #                 }
-    #             }
-    #         }
-    #     }
-    #     response = await async_client.post("/crawl", json=payload)
-    #     response.raise_for_status()
-    #     data = response.json()
+    # 6. Deep Crawl with SSL Certificate Fetching
+    async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+        max_depth = 0 # Only fetch for start URL to keep test fast
+        max_pages = 1
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
 
-    #     assert data["success"] is True
-    #     assert len(data["results"]) == 1
-    #     result = data["results"][0]
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
 
-    #     await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
-    #     assert result["success"] is True
-    #             # Check if SSL info was actually retrieved
-    #     if result["ssl_certificate"]:
-    #         # Assert directly using dictionary keys
-    #         assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
-    #         assert "issuer" in result["ssl_certificate"]
-    #         assert "subject" in result["ssl_certificate"]
-    #         # --- MODIFIED ASSERTIONS ---
-    #         assert "not_before" in result["ssl_certificate"] # Check for the actual key
-    #         assert "not_after" in result["ssl_certificate"]  # Check for the actual key
-    #         # --- END MODIFICATIONS ---
-    #         assert "fingerprint" in result["ssl_certificate"] # Check another key
+        await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
+        assert result["success"] is True
+                # Check if SSL info was actually retrieved
+        if result["ssl_certificate"]:
+            # Assert directly using dictionary keys
+            assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
+            assert "issuer" in result["ssl_certificate"]
+            assert "subject" in result["ssl_certificate"]
+            # --- MODIFIED ASSERTIONS ---
+            assert "not_before" in result["ssl_certificate"] # Check for the actual key
+            assert "not_after" in result["ssl_certificate"]  # Check for the actual key
+            # --- END MODIFICATIONS ---
+            assert "fingerprint" in result["ssl_certificate"] # Check another key
 
-    #         # This print statement using .get() already works correctly with dictionaries
-    #         print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
-    #         print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
-    #     else:
-    #         # This part remains the same
-    #         print("SSL Certificate was null in the result.")
+            # This print statement using .get() already works correctly with dictionaries
+            print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
+            print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
+        else:
+            # This part remains the same
+            print("SSL Certificate was null in the result.")
 
 
     # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)

From 94d486579c0c1a2b43ba159eb817a962ef7e9bdc Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 15 Apr 2025 22:32:27 +0800
Subject: [PATCH 58/78] docs(tests): clarify server URL comments in deep crawl
 tests

Improve documentation of test configuration URLs by adding clearer
comments explaining when to use each URL configuration - Docker vs
development mode.

No functional changes, only comment improvements.
---
 tests/docker/test_rest_api_deep_crawl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/docker/test_rest_api_deep_crawl.py b/tests/docker/test_rest_api_deep_crawl.py
index 8995881d..c535727f 100644
--- a/tests/docker/test_rest_api_deep_crawl.py
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -12,8 +12,8 @@ from dotenv import load_dotenv
 load_dotenv() # Load environment variables from .env file if present
 
 # --- Test Configuration ---
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Ensure this points to your running server
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Ensure this points to your running server
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
 DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
 DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
 

From 7db6b468d9b8b1d8b2051901e4009270852a0674 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 17 Apr 2025 20:13:53 +0800
Subject: [PATCH 59/78] feat(markdown): add content source selection for
 markdown generation

Adds a new content_source parameter to MarkdownGenerationStrategy that allows
selecting which HTML content to use for markdown generation:
- cleaned_html (default): uses post-processed HTML
- raw_html: uses original webpage HTML
- fit_html: uses preprocessed HTML for schema extraction

Changes include:
- Added content_source parameter to MarkdownGenerationStrategy
- Updated AsyncWebCrawler to handle HTML source selection
- Added examples and tests for the new feature
- Updated documentation with new parameter details

BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown()
method signature to better reflect its generalized purpose
---
 CHANGELOG.md                                  |   7 ++
 JOURNAL.md                                    |  41 +++++++
 crawl4ai/async_webcrawler.py                  |  38 ++++++-
 crawl4ai/markdown_generation_strategy.py      |  30 ++---
 .../markdown/content_source_example.py        |  64 +++++++++++
 .../markdown/content_source_short_example.py  |  42 +++++++
 docs/md_v2/api/parameters.md                  |   2 +-
 docs/md_v2/core/markdown-generation.md        |  77 +++++++++++--
 .../general/test_content_source_parameter.py  | 106 ++++++++++++++++++
 9 files changed, 383 insertions(+), 24 deletions(-)
 create mode 100644 docs/examples/markdown/content_source_example.py
 create mode 100644 docs/examples/markdown/content_source_short_example.py
 create mode 100644 tests/general/test_content_source_parameter.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61161f92..6ef49dd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+### [Added] 2025-04-17
+- Added content source selection feature for markdown generation
+  - New `content_source` parameter allows choosing between `cleaned_html`, `raw_html`, and `fit_html`
+  - Provides flexibility in how HTML content is processed before markdown conversion
+  - Added examples and documentation for the new feature
+  - Includes backward compatibility with default `cleaned_html` behavior
+  
 ## Version 0.5.0.post5 (2025-03-14)
 
 ### Added
diff --git a/JOURNAL.md b/JOURNAL.md
index ac00e890..0451b425 100644
--- a/JOURNAL.md
+++ b/JOURNAL.md
@@ -2,6 +2,47 @@
 
 This journal tracks significant feature additions, bug fixes, and architectural decisions in the crawl4ai project. It serves as both documentation and a historical record of the project's evolution.
 
+## [2025-04-17] Added Content Source Selection for Markdown Generation
+
+**Feature:** Configurable content source for markdown generation
+
+**Changes Made:**
+1. Added `content_source: str = "cleaned_html"` parameter to `MarkdownGenerationStrategy` class
+2. Updated `DefaultMarkdownGenerator` to accept and pass the content source parameter
+3. Renamed the `cleaned_html` parameter to `input_html` in the `generate_markdown` method
+4. Modified `AsyncWebCrawler.aprocess_html` to select the appropriate HTML source based on the generator's config
+5. Added `preprocess_html_for_schema` import in `async_webcrawler.py`
+
+**Implementation Details:**
+- Added a new `content_source` parameter to specify which HTML input to use for markdown generation
+- Options include: "cleaned_html" (default), "raw_html", and "fit_html"
+- Used a dictionary dispatch pattern in `aprocess_html` to select the appropriate HTML source
+- Added proper error handling with fallback to cleaned_html if content source selection fails
+- Ensured backward compatibility by defaulting to "cleaned_html" option
+
+**Files Modified:**
+- `crawl4ai/markdown_generation_strategy.py`: Added content_source parameter and updated the method signature
+- `crawl4ai/async_webcrawler.py`: Added HTML source selection logic and updated imports
+
+**Examples:**
+- Created `docs/examples/content_source_example.py` demonstrating how to use the new parameter
+
+**Challenges:**
+- Maintaining backward compatibility while reorganizing the parameter flow
+- Ensuring proper error handling for all content source options
+- Making the change with minimal code modifications
+
+**Why This Feature:**
+The content source selection feature allows users to choose which HTML content to use as input for markdown generation:
+1. "cleaned_html" - Uses the post-processed HTML after scraping strategy (original behavior)
+2. "raw_html" - Uses the original raw HTML directly from the web page
+3. "fit_html" - Uses the preprocessed HTML optimized for schema extraction
+
+This feature provides greater flexibility in how users generate markdown, enabling them to:
+- Capture more detailed content from the original HTML when needed
+- Use schema-optimized HTML when working with structured data
+- Choose the approach that best suits their specific use case
+
 ## [2025-04-09] Added MHTML Capture Feature
 
 **Feature:** MHTML snapshot capture of crawled pages
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 9ba508b2..afdcefdb 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -47,6 +47,7 @@ from .utils import (
     create_box_message,
     get_error_context,
     RobotsParser,
+    preprocess_html_for_schema,
 )
 
 
@@ -512,13 +513,48 @@ class AsyncWebCrawler:
             config.markdown_generator or DefaultMarkdownGenerator()
         )
 
+        # --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
+        # Get the desired source from the generator config, default to 'cleaned_html'
+        selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
+
+        # Define the source selection logic using dict dispatch
+        html_source_selector = {
+            "raw_html": lambda: html,  # The original raw HTML
+            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
+            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+        }
+
+        markdown_input_html = cleaned_html  # Default to cleaned_html
+
+        try:
+            # Get the appropriate lambda function, default to returning cleaned_html if key not found
+            source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
+            # Execute the lambda to get the selected HTML
+            markdown_input_html = source_lambda()
+
+            # Log which source is being used (optional, but helpful for debugging)
+            if self.logger and verbose:
+                actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+                self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+
+        except Exception as e:
+            # Handle potential errors, especially from preprocess_html_for_schema
+            if self.logger:
+                self.logger.warning(
+                    f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
+                    tag="MARKDOWN_SRC"
+                )
+            # Ensure markdown_input_html is still the default cleaned_html in case of error
+            markdown_input_html = cleaned_html
+        # --- END: HTML SOURCE SELECTION ---
+
         # Uncomment if by default we want to use PruningContentFilter
         # if not config.content_filter and not markdown_generator.content_filter:
         #     markdown_generator.content_filter = PruningContentFilter()
 
         markdown_result: MarkdownGenerationResult = (
             markdown_generator.generate_markdown(
-                cleaned_html=cleaned_html,
+                input_html=markdown_input_html,
                 base_url=url,
                 # html2text_options=kwargs.get('html2text', {})
             )
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index e89239f3..622cc8da 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -31,22 +31,24 @@ class MarkdownGenerationStrategy(ABC):
         content_filter: Optional[RelevantContentFilter] = None,
         options: Optional[Dict[str, Any]] = None,
         verbose: bool = False,
+        content_source: str = "cleaned_html",
     ):
         self.content_filter = content_filter
         self.options = options or {}
         self.verbose = verbose
+        self.content_source = content_source
 
     @abstractmethod
     def generate_markdown(
         self,
-        cleaned_html: str,
+        input_html: str,
         base_url: str = "",
         html2text_options: Optional[Dict[str, Any]] = None,
         content_filter: Optional[RelevantContentFilter] = None,
         citations: bool = True,
         **kwargs,
     ) -> MarkdownGenerationResult:
-        """Generate markdown from cleaned HTML."""
+        """Generate markdown from the selected input HTML."""
         pass
 
 
@@ -63,6 +65,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
     Args:
         content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
         options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
 
     Returns:
         MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
@@ -72,8 +75,9 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
         self,
         content_filter: Optional[RelevantContentFilter] = None,
         options: Optional[Dict[str, Any]] = None,
+        content_source: str = "cleaned_html",
     ):
-        super().__init__(content_filter, options)
+        super().__init__(content_filter, options, verbose=False, content_source=content_source)
 
     def convert_links_to_citations(
         self, markdown: str, base_url: str = ""
@@ -143,7 +147,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
 
     def generate_markdown(
         self,
-        cleaned_html: str,
+        input_html: str,
         base_url: str = "",
         html2text_options: Optional[Dict[str, Any]] = None,
         options: Optional[Dict[str, Any]] = None,
@@ -152,16 +156,16 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
         **kwargs,
     ) -> MarkdownGenerationResult:
         """
-        Generate markdown with citations from cleaned HTML.
+        Generate markdown with citations from the provided input HTML.
 
         How it works:
-        1. Generate raw markdown from cleaned HTML.
+        1. Generate raw markdown from the input HTML.
         2. Convert links to citations.
         3. Generate fit markdown if content filter is provided.
         4. Return MarkdownGenerationResult.
 
         Args:
-            cleaned_html (str): Cleaned HTML content.
+            input_html (str): The HTML content to process (selected based on content_source).
             base_url (str): Base URL for URL joins.
             html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
             options (Optional[Dict[str, Any]]): Additional options for markdown generation.
@@ -196,14 +200,14 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
             h.update_params(**default_options)
 
             # Ensure we have valid input
-            if not cleaned_html:
-                cleaned_html = ""
-            elif not isinstance(cleaned_html, str):
-                cleaned_html = str(cleaned_html)
+            if not input_html:
+                input_html = ""
+            elif not isinstance(input_html, str):
+                input_html = str(input_html)
 
             # Generate raw markdown
             try:
-                raw_markdown = h.handle(cleaned_html)
+                raw_markdown = h.handle(input_html)
             except Exception as e:
                 raw_markdown = f"Error converting HTML to markdown: {str(e)}"
 
@@ -228,7 +232,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
             if content_filter or self.content_filter:
                 try:
                     content_filter = content_filter or self.content_filter
-                    filtered_html = content_filter.filter_content(cleaned_html)
+                    filtered_html = content_filter.filter_content(input_html)
                     filtered_html = "\n".join(
                         "<div>{}</div>".format(s) for s in filtered_html
                     )
diff --git a/docs/examples/markdown/content_source_example.py b/docs/examples/markdown/content_source_example.py
new file mode 100644
index 00000000..5d836765
--- /dev/null
+++ b/docs/examples/markdown/content_source_example.py
@@ -0,0 +1,64 @@
+"""
+Example showing how to use the content_source parameter to control HTML input for markdown generation.
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_content_source():
+    """Demonstrates different content_source options for markdown generation."""
+    url = "https://example.com"  # Simple demo site
+    
+    print("Crawling with different content_source options...")
+    
+    # --- Example 1: Default Behavior (cleaned_html) ---
+    # This uses the HTML after it has been processed by the scraping strategy
+    # The HTML is cleaned, simplified, and optimized for readability
+    default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
+    default_config = CrawlerRunConfig(markdown_generator=default_generator)
+    
+    # --- Example 2: Raw HTML ---
+    # This uses the original HTML directly from the webpage
+    # Preserves more original content but may include navigation, ads, etc.
+    raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
+    
+    # --- Example 3: Fit HTML ---
+    # This uses preprocessed HTML optimized for schema extraction
+    # Better for structured data extraction but may lose some formatting
+    fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
+    
+    # Execute all three crawlers in sequence
+    async with AsyncWebCrawler() as crawler:
+        # Default (cleaned_html)
+        result_default = await crawler.arun(url=url, config=default_config)
+        
+        # Raw HTML
+        result_raw = await crawler.arun(url=url, config=raw_config)
+        
+        # Fit HTML
+        result_fit = await crawler.arun(url=url, config=fit_config)
+    
+    # Print a summary of the results
+    print("\nMarkdown Generation Results:\n")
+    
+    print("1. Default (cleaned_html):")
+    print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
+    
+    print("2. Raw HTML:")
+    print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
+    
+    print("3. Fit HTML:")
+    print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
+    
+    # Demonstrate differences in output
+    print("\nKey Takeaways:")
+    print("- cleaned_html: Best for readable, focused content")
+    print("- raw_html: Preserves more original content, but may include noise")
+    print("- fit_html: Optimized for schema extraction and structured data")
+
+if __name__ == "__main__":
+    asyncio.run(demo_content_source())
\ No newline at end of file
diff --git a/docs/examples/markdown/content_source_short_example.py b/docs/examples/markdown/content_source_short_example.py
new file mode 100644
index 00000000..83c3ecb4
--- /dev/null
+++ b/docs/examples/markdown/content_source_short_example.py
@@ -0,0 +1,42 @@
+"""
+Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_markdown_source_config():
+    print("\n=== Demo: Configuring Markdown Source ===")
+
+    # Example 1: Generate markdown from cleaned HTML (default behavior)
+    cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
+    config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
+        print("Markdown from Cleaned HTML (default):")
+        print(f"  Length: {len(result_cleaned.markdown.raw_markdown)}")
+        print(f"  Start: {result_cleaned.markdown.raw_markdown[:100]}...")
+
+    # Example 2: Generate markdown directly from raw HTML
+    raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_raw = await crawler.arun(url="https://example.com", config=config_raw)
+        print("\nMarkdown from Raw HTML:")
+        print(f"  Length: {len(result_raw.markdown.raw_markdown)}")
+        print(f"  Start: {result_raw.markdown.raw_markdown[:100]}...")
+
+    # Example 3: Generate markdown from preprocessed 'fit' HTML
+    fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_fit = await crawler.arun(url="https://example.com", config=config_fit)
+        print("\nMarkdown from Fit HTML:")
+        print(f"  Length: {len(result_fit.markdown.raw_markdown)}")
+        print(f"  Start: {result_fit.markdown.raw_markdown[:100]}...")
+
+if __name__ == "__main__":
+    asyncio.run(demo_markdown_source_config())
\ No newline at end of file
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index de4ba467..6cf771c1 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -70,7 +70,7 @@ We group them by category.
 |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
 | **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
 | **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
-| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.).                 |
+| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html').                 |
 | **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector. Affects the entire extraction process. |
 | **`target_elements`**        | `List[str]` (None)                   | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
 | **`excluded_tags`**          | `list` (None)                        | Removes entire tags (e.g. `["script", "style"]`).                                               |
diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index ac27e5b2..e6f5e12a 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -111,13 +111,71 @@ Some commonly used `options`:
 - **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
 - **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.
 
+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Option 1: Use the raw HTML directly from the webpage (before any processing)
+    raw_md_generator = DefaultMarkdownGenerator(
+        content_source="raw_html",
+        options={"ignore_links": True}
+    )
+    
+    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+    cleaned_md_generator = DefaultMarkdownGenerator(
+        content_source="cleaned_html",  # This is the default
+        options={"ignore_links": True}
+    )
+    
+    # Option 3: Use preprocessed HTML optimized for schema extraction
+    fit_md_generator = DefaultMarkdownGenerator(
+        content_source="fit_html",
+        options={"ignore_links": True}
+    )
+    
+    # Use one of the generators in your crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=raw_md_generator  # Try each of the generators
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown.raw_markdown[:500])
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
 ---
 
-## 4. Content Filters
+## 5. Content Filters
 
 **Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
 
-### 4.1 BM25ContentFilter
+### 5.1 BM25ContentFilter
 
 If you have a **search query**, BM25 is a good choice:
 
@@ -146,7 +204,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
 
-### 4.2 PruningContentFilter
+### 5.2 PruningContentFilter
 
 If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
 
@@ -170,7 +228,7 @@ prune_filter = PruningContentFilter(
 - You want a broad cleanup without a user query.  
 - The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
 
-### 4.3 LLMContentFilter
+### 5.3 LLMContentFilter
 
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 
@@ -247,7 +305,7 @@ filter = LLMContentFilter(
 
 ---
 
-## 5. Using Fit Markdown
+## 6. Using Fit Markdown
 
 When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
 
@@ -284,7 +342,7 @@ if __name__ == "__main__":
 
 ---
 
-## 6. The `MarkdownGenerationResult` Object
+## 7. The `MarkdownGenerationResult` Object
 
 If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
 
@@ -315,7 +373,7 @@ Below is a **revised section** under “Combining Filters (BM25 + Pruning)” th
 
 ---
 
-## 7. Combining Filters (BM25 + Pruning) in Two Passes
+## 8. Combining Filters (BM25 + Pruning) in Two Passes
 
 You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
 
@@ -407,7 +465,7 @@ If your codebase or pipeline design allows applying multiple filters in one pass
 
 ---
 
-## 8. Common Pitfalls & Tips
+## 9. Common Pitfalls & Tips
 
 1. **No Markdown Output?**  
    - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
@@ -427,11 +485,12 @@ If your codebase or pipeline design allows applying multiple filters in one pass
 
 ---
 
-## 9. Summary & Next Steps
+## 10. Summary & Next Steps
 
 In this **Markdown Generation Basics** tutorial, you learned to:
 
 - Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
+- Select different HTML sources using the `content_source` parameter.  
 - Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
 - Distinguish between raw and filtered markdown (`fit_markdown`).  
 - Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
diff --git a/tests/general/test_content_source_parameter.py b/tests/general/test_content_source_parameter.py
new file mode 100644
index 00000000..e686eaf8
--- /dev/null
+++ b/tests/general/test_content_source_parameter.py
@@ -0,0 +1,106 @@
+"""
+Tests for the content_source parameter in markdown generation.
+"""
+import unittest
+import asyncio
+from unittest.mock import patch, MagicMock
+
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.models import MarkdownGenerationResult
+
+HTML_SAMPLE = """
+<html>
+<head><title>Test Page</title></head>
+<body>
+    <h1>Test Content</h1>
+    <p>This is a test paragraph.</p>
+    <div class="container">
+        <p>This is content within a container.</p>
+    </div>
+</body>
+</html>
+"""
+
+
+class TestContentSourceParameter(unittest.TestCase):
+    """Test cases for the content_source parameter in markdown generation."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+
+    def tearDown(self):
+        """Tear down test fixtures."""
+        self.loop.close()
+
+    def test_default_content_source(self):
+        """Test that the default content_source is 'cleaned_html'."""
+        # Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator
+        generator = DefaultMarkdownGenerator()
+        self.assertEqual(generator.content_source, "cleaned_html")
+
+    def test_custom_content_source(self):
+        """Test that content_source can be customized."""
+        generator = DefaultMarkdownGenerator(content_source="fit_html")
+        self.assertEqual(generator.content_source, "fit_html")
+
+    @patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text')
+    def test_html_processing_using_input_html(self, mock_html2text):
+        """Test that generate_markdown uses input_html parameter."""
+        # Setup mock
+        mock_instance = MagicMock()
+        mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph."
+        mock_html2text.return_value = mock_instance
+
+        # Create generator and call generate_markdown
+        generator = DefaultMarkdownGenerator()
+        result = generator.generate_markdown(input_html="<h1>Test Content</h1><p>This is a test paragraph.</p>")
+
+        # Verify input_html was passed to HTML2Text handler
+        mock_instance.handle.assert_called_once()
+        # Get the first positional argument
+        args, _ = mock_instance.handle.call_args
+        self.assertEqual(args[0], "<h1>Test Content</h1><p>This is a test paragraph.</p>")
+        
+        # Check result
+        self.assertIsInstance(result, MarkdownGenerationResult)
+        self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.")
+
+    def test_html_source_selection_logic(self):
+        """Test that the HTML source selection logic works correctly."""
+        # We'll test the dispatch pattern directly to avoid async complexities
+        
+        # Create test data
+        raw_html = "<html><body><h1>Raw HTML</h1></body></html>"
+        cleaned_html = "<html><body><h1>Cleaned HTML</h1></body></html>"
+        fit_html = "<html><body><h1>Preprocessed HTML</h1></body></html>"
+        
+        # Test the dispatch pattern
+        html_source_selector = {
+            "raw_html": lambda: raw_html,
+            "cleaned_html": lambda: cleaned_html,
+            "fit_html": lambda: fit_html,
+        }
+        
+        # Test Case 1: content_source="cleaned_html"
+        source_lambda = html_source_selector.get("cleaned_html")
+        self.assertEqual(source_lambda(), cleaned_html)
+        
+        # Test Case 2: content_source="raw_html"
+        source_lambda = html_source_selector.get("raw_html")
+        self.assertEqual(source_lambda(), raw_html)
+        
+        # Test Case 3: content_source="fit_html"
+        source_lambda = html_source_selector.get("fit_html")
+        self.assertEqual(source_lambda(), fit_html)
+        
+        # Test Case 4: Invalid content_source falls back to cleaned_html
+        source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html)
+        self.assertEqual(source_lambda(), cleaned_html)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From 30ec4f571fbfcac9d0744ad9f33f63049fbb03de Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 17 Apr 2025 20:16:11 +0800
Subject: [PATCH 60/78] feat(docs): add comprehensive Docker API demo script

Add a new example script demonstrating Docker API usage with extensive features:
- Basic crawling with single/multi URL support
- Markdown generation with various filters
- Parameter demonstrations (CSS, JS, screenshots, SSL, proxies)
- Extraction strategies using CSS and LLM
- Deep crawling capabilities with streaming
- Integration examples with proxy rotation and SSL certificate fetching

Also includes minor formatting improvements in async_webcrawler.py
---
 crawl4ai/async_webcrawler.py            |  31 +-
 docs/examples/docker/demo_docker_api.py | 883 ++++++++++++++++++++++++
 2 files changed, 903 insertions(+), 11 deletions(-)
 create mode 100644 docs/examples/docker/demo_docker_api.py

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 9ba508b2..5cdc95b9 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -111,7 +111,8 @@ class AsyncWebCrawler:
         self,
         crawler_strategy: AsyncCrawlerStrategy = None,
         config: BrowserConfig = None,
-        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
+        base_directory: str = str(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
         thread_safe: bool = False,
         logger: AsyncLoggerBase = None,
         **kwargs,
@@ -139,7 +140,8 @@ class AsyncWebCrawler:
         )
 
         # Initialize crawler strategy
-        params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]}
+        params = {k: v for k, v in kwargs.items() if k in [
+            "browser_config", "logger"]}
         self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
             browser_config=browser_config,
             logger=self.logger,
@@ -237,7 +239,8 @@ class AsyncWebCrawler:
 
         config = config or CrawlerRunConfig()
         if not isinstance(url, str) or not url:
-            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
+            raise ValueError(
+                "Invalid URL, make sure the URL is a non-empty string")
 
         async with self._lock or self.nullcontext():
             try:
@@ -291,12 +294,12 @@ class AsyncWebCrawler:
 
                 # Update proxy configuration from rotation strategy if available
                 if config and config.proxy_rotation_strategy:
-                    next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
+                    next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
                     if next_proxy:
                         self.logger.info(
                             message="Switch proxy: {proxy}",
                             tag="PROXY",
-                            params={"proxy": next_proxy.server} 
+                            params={"proxy": next_proxy.server}
                         )
                         config.proxy_config = next_proxy
                         # config = config.clone(proxy_config=next_proxy)
@@ -306,7 +309,8 @@ class AsyncWebCrawler:
                     t1 = time.perf_counter()
 
                     if config.user_agent:
-                        self.crawler_strategy.update_user_agent(config.user_agent)
+                        self.crawler_strategy.update_user_agent(
+                            config.user_agent)
 
                     # Check robots.txt if enabled
                     if config and config.check_robots_txt:
@@ -372,7 +376,8 @@ class AsyncWebCrawler:
                     crawl_result.console_messages = async_response.console_messages
 
                     crawl_result.success = bool(html)
-                    crawl_result.session_id = getattr(config, "session_id", None)
+                    crawl_result.session_id = getattr(
+                        config, "session_id", None)
 
                     self.logger.success(
                         message="{url:.50}... | Status: {status} | Total: {timing}",
@@ -407,7 +412,8 @@ class AsyncWebCrawler:
                     )
 
                     cached_result.success = bool(html)
-                    cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.session_id = getattr(
+                        config, "session_id", None)
                     cached_result.redirected_url = cached_result.redirected_url or url
                     return CrawlResultContainer(cached_result)
 
@@ -474,12 +480,14 @@ class AsyncWebCrawler:
             params = config.__dict__.copy()
             params.pop("url", None)
             # add keys from kwargs to params that doesn't exist in params
-            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
+            params.update({k: v for k, v in kwargs.items()
+                          if k not in params.keys()})
 
             ################################
             # Scraping Strategy Execution  #
             ################################
-            result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
+            result: ScrapingResult = scraping_strategy.scrap(
+                url, html, **params)
 
             if result is None:
                 raise ValueError(
@@ -495,7 +503,8 @@ class AsyncWebCrawler:
 
         # Extract results - handle both dict and ScrapingResult
         if isinstance(result, dict):
-            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            cleaned_html = sanitize_input_encode(
+                result.get("cleaned_html", ""))
             media = result.get("media", {})
             links = result.get("links", {})
             metadata = result.get("metadata", {})
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
new file mode 100644
index 00000000..56d0173c
--- /dev/null
+++ b/docs/examples/docker/demo_docker_api.py
@@ -0,0 +1,883 @@
+import asyncio
+import httpx
+import json
+import os
+import time
+from typing import List, Dict, Any, AsyncGenerator, Optional
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.syntax import Syntax
+from rich.panel import Panel
+from rich.table import Table
+
+# --- Setup & Configuration ---
+load_dotenv()  # Load environment variables from .env file
+
+console = Console()
+
+# --- Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
+# Target URLs
+SIMPLE_URL = "https://httpbin.org/html"
+LINKS_URL = "https://httpbin.org/links/10/0"
+FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
+BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
+PYTHON_URL = "https://python.org" # For deeper crawl
+# Use the same sample site as deep crawl tests for consistency
+DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
+DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
+
+# --- Helper Functions ---
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    console.print("[bold cyan]Checking server health...[/]", end="")
+    try:
+        response = await client.get("/health", timeout=10.0)
+        response.raise_for_status()
+        health_data = response.json()
+        console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        console.print(f"\n[bold red]Server health check FAILED:[/]")
+        console.print(f"Error: {e}")
+        console.print(f"Is the server running at {BASE_URL}?")
+        return False
+    except Exception as e:
+        console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
+        console.print(e)
+        return False
+
+def print_payload(payload: Dict[str, Any]):
+    """Prints the JSON payload nicely."""
+    syntax = Syntax(json.dumps(payload, indent=2), "json", theme="default", line_numbers=False)
+    console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False))
+
+def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
+    """Prints a concise summary of crawl results."""
+    if not results:
+        console.print(f"[yellow]{title}: No results received.[/]")
+        return
+
+    console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False))
+    count = 0
+    for result in results:
+        if count >= max_items:
+            console.print(f"... (showing first {max_items} of {len(results)} results)")
+            break
+        count += 1
+        success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
+        url = result.get('url', 'N/A')
+        status = result.get('status_code', 'N/A')
+        content_info = ""
+        if result.get('extracted_content'):
+            content_str = json.dumps(result['extracted_content'])
+            snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str
+            content_info = f" | Extracted: [cyan]{snippet}[/]"
+        elif result.get('markdown'):
+             content_info = f" | Markdown: [cyan]Present[/]"
+        elif result.get('html'):
+            content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
+
+        console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
+        if "metadata" in result and "depth" in result["metadata"]:
+            console.print(f"  Depth: {result['metadata']['depth']}")
+        if not result.get('success') and result.get('error_message'):
+            console.print(f"  [red]Error: {result['error_message']}[/]")
+
+
+async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str) -> Optional[List[Dict[str, Any]]]:
+    """Handles non-streaming POST requests."""
+    console.rule(f"[bold blue]{title}[/]", style="blue")
+    print_payload(payload)
+    console.print(f"Sending POST request to {client.base_url}{endpoint}...")
+    try:
+        start_time = time.time()
+        response = await client.post(endpoint, json=payload)
+        duration = time.time() - start_time
+        console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
+        response.raise_for_status()
+        data = response.json()
+        if data.get("success"):
+            results = data.get("results", [])
+            print_result_summary(results, title=f"{title} Results")
+            return results
+        else:
+            console.print("[bold red]Request reported failure:[/]")
+            console.print(data)
+            return None
+    except httpx.HTTPStatusError as e:
+        console.print(f"[bold red]HTTP Error:[/]")
+        console.print(f"Status: {e.response.status_code}")
+        try:
+            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+        except json.JSONDecodeError:
+            console.print(f"Response Body: {e.response.text}")
+    except httpx.RequestError as e:
+        console.print(f"[bold red]Request Error: {e}[/]")
+    except Exception as e:
+        console.print(f"[bold red]Unexpected Error: {e}[/]")
+    return None
+
+async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
+    """Handles streaming POST requests."""
+    console.rule(f"[bold magenta]{title}[/]", style="magenta")
+    print_payload(payload)
+    console.print(f"Sending POST stream request to {client.base_url}{endpoint}...")
+    all_results = []
+    try:
+        start_time = time.time()
+        async with client.stream("POST", endpoint, json=payload) as response:
+            duration = time.time() - start_time # Time to first byte potentially
+            console.print(f"Initial Response Status: [bold {'green' if response.status_code == 200 else 'red'}]{response.status_code}[/] (first byte ~{duration:.2f}s)")
+            response.raise_for_status()
+
+            console.print("[magenta]--- Streaming Results ---[/]")
+            completed = False
+            async for line in response.aiter_lines():
+                if line:
+                    try:
+                        data = json.loads(line)
+                        if data.get("status") == "completed":
+                            completed = True
+                            console.print("[bold green]--- Stream Completed ---[/]")
+                            break
+                        elif data.get("url"): # Looks like a result
+                            all_results.append(data)
+                            success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]"
+                            url = data.get('url', 'N/A')
+                            console.print(f"  {success_icon} Received: [link={url}]{url}[/link]")
+                        else:
+                            console.print(f"  [yellow]Stream meta-data:[/yellow] {data}")
+                    except json.JSONDecodeError:
+                        console.print(f"  [red]Stream decode error for line:[/red] {line}")
+            if not completed:
+                 console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
+
+    except httpx.HTTPStatusError as e:
+        console.print(f"[bold red]HTTP Error:[/]")
+        console.print(f"Status: {e.response.status_code}")
+        try:
+            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+        except json.JSONDecodeError:
+            console.print(f"Response Body: {e.response.text}")
+    except httpx.RequestError as e:
+        console.print(f"[bold red]Request Error: {e}[/]")
+    except Exception as e:
+        console.print(f"[bold red]Unexpected Error: {e}[/]")
+
+    print_result_summary(all_results, title=f"{title} Collected Results")
+
+
+def load_proxies_from_env() -> List[Dict]:
+    """
+    Load proxies from the PROXIES environment variable.
+    Expected format: IP:PORT:USER:PASS,IP:PORT,IP2:PORT2:USER2:PASS2,...
+    Returns a list of dictionaries suitable for the 'params' of ProxyConfig.
+    """
+    proxies_params_list = []
+    proxies_str = os.getenv("PROXIES", "")
+    if not proxies_str:
+        # console.print("[yellow]PROXIES environment variable not set or empty.[/]")
+        return proxies_params_list # Return empty list if not set
+
+    try:
+        proxy_entries = proxies_str.split(",")
+        for entry in proxy_entries:
+            entry = entry.strip()
+            if not entry:
+                continue
+
+            parts = entry.split(":")
+            proxy_dict = {}
+
+            if len(parts) == 4: # Format: IP:PORT:USER:PASS
+                ip, port, username, password = parts
+                proxy_dict = {
+                    "server": f"http://{ip}:{port}", # Assuming http protocol
+                    "username": username,
+                    "password": password,
+                    # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it
+                }
+            elif len(parts) == 2: # Format: IP:PORT
+                ip, port = parts
+                proxy_dict = {
+                    "server": f"http://{ip}:{port}",
+                    # "ip": ip
+                }
+            else:
+                 console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
+                 continue
+
+            proxies_params_list.append(proxy_dict)
+
+    except Exception as e:
+        console.print(f"[red]Error loading proxies from environment:[/red] {e}")
+
+    if proxies_params_list:
+        console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
+    # else:
+    #     console.print("[yellow]No valid proxies loaded from environment.[/]")
+
+    return proxies_params_list
+
+
+
+# --- Demo Functions ---
+
+# 1. Basic Crawling
+async def demo_basic_single_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
+    return result
+
+async def demo_basic_multi_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL, LINKS_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
+    return result
+
+async def demo_streaming_multi_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}}
+    }
+    result =  stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
+    return result
+
+# 2. Markdown Generation & Content Filtering
+async def demo_markdown_default(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
+    return result
+
+async def demo_markdown_pruning(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PYTHON_URL], # Use a more complex page
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "PruningContentFilter",
+                            "params": {"threshold": 0.6, "threshold_type": "relative"}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
+    return result
+
+async def demo_markdown_bm25(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PYTHON_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "BM25ContentFilter",
+                            "params": {"user_query": "Python documentation language reference"}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 2c: Markdown with BM25 Filter")
+    return result
+
+# 3. Specific Parameters
+# Corrected Demo Function: demo_param_css_selector
+async def demo_param_css_selector(client: httpx.AsyncClient):
+    target_selector = ".main-content" # Using the suggested correct selector
+    payload = {
+        "urls": [PYTHON_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "css_selector": target_selector # Target specific div
+                # No extraction strategy is needed to demo this parameter's effect on input HTML
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')")
+
+    if results:
+        result = results[0]
+        if result['success'] and result.get('html'):
+            # Check if the returned HTML is likely constrained
+            # A simple check: does it contain expected content from within the selector,
+            # and does it LACK content known to be outside (like footer links)?
+            html_content = result['html']
+            content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere
+            footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content
+
+            console.print(f"  Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
+            console.print(f"  Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
+
+            if not content_present or not footer_absent:
+                 console.print(f"  [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
+            else:
+                 console.print(f"  [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
+
+        elif result['success']:
+            console.print("[yellow]HTML content was empty in the successful result.[/]")
+        # Error message is handled by print_result_summary called by make_request
+
+async def demo_param_js_execution(client: httpx.AsyncClient):
+    payload = {
+        "urls": [FORMS_URL], # Use a page with a form
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                 # Simple JS to fill and maybe click (won't submit without more complex setup)
+                "js_code": """
+                    () => {
+                        document.querySelector('[name="custname"]').value = 'Crawl4AI Demo';
+                        return { filled_name: document.querySelector('[name="custname"]').value };
+                    }
+                """,
+                "delay_before_return_html": 0.5 # Give JS time to potentially run
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
+    if results and results[0].get("js_execution_result"):
+         console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"])
+    elif results:
+         console.print("[yellow]JS Execution Result not found in response.[/]")
+
+
+async def demo_param_screenshot(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"cache_mode": "BYPASS", "screenshot": True}
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
+    if results and results[0].get("screenshot"):
+        console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
+    elif results:
+         console.print("[yellow]Screenshot data not found in response.[/]")
+
+async def demo_param_ssl_fetch(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PYTHON_URL], # Needs HTTPS
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"cache_mode": "BYPASS", "fetch_ssl_certificate": True}
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3d: Fetching SSL Certificate")
+    if results and results[0].get("ssl_certificate"):
+        console.print("[cyan]SSL Certificate Info:[/]")
+        console.print(results[0]["ssl_certificate"])
+    elif results:
+         console.print("[yellow]SSL Certificate data not found in response.[/]")
+
+
+
+async def demo_param_proxy(client: httpx.AsyncClient):
+    proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
+    if not proxy_params_list:
+        console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
+        console.print("Set the PROXIES environment variable to run this demo.")
+        console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
+        return
+
+    payload = {
+        "urls": ["https://httpbin.org/ip"], # URL that shows originating IP
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "proxy_rotation_strategy": {
+                    "type": "RoundRobinProxyStrategy",
+                    "params": {
+                        "proxies": [
+                            # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
+                            {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
+                            for p in proxy_params_list
+                        ]
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3e: Using Proxies")
+
+    # --- Verification Logic ---
+    if results and results[0].get("success"):
+        result = results[0]
+        try:
+            # httpbin.org/ip returns JSON within the HTML body's <pre> tag
+            html_content = result.get('html', '')
+            # Basic extraction - find JSON within <pre> tags or just the JSON itself
+            json_str = None
+            if '<pre' in html_content:
+                start = html_content.find('{')
+                end = html_content.rfind('}')
+                if start != -1 and end != -1:
+                    json_str = html_content[start:end+1]
+            elif html_content.strip().startswith('{'): # Maybe it's just JSON
+                 json_str = html_content.strip()
+
+            if json_str:
+                ip_data = json.loads(json_str)
+                origin_ip = ip_data.get("origin")
+                console.print(f"  Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
+
+                # Extract the IPs from the proxy list for comparison
+                proxy_ips = {p.get("server").split(":")[1][2:] for p in proxy_params_list}
+
+                if origin_ip and origin_ip in proxy_ips:
+                    console.print("[bold green]  Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
+                elif origin_ip:
+                    console.print("[bold red]  Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
+                    console.print(f"  Provided Proxy IPs: {proxy_ips}")
+                else:
+                    console.print("[yellow]  Verification SKIPPED: Could not extract origin IP from response.[/]")
+            else:
+                 console.print("[yellow]  Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
+                 # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
+
+        except json.JSONDecodeError:
+             console.print("[red]  Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
+        except Exception as e:
+             console.print(f"[red]  Verification Error: An unexpected error occurred during IP check: {e}[/]")
+    elif results:
+        console.print("[yellow]  Verification SKIPPED: Crawl for IP check was not successful.[/]")
+
+# 4. Extraction Strategies (Non-Deep)
+async def demo_extract_css(client: httpx.AsyncClient):
+    # Schema to extract book titles and prices
+    book_schema = {
+        "name": "BookList",
+        "baseSelector": "ol.row li.col-xs-6",
+        "fields": [
+            {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
+            {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
+        ]
+    }
+    payload = {
+        "urls": [BOOKS_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": {
+                    "type": "JsonCssExtractionStrategy",
+                     "params": {"schema": {"type": "dict", "value": book_schema}}
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 4a: JSON/CSS Extraction")
+
+    if results and results[0].get("success") and results[0].get("extracted_content"):
+        try:
+            extracted_data = json.loads(results[0]["extracted_content"])
+            if isinstance(extracted_data, list) and extracted_data:
+                 console.print("[cyan]Sample Extracted Books (CSS):[/]")
+                 table = Table(show_header=True, header_style="bold magenta")
+                 table.add_column("Title", style="dim")
+                 table.add_column("Price")
+                 for item in extracted_data[:5]: # Show first 5
+                     table.add_row(item.get('title', 'N/A'), item.get('price', 'N/A'))
+                 console.print(table)
+            else:
+                 console.print("[yellow]CSS extraction did not return a list of results.[/]")
+                 console.print(extracted_data)
+        except json.JSONDecodeError:
+             console.print("[red]Failed to parse extracted_content as JSON.[/]")
+        except Exception as e:
+             console.print(f"[red]Error processing extracted CSS content: {e}[/]")
+
+# 5. LLM Extraction
+async def demo_extract_llm(client: httpx.AsyncClient):
+    if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key
+         console.rule("[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
+         console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
+         return
+
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": {
+                    "type": "LLMExtractionStrategy",
+                    "params": {
+                        "instruction": "Extract title and author into JSON.",
+                        "llm_config": { # Optional: Specify provider if not default
+                            "type": "LLMConfig",
+                            "params": {}
+                            # Relies on server's default provider from config.yml & keys from .llm.env
+                            # "params": {"provider": "openai/gpt-4o-mini"}
+                        },
+                         "schema": { # Request structured output
+                            "type": "dict",
+                            "value": {
+                                "title": "BookInfo", "type": "object",
+                                "properties": {
+                                    "book_title": {"type": "string"},
+                                    "book_author": {"type": "string"}
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 4b: LLM Extraction")
+
+    if results and results[0].get("success") and results[0].get("extracted_content"):
+        try:
+            extracted_data = json.loads(results[0]["extracted_content"])
+            # Handle potential list wrapper from server
+            if isinstance(extracted_data, list) and extracted_data:
+                extracted_data = extracted_data[0]
+
+            if isinstance(extracted_data, dict):
+                 console.print("[cyan]Extracted Data (LLM):[/]")
+                 syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="default", line_numbers=False)
+                 console.print(Panel(syntax, border_style="cyan", expand=False))
+            else:
+                 console.print("[yellow]LLM extraction did not return expected dictionary.[/]")
+                 console.print(extracted_data)
+        except json.JSONDecodeError:
+             console.print("[red]Failed to parse LLM extracted_content as JSON.[/]")
+        except Exception as e:
+             console.print(f"[red]Error processing extracted LLM content: {e}[/]")
+
+# 6. Deep Crawling
+async def demo_deep_basic(client: httpx.AsyncClient):
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 4,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 5a: Basic Deep Crawl")
+    # print_result_summary is called by make_request, showing URLs and depths
+
+# 5. Streaming Deep Crawl
+async def demo_deep_streaming(client: httpx.AsyncClient):
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": True, # Enable streaming
+                "cache_mode": "BYPASS",
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 4,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    # stream_request handles printing results as they arrive
+    await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
+
+# 6. Deep Crawl with Extraction
+async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
+    # Schema to extract H1 and first paragraph from any page
+    general_schema = {
+        "name": "PageContent",
+        "baseSelector": "body", # Apply to whole body
+        "fields": [
+            {"name": "page_title", "selector": "h1", "type": "text", "default": "N/A"},
+            {"name": "first_p", "selector": "p", "type": "text", "default": "N/A"}, # Gets first p tag
+        ]
+    }
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": { # Apply CSS extraction to each page
+                    "type": "JsonCssExtractionStrategy",
+                    "params": {"schema": {"type": "dict", "value": general_schema}}
+                },
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 3,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [
+                                {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                ]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 6a: Deep Crawl + CSS Extraction")
+
+    if results:
+        console.print("[cyan]CSS Extraction Summary from Deep Crawl:[/]")
+        for result in results:
+            if result.get("success") and result.get("extracted_content"):
+                try:
+                    extracted = json.loads(result["extracted_content"])
+                    if isinstance(extracted, list) and extracted: extracted = extracted[0] # Use first item
+                    title = extracted.get('page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
+                    console.print(f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
+                except Exception:
+                     console.print(f"  [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
+            elif result.get("success"):
+                 console.print(f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
+            else:
+                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+
+# 6b. Deep Crawl with LLM Extraction
+async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
+    if not os.getenv("OPENAI_API_KEY"): # Basic check
+         console.rule("[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
+         console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
+         return
+
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                 "extraction_strategy": { # Apply LLM extraction to each page
+                    "type": "LLMExtractionStrategy",
+                    "params": {
+                        "instruction": "What is the main topic of this page based on the H1 and first paragraph? Respond with just the topic.",
+                        # Rely on server default LLM config + .llm.env keys
+                    }
+                },
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 2, # Reduce pages for LLM cost/time
+                         "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [
+                                {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                ]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 6b: Deep Crawl + LLM Extraction")
+
+    if results:
+        console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
+        for result in results:
+            if result.get("success") and result.get("extracted_content"):
+                 console.print(f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
+            elif result.get("success"):
+                 console.print(f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
+            else:
+                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+
+
+# 6c. Deep Crawl with Proxies
+async def demo_deep_with_proxy(client: httpx.AsyncClient):
+    proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
+    if not proxy_params_list:
+        console.rule("[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
+        console.print("Set the PROXIES environment variable to run this demo.")
+        return
+
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL], # Use a site likely accessible via proxies
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                 "proxy_rotation_strategy": {
+                    "type": "RoundRobinProxyStrategy",
+                     "params": {
+                        # Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
+                        "proxies": [
+                            {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
+                            for p in proxy_params_list
+                        ]
+                    }
+                },
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 0, # Just crawl start URL via proxy
+                        "max_pages": 1,
+                    }
+                }
+            }
+        }
+    }
+    # make_request calls print_result_summary, which shows URL and success status
+    await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
+    # Verification of specific proxy IP usage would require more complex setup or server logs.
+
+
+# 6d. Deep Crawl with SSL Certificate Fetching
+async def demo_deep_with_ssl(client: httpx.AsyncClient):
+    """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL], # Needs HTTPS
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": False,
+                "cache_mode": "BYPASS",
+                "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1, # Crawl a bit deeper
+                        "max_pages": 3,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 6d: Deep Crawl + Fetch SSL")
+
+    if results:
+        console.print("[cyan]SSL Certificate Summary from Deep Crawl:[/]")
+        for result in results:
+            if result.get("success") and result.get("ssl_certificate"):
+                 cert = result["ssl_certificate"]
+                 issuer_org = cert.get('issuer', {}).get('O', 'N/A')
+                 valid_from = cert.get('not_before', 'N/A')
+                 valid_to = cert.get('not_after', 'N/A')
+                 console.print(f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
+            elif result.get("success"):
+                 console.print(f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
+            else:
+                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+
+
+# --- Update Main Runner to include new demo ---
+async def main_demo():
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        if not await check_server_health(client):
+            return
+
+        # --- Run Demos ---
+        # await demo_basic_single_url(client)
+        # await demo_basic_multi_url(client)
+        # await demo_streaming_multi_url(client)
+
+        # await demo_markdown_default(client)
+        # await demo_markdown_pruning(client)
+        # await demo_markdown_bm25(client)
+
+        # await demo_param_css_selector(client)
+        # await demo_param_js_execution(client)
+        # await demo_param_screenshot(client)
+        # await demo_param_ssl_fetch(client)
+        # await demo_param_proxy(client) # Skips if no PROXIES env var
+
+        # await demo_extract_css(client)
+        await demo_extract_llm(client) # Skips if no common LLM key env var
+
+        await demo_deep_basic(client)
+        await demo_deep_streaming(client)
+        # demo_deep_filtering_scoring skipped for brevity, add if needed
+
+        await demo_deep_with_css_extraction(client)
+        await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
+        await demo_deep_with_proxy(client) # Skips if no PROXIES env var
+        await demo_deep_with_ssl(client) # Added the new demo
+
+        console.rule("[bold green]Demo Complete[/]", style="green")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main_demo())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Demo interrupted by user.[/]")
+    except Exception as e:
+         console.print(f"\n[bold red]An error occurred during demo execution:[/]")
+         console.print_exception(show_locals=False)
\ No newline at end of file

From 921e0c46b6afba316c3b9f38917b35f34f366e24 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 17 Apr 2025 22:31:51 +0800
Subject: [PATCH 61/78] feat(tests): implement high volume stress testing
 framework

Add comprehensive stress testing solution for SDK using arun_many and dispatcher system:
- Create test_stress_sdk.py for running high volume crawl tests
- Add run_benchmark.py for orchestrating tests with predefined configs
- Implement benchmark_report.py for generating performance reports
- Add memory tracking and local test site generation
- Support both streaming and batch processing modes
- Add detailed documentation in README.md

The framework enables testing SDK performance, concurrency handling,
and memory behavior under high-volume scenarios.
---
 .gitignore                       |   6 +-
 JOURNAL.md                       | 191 +++++++
 tests/memory/README.md           | 315 +++++++++++
 tests/memory/benchmark_report.py | 887 +++++++++++++++++++++++++++++++
 tests/memory/requirements.txt    |   4 +
 tests/memory/run_benchmark.py    | 259 +++++++++
 tests/memory/test_stress_sdk.py  | 500 +++++++++++++++++
 7 files changed, 2161 insertions(+), 1 deletion(-)
 create mode 100644 tests/memory/README.md
 create mode 100755 tests/memory/benchmark_report.py
 create mode 100644 tests/memory/requirements.txt
 create mode 100755 tests/memory/run_benchmark.py
 create mode 100644 tests/memory/test_stress_sdk.py

diff --git a/.gitignore b/.gitignore
index a290ab7d..1658a987 100644
--- a/.gitignore
+++ b/.gitignore
@@ -257,4 +257,8 @@ continue_config.json
 .private/
 
 CLAUDE_MONITOR.md
-CLAUDE.md
\ No newline at end of file
+CLAUDE.md
+
+tests/**/test_site
+tests/**/reports
+tests/**/benchmark_reports
\ No newline at end of file
diff --git a/JOURNAL.md b/JOURNAL.md
index ac00e890..df0c8564 100644
--- a/JOURNAL.md
+++ b/JOURNAL.md
@@ -2,6 +2,197 @@
 
 This journal tracks significant feature additions, bug fixes, and architectural decisions in the crawl4ai project. It serves as both documentation and a historical record of the project's evolution.
 
+## [2025-04-17] Implemented High Volume Stress Testing Solution for SDK
+
+**Feature:** Comprehensive stress testing framework using `arun_many` and the dispatcher system to evaluate performance, concurrency handling, and identify potential issues under high-volume crawling scenarios.
+
+**Changes Made:**
+1.  Created a dedicated stress testing framework in the `benchmarking/` (or similar) directory.
+2.  Implemented local test site generation (`SiteGenerator`) with configurable heavy HTML pages.
+3.  Added basic memory usage tracking (`SimpleMemoryTracker`) using platform-specific commands (avoiding `psutil` dependency for this specific test).
+4.  Utilized `CrawlerMonitor` from `crawl4ai` for rich terminal UI and real-time monitoring of test progress and dispatcher activity.
+5.  Implemented detailed result summary saving (JSON) and memory sample logging (CSV).
+6.  Developed `run_benchmark.py` to orchestrate tests with predefined configurations.
+7.  Created `run_all.sh` as a simple wrapper for `run_benchmark.py`.
+
+**Implementation Details:**
+-   Generates a local test site with configurable pages containing heavy text and image content.
+-   Uses Python's built-in `http.server` for local serving, minimizing network variance.
+-   Leverages `crawl4ai`'s `arun_many` method for processing URLs.
+-   Utilizes `MemoryAdaptiveDispatcher` to manage concurrency via the `max_sessions` parameter (note: memory adaptation features require `psutil`, not used by `SimpleMemoryTracker`).
+-   Tracks memory usage via `SimpleMemoryTracker`, recording samples throughout test execution to a CSV file.
+-   Uses `CrawlerMonitor` (which uses the `rich` library) for clear terminal visualization and progress reporting directly from the dispatcher.
+-   Stores detailed final metrics in a JSON summary file.
+
+**Files Created/Updated:**
+-   `stress_test_sdk.py`: Main stress testing implementation using `arun_many`.
+-   `benchmark_report.py`: (Assumed) Report generator for comparing test results.
+-   `run_benchmark.py`: Test runner script with predefined configurations.
+-   `run_all.sh`: Simple bash script wrapper for `run_benchmark.py`.
+-   `USAGE.md`: Comprehensive documentation on usage and interpretation (updated).
+
+**Testing Approach:**
+-   Creates a controlled, reproducible test environment with a local HTTP server.
+-   Processes URLs using `arun_many`, allowing the dispatcher to manage concurrency up to `max_sessions`.
+-   Optionally logs per-batch summaries (when not in streaming mode) after processing chunks.
+-   Supports different test sizes via `run_benchmark.py` configurations.
+-   Records memory samples via platform commands for basic trend analysis.
+-   Includes cleanup functionality for the test environment.
+
+**Challenges:**
+-   Ensuring proper cleanup of HTTP server processes.
+-   Getting reliable memory tracking across platforms without adding heavy dependencies (`psutil`) to this specific test script.
+-   Designing `run_benchmark.py` to correctly pass arguments to `stress_test_sdk.py`.
+
+**Why This Feature:**
+The high volume stress testing solution addresses critical needs for ensuring Crawl4AI's `arun_many` reliability:
+1.  Provides a reproducible way to evaluate performance under concurrent load.
+2.  Allows testing the dispatcher's concurrency control (`max_session_permit`) and queue management.
+3.  Enables performance tuning by observing throughput (`URLs/sec`) under different `max_sessions` settings.
+4.  Creates a controlled environment for testing `arun_many` behavior.
+5.  Supports continuous integration by providing deterministic test conditions for `arun_many`.
+
+**Design Decisions:**
+-   Chose local site generation for reproducibility and isolation from network issues.
+-   Utilized the built-in `CrawlerMonitor` for real-time feedback, leveraging its `rich` integration.
+-   Implemented optional per-batch logging in `stress_test_sdk.py` (when not streaming) to provide chunk-level summaries alongside the continuous monitor.
+-   Adopted `arun_many` with a `MemoryAdaptiveDispatcher` as the core mechanism for parallel execution, reflecting the intended SDK usage.
+-   Created `run_benchmark.py` to simplify running standard test configurations.
+-   Used `SimpleMemoryTracker` to provide basic memory insights without requiring `psutil` for this particular test runner.
+
+**Future Enhancements to Consider:**
+-   Create a separate test variant that *does* use `psutil` to specifically stress the memory-adaptive features of the dispatcher.
+-   Add support for generated JavaScript content.
+-   Add support for Docker-based testing with explicit memory limits.
+-   Enhance `benchmark_report.py` to provide more sophisticated analysis of performance and memory trends from the generated JSON/CSV files.
+
+---
+
+## [2025-04-17] Refined Stress Testing System Parameters and Execution
+
+**Changes Made:**
+1.  Corrected `run_benchmark.py` and `stress_test_sdk.py` to use `--max-sessions` instead of the incorrect `--workers` parameter, accurately reflecting dispatcher configuration.
+2.  Updated `run_benchmark.py` argument handling to correctly pass all relevant custom parameters (including `--stream`, `--monitor-mode`, etc.) to `stress_test_sdk.py`.
+3.  (Assuming changes in `benchmark_report.py`) Applied dark theme to benchmark reports for better readability.
+4.  (Assuming changes in `benchmark_report.py`) Improved visualization code to eliminate matplotlib warnings.
+5.  Updated `run_benchmark.py` to provide clickable `file://` links to generated reports in the terminal output.
+6.  Updated `USAGE.md` with comprehensive parameter descriptions reflecting the final script arguments.
+7.  Updated `run_all.sh` wrapper to correctly invoke `run_benchmark.py` with flexible arguments.
+
+**Details of Changes:**
+
+1.  **Parameter Correction (`--max-sessions`)**:
+    *   Identified the fundamental misunderstanding where `--workers` was used incorrectly.
+    *   Refactored `stress_test_sdk.py` to accept `--max-sessions` and configure the `MemoryAdaptiveDispatcher`'s `max_session_permit` accordingly.
+    *   Updated `run_benchmark.py` argument parsing and command construction to use `--max-sessions`.
+    *   Updated `TEST_CONFIGS` in `run_benchmark.py` to use `max_sessions`.
+
+2.  **Argument Handling (`run_benchmark.py`)**:
+    *   Improved logic to collect all command-line arguments provided to `run_benchmark.py`.
+    *   Ensured all relevant arguments (like `--stream`, `--monitor-mode`, `--port`, `--use-rate-limiter`, etc.) are correctly forwarded when calling `stress_test_sdk.py` as a subprocess.
+
+3.  **Dark Theme & Visualization Fixes (Assumed in `benchmark_report.py`)**:
+    *   (Describes changes assumed to be made in the separate reporting script).
+
+4.  **Clickable Links (`run_benchmark.py`)**:
+    *   Added logic to find the latest HTML report and PNG chart in the `benchmark_reports` directory after `benchmark_report.py` runs.
+    *   Used `pathlib` to generate correct `file://` URLs for terminal output.
+
+5.  **Documentation Improvements (`USAGE.md`)**:
+    *   Rewrote sections to explain `arun_many`, dispatchers, and `--max-sessions`.
+    *   Updated parameter tables for all scripts (`stress_test_sdk.py`, `run_benchmark.py`).
+    *   Clarified the difference between batch and streaming modes and their effect on logging.
+    *   Updated examples to use correct arguments.
+
+**Files Modified:**
+-   `stress_test_sdk.py`: Changed `--workers` to `--max-sessions`, added new arguments, used `arun_many`.
+-   `run_benchmark.py`: Changed argument handling, updated configs, calls `stress_test_sdk.py`.
+-   `run_all.sh`: Updated to call `run_benchmark.py` correctly.
+-   `USAGE.md`: Updated documentation extensively.
+-   `benchmark_report.py`: (Assumed modifications for dark theme and viz fixes).
+
+**Testing:**
+-   Verified that `--max-sessions` correctly limits concurrency via the `CrawlerMonitor` output.
+-   Confirmed that custom arguments passed to `run_benchmark.py` are forwarded to `stress_test_sdk.py`.
+-   Validated clickable links work in supporting terminals.
+-   Ensured documentation matches the final script parameters and behavior.
+
+**Why These Changes:**
+These refinements correct the fundamental approach of the stress test to align with `crawl4ai`'s actual architecture and intended usage:
+1.  Ensures the test evaluates the correct components (`arun_many`, `MemoryAdaptiveDispatcher`).
+2.  Makes test configurations more accurate and flexible.
+3.  Improves the usability of the testing framework through better argument handling and documentation.
+
+
+**Future Enhancements to Consider:**
+- Add support for generated JavaScript content to test JS rendering performance
+- Implement more sophisticated memory analysis like generational garbage collection tracking
+- Add support for Docker-based testing with memory limits to force OOM conditions
+- Create visualization tools for analyzing memory usage patterns across test runs
+- Add benchmark comparisons between different crawler versions or configurations
+
+## [2025-04-17] Fixed Issues in Stress Testing System
+
+**Changes Made:**
+1. Fixed custom parameter handling in run_benchmark.py
+2. Applied dark theme to benchmark reports for better readability
+3. Improved visualization code to eliminate matplotlib warnings
+4. Added clickable links to generated reports in terminal output
+5. Enhanced documentation with comprehensive parameter descriptions
+
+**Details of Changes:**
+
+1. **Custom Parameter Handling Fix**
+   - Identified bug where custom URL count was being ignored in run_benchmark.py
+   - Rewrote argument handling to use a custom args dictionary
+   - Properly passed parameters to the test_simple_stress.py command
+   - Added better UI indication of custom parameters in use
+
+2. **Dark Theme Implementation**
+   - Added complete dark theme to HTML benchmark reports
+   - Applied dark styling to all visualization components
+   - Used Nord-inspired color palette for charts and graphs
+   - Improved contrast and readability for data visualization
+   - Updated text colors and backgrounds for better eye comfort
+
+3. **Matplotlib Warning Fixes**
+   - Resolved warnings related to improper use of set_xticklabels()
+   - Implemented correct x-axis positioning for bar charts
+   - Ensured proper alignment of bar labels and data points
+   - Updated plotting code to use modern matplotlib practices
+
+4. **Documentation Improvements**
+   - Created comprehensive USAGE.md with detailed instructions
+   - Added parameter documentation for all scripts
+   - Included examples for all common use cases
+   - Provided detailed explanations for interpreting results
+   - Added troubleshooting guide for common issues
+
+**Files Modified:**
+- `tests/memory/run_benchmark.py`: Fixed custom parameter handling
+- `tests/memory/benchmark_report.py`: Added dark theme and fixed visualization warnings
+- `tests/memory/run_all.sh`: Added clickable links to reports
+- `tests/memory/USAGE.md`: Created comprehensive documentation
+
+**Testing:**
+- Verified that custom URL counts are now correctly used
+- Confirmed dark theme is properly applied to all report elements
+- Checked that matplotlib warnings are no longer appearing
+- Validated clickable links to reports work in terminals that support them
+
+**Why These Changes:**
+These improvements address several usability issues with the stress testing system:
+1. Better parameter handling ensures test configurations work as expected
+2. Dark theme reduces eye strain during extended test review sessions
+3. Fixing visualization warnings improves code quality and output clarity
+4. Enhanced documentation makes the system more accessible for future use
+
+**Future Enhancements:**
+- Add additional visualization options for different types of analysis
+- Implement theme toggle to support both light and dark preferences
+- Add export options for embedding reports in other documentation
+- Create dedicated CI/CD integration templates for automated testing
+
 ## [2025-04-09] Added MHTML Capture Feature
 
 **Feature:** MHTML snapshot capture of crawled pages
diff --git a/tests/memory/README.md b/tests/memory/README.md
new file mode 100644
index 00000000..164ef095
--- /dev/null
+++ b/tests/memory/README.md
@@ -0,0 +1,315 @@
+# Crawl4AI Stress Testing and Benchmarking
+
+This directory contains tools for stress testing Crawl4AI's `arun_many` method and dispatcher system with high volumes of URLs to evaluate performance, concurrency handling, and potentially detect memory issues. It also includes a benchmarking system to track performance over time.
+
+## Quick Start
+
+```bash
+# Run a default stress test (small config) and generate a report
+# (Assumes run_all.sh is updated to call run_benchmark.py)
+./run_all.sh
+```
+*Note: `run_all.sh` might need to be updated if it directly called the old script.*
+
+## Overview
+
+The stress testing system works by:
+
+1.  Generating a local test site with heavy HTML pages (regenerated by default for each test).
+2.  Starting a local HTTP server to serve these pages.
+3.  Running Crawl4AI's `arun_many` method against this local site using the `MemoryAdaptiveDispatcher` with configurable concurrency (`max_sessions`).
+4.  Monitoring performance metrics via the `CrawlerMonitor` and optionally logging memory usage.
+5.  Optionally generating detailed benchmark reports with visualizations using `benchmark_report.py`.
+
+## Available Tools
+
+-   `test_stress_sdk.py` - Main stress testing script utilizing `arun_many` and dispatchers.
+-   `benchmark_report.py` - Report generator for comparing test results (assumes compatibility with `test_stress_sdk.py` outputs).
+-   `run_benchmark.py` - Python script with predefined test configurations that orchestrates tests using `test_stress_sdk.py`.
+-   `run_all.sh` - Simple wrapper script (may need updating).
+
+## Usage Guide
+
+### Using Predefined Configurations (Recommended)
+
+The `run_benchmark.py` script offers the easiest way to run standardized tests:
+
+```bash
+# Quick test (50 URLs, 4 max sessions)
+python run_benchmark.py quick
+
+# Medium test (500 URLs, 16 max sessions)
+python run_benchmark.py medium
+
+# Large test (1000 URLs, 32 max sessions)
+python run_benchmark.py large
+
+# Extreme test (2000 URLs, 64 max sessions)
+python run_benchmark.py extreme
+
+# Custom configuration
+python run_benchmark.py custom --urls 300 --max-sessions 24 --chunk-size 50
+
+# Run 'small' test in streaming mode
+python run_benchmark.py small --stream
+
+# Override max_sessions for the 'medium' config
+python run_benchmark.py medium --max-sessions 20
+
+# Skip benchmark report generation after the test
+python run_benchmark.py small --no-report
+
+# Clean up reports and site files before running
+python run_benchmark.py medium --clean
+```
+
+#### `run_benchmark.py` Parameters
+
+| Parameter            | Default         | Description                                                                 |
+| -------------------- | --------------- | --------------------------------------------------------------------------- |
+| `config`             | *required*      | Test configuration: `quick`, `small`, `medium`, `large`, `extreme`, `custom`|
+| `--urls`             | config-specific | Number of URLs (required for `custom`)                                      |
+| `--max-sessions`     | config-specific | Max concurrent sessions managed by dispatcher (required for `custom`)         |
+| `--chunk-size`       | config-specific | URLs per batch for non-stream logging (required for `custom`)               |
+| `--stream`           | False           | Enable streaming results (disables batch logging)                           |
+| `--monitor-mode`     | DETAILED        | `DETAILED` or `AGGREGATED` display for the live monitor                     |
+| `--use-rate-limiter` | False           | Enable basic rate limiter in the dispatcher                                 |
+| `--port`             | 8000            | HTTP server port                                                            |
+| `--no-report`        | False           | Skip generating comparison report via `benchmark_report.py`                 |
+| `--clean`            | False           | Clean up reports and site files before running                              |
+| `--keep-server-alive`| False           | Keep local HTTP server running after test                                   |
+| `--use-existing-site`| False           | Use existing site on specified port (no local server start/site gen)        |
+| `--skip-generation`  | False           | Use existing site files but start local server                              |
+| `--keep-site`        | False           | Keep generated site files after test                                        |
+
+#### Predefined Configurations
+
+| Configuration | URLs   | Max Sessions | Chunk Size | Description                      |
+| ------------- | ------ | ------------ | ---------- | -------------------------------- |
+| `quick`       | 50     | 4            | 10         | Quick test for basic validation  |
+| `small`       | 100    | 8            | 20         | Small test for routine checks    |
+| `medium`      | 500    | 16           | 50         | Medium test for thorough checks  |
+| `large`       | 1000   | 32           | 100        | Large test for stress testing    |
+| `extreme`     | 2000   | 64           | 200        | Extreme test for limit testing   |
+
+### Direct Usage of `test_stress_sdk.py`
+
+For fine-grained control or debugging, you can run the stress test script directly:
+
+```bash
+# Test with 200 URLs and 32 max concurrent sessions
+python test_stress_sdk.py --urls 200 --max-sessions 32 --chunk-size 40
+
+# Clean up previous test data first
+python test_stress_sdk.py --clean-reports --clean-site --urls 100 --max-sessions 16 --chunk-size 20
+
+# Change the HTTP server port and use aggregated monitor
+python test_stress_sdk.py --port 8088 --urls 100 --max-sessions 16 --monitor-mode AGGREGATED
+
+# Enable streaming mode and use rate limiting
+python test_stress_sdk.py --urls 50 --max-sessions 8 --stream --use-rate-limiter
+
+# Change report output location
+python test_stress_sdk.py --report-path custom_reports --urls 100 --max-sessions 16
+```
+
+#### `test_stress_sdk.py` Parameters
+
+| Parameter            | Default    | Description                                                          |
+| -------------------- | ---------- | -------------------------------------------------------------------- |
+| `--urls`             | 100        | Number of URLs to test                                               |
+| `--max-sessions`     | 16         | Maximum concurrent crawling sessions managed by the dispatcher       |
+| `--chunk-size`       | 10         | Number of URLs per batch (relevant for non-stream logging)           |
+| `--stream`           | False      | Enable streaming results (disables batch logging)                    |
+| `--monitor-mode`     | DETAILED   | `DETAILED` or `AGGREGATED` display for the live `CrawlerMonitor`     |
+| `--use-rate-limiter` | False      | Enable a basic `RateLimiter` within the dispatcher                   |
+| `--site-path`        | "test_site"| Path to store/use the generated test site                            |
+| `--port`             | 8000       | Port for the local HTTP server                                       |
+| `--report-path`      | "reports"  | Path to save test result summary (JSON) and memory samples (CSV)   |
+| `--skip-generation`  | False      | Use existing test site files but still start local server            |
+| `--use-existing-site`| False      | Use existing site on specified port (no local server/site gen)     |
+| `--keep-server-alive`| False      | Keep local HTTP server running after test completion                 |
+| `--keep-site`        | False      | Keep the generated test site files after test completion             |
+| `--clean-reports`    | False      | Clean up report directory before running                             |
+| `--clean-site`       | False      | Clean up site directory before/after running (see script logic)    |
+
+### Generating Reports Only
+
+If you only want to generate a benchmark report from existing test results (assuming `benchmark_report.py` is compatible):
+
+```bash
+# Generate a report from existing test results in ./reports/
+python benchmark_report.py
+
+# Limit to the most recent 5 test results
+python benchmark_report.py --limit 5
+
+# Specify a custom source directory for test results
+python benchmark_report.py --reports-dir alternate_results
+```
+
+#### `benchmark_report.py` Parameters (Assumed)
+
+| Parameter       | Default              | Description                                                 |
+| --------------- | -------------------- | ----------------------------------------------------------- |
+| `--reports-dir` | "reports"            | Directory containing `test_stress_sdk.py` result files      |
+| `--output-dir`  | "benchmark_reports"  | Directory to save generated HTML reports and charts         |
+| `--limit`       | None (all results)   | Limit comparison to N most recent test results              |
+| `--output-file` | Auto-generated       | Custom output filename for the HTML report                  |
+
+## Understanding the Test Output
+
+### Real-time Progress Display (`CrawlerMonitor`)
+
+When running `test_stress_sdk.py`, the `CrawlerMonitor` provides a live view of the crawling process managed by the dispatcher.
+
+-   **DETAILED Mode (Default):** Shows individual task status (Queued, Active, Completed, Failed), timings, memory usage per task (if `psutil` is available), overall queue statistics, and memory pressure status (if `psutil` available).
+-   **AGGREGATED Mode:** Shows summary counts (Queued, Active, Completed, Failed), overall progress percentage, estimated time remaining, average URLs/sec, and memory pressure status.
+
+### Batch Log Output (Non-Streaming Mode Only)
+
+If running `test_stress_sdk.py` **without** the `--stream` flag, you will *also* see per-batch summary lines printed to the console *after* the monitor display, once each chunk of URLs finishes processing:
+
+```
+ Batch | Progress | Start Mem | End Mem   | URLs/sec | Success/Fail | Time (s) | Status
+───────────────────────────────────────────────────────────────────────────────────────────
+ 1     |  10.0%   |  50.1 MB  |  55.3 MB  |    23.8    |    10/0      |     0.42   | Success
+ 2     |  20.0%   |  55.3 MB  |  60.1 MB  |    24.1    |    10/0      |     0.41   | Success
+ ...
+```
+
+This display provides chunk-specific metrics:
+-   **Batch**: The batch number being reported.
+-   **Progress**: Overall percentage of total URLs processed *after* this batch.
+-   **Start Mem / End Mem**: Memory usage before and after processing this batch (if tracked).
+-   **URLs/sec**: Processing speed *for this specific batch*.
+-   **Success/Fail**: Number of successful and failed URLs *in this batch*.
+-   **Time (s)**: Wall-clock time taken to process *this batch*.
+-   **Status**: Color-coded status for the batch outcome.
+
+### Summary Output
+
+After test completion, a final summary is displayed:
+
+```
+================================================================================
+Test Completed
+================================================================================
+Test ID: 20250418_103015
+Configuration: 100 URLs, 16 max sessions, Chunk: 10, Stream: False, Monitor: DETAILED
+Results: 100 successful, 0 failed (100 processed, 100.0% success)
+Performance: 5.85 seconds total, 17.09 URLs/second avg
+Memory Usage: Start: 50.1 MB, End: 75.3 MB, Max: 78.1 MB, Growth: 25.2 MB
+Results summary saved to reports/test_summary_20250418_103015.json
+```
+
+### HTML Report Structure (Generated by `benchmark_report.py`)
+
+(This section remains the same, assuming `benchmark_report.py` generates these)
+The benchmark report contains several sections:
+1.  **Summary**: Overview of the latest test results and trends
+2.  **Performance Comparison**: Charts showing throughput across tests
+3.  **Memory Usage**: Detailed memory usage graphs for each test
+4.  **Detailed Results**: Tabular data of all test metrics
+5.  **Conclusion**: Automated analysis of performance and memory patterns
+
+### Memory Metrics
+
+(This section remains conceptually the same)
+Memory growth is the key metric for detecting leaks...
+
+### Performance Metrics
+
+(This section remains conceptually the same, though "URLs per Worker" is less relevant - focus on overall URLs/sec)
+Key performance indicators include:
+-   **URLs per Second**: Higher is better (throughput)
+-   **Success Rate**: Should be 100% in normal conditions
+-   **Total Processing Time**: Lower is better
+-   **Dispatcher Efficiency**: Observe queue lengths and wait times in the monitor (Detailed mode)
+
+### Raw Data Files
+
+Raw data is saved in the `--report-path` directory (default `./reports/`):
+
+-   **JSON files** (`test_summary_*.json`): Contains the final summary for each test run.
+-   **CSV files** (`memory_samples_*.csv`): Contains time-series memory samples taken during the test run.
+
+Example of reading raw data:
+```python
+import json
+import pandas as pd
+
+# Load test summary
+test_id = "20250418_103015" # Example ID
+with open(f'reports/test_summary_{test_id}.json', 'r') as f:
+    results = json.load(f)
+
+# Load memory samples
+memory_df = pd.read_csv(f'reports/memory_samples_{test_id}.csv')
+
+# Analyze memory_df (e.g., calculate growth, plot)
+if not memory_df['memory_info_mb'].isnull().all():
+    growth = memory_df['memory_info_mb'].iloc[-1] - memory_df['memory_info_mb'].iloc[0]
+    print(f"Total Memory Growth: {growth:.1f} MB")
+else:
+    print("No valid memory samples found.")
+
+print(f"Avg URLs/sec: {results['urls_processed'] / results['total_time_seconds']:.2f}")
+```
+
+## Visualization Dependencies
+
+(This section remains the same)
+For full visualization capabilities in the HTML reports generated by `benchmark_report.py`, install additional dependencies...
+
+## Directory Structure
+
+```
+benchmarking/          # Or your top-level directory name
+├── benchmark_reports/ # Generated HTML reports (by benchmark_report.py)
+├── reports/           # Raw test result data (from test_stress_sdk.py)
+├── test_site/         # Generated test content (temporary)
+├── benchmark_report.py# Report generator
+├── run_benchmark.py   # Test runner with predefined configs
+├── test_stress_sdk.py # Main stress test implementation using arun_many
+└── run_all.sh         # Simple wrapper script (may need updates)
+#└── requirements.txt   # Optional: Visualization dependencies for benchmark_report.py
+```
+
+## Cleanup
+
+To clean up after testing:
+
+```bash
+# Remove the test site content (if not using --keep-site)
+rm -rf test_site
+
+# Remove all raw reports and generated benchmark reports
+rm -rf reports benchmark_reports
+
+# Or use the --clean flag with run_benchmark.py
+python run_benchmark.py medium --clean
+```
+
+## Use in CI/CD
+
+(This section remains conceptually the same, just update script names)
+These tests can be integrated into CI/CD pipelines:
+```bash
+# Example CI script
+python run_benchmark.py medium --no-report # Run test without interactive report gen
+# Check exit code
+if [ $? -ne 0 ]; then echo "Stress test failed!"; exit 1; fi
+# Optionally, run report generator and check its output/metrics
+# python benchmark_report.py
+# check_report_metrics.py reports/test_summary_*.json || exit 1
+exit 0
+```
+
+## Troubleshooting
+
+-   **HTTP Server Port Conflict**: Use `--port` with `run_benchmark.py` or `test_stress_sdk.py`.
+-   **Memory Tracking Issues**: The `SimpleMemoryTracker` uses platform commands (`ps`, `/proc`, `tasklist`). Ensure these are available and the script has permission. If it consistently fails, memory reporting will be limited.
+-   **Visualization Missing**: Related to `benchmark_report.py` and its dependencies.
+-   **Site Generation Issues**: Check permissions for creating `./test_site/`. Use `--skip-generation` if you want to manage the site manually.
+-   **Testing Against External Site**: Ensure the external site is running and use `--use-existing-site --port <correct_port>`.
diff --git a/tests/memory/benchmark_report.py b/tests/memory/benchmark_report.py
new file mode 100755
index 00000000..a634f997
--- /dev/null
+++ b/tests/memory/benchmark_report.py
@@ -0,0 +1,887 @@
+#!/usr/bin/env python3
+"""
+Benchmark reporting tool for Crawl4AI stress tests.
+Generates visual reports and comparisons between test runs.
+"""
+
+import os
+import json
+import glob
+import argparse
+import sys
+from datetime import datetime
+from pathlib import Path
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+
+# Initialize rich console
+console = Console()
+
+# Try to import optional visualization dependencies
+VISUALIZATION_AVAILABLE = True
+try:
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    import numpy as np
+    import seaborn as sns
+except ImportError:
+    VISUALIZATION_AVAILABLE = False
+    console.print("[yellow]Warning: Visualization dependencies not found. Install with:[/yellow]")
+    console.print("[yellow]pip install pandas matplotlib seaborn[/yellow]")
+    console.print("[yellow]Only text-based reports will be generated.[/yellow]")
+
+# Configure plotting if available
+if VISUALIZATION_AVAILABLE:
+    # Set plot style for dark theme
+    plt.style.use('dark_background')
+    sns.set_theme(style="darkgrid")
+    
+    # Custom color palette based on Nord theme
+    nord_palette = ["#88c0d0", "#81a1c1", "#a3be8c", "#ebcb8b", "#bf616a", "#b48ead", "#5e81ac"]
+    sns.set_palette(nord_palette)
+
+class BenchmarkReporter:
+    """Generates visual reports and comparisons for Crawl4AI stress tests."""
+    
+    def __init__(self, reports_dir="reports", output_dir="benchmark_reports"):
+        """Initialize the benchmark reporter.
+        
+        Args:
+            reports_dir: Directory containing test result files
+            output_dir: Directory to save generated reports
+        """
+        self.reports_dir = Path(reports_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Configure matplotlib if available
+        if VISUALIZATION_AVAILABLE:
+            # Ensure the matplotlib backend works in headless environments
+            mpl.use('Agg')
+            
+            # Set up styling for plots with dark theme
+            mpl.rcParams['figure.figsize'] = (12, 8)
+            mpl.rcParams['font.size'] = 12
+            mpl.rcParams['axes.labelsize'] = 14
+            mpl.rcParams['axes.titlesize'] = 16
+            mpl.rcParams['xtick.labelsize'] = 12
+            mpl.rcParams['ytick.labelsize'] = 12
+            mpl.rcParams['legend.fontsize'] = 12
+            mpl.rcParams['figure.facecolor'] = '#1e1e1e'
+            mpl.rcParams['axes.facecolor'] = '#2e3440'
+            mpl.rcParams['savefig.facecolor'] = '#1e1e1e'
+            mpl.rcParams['text.color'] = '#e0e0e0'
+            mpl.rcParams['axes.labelcolor'] = '#e0e0e0'
+            mpl.rcParams['xtick.color'] = '#e0e0e0'
+            mpl.rcParams['ytick.color'] = '#e0e0e0'
+            mpl.rcParams['grid.color'] = '#444444'
+            mpl.rcParams['figure.edgecolor'] = '#444444'
+        
+    def load_test_results(self, limit=None):
+        """Load all test results from the reports directory.
+        
+        Args:
+            limit: Optional limit on number of most recent tests to load
+            
+        Returns:
+            Dictionary mapping test IDs to result data
+        """
+        result_files = glob.glob(str(self.reports_dir / "test_results_*.json"))
+        
+        # Sort files by modification time (newest first)
+        result_files.sort(key=os.path.getmtime, reverse=True)
+        
+        if limit:
+            result_files = result_files[:limit]
+        
+        results = {}
+        for file_path in result_files:
+            try:
+                with open(file_path, 'r') as f:
+                    data = json.load(f)
+                    test_id = data.get('test_id')
+                    if test_id:
+                        results[test_id] = data
+                        
+                        # Try to load the corresponding memory samples
+                        csv_path = self.reports_dir / f"memory_samples_{test_id}.csv"
+                        if csv_path.exists():
+                            try:
+                                memory_df = pd.read_csv(csv_path)
+                                results[test_id]['memory_samples'] = memory_df
+                            except Exception as e:
+                                console.print(f"[yellow]Warning: Could not load memory samples for {test_id}: {e}[/yellow]")
+            except Exception as e:
+                console.print(f"[red]Error loading {file_path}: {e}[/red]")
+        
+        console.print(f"Loaded {len(results)} test results")
+        return results
+    
+    def generate_summary_table(self, results):
+        """Generate a summary table of test results.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            
+        Returns:
+            Rich Table object
+        """
+        table = Table(title="Crawl4AI Stress Test Summary", show_header=True)
+        
+        # Define columns
+        table.add_column("Test ID", style="cyan")
+        table.add_column("Date", style="bright_green")
+        table.add_column("URLs", justify="right")
+        table.add_column("Workers", justify="right")
+        table.add_column("Success %", justify="right")
+        table.add_column("Time (s)", justify="right")
+        table.add_column("Mem Growth", justify="right")
+        table.add_column("URLs/sec", justify="right")
+        
+        # Add rows
+        for test_id, data in sorted(results.items(), key=lambda x: x[0], reverse=True):
+            # Parse timestamp from test_id
+            try:
+                date_str = datetime.strptime(test_id, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
+            except:
+                date_str = "Unknown"
+            
+            # Calculate success percentage
+            total_urls = data.get('url_count', 0)
+            successful = data.get('successful_urls', 0)
+            success_pct = (successful / total_urls * 100) if total_urls > 0 else 0
+            
+            # Calculate memory growth if available
+            mem_growth = "N/A"
+            if 'memory_samples' in data:
+                samples = data['memory_samples']
+                if len(samples) >= 2:
+                    # Try to extract numeric values from memory_info strings
+                    try:
+                        first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+                        last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+                        mem_growth = f"{last_mem - first_mem:.1f} MB"
+                    except:
+                        pass
+            
+            # Calculate URLs per second
+            time_taken = data.get('total_time_seconds', 0)
+            urls_per_sec = total_urls / time_taken if time_taken > 0 else 0
+            
+            table.add_row(
+                test_id,
+                date_str,
+                str(total_urls),
+                str(data.get('workers', 'N/A')),
+                f"{success_pct:.1f}%",
+                f"{data.get('total_time_seconds', 0):.2f}",
+                mem_growth,
+                f"{urls_per_sec:.1f}"
+            )
+        
+        return table
+    
+    def generate_performance_chart(self, results, output_file=None):
+        """Generate a performance comparison chart.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            output_file: File path to save the chart
+            
+        Returns:
+            Path to the saved chart file or None if visualization is not available
+        """
+        if not VISUALIZATION_AVAILABLE:
+            console.print("[yellow]Skipping performance chart - visualization dependencies not available[/yellow]")
+            return None
+            
+        # Extract relevant data
+        data = []
+        for test_id, result in results.items():
+            urls = result.get('url_count', 0)
+            workers = result.get('workers', 0)
+            time_taken = result.get('total_time_seconds', 0)
+            urls_per_sec = urls / time_taken if time_taken > 0 else 0
+            
+            # Parse timestamp from test_id for sorting
+            try:
+                timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+                data.append({
+                    'test_id': test_id,
+                    'timestamp': timestamp,
+                    'urls': urls,
+                    'workers': workers,
+                    'time_seconds': time_taken,
+                    'urls_per_sec': urls_per_sec
+                })
+            except:
+                console.print(f"[yellow]Warning: Could not parse timestamp from {test_id}[/yellow]")
+        
+        if not data:
+            console.print("[yellow]No valid data for performance chart[/yellow]")
+            return None
+        
+        # Convert to DataFrame and sort by timestamp
+        df = pd.DataFrame(data)
+        df = df.sort_values('timestamp')
+        
+        # Create the plot
+        fig, ax1 = plt.subplots(figsize=(12, 6))
+        
+        # Plot URLs per second as bars with properly set x-axis
+        x_pos = range(len(df['test_id']))
+        bars = ax1.bar(x_pos, df['urls_per_sec'], color='#88c0d0', alpha=0.8)
+        ax1.set_ylabel('URLs per Second', color='#88c0d0')
+        ax1.tick_params(axis='y', labelcolor='#88c0d0')
+        
+        # Properly set x-axis labels
+        ax1.set_xticks(x_pos)
+        ax1.set_xticklabels(df['test_id'].tolist(), rotation=45, ha='right')
+        
+        # Add worker count as text on each bar
+        for i, bar in enumerate(bars):
+            height = bar.get_height()
+            workers = df.iloc[i]['workers']
+            ax1.text(i, height + 0.1,
+                    f'W: {workers}', ha='center', va='bottom', fontsize=9, color='#e0e0e0')
+        
+        # Add a second y-axis for total URLs
+        ax2 = ax1.twinx()
+        ax2.plot(x_pos, df['urls'], '-', color='#bf616a', alpha=0.8, markersize=6, marker='o')
+        ax2.set_ylabel('Total URLs', color='#bf616a')
+        ax2.tick_params(axis='y', labelcolor='#bf616a')
+        
+        # Set title and layout
+        plt.title('Crawl4AI Performance Benchmarks')
+        plt.tight_layout()
+        
+        # Save the figure
+        if output_file is None:
+            output_file = self.output_dir / "performance_comparison.png"
+        plt.savefig(output_file, dpi=100, bbox_inches='tight')
+        plt.close()
+        
+        return output_file
+    
+    def generate_memory_charts(self, results, output_prefix=None):
+        """Generate memory usage charts for each test.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            output_prefix: Prefix for output file names
+            
+        Returns:
+            List of paths to the saved chart files
+        """
+        if not VISUALIZATION_AVAILABLE:
+            console.print("[yellow]Skipping memory charts - visualization dependencies not available[/yellow]")
+            return []
+            
+        output_files = []
+        
+        for test_id, result in results.items():
+            if 'memory_samples' not in result:
+                continue
+            
+            memory_df = result['memory_samples']
+            
+            # Check if we have enough data points
+            if len(memory_df) < 2:
+                continue
+            
+            # Try to extract numeric values from memory_info strings
+            try:
+                memory_values = []
+                for mem_str in memory_df['memory_info']:
+                    # Extract the number from strings like "142.8 MB"
+                    value = float(mem_str.split()[0])
+                    memory_values.append(value)
+                
+                memory_df['memory_mb'] = memory_values
+            except Exception as e:
+                console.print(f"[yellow]Could not parse memory values for {test_id}: {e}[/yellow]")
+                continue
+            
+            # Create the plot
+            plt.figure(figsize=(10, 6))
+            
+            # Plot memory usage over time
+            plt.plot(memory_df['elapsed_seconds'], memory_df['memory_mb'], 
+                     color='#88c0d0', marker='o', linewidth=2, markersize=4)
+            
+            # Add annotations for chunk processing
+            chunk_size = result.get('chunk_size', 0)
+            url_count = result.get('url_count', 0)
+            if chunk_size > 0 and url_count > 0:
+                # Estimate chunk processing times
+                num_chunks = (url_count + chunk_size - 1) // chunk_size  # Ceiling division
+                total_time = result.get('total_time_seconds', memory_df['elapsed_seconds'].max())
+                chunk_times = np.linspace(0, total_time, num_chunks + 1)[1:]
+                
+                for i, time_point in enumerate(chunk_times):
+                    if time_point <= memory_df['elapsed_seconds'].max():
+                        plt.axvline(x=time_point, color='#4c566a', linestyle='--', alpha=0.6)
+                        plt.text(time_point, memory_df['memory_mb'].min(), f'Chunk {i+1}', 
+                                rotation=90, verticalalignment='bottom', fontsize=8, color='#e0e0e0')
+            
+            # Set labels and title
+            plt.xlabel('Elapsed Time (seconds)', color='#e0e0e0')
+            plt.ylabel('Memory Usage (MB)', color='#e0e0e0')
+            plt.title(f'Memory Usage During Test {test_id}\n({url_count} URLs, {result.get("workers", "?")} Workers)', 
+                      color='#e0e0e0')
+            
+            # Add grid and set y-axis to start from zero
+            plt.grid(True, alpha=0.3, color='#4c566a')
+            
+            # Add test metadata as text
+            info_text = (
+                f"URLs: {url_count}\n"
+                f"Workers: {result.get('workers', 'N/A')}\n"
+                f"Chunk Size: {result.get('chunk_size', 'N/A')}\n"
+                f"Total Time: {result.get('total_time_seconds', 0):.2f}s\n"
+            )
+            
+            # Calculate memory growth
+            if len(memory_df) >= 2:
+                first_mem = memory_df.iloc[0]['memory_mb']
+                last_mem = memory_df.iloc[-1]['memory_mb']
+                growth = last_mem - first_mem
+                growth_rate = growth / result.get('total_time_seconds', 1)
+                
+                info_text += f"Memory Growth: {growth:.1f} MB\n"
+                info_text += f"Growth Rate: {growth_rate:.2f} MB/s"
+            
+            plt.figtext(0.02, 0.02, info_text, fontsize=9, color='#e0e0e0',
+                       bbox=dict(facecolor='#3b4252', alpha=0.8, edgecolor='#4c566a'))
+            
+            # Save the figure
+            if output_prefix is None:
+                output_file = self.output_dir / f"memory_chart_{test_id}.png"
+            else:
+                output_file = Path(f"{output_prefix}_memory_{test_id}.png")
+                
+            plt.tight_layout()
+            plt.savefig(output_file, dpi=100, bbox_inches='tight')
+            plt.close()
+            
+            output_files.append(output_file)
+        
+        return output_files
+    
+    def generate_comparison_report(self, results, title=None, output_file=None):
+        """Generate a comprehensive comparison report of multiple test runs.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            title: Optional title for the report
+            output_file: File path to save the report
+            
+        Returns:
+            Path to the saved report file
+        """
+        if not results:
+            console.print("[yellow]No results to generate comparison report[/yellow]")
+            return None
+        
+        if output_file is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_file = self.output_dir / f"comparison_report_{timestamp}.html"
+        
+        # Create data for the report
+        rows = []
+        for test_id, data in results.items():
+            # Calculate metrics
+            urls = data.get('url_count', 0)
+            workers = data.get('workers', 0)
+            successful = data.get('successful_urls', 0)
+            failed = data.get('failed_urls', 0)
+            time_seconds = data.get('total_time_seconds', 0)
+            
+            # Calculate additional metrics
+            success_rate = (successful / urls) * 100 if urls > 0 else 0
+            urls_per_second = urls / time_seconds if time_seconds > 0 else 0
+            urls_per_worker = urls / workers if workers > 0 else 0
+            
+            # Calculate memory growth if available
+            mem_start = None
+            mem_end = None
+            mem_growth = None
+            if 'memory_samples' in data:
+                samples = data['memory_samples']
+                if len(samples) >= 2:
+                    try:
+                        first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+                        last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+                        mem_start = first_mem
+                        mem_end = last_mem
+                        mem_growth = last_mem - first_mem
+                    except:
+                        pass
+            
+            # Parse timestamp from test_id
+            try:
+                timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+            except:
+                timestamp = None
+            
+            rows.append({
+                'test_id': test_id,
+                'timestamp': timestamp,
+                'date': timestamp.strftime("%Y-%m-%d %H:%M:%S") if timestamp else "Unknown",
+                'urls': urls,
+                'workers': workers,
+                'chunk_size': data.get('chunk_size', 0),
+                'successful': successful,
+                'failed': failed,
+                'success_rate': success_rate,
+                'time_seconds': time_seconds,
+                'urls_per_second': urls_per_second,
+                'urls_per_worker': urls_per_worker,
+                'memory_start': mem_start,
+                'memory_end': mem_end,
+                'memory_growth': mem_growth
+            })
+        
+        # Sort data by timestamp if possible
+        if VISUALIZATION_AVAILABLE:
+            # Convert to DataFrame and sort by timestamp
+            df = pd.DataFrame(rows)
+            if 'timestamp' in df.columns and not df['timestamp'].isna().all():
+                df = df.sort_values('timestamp', ascending=False)
+        else:
+            # Simple sorting without pandas
+            rows.sort(key=lambda x: x.get('timestamp', datetime.now()), reverse=True)
+            df = None
+        
+        # Generate HTML report
+        html = []
+        html.append('<!DOCTYPE html>')
+        html.append('<html lang="en">')
+        html.append('<head>')
+        html.append('<meta charset="UTF-8">')
+        html.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
+        html.append(f'<title>{title or "Crawl4AI Benchmark Comparison"}</title>')
+        html.append('<style>')
+        html.append('''
+        body {
+            font-family: Arial, sans-serif;
+            line-height: 1.6;
+            margin: 0;
+            padding: 20px;
+            max-width: 1200px;
+            margin: 0 auto;
+            color: #e0e0e0;
+            background-color: #1e1e1e;
+        }
+        h1, h2, h3 {
+            color: #81a1c1;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+            margin-bottom: 20px;
+        }
+        th, td {
+            text-align: left;
+            padding: 12px;
+            border-bottom: 1px solid #444;
+        }
+        th {
+            background-color: #2e3440;
+            font-weight: bold;
+        }
+        tr:hover {
+            background-color: #2e3440;
+        }
+        a {
+            color: #88c0d0;
+            text-decoration: none;
+        }
+        a:hover {
+            text-decoration: underline;
+        }
+        .chart-container {
+            margin: 30px 0;
+            text-align: center;
+            background-color: #2e3440;
+            padding: 20px;
+            border-radius: 8px;
+        }
+        .chart-container img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid #444;
+            box-shadow: 0 0 10px rgba(0,0,0,0.3);
+        }
+        .card {
+            border: 1px solid #444;
+            border-radius: 8px;
+            padding: 15px;
+            margin-bottom: 20px;
+            background-color: #2e3440;
+            box-shadow: 0 0 10px rgba(0,0,0,0.2);
+        }
+        .highlight {
+            background-color: #3b4252;
+            font-weight: bold;
+        }
+        .status-good {
+            color: #a3be8c;
+        }
+        .status-warning {
+            color: #ebcb8b;
+        }
+        .status-bad {
+            color: #bf616a;
+        }
+        ''')
+        html.append('</style>')
+        html.append('</head>')
+        html.append('<body>')
+        
+        # Header
+        html.append(f'<h1>{title or "Crawl4AI Benchmark Comparison"}</h1>')
+        html.append(f'<p>Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>')
+        
+        # Summary section
+        html.append('<div class="card">')
+        html.append('<h2>Summary</h2>')
+        html.append('<p>This report compares the performance of Crawl4AI across multiple test runs.</p>')
+        
+        # Summary metrics
+        data_available = (VISUALIZATION_AVAILABLE and df is not None and not df.empty) or (not VISUALIZATION_AVAILABLE and len(rows) > 0)
+        if data_available:
+            # Get the latest test data
+            if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+                latest_test = df.iloc[0]
+                latest_id = latest_test['test_id']
+            else:
+                latest_test = rows[0]  # First row (already sorted by timestamp)
+                latest_id = latest_test['test_id']
+            
+            html.append('<h3>Latest Test Results</h3>')
+            html.append('<ul>')
+            html.append(f'<li><strong>Test ID:</strong> {latest_id}</li>')
+            html.append(f'<li><strong>Date:</strong> {latest_test["date"]}</li>')
+            html.append(f'<li><strong>URLs:</strong> {latest_test["urls"]}</li>')
+            html.append(f'<li><strong>Workers:</strong> {latest_test["workers"]}</li>')
+            html.append(f'<li><strong>Success Rate:</strong> {latest_test["success_rate"]:.1f}%</li>')
+            html.append(f'<li><strong>Time:</strong> {latest_test["time_seconds"]:.2f} seconds</li>')
+            html.append(f'<li><strong>Performance:</strong> {latest_test["urls_per_second"]:.1f} URLs/second</li>')
+            
+            # Check memory growth (handle both pandas and dict mode)
+            memory_growth_available = False
+            if VISUALIZATION_AVAILABLE and df is not None:
+                if pd.notna(latest_test["memory_growth"]):
+                    html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
+                    memory_growth_available = True
+            else:
+                if latest_test["memory_growth"] is not None:
+                    html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
+                    memory_growth_available = True
+            
+            html.append('</ul>')
+            
+            # If we have more than one test, show trend
+            if (VISUALIZATION_AVAILABLE and df is not None and len(df) > 1) or (not VISUALIZATION_AVAILABLE and len(rows) > 1):
+                if VISUALIZATION_AVAILABLE and df is not None:
+                    prev_test = df.iloc[1]
+                else:
+                    prev_test = rows[1]
+                
+                # Calculate performance change
+                perf_change = ((latest_test["urls_per_second"] / prev_test["urls_per_second"]) - 1) * 100 if prev_test["urls_per_second"] > 0 else 0
+                
+                status_class = ""
+                if perf_change > 5:
+                    status_class = "status-good"
+                elif perf_change < -5:
+                    status_class = "status-bad"
+                
+                html.append('<h3>Performance Trend</h3>')
+                html.append('<ul>')
+                html.append(f'<li><strong>Performance Change:</strong> <span class="{status_class}">{perf_change:+.1f}%</span> compared to previous test</li>')
+                
+                # Memory trend if available
+                memory_trend_available = False
+                if VISUALIZATION_AVAILABLE and df is not None:
+                    if pd.notna(latest_test["memory_growth"]) and pd.notna(prev_test["memory_growth"]):
+                        mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+                        memory_trend_available = True
+                else:
+                    if latest_test["memory_growth"] is not None and prev_test["memory_growth"] is not None:
+                        mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+                        memory_trend_available = True
+                
+                if memory_trend_available:
+                    mem_status = ""
+                    if mem_change < -1:  # Improved (less growth)
+                        mem_status = "status-good"
+                    elif mem_change > 1:  # Worse (more growth)
+                        mem_status = "status-bad"
+                    
+                    html.append(f'<li><strong>Memory Trend:</strong> <span class="{mem_status}">{mem_change:+.1f} MB</span> change in memory growth</li>')
+                
+                html.append('</ul>')
+        
+        html.append('</div>')
+        
+        # Generate performance chart if visualization is available
+        if VISUALIZATION_AVAILABLE:
+            perf_chart = self.generate_performance_chart(results)
+            if perf_chart:
+                html.append('<div class="chart-container">')
+                html.append('<h2>Performance Comparison</h2>')
+                html.append(f'<img src="{os.path.relpath(perf_chart, os.path.dirname(output_file))}" alt="Performance Comparison Chart">')
+                html.append('</div>')
+        else:
+            html.append('<div class="chart-container">')
+            html.append('<h2>Performance Comparison</h2>')
+            html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
+            html.append('</div>')
+        
+        # Generate memory charts if visualization is available
+        if VISUALIZATION_AVAILABLE:
+            memory_charts = self.generate_memory_charts(results)
+            if memory_charts:
+                html.append('<div class="chart-container">')
+                html.append('<h2>Memory Usage</h2>')
+                
+                for chart in memory_charts:
+                    test_id = chart.stem.split('_')[-1]
+                    html.append(f'<h3>Test {test_id}</h3>')
+                    html.append(f'<img src="{os.path.relpath(chart, os.path.dirname(output_file))}" alt="Memory Chart for {test_id}">')
+                
+                html.append('</div>')
+        else:
+            html.append('<div class="chart-container">')
+            html.append('<h2>Memory Usage</h2>')
+            html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
+            html.append('</div>')
+        
+        # Detailed results table
+        html.append('<h2>Detailed Results</h2>')
+        
+        # Add the results as an HTML table
+        html.append('<table>')
+        
+        # Table headers
+        html.append('<tr>')
+        for col in ['Test ID', 'Date', 'URLs', 'Workers', 'Success %', 'Time (s)', 'URLs/sec', 'Mem Growth (MB)']:
+            html.append(f'<th>{col}</th>')
+        html.append('</tr>')
+        
+        # Table rows - handle both pandas DataFrame and list of dicts
+        if VISUALIZATION_AVAILABLE and df is not None:
+            # Using pandas DataFrame
+            for _, row in df.iterrows():
+                html.append('<tr>')
+                html.append(f'<td>{row["test_id"]}</td>')
+                html.append(f'<td>{row["date"]}</td>')
+                html.append(f'<td>{row["urls"]}</td>')
+                html.append(f'<td>{row["workers"]}</td>')
+                html.append(f'<td>{row["success_rate"]:.1f}%</td>')
+                html.append(f'<td>{row["time_seconds"]:.2f}</td>')
+                html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
+                
+                # Memory growth cell
+                if pd.notna(row["memory_growth"]):
+                    html.append(f'<td>{row["memory_growth"]:.1f}</td>')
+                else:
+                    html.append('<td>N/A</td>')
+                    
+                html.append('</tr>')
+        else:
+            # Using list of dicts (when pandas is not available)
+            for row in rows:
+                html.append('<tr>')
+                html.append(f'<td>{row["test_id"]}</td>')
+                html.append(f'<td>{row["date"]}</td>')
+                html.append(f'<td>{row["urls"]}</td>')
+                html.append(f'<td>{row["workers"]}</td>')
+                html.append(f'<td>{row["success_rate"]:.1f}%</td>')
+                html.append(f'<td>{row["time_seconds"]:.2f}</td>')
+                html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
+                
+                # Memory growth cell
+                if row["memory_growth"] is not None:
+                    html.append(f'<td>{row["memory_growth"]:.1f}</td>')
+                else:
+                    html.append('<td>N/A</td>')
+                    
+                html.append('</tr>')
+        
+        html.append('</table>')
+        
+        # Conclusion section
+        html.append('<div class="card">')
+        html.append('<h2>Conclusion</h2>')
+        
+        if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+            # Using pandas for statistics (when available)
+            # Calculate some overall statistics
+            avg_urls_per_sec = df['urls_per_second'].mean()
+            max_urls_per_sec = df['urls_per_second'].max()
+            
+            # Determine if we have a trend
+            if len(df) > 1:
+                trend_data = df.sort_values('timestamp')
+                first_perf = trend_data.iloc[0]['urls_per_second']
+                last_perf = trend_data.iloc[-1]['urls_per_second']
+                
+                perf_change = ((last_perf / first_perf) - 1) * 100 if first_perf > 0 else 0
+                
+                if perf_change > 10:
+                    trend_desc = "significantly improved"
+                    trend_class = "status-good"
+                elif perf_change > 5:
+                    trend_desc = "improved"
+                    trend_class = "status-good"
+                elif perf_change < -10:
+                    trend_desc = "significantly decreased"
+                    trend_class = "status-bad"
+                elif perf_change < -5:
+                    trend_desc = "decreased"
+                    trend_class = "status-bad"
+                else:
+                    trend_desc = "remained stable"
+                    trend_class = ""
+                
+                html.append(f'<p>Overall performance has <span class="{trend_class}">{trend_desc}</span> over the test period.</p>')
+            
+            html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
+            html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
+            
+            # Memory leak assessment
+            if 'memory_growth' in df.columns and not df['memory_growth'].isna().all():
+                avg_growth = df['memory_growth'].mean()
+                max_growth = df['memory_growth'].max()
+                
+                if avg_growth < 5:
+                    leak_assessment = "No significant memory leaks detected"
+                    leak_class = "status-good"
+                elif avg_growth < 10:
+                    leak_assessment = "Minor memory growth observed"
+                    leak_class = "status-warning"
+                else:
+                    leak_assessment = "Potential memory leak detected"
+                    leak_class = "status-bad"
+                
+                html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
+        else:
+            # Manual calculations without pandas
+            if rows:
+                # Calculate average and max throughput
+                total_urls_per_sec = sum(row['urls_per_second'] for row in rows)
+                avg_urls_per_sec = total_urls_per_sec / len(rows)
+                max_urls_per_sec = max(row['urls_per_second'] for row in rows)
+                
+                html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
+                html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
+                
+                # Memory assessment (simplified without pandas)
+                growth_values = [row['memory_growth'] for row in rows if row['memory_growth'] is not None]
+                if growth_values:
+                    avg_growth = sum(growth_values) / len(growth_values)
+                    
+                    if avg_growth < 5:
+                        leak_assessment = "No significant memory leaks detected"
+                        leak_class = "status-good"
+                    elif avg_growth < 10:
+                        leak_assessment = "Minor memory growth observed"
+                        leak_class = "status-warning"
+                    else:
+                        leak_assessment = "Potential memory leak detected"
+                        leak_class = "status-bad"
+                    
+                    html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
+            else:
+                html.append('<p>No test data available for analysis.</p>')
+        
+        html.append('</div>')
+        
+        # Footer
+        html.append('<div style="margin-top: 30px; text-align: center; color: #777; font-size: 0.9em;">')
+        html.append('<p>Generated by Crawl4AI Benchmark Reporter</p>')
+        html.append('</div>')
+        
+        html.append('</body>')
+        html.append('</html>')
+        
+        # Write the HTML file
+        with open(output_file, 'w') as f:
+            f.write('\n'.join(html))
+        
+        # Print a clickable link for terminals that support it (iTerm, VS Code, etc.)
+        file_url = f"file://{os.path.abspath(output_file)}"
+        console.print(f"[green]Comparison report saved to: {output_file}[/green]")
+        console.print(f"[blue underline]Click to open report: {file_url}[/blue underline]")
+        return output_file
+    
+    def run(self, limit=None, output_file=None):
+        """Generate a full benchmark report.
+        
+        Args:
+            limit: Optional limit on number of most recent tests to include
+            output_file: Optional output file path
+            
+        Returns:
+            Path to the generated report file
+        """
+        # Load test results
+        results = self.load_test_results(limit=limit)
+        
+        if not results:
+            console.print("[yellow]No test results found. Run some tests first.[/yellow]")
+            return None
+        
+        # Generate and display summary table
+        summary_table = self.generate_summary_table(results)
+        console.print(summary_table)
+        
+        # Generate comparison report
+        title = f"Crawl4AI Benchmark Report ({len(results)} test runs)"
+        report_file = self.generate_comparison_report(results, title=title, output_file=output_file)
+        
+        if report_file:
+            console.print(f"[bold green]Report generated successfully: {report_file}[/bold green]")
+            return report_file
+        else:
+            console.print("[bold red]Failed to generate report[/bold red]")
+            return None
+
+
+def main():
+    """Main entry point for the benchmark reporter."""
+    parser = argparse.ArgumentParser(description="Generate benchmark reports for Crawl4AI stress tests")
+    
+    parser.add_argument("--reports-dir", type=str, default="reports",
+                      help="Directory containing test result files")
+    parser.add_argument("--output-dir", type=str, default="benchmark_reports",
+                      help="Directory to save generated reports")
+    parser.add_argument("--limit", type=int, default=None,
+                      help="Limit to most recent N test results")
+    parser.add_argument("--output-file", type=str, default=None,
+                      help="Custom output file path for the report")
+    
+    args = parser.parse_args()
+    
+    # Create the benchmark reporter
+    reporter = BenchmarkReporter(reports_dir=args.reports_dir, output_dir=args.output_dir)
+    
+    # Generate the report
+    report_file = reporter.run(limit=args.limit, output_file=args.output_file)
+    
+    if report_file:
+        print(f"Report generated at: {report_file}")
+        return 0
+    else:
+        print("Failed to generate report")
+        return 1
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())
\ No newline at end of file
diff --git a/tests/memory/requirements.txt b/tests/memory/requirements.txt
new file mode 100644
index 00000000..230e0e1f
--- /dev/null
+++ b/tests/memory/requirements.txt
@@ -0,0 +1,4 @@
+pandas>=1.5.0
+matplotlib>=3.5.0
+seaborn>=0.12.0
+rich>=12.0.0
\ No newline at end of file
diff --git a/tests/memory/run_benchmark.py b/tests/memory/run_benchmark.py
new file mode 100755
index 00000000..1e110ddf
--- /dev/null
+++ b/tests/memory/run_benchmark.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Run a complete Crawl4AI benchmark test using test_stress_sdk.py and generate a report.
+"""
+
+import sys
+import os
+import glob
+import argparse
+import subprocess
+import time
+from datetime import datetime
+
+from rich.console import Console
+from rich.text import Text
+
+console = Console()
+
+# Updated TEST_CONFIGS to use max_sessions
+TEST_CONFIGS = {
+    "quick":   {"urls": 50,   "max_sessions": 4,  "chunk_size": 10, "description": "Quick test (50 URLs, 4 sessions)"},
+    "small":   {"urls": 100,  "max_sessions": 8,  "chunk_size": 20, "description": "Small test (100 URLs, 8 sessions)"},
+    "medium":  {"urls": 500,  "max_sessions": 16, "chunk_size": 50, "description": "Medium test (500 URLs, 16 sessions)"},
+    "large":   {"urls": 1000, "max_sessions": 32, "chunk_size": 100,"description": "Large test (1000 URLs, 32 sessions)"},
+    "extreme": {"urls": 2000, "max_sessions": 64, "chunk_size": 200,"description": "Extreme test (2000 URLs, 64 sessions)"},
+}
+
+# Arguments to forward directly if present in custom_args
+FORWARD_ARGS = {
+    "urls": "--urls",
+    "max_sessions": "--max-sessions",
+    "chunk_size": "--chunk-size",
+    "port": "--port",
+    "monitor_mode": "--monitor-mode",
+}
+# Boolean flags to forward if True
+FORWARD_FLAGS = {
+    "stream": "--stream",
+    "use_rate_limiter": "--use-rate-limiter",
+    "keep_server_alive": "--keep-server-alive",
+    "use_existing_site": "--use-existing-site",
+    "skip_generation": "--skip-generation",
+    "keep_site": "--keep-site",
+    "clean_reports": "--clean-reports", # Note: clean behavior is handled here, but pass flag if needed
+    "clean_site": "--clean-site",     # Note: clean behavior is handled here, but pass flag if needed
+}
+
+def run_benchmark(config_name, custom_args=None, compare=True, clean=False):
+    """Runs the stress test and optionally the report generator."""
+    if config_name not in TEST_CONFIGS and config_name != "custom":
+        console.print(f"[bold red]Unknown configuration: {config_name}[/bold red]")
+        return False
+
+    # Print header
+    title = "Crawl4AI SDK Benchmark Test"
+    if config_name != "custom":
+        title += f" - {TEST_CONFIGS[config_name]['description']}"
+    else:
+        # Safely get custom args for title
+        urls = custom_args.get('urls', '?') if custom_args else '?'
+        sessions = custom_args.get('max_sessions', '?') if custom_args else '?'
+        title += f" - Custom ({urls} URLs, {sessions} sessions)"
+
+    console.print(f"\n[bold blue]{title}[/bold blue]")
+    console.print("=" * (len(title) + 4)) # Adjust underline length
+
+    console.print("\n[bold white]Preparing test...[/bold white]")
+
+    # --- Command Construction ---
+    # Use the new script name
+    cmd = ["python", "test_stress_sdk.py"]
+
+    # Apply config or custom args
+    args_to_use = {}
+    if config_name != "custom":
+        args_to_use = TEST_CONFIGS[config_name].copy()
+        # If custom args are provided (e.g., boolean flags), overlay them
+        if custom_args:
+            args_to_use.update(custom_args)
+    elif custom_args: # Custom config
+        args_to_use = custom_args.copy()
+
+    # Add arguments with values
+    for key, arg_name in FORWARD_ARGS.items():
+        if key in args_to_use:
+            cmd.extend([arg_name, str(args_to_use[key])])
+
+    # Add boolean flags
+    for key, flag_name in FORWARD_FLAGS.items():
+        if args_to_use.get(key, False): # Check if key exists and is True
+             # Special handling for clean flags - apply locally, don't forward?
+             # Decide if test_stress_sdk.py also needs --clean flags or if run_benchmark handles it.
+             # For now, let's assume run_benchmark handles cleaning based on its own --clean flag.
+             # We'll forward other flags.
+            if key not in ["clean_reports", "clean_site"]:
+                 cmd.append(flag_name)
+
+    # Handle the top-level --clean flag for run_benchmark
+    if clean:
+        # Pass clean flags to the stress test script as well, if needed
+        # This assumes test_stress_sdk.py also uses --clean-reports and --clean-site
+        cmd.append("--clean-reports")
+        cmd.append("--clean-site")
+        console.print("[yellow]Applying --clean: Cleaning reports and site before test.[/yellow]")
+        # Actual cleaning logic might reside here or be delegated entirely
+
+    console.print(f"\n[bold white]Running stress test:[/bold white] {' '.join(cmd)}")
+    start = time.time()
+
+    # Execute the stress test script
+    # Use Popen to stream output
+    try:
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding='utf-8', errors='replace')
+        while True:
+            line = proc.stdout.readline()
+            if not line:
+                break
+            console.print(line.rstrip()) # Print line by line
+        proc.wait() # Wait for the process to complete
+    except FileNotFoundError:
+         console.print(f"[bold red]Error: Script 'test_stress_sdk.py' not found. Make sure it's in the correct directory.[/bold red]")
+         return False
+    except Exception as e:
+         console.print(f"[bold red]Error running stress test subprocess: {e}[/bold red]")
+         return False
+
+
+    if proc.returncode != 0:
+        console.print(f"[bold red]Stress test failed with exit code {proc.returncode}[/bold red]")
+        return False
+
+    duration = time.time() - start
+    console.print(f"[bold green]Stress test completed in {duration:.1f} seconds[/bold green]")
+
+    # --- Report Generation (Optional) ---
+    if compare:
+        # Assuming benchmark_report.py exists and works with the generated reports
+        report_script = "benchmark_report.py" # Keep configurable if needed
+        report_cmd = ["python", report_script]
+        console.print(f"\n[bold white]Generating benchmark report: {' '.join(report_cmd)}[/bold white]")
+
+        # Run the report command and capture output
+        try:
+             report_proc = subprocess.run(report_cmd, capture_output=True, text=True, check=False, encoding='utf-8', errors='replace') # Use check=False to handle potential errors
+
+             # Print the captured output from benchmark_report.py
+             if report_proc.stdout:
+                 console.print("\n" + report_proc.stdout)
+             if report_proc.stderr:
+                 console.print("[yellow]Report generator stderr:[/yellow]\n" + report_proc.stderr)
+
+             if report_proc.returncode != 0:
+                 console.print(f"[bold yellow]Benchmark report generation script '{report_script}' failed with exit code {report_proc.returncode}[/bold yellow]")
+                 # Don't return False here, test itself succeeded
+             else:
+                  console.print(f"[bold green]Benchmark report script '{report_script}' completed.[/bold green]")
+
+             # Find and print clickable links to the reports
+             # Assuming reports are saved in 'benchmark_reports' by benchmark_report.py
+             report_dir = "benchmark_reports"
+             if os.path.isdir(report_dir):
+                 report_files = glob.glob(os.path.join(report_dir, "comparison_report_*.html"))
+                 if report_files:
+                     try:
+                         latest_report = max(report_files, key=os.path.getctime)
+                         report_path = os.path.abspath(latest_report)
+                         report_url = pathlib.Path(report_path).as_uri() # Better way to create file URI
+                         console.print(f"[bold cyan]Click to open report: [link={report_url}]{report_url}[/link][/bold cyan]")
+                     except Exception as e:
+                          console.print(f"[yellow]Could not determine latest report: {e}[/yellow]")
+
+                 chart_files = glob.glob(os.path.join(report_dir, "memory_chart_*.png"))
+                 if chart_files:
+                      try:
+                         latest_chart = max(chart_files, key=os.path.getctime)
+                         chart_path = os.path.abspath(latest_chart)
+                         chart_url = pathlib.Path(chart_path).as_uri()
+                         console.print(f"[cyan]Memory chart: [link={chart_url}]{chart_url}[/link][/cyan]")
+                      except Exception as e:
+                           console.print(f"[yellow]Could not determine latest chart: {e}[/yellow]")
+             else:
+                  console.print(f"[yellow]Benchmark report directory '{report_dir}' not found. Cannot link reports.[/yellow]")
+
+        except FileNotFoundError:
+             console.print(f"[bold red]Error: Report script '{report_script}' not found.[/bold red]")
+        except Exception as e:
+             console.print(f"[bold red]Error running report generation subprocess: {e}[/bold red]")
+
+
+    # Prompt to exit
+    console.print("\n[bold green]Benchmark run finished. Press Enter to exit.[/bold green]")
+    try:
+        input() # Wait for user input
+    except EOFError:
+        pass # Handle case where input is piped or unavailable
+
+    return True
+
+def main():
+    parser = argparse.ArgumentParser(description="Run a Crawl4AI SDK benchmark test and generate a report")
+
+    # --- Arguments ---
+    parser.add_argument("config", choices=list(TEST_CONFIGS) + ["custom"],
+                        help="Test configuration: quick, small, medium, large, extreme, or custom")
+
+    # Arguments for 'custom' config or to override presets
+    parser.add_argument("--urls", type=int, help="Number of URLs")
+    parser.add_argument("--max-sessions", type=int, help="Max concurrent sessions (replaces --workers)")
+    parser.add_argument("--chunk-size", type=int, help="URLs per batch (for non-stream logging)")
+    parser.add_argument("--port", type=int, help="HTTP server port")
+    parser.add_argument("--monitor-mode", type=str, choices=["DETAILED", "AGGREGATED"], help="Monitor display mode")
+
+    # Boolean flags / options
+    parser.add_argument("--stream", action="store_true", help="Enable streaming results (disables batch logging)")
+    parser.add_argument("--use-rate-limiter", action="store_true", help="Enable basic rate limiter")
+    parser.add_argument("--no-report", action="store_true", help="Skip generating comparison report")
+    parser.add_argument("--clean", action="store_true", help="Clean up reports and site before running")
+    parser.add_argument("--keep-server-alive", action="store_true", help="Keep HTTP server running after test")
+    parser.add_argument("--use-existing-site", action="store_true", help="Use existing site on specified port")
+    parser.add_argument("--skip-generation", action="store_true", help="Use existing site files without regenerating")
+    parser.add_argument("--keep-site", action="store_true", help="Keep generated site files after test")
+    # Removed url_level_logging as it's implicitly handled by stream/batch mode now
+
+    args = parser.parse_args()
+
+    custom_args = {}
+
+    # Populate custom_args from explicit command-line args
+    if args.urls is not None: custom_args["urls"] = args.urls
+    if args.max_sessions is not None: custom_args["max_sessions"] = args.max_sessions
+    if args.chunk_size is not None: custom_args["chunk_size"] = args.chunk_size
+    if args.port is not None: custom_args["port"] = args.port
+    if args.monitor_mode is not None: custom_args["monitor_mode"] = args.monitor_mode
+    if args.stream: custom_args["stream"] = True
+    if args.use_rate_limiter: custom_args["use_rate_limiter"] = True
+    if args.keep_server_alive: custom_args["keep_server_alive"] = True
+    if args.use_existing_site: custom_args["use_existing_site"] = True
+    if args.skip_generation: custom_args["skip_generation"] = True
+    if args.keep_site: custom_args["keep_site"] = True
+    # Clean flags are handled by the 'clean' argument passed to run_benchmark
+
+    # Validate custom config requirements
+    if args.config == "custom":
+        required_custom = ["urls", "max_sessions", "chunk_size"]
+        missing = [f"--{arg}" for arg in required_custom if arg not in custom_args]
+        if missing:
+            console.print(f"[bold red]Error: 'custom' config requires: {', '.join(missing)}[/bold red]")
+            return 1
+
+    success = run_benchmark(
+        config_name=args.config,
+        custom_args=custom_args, # Pass all collected custom args
+        compare=not args.no_report,
+        clean=args.clean
+    )
+    return 0 if success else 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/tests/memory/test_stress_sdk.py b/tests/memory/test_stress_sdk.py
new file mode 100644
index 00000000..8000690c
--- /dev/null
+++ b/tests/memory/test_stress_sdk.py
@@ -0,0 +1,500 @@
+#!/usr/bin/env python3
+"""
+Stress test for Crawl4AI's arun_many and dispatcher system.
+This version uses a local HTTP server and focuses on testing
+the SDK's ability to handle multiple URLs concurrently, with per-batch logging.
+"""
+
+import asyncio
+import os
+import time
+import pathlib
+import random
+import secrets
+import argparse
+import json
+import sys
+import subprocess
+import signal
+from typing import List, Dict, Optional, Union, AsyncGenerator
+import shutil
+from rich.console import Console
+
+# Crawl4AI components
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BrowserConfig,
+    MemoryAdaptiveDispatcher,
+    CrawlerMonitor,
+    DisplayMode,
+    CrawlResult,
+    RateLimiter,
+    CacheMode,
+)
+
+# Constants
+DEFAULT_SITE_PATH = "test_site"
+DEFAULT_PORT = 8000
+DEFAULT_MAX_SESSIONS = 16
+DEFAULT_URL_COUNT = 100
+DEFAULT_CHUNK_SIZE = 10 # Define chunk size for batch logging
+DEFAULT_REPORT_PATH = "reports"
+DEFAULT_STREAM_MODE = False
+DEFAULT_MONITOR_MODE = "DETAILED"
+
+# Initialize Rich console
+console = Console()
+
+# --- SiteGenerator Class (Unchanged) ---
+class SiteGenerator:
+    """Generates a local test site with heavy pages for stress testing."""
+
+    def __init__(self, site_path: str = DEFAULT_SITE_PATH, page_count: int = DEFAULT_URL_COUNT):
+        self.site_path = pathlib.Path(site_path)
+        self.page_count = page_count
+        self.images_dir = self.site_path / "images"
+        self.lorem_words = " ".join("lorem ipsum dolor sit amet " * 100).split()
+
+        self.html_template = """<!doctype html>
+<html>
+<head>
+    <title>Test Page {page_num}</title>
+    <meta charset="utf-8">
+</head>
+<body>
+    <h1>Test Page {page_num}</h1>
+    {paragraphs}
+    {images}
+</body>
+</html>
+"""
+
+    def generate_site(self) -> None:
+        self.site_path.mkdir(parents=True, exist_ok=True)
+        self.images_dir.mkdir(exist_ok=True)
+        console.print(f"Generating {self.page_count} test pages...")
+        for i in range(self.page_count):
+            paragraphs = "\n".join(f"<p>{' '.join(random.choices(self.lorem_words, k=200))}</p>" for _ in range(5))
+            images = "\n".join(f'<img src="https://picsum.photos/seed/{secrets.token_hex(8)}/300/200" loading="lazy" alt="Random image {j}"/>' for j in range(3))
+            page_path = self.site_path / f"page_{i}.html"
+            page_path.write_text(self.html_template.format(page_num=i, paragraphs=paragraphs, images=images), encoding="utf-8")
+            if (i + 1) % (self.page_count // 10 or 1) == 0 or i == self.page_count - 1:
+                 console.print(f"Generated {i+1}/{self.page_count} pages")
+        self._create_index_page()
+        console.print(f"[bold green]Successfully generated {self.page_count} test pages in [cyan]{self.site_path}[/cyan][/bold green]")
+
+    def _create_index_page(self) -> None:
+        index_content = """<!doctype html><html><head><title>Test Site Index</title><meta charset="utf-8"></head><body><h1>Test Site Index</h1><p>This is an automatically generated site for testing Crawl4AI.</p><div class="page-links">\n"""
+        for i in range(self.page_count):
+            index_content += f'        <a href="page_{i}.html">Test Page {i}</a><br>\n'
+        index_content += """    </div></body></html>"""
+        (self.site_path / "index.html").write_text(index_content, encoding="utf-8")
+
+# --- LocalHttpServer Class (Unchanged) ---
+class LocalHttpServer:
+    """Manages a local HTTP server for serving test pages."""
+    def __init__(self, site_path: str = DEFAULT_SITE_PATH, port: int = DEFAULT_PORT):
+        self.site_path = pathlib.Path(site_path)
+        self.port = port
+        self.process = None
+
+    def start(self) -> None:
+        if not self.site_path.exists(): raise FileNotFoundError(f"Site directory {self.site_path} does not exist")
+        console.print(f"Attempting to start HTTP server in [cyan]{self.site_path}[/cyan] on port {self.port}...")
+        try:
+            cmd = ["python", "-m", "http.server", str(self.port)]
+            creationflags = 0; preexec_fn = None
+            if sys.platform == 'win32': creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
+            self.process = subprocess.Popen(cmd, cwd=str(self.site_path), stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=creationflags)
+            time.sleep(1.5)
+            if self.is_running(): console.print(f"[bold green]HTTP server started successfully (PID: {self.process.pid})[/bold green]")
+            else:
+                console.print("[bold red]Failed to start HTTP server. Checking logs...[/bold red]")
+                stdout, stderr = self.process.communicate(); print(stdout.decode(errors='ignore')); print(stderr.decode(errors='ignore'))
+                self.stop(); raise RuntimeError("HTTP server failed to start.")
+        except Exception as e: console.print(f"[bold red]Error starting HTTP server: {str(e)}[/bold red]"); self.stop(); raise
+
+    def stop(self) -> None:
+        if self.process and self.is_running():
+            console.print(f"Stopping HTTP server (PID: {self.process.pid})...")
+            try:
+                if sys.platform == 'win32': self.process.send_signal(signal.CTRL_BREAK_EVENT); time.sleep(0.5)
+                self.process.terminate()
+                try: stdout, stderr = self.process.communicate(timeout=5); console.print("[bold yellow]HTTP server stopped[/bold yellow]")
+                except subprocess.TimeoutExpired: console.print("[bold red]Server did not terminate gracefully, killing...[/bold red]"); self.process.kill(); stdout, stderr = self.process.communicate(); console.print("[bold yellow]HTTP server killed[/bold yellow]")
+            except Exception as e: console.print(f"[bold red]Error stopping HTTP server: {str(e)}[/bold red]"); self.process.kill()
+            finally: self.process = None
+        elif self.process: console.print("[dim]HTTP server process already stopped.[/dim]"); self.process = None
+
+    def is_running(self) -> bool:
+        if not self.process: return False
+        return self.process.poll() is None
+
+# --- SimpleMemoryTracker Class (Unchanged) ---
+class SimpleMemoryTracker:
+    """Basic memory tracker that doesn't rely on psutil."""
+    def __init__(self, report_path: str = DEFAULT_REPORT_PATH, test_id: Optional[str] = None):
+        self.report_path = pathlib.Path(report_path); self.report_path.mkdir(parents=True, exist_ok=True)
+        self.test_id = test_id or time.strftime("%Y%m%d_%H%M%S")
+        self.start_time = time.time(); self.memory_samples = []; self.pid = os.getpid()
+        self.csv_path = self.report_path / f"memory_samples_{self.test_id}.csv"
+        with open(self.csv_path, 'w', encoding='utf-8') as f: f.write("timestamp,elapsed_seconds,memory_info_mb\n")
+
+    def sample(self) -> Dict:
+        try:
+            memory_mb = self._get_memory_info_mb()
+            memory_str = f"{memory_mb:.1f} MB" if memory_mb is not None else "Unknown"
+            timestamp = time.time(); elapsed = timestamp - self.start_time
+            sample = {"timestamp": timestamp, "elapsed_seconds": elapsed, "memory_mb": memory_mb, "memory_str": memory_str}
+            self.memory_samples.append(sample)
+            with open(self.csv_path, 'a', encoding='utf-8') as f: f.write(f"{timestamp},{elapsed:.2f},{memory_mb if memory_mb is not None else ''}\n")
+            return sample
+        except Exception as e: return {"memory_mb": None, "memory_str": "Error"}
+
+    def _get_memory_info_mb(self) -> Optional[float]:
+        pid_str = str(self.pid)
+        try:
+            if sys.platform == 'darwin': result = subprocess.run(["ps", "-o", "rss=", "-p", pid_str], capture_output=True, text=True, check=True, encoding='utf-8'); return int(result.stdout.strip()) / 1024.0
+            elif sys.platform == 'linux':
+                with open(f"/proc/{pid_str}/status", encoding='utf-8') as f:
+                    for line in f:
+                        if line.startswith("VmRSS:"): return int(line.split()[1]) / 1024.0
+                return None
+            elif sys.platform == 'win32': result = subprocess.run(["tasklist", "/fi", f"PID eq {pid_str}", "/fo", "csv", "/nh"], capture_output=True, text=True, check=True, encoding='cp850', errors='ignore'); parts = result.stdout.strip().split('","'); return int(parts[4].strip().replace('"', '').replace(' K', '').replace(',', '')) / 1024.0 if len(parts) >= 5 else None
+            else: return None
+        except: return None # Catch all exceptions for robustness
+
+    def get_report(self) -> Dict:
+        if not self.memory_samples: return {"error": "No memory samples collected"}
+        total_time = time.time() - self.start_time; valid_samples = [s['memory_mb'] for s in self.memory_samples if s['memory_mb'] is not None]
+        start_mem = valid_samples[0] if valid_samples else None; end_mem = valid_samples[-1] if valid_samples else None
+        max_mem = max(valid_samples) if valid_samples else None; avg_mem = sum(valid_samples) / len(valid_samples) if valid_samples else None
+        growth = (end_mem - start_mem) if start_mem is not None and end_mem is not None else None
+        return {"test_id": self.test_id, "total_time_seconds": total_time, "sample_count": len(self.memory_samples), "valid_sample_count": len(valid_samples), "csv_path": str(self.csv_path), "platform": sys.platform, "start_memory_mb": start_mem, "end_memory_mb": end_mem, "max_memory_mb": max_mem, "average_memory_mb": avg_mem, "memory_growth_mb": growth}
+
+
+# --- CrawlerStressTest Class (Refactored for Per-Batch Logging) ---
+class CrawlerStressTest:
+    """Orchestrates the stress test using arun_many per chunk and a dispatcher."""
+
+    def __init__(
+        self,
+        url_count: int = DEFAULT_URL_COUNT,
+        port: int = DEFAULT_PORT,
+        max_sessions: int = DEFAULT_MAX_SESSIONS,
+        chunk_size: int = DEFAULT_CHUNK_SIZE, # Added chunk_size
+        report_path: str = DEFAULT_REPORT_PATH,
+        stream_mode: bool = DEFAULT_STREAM_MODE,
+        monitor_mode: str = DEFAULT_MONITOR_MODE,
+        use_rate_limiter: bool = False
+    ):
+        self.url_count = url_count
+        self.server_port = port
+        self.max_sessions = max_sessions
+        self.chunk_size = chunk_size # Store chunk size
+        self.report_path = pathlib.Path(report_path)
+        self.report_path.mkdir(parents=True, exist_ok=True)
+        self.stream_mode = stream_mode
+        self.monitor_mode = DisplayMode[monitor_mode.upper()]
+        self.use_rate_limiter = use_rate_limiter
+
+        self.test_id = time.strftime("%Y%m%d_%H%M%S")
+        self.results_summary = {
+            "test_id": self.test_id, "url_count": url_count, "max_sessions": max_sessions,
+            "chunk_size": chunk_size, "stream_mode": stream_mode, "monitor_mode": monitor_mode,
+            "rate_limiter_used": use_rate_limiter, "start_time": "", "end_time": "",
+            "total_time_seconds": 0, "successful_urls": 0, "failed_urls": 0,
+            "urls_processed": 0, "chunks_processed": 0
+        }
+
+    async def run(self) -> Dict:
+        """Run the stress test and return results."""
+        memory_tracker = SimpleMemoryTracker(report_path=self.report_path, test_id=self.test_id)
+        urls = [f"http://localhost:{self.server_port}/page_{i}.html" for i in range(self.url_count)]
+        # Split URLs into chunks based on self.chunk_size
+        url_chunks = [urls[i:i+self.chunk_size] for i in range(0, len(urls), self.chunk_size)]
+
+        self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
+        start_time = time.time()
+
+        config = CrawlerRunConfig(
+            wait_for_images=False, verbose=False,
+            stream=self.stream_mode, # Still pass stream mode, affects arun_many return type
+            cache_mode=CacheMode.BYPASS
+        )
+
+        total_successful_urls = 0
+        total_failed_urls = 0
+        total_urls_processed = 0
+        start_memory_sample = memory_tracker.sample()
+        start_memory_str = start_memory_sample.get("memory_str", "Unknown")
+
+        # monitor = CrawlerMonitor(display_mode=self.monitor_mode, total_urls=self.url_count)
+        monitor = None
+        rate_limiter = RateLimiter(base_delay=(0.1, 0.3)) if self.use_rate_limiter else None
+        dispatcher = MemoryAdaptiveDispatcher(max_session_permit=self.max_sessions, monitor=monitor, rate_limiter=rate_limiter)
+
+        console.print(f"\n[bold cyan]Crawl4AI Stress Test - {self.url_count} URLs, {self.max_sessions} max sessions[/bold cyan]")
+        console.print(f"[bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]Monitor:[/bold cyan] {self.monitor_mode.name}, [bold cyan]Chunk Size:[/bold cyan] {self.chunk_size}")
+        console.print(f"[bold cyan]Initial Memory:[/bold cyan] {start_memory_str}")
+
+        # Print batch log header only if not streaming
+        if not self.stream_mode:
+            console.print("\n[bold]Batch Progress:[/bold] (Monitor below shows overall progress)")
+            console.print("[bold] Batch | Progress | Start Mem | End Mem   | URLs/sec | Success/Fail | Time (s) | Status [/bold]")
+            console.print("─" * 90)
+
+        monitor_task = asyncio.create_task(self._periodic_memory_sample(memory_tracker, 2.0))
+
+        try:
+            async with AsyncWebCrawler(
+                    config=BrowserConfig( verbose = False)
+                ) as crawler:
+                # Process URLs chunk by chunk
+                for chunk_idx, url_chunk in enumerate(url_chunks):
+                    batch_start_time = time.time()
+                    chunk_success = 0
+                    chunk_failed = 0
+
+                    # Sample memory before the chunk
+                    start_mem_sample = memory_tracker.sample()
+                    start_mem_str = start_mem_sample.get("memory_str", "Unknown")
+
+                    # --- Call arun_many for the current chunk ---
+                    try:
+                        # Note: dispatcher/monitor persist across calls
+                        results_gen_or_list: Union[AsyncGenerator[CrawlResult, None], List[CrawlResult]] = \
+                            await crawler.arun_many(
+                                urls=url_chunk,
+                                config=config,
+                                dispatcher=dispatcher # Reuse the same dispatcher
+                            )
+
+                        if self.stream_mode:
+                            # Process stream results if needed, but batch logging is less relevant
+                            async for result in results_gen_or_list:
+                                total_urls_processed += 1
+                                if result.success: chunk_success += 1
+                                else: chunk_failed += 1
+                            # In stream mode, batch summary isn't as meaningful here
+                            # We could potentially track completion per chunk async, but it's complex
+
+                        else: # Batch mode
+                            # Process the list of results for this chunk
+                            for result in results_gen_or_list:
+                                total_urls_processed += 1
+                                if result.success: chunk_success += 1
+                                else: chunk_failed += 1
+
+                    except Exception as e:
+                        console.print(f"[bold red]Error processing chunk {chunk_idx+1}: {e}[/bold red]")
+                        chunk_failed = len(url_chunk) # Assume all failed in the chunk on error
+                        total_urls_processed += len(url_chunk) # Count them as processed (failed)
+
+                    # --- Log batch results (only if not streaming) ---
+                    if not self.stream_mode:
+                        batch_time = time.time() - batch_start_time
+                        urls_per_sec = len(url_chunk) / batch_time if batch_time > 0 else 0
+                        end_mem_sample = memory_tracker.sample()
+                        end_mem_str = end_mem_sample.get("memory_str", "Unknown")
+
+                        progress_pct = (total_urls_processed / self.url_count) * 100
+
+                        if chunk_failed == 0: status_color, status = "green", "Success"
+                        elif chunk_success == 0: status_color, status = "red", "Failed"
+                        else: status_color, status = "yellow", "Partial"
+
+                        console.print(
+                             f" {chunk_idx+1:<5} | {progress_pct:6.1f}% | {start_mem_str:>9} | {end_mem_str:>9} | {urls_per_sec:8.1f} | "
+                            f"{chunk_success:^7}/{chunk_failed:<6} | {batch_time:8.2f} | [{status_color}]{status:<7}[/{status_color}]"
+                        )
+
+                    # Accumulate totals
+                    total_successful_urls += chunk_success
+                    total_failed_urls += chunk_failed
+                    self.results_summary["chunks_processed"] += 1
+
+                    # Optional small delay between starting chunks if needed
+                    # await asyncio.sleep(0.1)
+
+        except Exception as e:
+             console.print(f"[bold red]An error occurred during the main crawl loop: {e}[/bold red]")
+        finally:
+            if 'monitor_task' in locals() and not monitor_task.done():
+                 monitor_task.cancel()
+                 try: await monitor_task
+                 except asyncio.CancelledError: pass
+
+        end_time = time.time()
+        self.results_summary.update({
+            "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "total_time_seconds": end_time - start_time,
+            "successful_urls": total_successful_urls,
+            "failed_urls": total_failed_urls,
+            "urls_processed": total_urls_processed,
+            "memory": memory_tracker.get_report()
+        })
+        self._save_results()
+        return self.results_summary
+
+    async def _periodic_memory_sample(self, tracker: SimpleMemoryTracker, interval: float):
+        """Background task to sample memory periodically."""
+        while True:
+            tracker.sample()
+            try:
+                await asyncio.sleep(interval)
+            except asyncio.CancelledError:
+                break # Exit loop on cancellation
+
+    def _save_results(self) -> None:
+        results_path = self.report_path / f"test_summary_{self.test_id}.json"
+        try:
+            with open(results_path, 'w', encoding='utf-8') as f: json.dump(self.results_summary, f, indent=2, default=str)
+            # console.print(f"\n[bold green]Results summary saved to {results_path}[/bold green]") # Moved summary print to run_full_test
+        except Exception as e: console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
+
+
+# --- run_full_test Function (Adjusted) ---
+async def run_full_test(args):
+    """Run the complete test process from site generation to crawling."""
+    server = None
+    site_generated = False
+
+    # --- Site Generation --- (Same as before)
+    if not args.use_existing_site and not args.skip_generation:
+        if os.path.exists(args.site_path): console.print(f"[yellow]Removing existing site directory: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+        site_generator = SiteGenerator(site_path=args.site_path, page_count=args.urls); site_generator.generate_site(); site_generated = True
+    elif args.use_existing_site: console.print(f"[cyan]Using existing site assumed to be running on port {args.port}[/cyan]")
+    elif args.skip_generation:
+         console.print(f"[cyan]Skipping site generation, using existing directory: {args.site_path}[/cyan]")
+         if not os.path.exists(args.site_path) or not os.path.isdir(args.site_path): console.print(f"[bold red]Error: Site path '{args.site_path}' does not exist or is not a directory.[/bold red]"); return
+
+    # --- Start Local Server --- (Same as before)
+    server_started = False
+    if not args.use_existing_site:
+        server = LocalHttpServer(site_path=args.site_path, port=args.port)
+        try: server.start(); server_started = True
+        except Exception as e:
+            console.print(f"[bold red]Failed to start local server. Aborting test.[/bold red]")
+            if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+            return
+
+    try:
+        # --- Run the Stress Test ---
+        test = CrawlerStressTest(
+            url_count=args.urls,
+            port=args.port,
+            max_sessions=args.max_sessions,
+            chunk_size=args.chunk_size, # Pass chunk_size
+            report_path=args.report_path,
+            stream_mode=args.stream,
+            monitor_mode=args.monitor_mode,
+            use_rate_limiter=args.use_rate_limiter
+        )
+        results = await test.run() # Run the test which now handles chunks internally
+
+        # --- Print Summary ---
+        console.print("\n" + "=" * 80)
+        console.print("[bold green]Test Completed[/bold green]")
+        console.print("=" * 80)
+
+        # (Summary printing logic remains largely the same)
+        success_rate = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
+        urls_per_second = results["urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+
+        console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
+        console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_sessions']} sessions, Chunk: {results['chunk_size']}, Stream: {results['stream_mode']}, Monitor: {results['monitor_mode']}")
+        console.print(f"[bold cyan]Results:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['urls_processed']} processed, {success_rate:.1f}% success)")
+        console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f} seconds total, {urls_per_second:.2f} URLs/second avg")
+
+        mem_report = results.get("memory", {})
+        mem_info_str = "Memory tracking data unavailable."
+        if mem_report and not mem_report.get("error"):
+            start_mb = mem_report.get('start_memory_mb'); end_mb = mem_report.get('end_memory_mb'); max_mb = mem_report.get('max_memory_mb'); growth_mb = mem_report.get('memory_growth_mb')
+            mem_parts = []
+            if start_mb is not None: mem_parts.append(f"Start: {start_mb:.1f} MB")
+            if end_mb is not None: mem_parts.append(f"End: {end_mb:.1f} MB")
+            if max_mb is not None: mem_parts.append(f"Max: {max_mb:.1f} MB")
+            if growth_mb is not None: mem_parts.append(f"Growth: {growth_mb:.1f} MB")
+            if mem_parts: mem_info_str = ", ".join(mem_parts)
+            csv_path = mem_report.get('csv_path')
+            if csv_path: console.print(f"[dim]Memory samples saved to: {csv_path}[/dim]")
+
+        console.print(f"[bold cyan]Memory Usage:[/bold cyan] {mem_info_str}")
+        console.print(f"[bold green]Results summary saved to {results['memory']['csv_path'].replace('memory_samples', 'test_summary').replace('.csv', '.json')}[/bold green]") # Infer summary path
+
+
+        if results["failed_urls"] > 0: console.print(f"\n[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate:.1f}% failure rate)[/bold yellow]")
+        if results["urls_processed"] < results["url_count"]: console.print(f"\n[bold red]Error: Only {results['urls_processed']} out of {results['url_count']} URLs were processed![/bold red]")
+
+
+    finally:
+        # --- Stop Server / Cleanup --- (Same as before)
+        if server_started and server and not args.keep_server_alive: server.stop()
+        elif server_started and server and args.keep_server_alive:
+            console.print(f"[bold cyan]Server is kept running on port {args.port}. Press Ctrl+C to stop it.[/bold cyan]")
+            try: await asyncio.Future() # Keep running indefinitely
+            except KeyboardInterrupt: console.print("\n[bold yellow]Stopping server due to user interrupt...[/bold yellow]"); server.stop()
+
+        if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+        elif args.clean_site and os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+
+
+# --- main Function (Added chunk_size argument) ---
+def main():
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(description="Crawl4AI SDK High Volume Stress Test using arun_many")
+
+    # Test parameters
+    parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Number of URLs to test (default: {DEFAULT_URL_COUNT})")
+    parser.add_argument("--max-sessions", type=int, default=DEFAULT_MAX_SESSIONS, help=f"Maximum concurrent crawling sessions (default: {DEFAULT_MAX_SESSIONS})")
+    parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per batch for logging (default: {DEFAULT_CHUNK_SIZE})") # Added
+    parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Enable streaming mode (disables batch logging) (default: {DEFAULT_STREAM_MODE})")
+    parser.add_argument("--monitor-mode", type=str, default=DEFAULT_MONITOR_MODE, choices=["DETAILED", "AGGREGATED"], help=f"Display mode for the live monitor (default: {DEFAULT_MONITOR_MODE})")
+    parser.add_argument("--use-rate-limiter", action="store_true", default=False, help="Enable a basic rate limiter (default: False)")
+
+    # Environment parameters
+    parser.add_argument("--site-path", type=str, default=DEFAULT_SITE_PATH, help=f"Path to generate/use the test site (default: {DEFAULT_SITE_PATH})")
+    parser.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port for the local HTTP server (default: {DEFAULT_PORT})")
+    parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
+
+    # Site/Server management
+    parser.add_argument("--skip-generation", action="store_true", help="Use existing test site folder without regenerating")
+    parser.add_argument("--use-existing-site", action="store_true", help="Do not generate site or start local server; assume site exists on --port")
+    parser.add_argument("--keep-server-alive", action="store_true", help="Keep the local HTTP server running after test")
+    parser.add_argument("--keep-site", action="store_true", help="Keep the generated test site files after test")
+    parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
+    parser.add_argument("--clean-site", action="store_true", help="Clean up site directory before running (if generating) or after")
+
+    args = parser.parse_args()
+
+    # Display config
+    console.print("[bold underline]Crawl4AI SDK Stress Test Configuration[/bold underline]")
+    console.print(f"URLs: {args.urls}, Max Sessions: {args.max_sessions}, Chunk Size: {args.chunk_size}") # Added chunk size
+    console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}, Monitor: {args.monitor_mode}, Rate Limit: {args.use_rate_limiter}")
+    console.print(f"Site Path: {args.site_path}, Port: {args.port}, Report Path: {args.report_path}")
+    console.print("-" * 40)
+    # (Rest of config display and cleanup logic is the same)
+    if args.use_existing_site: console.print("[cyan]Mode: Using existing external site/server[/cyan]")
+    elif args.skip_generation: console.print("[cyan]Mode: Using existing site files, starting local server[/cyan]")
+    else: console.print("[cyan]Mode: Generating site files, starting local server[/cyan]")
+    if args.keep_server_alive: console.print("[cyan]Option: Keep server alive after test[/cyan]")
+    if args.keep_site: console.print("[cyan]Option: Keep site files after test[/cyan]")
+    if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
+    if args.clean_site: console.print("[cyan]Option: Clean site directory[/cyan]")
+    console.print("-" * 40)
+
+    if args.clean_reports:
+        if os.path.exists(args.report_path): console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]"); shutil.rmtree(args.report_path)
+        os.makedirs(args.report_path, exist_ok=True)
+    if args.clean_site and not args.use_existing_site:
+         if os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+
+    # Run
+    try: asyncio.run(run_full_test(args))
+    except KeyboardInterrupt: console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
+    except Exception as e: console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}"); import traceback; traceback.print_exc()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 3bf78ff47a67c82a962dbc0d19da166b42229961 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 17 Apr 2025 22:32:58 +0800
Subject: [PATCH 62/78] refactor(docker-demo): enhance error handling and
 output formatting

Improve the Docker API demo script with better error handling, more detailed output,
and enhanced visualization:
- Add detailed error messages and stack traces for debugging
- Implement better status code handling and display
- Enhance JSON output formatting with monokai theme and word wrap
- Add depth information display for deep crawls
- Improve proxy usage reporting
- Fix port number inconsistency

No breaking changes.
---
 docs/examples/docker/demo_docker_api.py | 194 ++++++++++++++++++++----
 1 file changed, 165 insertions(+), 29 deletions(-)

diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 56d0173c..77f3bf42 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -16,8 +16,8 @@ load_dotenv()  # Load environment variables from .env file
 console = Console()
 
 # --- Configuration ---
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
 # Target URLs
 SIMPLE_URL = "https://httpbin.org/html"
 LINKS_URL = "https://httpbin.org/links/10/0"
@@ -50,8 +50,14 @@ async def check_server_health(client: httpx.AsyncClient):
         return False
 
 def print_payload(payload: Dict[str, Any]):
-    """Prints the JSON payload nicely."""
-    syntax = Syntax(json.dumps(payload, indent=2), "json", theme="default", line_numbers=False)
+    """Prints the JSON payload nicely with a dark theme."""
+    syntax = Syntax(
+        json.dumps(payload, indent=2),
+        "json",
+        theme="monokai",  # <--- Changed theme here
+        line_numbers=False,
+        word_wrap=True      # Added word wrap for potentially long payloads
+    )
     console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False))
 
 def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
@@ -126,12 +132,15 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
     print_payload(payload)
     console.print(f"Sending POST stream request to {client.base_url}{endpoint}...")
     all_results = []
+    initial_status_code = None # Store initial status code
+
     try:
         start_time = time.time()
         async with client.stream("POST", endpoint, json=payload) as response:
+            initial_status_code = response.status_code # Capture initial status
             duration = time.time() - start_time # Time to first byte potentially
-            console.print(f"Initial Response Status: [bold {'green' if response.status_code == 200 else 'red'}]{response.status_code}[/] (first byte ~{duration:.2f}s)")
-            response.raise_for_status()
+            console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
+            response.raise_for_status() # Raise exception for bad *initial* status codes
 
             console.print("[magenta]--- Streaming Results ---[/]")
             completed = False
@@ -143,11 +152,16 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
                             completed = True
                             console.print("[bold green]--- Stream Completed ---[/]")
                             break
-                        elif data.get("url"): # Looks like a result
+                        elif data.get("url"): # Looks like a result dictionary
                             all_results.append(data)
+                            # Display summary info as it arrives
                             success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]"
                             url = data.get('url', 'N/A')
-                            console.print(f"  {success_icon} Received: [link={url}]{url}[/link]")
+                            # Display status code FROM THE RESULT DATA if available
+                            result_status = data.get('status_code', 'N/A')
+                            console.print(f"  {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
+                            if not data.get('success') and data.get('error_message'):
+                                console.print(f"    [red]Error: {data['error_message']}[/]")
                         else:
                             console.print(f"  [yellow]Stream meta-data:[/yellow] {data}")
                     except json.JSONDecodeError:
@@ -156,8 +170,10 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
                  console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
 
     except httpx.HTTPStatusError as e:
-        console.print(f"[bold red]HTTP Error:[/]")
-        console.print(f"Status: {e.response.status_code}")
+        # Use the captured initial status code if available, otherwise from the exception
+        status = initial_status_code if initial_status_code is not None else e.response.status_code
+        console.print(f"[bold red]HTTP Error (Initial Request):[/]")
+        console.print(f"Status: {status}")
         try:
             console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
         except json.JSONDecodeError:
@@ -165,11 +181,12 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
     except httpx.RequestError as e:
         console.print(f"[bold red]Request Error: {e}[/]")
     except Exception as e:
-        console.print(f"[bold red]Unexpected Error: {e}[/]")
+        console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
+        console.print_exception(show_locals=False) # Print stack trace for unexpected errors
 
+    # Call print_result_summary with the *collected* results AFTER the stream is done
     print_result_summary(all_results, title=f"{title} Collected Results")
 
-
 def load_proxies_from_env() -> List[Dict]:
     """
     Load proxies from the PROXIES environment variable.
@@ -583,7 +600,7 @@ async def demo_extract_llm(client: httpx.AsyncClient):
 
             if isinstance(extracted_data, dict):
                  console.print("[cyan]Extracted Data (LLM):[/]")
-                 syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="default", line_numbers=False)
+                 syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="monokai", line_numbers=False)
                  console.print(Panel(syntax, border_style="cyan", expand=False))
             else:
                  console.print("[yellow]LLM extraction did not return expected dictionary.[/]")
@@ -618,6 +635,12 @@ async def demo_deep_basic(client: httpx.AsyncClient):
     }
     results = await make_request(client, "/crawl", payload, "Demo 5a: Basic Deep Crawl")
     # print_result_summary is called by make_request, showing URLs and depths
+    for result in results:
+        if result.get("success") and result.get("metadata"):
+            depth = result["metadata"].get("depth", "N/A")
+            console.print(f"  Depth: {depth}")
+        elif not result.get("success"):
+            console.print(f"  [red]Error: {result['error_message']}[/]")
 
 # 5. Streaming Deep Crawl
 async def demo_deep_streaming(client: httpx.AsyncClient):
@@ -646,6 +669,109 @@ async def demo_deep_streaming(client: httpx.AsyncClient):
     # stream_request handles printing results as they arrive
     await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
 
+# 5a. Deep Crawl with Filtering & Scoring
+async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
+    """Demonstrates deep crawl with advanced URL filtering and scoring."""
+    max_depth = 2 # Go a bit deeper to see scoring/filtering effects
+    max_pages = 6
+    excluded_pattern = "*/category-1/*" # Example pattern to exclude
+    keyword_to_score = "product"        # Example keyword to prioritize
+
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": False,
+                "cache_mode": "BYPASS",
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": max_depth,
+                        "max_pages": max_pages,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {
+                                "filters": [
+                                    {   # Stay on the allowed domain
+                                        "type": "DomainFilter",
+                                        "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                    },
+                                    {   # Only crawl HTML pages
+                                        "type": "ContentTypeFilter",
+                                        "params": {"allowed_types": ["text/html"]}
+                                    },
+                                    {   # Exclude URLs matching the pattern
+                                        "type": "URLPatternFilter",
+                                        "params": {
+                                            "patterns": [excluded_pattern],
+                                            "reverse": True # Block if match
+                                        }
+                                    }
+                                ]
+                            }
+                        },
+                        "url_scorer": {
+                            "type": "CompositeScorer",
+                            "params": {
+                                "scorers": [
+                                    {   # Boost score for URLs containing the keyword
+                                        "type": "KeywordRelevanceScorer",
+                                        "params": {"keywords": [keyword_to_score], "weight": 1.5} # Higher weight
+                                    },
+                                    {   # Slightly penalize deeper pages
+                                        "type": "PathDepthScorer",
+                                        "params": {"optimal_depth": 1, "weight": -0.1}
+                                    }
+                                ]
+                            }
+                        },
+                        # Optional: Only crawl URLs scoring above a threshold
+                        # "score_threshold": 0.1
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 5c: Deep Crawl with Filtering & Scoring")
+
+    # --- Verification/Analysis ---
+    if results:
+        console.print("[cyan]Deep Crawl Filtering/Scoring Analysis:[/]")
+        excluded_found = False
+        prioritized_found_at_depth1 = False
+        prioritized_found_overall = False
+
+        for result in results:
+            url = result.get("url", "")
+            depth = result.get("metadata", {}).get("depth", -1)
+
+            # Check Filtering
+            if excluded_pattern.strip('*') in url: # Check if the excluded part is present
+                console.print(f"  [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
+                excluded_found = True
+
+            # Check Scoring (Observation)
+            if keyword_to_score in url:
+                 prioritized_found_overall = True
+                 if depth == 1: # Check if prioritized keywords appeared early (depth 1)
+                     prioritized_found_at_depth1 = True
+
+        if not excluded_found:
+             console.print(f"  [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
+        else:
+             console.print(f"  [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
+
+        if prioritized_found_at_depth1:
+            console.print(f"  [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
+        elif prioritized_found_overall:
+            console.print(f"  [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
+        else:
+             console.print(f"  [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
+
+        # print_result_summary called by make_request already shows URLs and depths
+
 # 6. Deep Crawl with Extraction
 async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
     # Schema to extract H1 and first paragraph from any page
@@ -782,16 +908,26 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient):
                 "deep_crawl_strategy": {
                     "type": "BFSDeepCrawlStrategy",
                     "params": {
-                        "max_depth": 0, # Just crawl start URL via proxy
-                        "max_pages": 1,
+                        "max_depth": 1, # Just crawl start URL via proxy
+                        "max_pages": 5,
                     }
                 }
             }
         }
     }
     # make_request calls print_result_summary, which shows URL and success status
-    await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
+    results = await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
+    if not results:
+        console.print("[red]No results returned from the crawl.[/]")
+        return
+    console.print("[cyan]Proxy Usage Summary from Deep Crawl:[/]")
     # Verification of specific proxy IP usage would require more complex setup or server logs.
+    for result in results:
+        if result.get("success") and result.get("metadata"):
+            proxy_ip = result["metadata"].get("proxy_ip", "N/A")
+            console.print(f"  Proxy IP used: {proxy_ip}")
+        elif not result.get("success"):
+            console.print(f"  [red]Error: {result['error_message']}[/]")
 
 
 # 6d. Deep Crawl with SSL Certificate Fetching
@@ -844,26 +980,26 @@ async def main_demo():
             return
 
         # --- Run Demos ---
-        # await demo_basic_single_url(client)
-        # await demo_basic_multi_url(client)
-        # await demo_streaming_multi_url(client)
+        await demo_basic_single_url(client)
+        await demo_basic_multi_url(client)
+        await demo_streaming_multi_url(client)
 
-        # await demo_markdown_default(client)
-        # await demo_markdown_pruning(client)
-        # await demo_markdown_bm25(client)
+        await demo_markdown_default(client)
+        await demo_markdown_pruning(client)
+        await demo_markdown_bm25(client)
 
-        # await demo_param_css_selector(client)
-        # await demo_param_js_execution(client)
-        # await demo_param_screenshot(client)
-        # await demo_param_ssl_fetch(client)
-        # await demo_param_proxy(client) # Skips if no PROXIES env var
+        await demo_param_css_selector(client)
+        await demo_param_js_execution(client)
+        await demo_param_screenshot(client)
+        await demo_param_ssl_fetch(client)
+        await demo_param_proxy(client) # Skips if no PROXIES env var
 
-        # await demo_extract_css(client)
+        await demo_extract_css(client)
         await demo_extract_llm(client) # Skips if no common LLM key env var
 
         await demo_deep_basic(client)
-        await demo_deep_streaming(client)
-        # demo_deep_filtering_scoring skipped for brevity, add if needed
+        await demo_deep_streaming(client) # This need extra work
+        
 
         await demo_deep_with_css_extraction(client)
         await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var

From 16b231824295f561787d1473386473547b668510 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 18 Apr 2025 22:26:24 +0800
Subject: [PATCH 63/78] feat(api): implement crawler pool manager for improved
 resource handling

Adds a new CrawlerManager class to handle browser instance pooling and failover:
- Implements auto-scaling based on system resources
- Adds primary/backup crawler management
- Integrates memory monitoring and throttling
- Adds streaming support with memory tracking
- Updates API endpoints to use pooled crawlers

BREAKING CHANGE: API endpoints now require CrawlerManager initialization
---
 crawl4ai/async_webcrawler.py           |   6 +-
 deploy/docker/api copy.py              | 503 ++++++++++++++++++++++
 deploy/docker/api.py                   |  94 ++++-
 deploy/docker/config.yml               |  34 ++
 deploy/docker/crawler_manager.py       | 556 +++++++++++++++++++++++++
 deploy/docker/server.py                | 299 +++++++++++--
 tests/memory/test_stress_api.py        | 516 +++++++++++++++++++++++
 tests/memory/test_stress_docker_api.py | 129 ++++++
 tests/memory/test_stress_sdk.py        |   4 +-
 9 files changed, 2082 insertions(+), 59 deletions(-)
 create mode 100644 deploy/docker/api copy.py
 create mode 100644 deploy/docker/crawler_manager.py
 create mode 100644 tests/memory/test_stress_api.py
 create mode 100644 tests/memory/test_stress_docker_api.py

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 1eaea156..8940b8ab 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -542,9 +542,9 @@ class AsyncWebCrawler:
             markdown_input_html = source_lambda()
 
             # Log which source is being used (optional, but helpful for debugging)
-            if self.logger and verbose:
-                actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
-                self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+            # if self.logger and verbose:
+            #     actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+            #     self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
 
         except Exception as e:
             # Handle potential errors, especially from preprocess_html_for_schema
diff --git a/deploy/docker/api copy.py b/deploy/docker/api copy.py
new file mode 100644
index 00000000..341e23e1
--- /dev/null
+++ b/deploy/docker/api copy.py	
@@ -0,0 +1,503 @@
+import os
+import json
+import asyncio
+from typing import List, Tuple
+from functools import partial
+
+import logging
+from typing import Optional, AsyncGenerator
+from urllib.parse import unquote
+from fastapi import HTTPException, Request, status
+from fastapi.background import BackgroundTasks
+from fastapi.responses import JSONResponse
+from redis import asyncio as aioredis
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    LLMExtractionStrategy,
+    CacheMode,
+    BrowserConfig,
+    MemoryAdaptiveDispatcher,
+    RateLimiter, 
+    LLMConfig
+)
+from crawl4ai.utils import perform_completion_with_backoff
+from crawl4ai.content_filter_strategy import (
+    PruningContentFilter,
+    BM25ContentFilter,
+    LLMContentFilter
+)
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+from utils import (
+    TaskStatus,
+    FilterType,
+    get_base_url,
+    is_task_id,
+    should_cleanup_task,
+    decode_redis_hash
+)
+
+import psutil, time
+
+logger = logging.getLogger(__name__)
+
+# --- Helper to get memory ---
+def _get_memory_mb():
+    try:
+        return psutil.Process().memory_info().rss / (1024 * 1024)
+    except Exception as e:
+        logger.warning(f"Could not get memory info: {e}")
+        return None
+
+
+async def handle_llm_qa(
+    url: str,
+    query: str,
+    config: dict
+) -> str:
+    """Process QA using LLM with crawled content as context."""
+    try:
+        # Extract base URL by finding last '?q=' occurrence
+        last_q_index = url.rfind('?q=')
+        if last_q_index != -1:
+            url = url[:last_q_index]
+
+        # Get markdown content
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url)
+            if not result.success:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=result.error_message
+                )
+            content = result.markdown.fit_markdown
+
+        # Create prompt and get LLM response
+        prompt = f"""Use the following content as context to answer the question.
+    Content:
+    {content}
+
+    Question: {query}
+
+    Answer:"""
+
+        response = perform_completion_with_backoff(
+            provider=config["llm"]["provider"],
+            prompt_with_variables=prompt,
+            api_token=os.environ.get(config["llm"].get("api_key_env", ""))
+        )
+
+        return response.choices[0].message.content
+    except Exception as e:
+        logger.error(f"QA processing error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+
+async def process_llm_extraction(
+    redis: aioredis.Redis,
+    config: dict,
+    task_id: str,
+    url: str,
+    instruction: str,
+    schema: Optional[str] = None,
+    cache: str = "0"
+) -> None:
+    """Process LLM extraction in background."""
+    try:
+        # If config['llm'] has api_key then ignore the api_key_env
+        api_key = ""
+        if "api_key" in config["llm"]:
+            api_key = config["llm"]["api_key"]
+        else:
+            api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
+        llm_strategy = LLMExtractionStrategy(
+            llm_config=LLMConfig(
+                provider=config["llm"]["provider"],
+                api_token=api_key
+            ),
+            instruction=instruction,
+            schema=json.loads(schema) if schema else None,
+        )
+
+        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
+
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=url,
+                config=CrawlerRunConfig(
+                    extraction_strategy=llm_strategy,
+                    scraping_strategy=LXMLWebScrapingStrategy(),
+                    cache_mode=cache_mode
+                )
+            )
+
+        if not result.success:
+            await redis.hset(f"task:{task_id}", mapping={
+                "status": TaskStatus.FAILED,
+                "error": result.error_message
+            })
+            return
+
+        try:
+            content = json.loads(result.extracted_content)
+        except json.JSONDecodeError:
+            content = result.extracted_content
+        await redis.hset(f"task:{task_id}", mapping={
+            "status": TaskStatus.COMPLETED,
+            "result": json.dumps(content)
+        })
+
+    except Exception as e:
+        logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
+        await redis.hset(f"task:{task_id}", mapping={
+            "status": TaskStatus.FAILED,
+            "error": str(e)
+        })
+
+async def handle_markdown_request(
+    url: str,
+    filter_type: FilterType,
+    query: Optional[str] = None,
+    cache: str = "0",
+    config: Optional[dict] = None
+) -> str:
+    """Handle markdown generation requests."""
+    try:
+        decoded_url = unquote(url)
+        if not decoded_url.startswith(('http://', 'https://')):
+            decoded_url = 'https://' + decoded_url
+
+        if filter_type == FilterType.RAW:
+            md_generator = DefaultMarkdownGenerator()
+        else:
+            content_filter = {
+                FilterType.FIT: PruningContentFilter(),
+                FilterType.BM25: BM25ContentFilter(user_query=query or ""),
+                FilterType.LLM: LLMContentFilter(
+                    llm_config=LLMConfig(
+                        provider=config["llm"]["provider"],
+                        api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
+                    ),
+                    instruction=query or "Extract main content"
+                )
+            }[filter_type]
+            md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
+
+        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
+
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=decoded_url,
+                config=CrawlerRunConfig(
+                    markdown_generator=md_generator,
+                    scraping_strategy=LXMLWebScrapingStrategy(),
+                    cache_mode=cache_mode
+                )
+            )
+            
+            if not result.success:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=result.error_message
+                )
+
+            return (result.markdown.raw_markdown 
+                   if filter_type == FilterType.RAW 
+                   else result.markdown.fit_markdown)
+
+    except Exception as e:
+        logger.error(f"Markdown error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+
+async def handle_llm_request(
+    redis: aioredis.Redis,
+    background_tasks: BackgroundTasks,
+    request: Request,
+    input_path: str,
+    query: Optional[str] = None,
+    schema: Optional[str] = None,
+    cache: str = "0",
+    config: Optional[dict] = None
+) -> JSONResponse:
+    """Handle LLM extraction requests."""
+    base_url = get_base_url(request)
+    
+    try:
+        if is_task_id(input_path):
+            return await handle_task_status(
+                redis, input_path, base_url
+            )
+
+        if not query:
+            return JSONResponse({
+                "message": "Please provide an instruction",
+                "_links": {
+                    "example": {
+                        "href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
+                        "title": "Try this example"
+                    }
+                }
+            })
+
+        return await create_new_task(
+            redis,
+            background_tasks,
+            input_path,
+            query,
+            schema,
+            cache,
+            base_url,
+            config
+        )
+
+    except Exception as e:
+        logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
+        return JSONResponse({
+            "error": str(e),
+            "_links": {
+                "retry": {"href": str(request.url)}
+            }
+        }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+
+async def handle_task_status(
+    redis: aioredis.Redis,
+    task_id: str,
+    base_url: str
+) -> JSONResponse:
+    """Handle task status check requests."""
+    task = await redis.hgetall(f"task:{task_id}")
+    if not task:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Task not found"
+        )
+
+    task = decode_redis_hash(task)
+    response = create_task_response(task, task_id, base_url)
+
+    if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
+        if should_cleanup_task(task["created_at"]):
+            await redis.delete(f"task:{task_id}")
+
+    return JSONResponse(response)
+
+async def create_new_task(
+    redis: aioredis.Redis,
+    background_tasks: BackgroundTasks,
+    input_path: str,
+    query: str,
+    schema: Optional[str],
+    cache: str,
+    base_url: str,
+    config: dict
+) -> JSONResponse:
+    """Create and initialize a new task."""
+    decoded_url = unquote(input_path)
+    if not decoded_url.startswith(('http://', 'https://')):
+        decoded_url = 'https://' + decoded_url
+
+    from datetime import datetime
+    task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
+    
+    await redis.hset(f"task:{task_id}", mapping={
+        "status": TaskStatus.PROCESSING,
+        "created_at": datetime.now().isoformat(),
+        "url": decoded_url
+    })
+
+    background_tasks.add_task(
+        process_llm_extraction,
+        redis,
+        config,
+        task_id,
+        decoded_url,
+        query,
+        schema,
+        cache
+    )
+
+    return JSONResponse({
+        "task_id": task_id,
+        "status": TaskStatus.PROCESSING,
+        "url": decoded_url,
+        "_links": {
+            "self": {"href": f"{base_url}/llm/{task_id}"},
+            "status": {"href": f"{base_url}/llm/{task_id}"}
+        }
+    })
+
+def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
+    """Create response for task status check."""
+    response = {
+        "task_id": task_id,
+        "status": task["status"],
+        "created_at": task["created_at"],
+        "url": task["url"],
+        "_links": {
+            "self": {"href": f"{base_url}/llm/{task_id}"},
+            "refresh": {"href": f"{base_url}/llm/{task_id}"}
+        }
+    }
+
+    if task["status"] == TaskStatus.COMPLETED:
+        response["result"] = json.loads(task["result"])
+    elif task["status"] == TaskStatus.FAILED:
+        response["error"] = task["error"]
+
+    return response
+
+async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
+    """Stream results with heartbeats and completion markers."""
+    import json
+    from utils import datetime_handler
+
+    try:
+        async for result in results_gen:
+            try:
+                server_memory_mb = _get_memory_mb()
+                result_dict = result.model_dump()
+                result_dict['server_memory_mb'] = server_memory_mb
+                logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
+                data = json.dumps(result_dict, default=datetime_handler) + "\n"
+                yield data.encode('utf-8')
+            except Exception as e:
+                logger.error(f"Serialization error: {e}")
+                error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
+                yield (json.dumps(error_response) + "\n").encode('utf-8')
+
+        yield json.dumps({"status": "completed"}).encode('utf-8')
+        
+    except asyncio.CancelledError:
+        logger.warning("Client disconnected during streaming")
+    finally:
+        try:
+            await crawler.close()
+        except Exception as e:
+            logger.error(f"Crawler cleanup error: {e}")
+
+async def handle_crawl_request(
+    urls: List[str],
+    browser_config: dict,
+    crawler_config: dict,
+    config: dict
+) -> dict:
+    """Handle non-streaming crawl requests."""
+    start_mem_mb = _get_memory_mb() # <--- Get memory before
+    start_time = time.time()
+    mem_delta_mb = None
+    peak_mem_mb = start_mem_mb
+    
+    try:
+        browser_config = BrowserConfig.load(browser_config)
+        crawler_config = CrawlerRunConfig.load(crawler_config)
+
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
+            rate_limiter=RateLimiter(
+                base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
+            )
+        )
+
+        crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
+        await crawler.start()
+        results = []
+        func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
+        partial_func = partial(func, 
+                                urls[0] if len(urls) == 1 else urls, 
+                                config=crawler_config, 
+                                dispatcher=dispatcher)
+        results = await partial_func()
+        await crawler.close()
+        
+        end_mem_mb = _get_memory_mb() # <--- Get memory after
+        end_time = time.time()
+        
+        if start_mem_mb is not None and end_mem_mb is not None:
+            mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
+            peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
+        logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
+                              
+        return {
+            "success": True,
+            "results": [result.model_dump() for result in results],
+            "server_processing_time_s": end_time - start_time,
+            "server_memory_delta_mb": mem_delta_mb,
+            "server_peak_memory_mb": peak_mem_mb
+        }
+
+    except Exception as e:
+        logger.error(f"Crawl error: {str(e)}", exc_info=True)
+        if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
+             try:
+                 await crawler.close()
+             except Exception as close_e:
+                  logger.error(f"Error closing crawler during exception handling: {close_e}")
+
+        # Measure memory even on error if possible
+        end_mem_mb_error = _get_memory_mb()
+        if start_mem_mb is not None and end_mem_mb_error is not None:
+            mem_delta_mb = end_mem_mb_error - start_mem_mb
+
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=json.dumps({ # Send structured error
+                "error": str(e),
+                "server_memory_delta_mb": mem_delta_mb,
+                "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
+            })
+        )
+
+async def handle_stream_crawl_request(
+    urls: List[str],
+    browser_config: dict,
+    crawler_config: dict,
+    config: dict
+) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
+    """Handle streaming crawl requests."""
+    try:
+        browser_config = BrowserConfig.load(browser_config)
+        # browser_config.verbose = True # Set to False or remove for production stress testing
+        browser_config.verbose = False
+        crawler_config = CrawlerRunConfig.load(crawler_config)
+        crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
+        crawler_config.stream = True
+
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
+            rate_limiter=RateLimiter(
+                base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
+            )
+        )
+
+        crawler = AsyncWebCrawler(config=browser_config)
+        await crawler.start()
+
+        results_gen = await crawler.arun_many(
+            urls=urls,
+            config=crawler_config,
+            dispatcher=dispatcher
+        )
+
+        return crawler, results_gen
+
+    except Exception as e:
+        # Make sure to close crawler if started during an error here
+        if 'crawler' in locals() and crawler.ready:
+             try:
+                  await crawler.close()
+             except Exception as close_e:
+                  logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+        logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
+        # Raising HTTPException here will prevent streaming response
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
\ No newline at end of file
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index c01696b2..b226682f 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -40,8 +40,19 @@ from utils import (
     decode_redis_hash
 )
 
+import psutil, time
+
 logger = logging.getLogger(__name__)
 
+# --- Helper to get memory ---
+def _get_memory_mb():
+    try:
+        return psutil.Process().memory_info().rss / (1024 * 1024)
+    except Exception as e:
+        logger.warning(f"Could not get memory info: {e}")
+        return None
+
+
 async def handle_llm_qa(
     url: str,
     query: str,
@@ -351,7 +362,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
     try:
         async for result in results_gen:
             try:
+                server_memory_mb = _get_memory_mb()
                 result_dict = result.model_dump()
+                result_dict['server_memory_mb'] = server_memory_mb
                 logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
                 data = json.dumps(result_dict, default=datetime_handler) + "\n"
                 yield data.encode('utf-8')
@@ -364,19 +377,25 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
         
     except asyncio.CancelledError:
         logger.warning("Client disconnected during streaming")
-    finally:
-        try:
-            await crawler.close()
-        except Exception as e:
-            logger.error(f"Crawler cleanup error: {e}")
+    # finally:
+    #     try:
+    #         await crawler.close()
+    #     except Exception as e:
+    #         logger.error(f"Crawler cleanup error: {e}")
 
 async def handle_crawl_request(
+    crawler: AsyncWebCrawler,
     urls: List[str],
     browser_config: dict,
     crawler_config: dict,
     config: dict
 ) -> dict:
     """Handle non-streaming crawl requests."""
+    start_mem_mb = _get_memory_mb() # <--- Get memory before
+    start_time = time.time()
+    mem_delta_mb = None
+    peak_mem_mb = start_mem_mb
+    
     try:
         browser_config = BrowserConfig.load(browser_config)
         crawler_config = CrawlerRunConfig.load(crawler_config)
@@ -388,31 +407,63 @@ async def handle_crawl_request(
             )
         )
 
-        crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
-        await crawler.start()
+        # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
+        # await crawler.start()
         results = []
         func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
         partial_func = partial(func, 
                                 urls[0] if len(urls) == 1 else urls, 
                                 config=crawler_config, 
                                 dispatcher=dispatcher)
+        
+        # Simulate work being done by the crawler
+        # logger.debug(f"Request (URLs: {len(urls)}) starting simulated work...") # Add log
+        # await asyncio.sleep(2) # <--- ADD ARTIFICIAL DELAY (e.g., 0.5 seconds)
+        # logger.debug(f"Request (URLs: {len(urls)}) finished simulated work.") 
+                
         results = await partial_func()
-        await crawler.close()
+        # await crawler.close()
+        
+        end_mem_mb = _get_memory_mb() # <--- Get memory after
+        end_time = time.time()
+        
+        if start_mem_mb is not None and end_mem_mb is not None:
+            mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
+            peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
+        logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
+                              
         return {
             "success": True,
-            "results": [result.model_dump() for result in results]
+            "results": [result.model_dump() for result in results],
+            "server_processing_time_s": end_time - start_time,
+            "server_memory_delta_mb": mem_delta_mb,
+            "server_peak_memory_mb": peak_mem_mb
         }
 
     except Exception as e:
         logger.error(f"Crawl error: {str(e)}", exc_info=True)
-        if 'crawler' in locals():
-            await crawler.close()
+        # if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
+        #      try:
+        #          await crawler.close()
+        #      except Exception as close_e:
+        #           logger.error(f"Error closing crawler during exception handling: {close_e}")
+
+        # Measure memory even on error if possible
+        end_mem_mb_error = _get_memory_mb()
+        if start_mem_mb is not None and end_mem_mb_error is not None:
+            mem_delta_mb = end_mem_mb_error - start_mem_mb
+
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=str(e)
+            detail=json.dumps({ # Send structured error
+                "error": str(e),
+                "server_memory_delta_mb": mem_delta_mb,
+                "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
+            })
         )
 
 async def handle_stream_crawl_request(
+    crawler: AsyncWebCrawler,
     urls: List[str],
     browser_config: dict,
     crawler_config: dict,
@@ -421,9 +472,11 @@ async def handle_stream_crawl_request(
     """Handle streaming crawl requests."""
     try:
         browser_config = BrowserConfig.load(browser_config)
-        browser_config.verbose = True
+        # browser_config.verbose = True # Set to False or remove for production stress testing
+        browser_config.verbose = False
         crawler_config = CrawlerRunConfig.load(crawler_config)
         crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
+        crawler_config.stream = True
 
         dispatcher = MemoryAdaptiveDispatcher(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
@@ -432,8 +485,8 @@ async def handle_stream_crawl_request(
             )
         )
 
-        crawler = AsyncWebCrawler(config=browser_config)
-        await crawler.start()
+        # crawler = AsyncWebCrawler(config=browser_config)
+        # await crawler.start()
 
         results_gen = await crawler.arun_many(
             urls=urls,
@@ -441,12 +494,19 @@ async def handle_stream_crawl_request(
             dispatcher=dispatcher
         )
 
+        # Return the *same* crawler instance and the generator
+        # The caller (server.py) manages the crawler lifecycle via the pool context
         return crawler, results_gen
 
     except Exception as e:
-        if 'crawler' in locals():
-            await crawler.close()
+        # Make sure to close crawler if started during an error here
+        # if 'crawler' in locals() and crawler.ready:
+        #      try:
+        #           await crawler.close()
+        #      except Exception as close_e:
+        #           logger.error(f"Error closing crawler during stream setup exception: {close_e}")
         logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
+        # Raising HTTPException here will prevent streaming response
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail=str(e)
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index 3b5fead6..17848e99 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -48,6 +48,38 @@ security:
     content_security_policy: "default-src 'self'"
     strict_transport_security: "max-age=63072000; includeSubDomains"
 
+# Crawler Pool Configuration
+crawler_pool:
+  enabled: true # Set to false to disable the pool
+
+  # --- Option 1: Auto-calculate size ---
+  auto_calculate_size: true
+  calculation_params:
+    mem_headroom_mb: 512     # Memory reserved for OS/other apps
+    avg_page_mem_mb: 150     # Estimated MB per concurrent "tab"/page in browsers
+    fd_per_page: 20          # Estimated file descriptors per page
+    core_multiplier: 4       # Max crawlers per CPU core
+    min_pool_size: 2         # Minimum number of primary crawlers
+    max_pool_size: 16        # Maximum number of primary crawlers
+
+  # --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
+  # pool_size: 8
+
+  # --- Other Pool Settings ---
+  backup_pool_size: 1        # Number of backup crawlers
+  max_wait_time_s: 30.0      # Max seconds a request waits for a free crawler
+  throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
+  throttle_delay_min_s: 0.1  # Min throttle delay
+  throttle_delay_max_s: 0.5  # Max throttle delay
+
+  # --- Browser Config for Pooled Crawlers ---
+  browser_config:
+    # No need for "type": "BrowserConfig" here, just params
+    headless: true
+    verbose: false # Keep pool crawlers less verbose in production
+    # user_agent: "MyPooledCrawler/1.0" # Example
+    # Add other BrowserConfig params as needed (e.g., proxy, viewport)
+
 # Crawler Configuration
 crawler:
   memory_threshold_percent: 95.0
@@ -61,6 +93,8 @@ crawler:
 logging:
   level: "INFO"
   format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+  file: "logs/app.log"
+  verbose: true
 
 # Observability Configuration
 observability:
diff --git a/deploy/docker/crawler_manager.py b/deploy/docker/crawler_manager.py
new file mode 100644
index 00000000..b566e2d3
--- /dev/null
+++ b/deploy/docker/crawler_manager.py
@@ -0,0 +1,556 @@
+# crawler_manager.py
+import asyncio
+import time
+import uuid
+import psutil
+import os
+import resource  # For FD limit
+import random
+import math
+from typing import Optional, Tuple, Any, List, Dict, AsyncGenerator
+from pydantic import BaseModel, Field, field_validator
+from contextlib import asynccontextmanager
+import logging
+
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, AsyncLogger
+# Assuming api.py handlers are accessible or refactored slightly if needed
+# We might need to import the specific handler functions if we call them directly
+# from api import handle_crawl_request, handle_stream_crawl_request, _get_memory_mb, stream_results
+
+# --- Custom Exceptions ---
+class PoolTimeoutError(Exception):
+    """Raised when waiting for a crawler resource times out."""
+    pass
+
+class PoolConfigurationError(Exception):
+    """Raised for configuration issues."""
+    pass
+
+class NoHealthyCrawlerError(Exception):
+    """Raised when no healthy crawler is available."""
+    pass
+
+
+# --- Configuration Models ---
+class CalculationParams(BaseModel):
+    mem_headroom_mb: int = 512
+    avg_page_mem_mb: int = 150
+    fd_per_page: int = 20
+    core_multiplier: int = 4
+    min_pool_size: int = 1 # Min safe pages should be at least 1
+    max_pool_size: int = 16
+
+    # V2 validation for avg_page_mem_mb
+    @field_validator('avg_page_mem_mb')
+    @classmethod
+    def check_avg_page_mem(cls, v: int) -> int:
+        if v <= 0:
+            raise ValueError("avg_page_mem_mb must be positive")
+        return v
+
+    # V2 validation for fd_per_page
+    @field_validator('fd_per_page')
+    @classmethod
+    def check_fd_per_page(cls, v: int) -> int:
+        if v <= 0:
+            raise ValueError("fd_per_page must be positive")
+        return v
+
+# crawler_manager.py
+# ... (imports including BaseModel, Field from pydantic) ...
+from pydantic import BaseModel, Field, field_validator # <-- Import field_validator
+
+# --- Configuration Models (Pydantic V2 Syntax) ---
+class CalculationParams(BaseModel):
+    mem_headroom_mb: int = 512
+    avg_page_mem_mb: int = 150
+    fd_per_page: int = 20
+    core_multiplier: int = 4
+    min_pool_size: int = 1 # Min safe pages should be at least 1
+    max_pool_size: int = 16
+
+    # V2 validation for avg_page_mem_mb
+    @field_validator('avg_page_mem_mb')
+    @classmethod
+    def check_avg_page_mem(cls, v: int) -> int:
+        if v <= 0:
+            raise ValueError("avg_page_mem_mb must be positive")
+        return v
+
+    # V2 validation for fd_per_page
+    @field_validator('fd_per_page')
+    @classmethod
+    def check_fd_per_page(cls, v: int) -> int:
+        if v <= 0:
+            raise ValueError("fd_per_page must be positive")
+        return v
+
+class CrawlerManagerConfig(BaseModel):
+    enabled: bool = True
+    auto_calculate_size: bool = True
+    calculation_params: CalculationParams = Field(default_factory=CalculationParams) # Use Field for default_factory
+    backup_pool_size: int = Field(1, ge=0) # Allow 0 backups
+    max_wait_time_s: float = 30.0
+    throttle_threshold_percent: float = Field(70.0, ge=0, le=100)
+    throttle_delay_min_s: float = 0.1
+    throttle_delay_max_s: float = 0.5
+    browser_config: Dict[str, Any] = Field(default_factory=lambda: {"headless": True, "verbose": False}) # Use Field for default_factory
+    primary_reload_delay_s: float = 60.0
+
+# --- Crawler Manager ---
+class CrawlerManager:
+    """Manages shared AsyncWebCrawler instances, concurrency, and failover."""
+
+    def __init__(self, config: CrawlerManagerConfig, logger = None):
+        if not config.enabled:
+            self.logger.warning("CrawlerManager is disabled by configuration.")
+            # Set defaults to allow server to run, but manager won't function
+            self.config = config
+            self._initialized = False,
+            return
+
+        self.config = config
+        self._primary_crawler: Optional[AsyncWebCrawler] = None
+        self._secondary_crawlers: List[AsyncWebCrawler] = []
+        self._active_crawler_index: int = 0 # 0 for primary, 1+ for secondary index
+        self._primary_healthy: bool = False
+        self._secondary_healthy_flags: List[bool] = []
+
+        self._safe_pages: int = 1 # Default, calculated in initialize
+        self._semaphore: Optional[asyncio.Semaphore] = None
+        self._state_lock = asyncio.Lock() # Protects active_crawler, health flags
+        self._reload_tasks: List[Optional[asyncio.Task]] = [] # Track reload background tasks
+
+        self._initialized = False
+        self._shutting_down = False
+        
+        # Initialize logger if provided
+        if logger is None:
+            self.logger = logging.getLogger(__name__)
+            self.logger.setLevel(logging.INFO)
+        else:
+            self.logger = logger
+
+        self.logger.info("CrawlerManager initialized with config.")
+        self.logger.debug(f"Config: {self.config.model_dump_json(indent=2)}")
+
+    def is_enabled(self) -> bool:
+        return self.config.enabled and self._initialized
+
+    def _get_system_resources(self) -> Tuple[int, int, int]:
+        """Gets RAM, CPU cores, and FD limit."""
+        total_ram_mb = 0
+        cpu_cores = 0
+        try:
+            mem_info = psutil.virtual_memory()
+            total_ram_mb = mem_info.total // (1024 * 1024)
+            cpu_cores = psutil.cpu_count(logical=False) or psutil.cpu_count(logical=True) # Prefer physical cores
+        except Exception as e:
+            self.logger.warning(f"Could not get RAM/CPU info via psutil: {e}")
+            total_ram_mb = 2048 # Default fallback
+            cpu_cores = 2      # Default fallback
+
+        fd_limit = 1024 # Default fallback
+        try:
+            soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
+            fd_limit = soft_limit # Use the soft limit
+        except (ImportError, ValueError, OSError, AttributeError) as e:
+            self.logger.warning(f"Could not get file descriptor limit (common on Windows): {e}. Using default: {fd_limit}")
+
+        self.logger.info(f"System Resources: RAM={total_ram_mb}MB, Cores={cpu_cores}, FD Limit={fd_limit}")
+        return total_ram_mb, cpu_cores, fd_limit
+
+    def _calculate_safe_pages(self) -> int:
+        """Calculates the safe number of concurrent pages based on resources."""
+        if not self.config.auto_calculate_size:
+            # If auto-calc is off, use max_pool_size as the hard limit
+            # This isn't ideal based on the prompt, but provides *some* manual override
+            # A dedicated `manual_safe_pages` might be better. Let's use max_pool_size for now.
+             self.logger.warning("Auto-calculation disabled. Using max_pool_size as safe_pages limit.")
+             return self.config.calculation_params.max_pool_size
+
+        params = self.config.calculation_params
+        total_ram_mb, cpu_cores, fd_limit = self._get_system_resources()
+
+        available_ram_mb = total_ram_mb - params.mem_headroom_mb
+        if available_ram_mb <= 0:
+            self.logger.error(f"Not enough RAM ({total_ram_mb}MB) after headroom ({params.mem_headroom_mb}MB). Cannot calculate safe pages.")
+            return params.min_pool_size # Fallback to minimum
+
+        try:
+            # Calculate limits from each resource
+            mem_limit = available_ram_mb // params.avg_page_mem_mb if params.avg_page_mem_mb > 0 else float('inf')
+            fd_limit_pages = fd_limit // params.fd_per_page if params.fd_per_page > 0 else float('inf')
+            cpu_limit = cpu_cores * params.core_multiplier if cpu_cores > 0 else float('inf')
+
+            # Determine the most constraining limit
+            calculated_limit = math.floor(min(mem_limit, fd_limit_pages, cpu_limit))
+
+        except ZeroDivisionError:
+             self.logger.error("Division by zero in safe_pages calculation (avg_page_mem_mb or fd_per_page is zero).")
+             calculated_limit = params.min_pool_size # Fallback
+
+        # Clamp the result within min/max bounds
+        safe_pages = max(params.min_pool_size, min(calculated_limit, params.max_pool_size))
+
+        self.logger.info(f"Calculated safe pages: MemoryLimit={mem_limit}, FDLimit={fd_limit_pages}, CPULimit={cpu_limit} -> RawCalc={calculated_limit} -> Clamped={safe_pages}")
+        return safe_pages
+
+    async def _create_and_start_crawler(self, crawler_id: str) -> Optional[AsyncWebCrawler]:
+        """Creates, starts, and returns a crawler instance."""
+        try:
+            # Create BrowserConfig from the dictionary in manager config
+            browser_conf = BrowserConfig(**self.config.browser_config)
+            crawler = AsyncWebCrawler(config=browser_conf)
+            await crawler.start()
+            self.logger.info(f"Successfully started crawler instance: {crawler_id}")
+            return crawler
+        except Exception as e:
+            self.logger.error(f"Failed to start crawler instance {crawler_id}: {e}", exc_info=True)
+            return None
+
+    async def initialize(self):
+        """Initializes crawlers and semaphore. Called at server startup."""
+        if not self.config.enabled or self._initialized:
+            return
+
+        self.logger.info("Initializing CrawlerManager...")
+        self._safe_pages = self._calculate_safe_pages()
+        self._semaphore = asyncio.Semaphore(self._safe_pages)
+
+        self._primary_crawler = await self._create_and_start_crawler("Primary")
+        if self._primary_crawler:
+            self._primary_healthy = True
+        else:
+            self._primary_healthy = False
+            self.logger.critical("Primary crawler failed to initialize!")
+
+        self._secondary_crawlers = []
+        self._secondary_healthy_flags = []
+        self._reload_tasks = [None] * (1 + self.config.backup_pool_size) # For primary + backups
+
+        for i in range(self.config.backup_pool_size):
+            sec_id = f"Secondary-{i+1}"
+            crawler = await self._create_and_start_crawler(sec_id)
+            self._secondary_crawlers.append(crawler) # Add even if None
+            self._secondary_healthy_flags.append(crawler is not None)
+            if crawler is None:
+                 self.logger.error(f"{sec_id} crawler failed to initialize!")
+
+        # Set initial active crawler (prefer primary)
+        if self._primary_healthy:
+            self._active_crawler_index = 0
+            self.logger.info("Primary crawler is active.")
+        else:
+            # Find the first healthy secondary
+            found_healthy_backup = False
+            for i, healthy in enumerate(self._secondary_healthy_flags):
+                if healthy:
+                    self._active_crawler_index = i + 1 # 1-based index for secondaries
+                    self.logger.warning(f"Primary failed, Secondary-{i+1} is active.")
+                    found_healthy_backup = True
+                    break
+            if not found_healthy_backup:
+                 self.logger.critical("FATAL: No healthy crawlers available after initialization!")
+                 # Server should probably refuse connections in this state
+
+        self._initialized = True
+        self.logger.info(f"CrawlerManager initialized. Safe Pages: {self._safe_pages}. Active Crawler Index: {self._active_crawler_index}")
+
+    async def shutdown(self):
+        """Shuts down all crawler instances. Called at server shutdown."""
+        if not self._initialized or self._shutting_down:
+            return
+
+        self._shutting_down = True
+        self.logger.info("Shutting down CrawlerManager...")
+
+        # Cancel any ongoing reload tasks
+        for i, task in enumerate(self._reload_tasks):
+            if task and not task.done():
+                try:
+                    task.cancel()
+                    await task # Wait for cancellation
+                    self.logger.info(f"Cancelled reload task for crawler index {i}.")
+                except asyncio.CancelledError:
+                    self.logger.info(f"Reload task for crawler index {i} was already cancelled.")
+                except Exception as e:
+                    self.logger.warning(f"Error cancelling reload task for crawler index {i}: {e}")
+        self._reload_tasks = []
+
+
+        # Close primary
+        if self._primary_crawler:
+            try:
+                self.logger.info("Closing primary crawler...")
+                await self._primary_crawler.close()
+                self._primary_crawler = None
+            except Exception as e:
+                self.logger.error(f"Error closing primary crawler: {e}", exc_info=True)
+
+        # Close secondaries
+        for i, crawler in enumerate(self._secondary_crawlers):
+             if crawler:
+                 try:
+                     self.logger.info(f"Closing secondary crawler {i+1}...")
+                     await crawler.close()
+                 except Exception as e:
+                     self.logger.error(f"Error closing secondary crawler {i+1}: {e}", exc_info=True)
+        self._secondary_crawlers = []
+
+        self._initialized = False
+        self.logger.info("CrawlerManager shut down complete.")
+
+    @asynccontextmanager
+    async def get_crawler(self) -> AsyncGenerator[AsyncWebCrawler, None]:
+        """Acquires semaphore, yields active crawler, handles throttling & failover."""
+        if not self.is_enabled():
+            raise NoHealthyCrawlerError("CrawlerManager is disabled or not initialized.")
+
+        if self._shutting_down:
+             raise NoHealthyCrawlerError("CrawlerManager is shutting down.")
+
+        active_crawler: Optional[AsyncWebCrawler] = None
+        acquired = False
+        request_id = uuid.uuid4()
+        start_wait = time.time()
+
+        # --- Throttling ---
+        try:
+            # Check semaphore value without acquiring
+            current_usage = self._safe_pages - self._semaphore._value
+            usage_percent = (current_usage / self._safe_pages) * 100 if self._safe_pages > 0 else 0
+
+            if usage_percent >= self.config.throttle_threshold_percent:
+                delay = random.uniform(self.config.throttle_delay_min_s, self.config.throttle_delay_max_s)
+                self.logger.debug(f"Throttling: Usage {usage_percent:.1f}% >= {self.config.throttle_threshold_percent}%. Delaying {delay:.3f}s")
+                await asyncio.sleep(delay)
+        except Exception as e:
+             self.logger.warning(f"Error during throttling check: {e}") # Continue attempt even if throttle check fails
+
+        # --- Acquire Semaphore ---
+        try:
+            # self.logger.debug(f"Attempting to acquire semaphore (Available: {self._semaphore._value}/{self._safe_pages}). Wait Timeout: {self.config.max_wait_time_s}s")
+            
+            # --- Logging Before Acquire ---
+            sem_value = self._semaphore._value if self._semaphore else 'N/A'
+            sem_waiters = len(self._semaphore._waiters) if self._semaphore and self._semaphore._waiters else 0
+            self.logger.debug(f"Req {request_id}: Attempting acquire. Available={sem_value}/{self._safe_pages}, Waiters={sem_waiters}, Timeout={self.config.max_wait_time_s}s")            
+
+            await asyncio.wait_for(
+                self._semaphore.acquire(), timeout=self.config.max_wait_time_s
+            )
+            acquired = True
+            wait_duration = time.time() - start_wait
+            if wait_duration > 1:
+                self.logger.warning(f"Semaphore acquired after {wait_duration:.3f}s. (Available: {self._semaphore._value}/{self._safe_pages})")
+                
+            self.logger.debug(f"Semaphore acquired successfully after {wait_duration:.3f}s. (Available: {self._semaphore._value}/{self._safe_pages})")
+
+            # --- Select Active Crawler (Critical Section) ---
+            async with self._state_lock:
+                current_active_index = self._active_crawler_index
+                is_primary_active = (current_active_index == 0)
+
+                if is_primary_active:
+                    if self._primary_healthy and self._primary_crawler:
+                        active_crawler = self._primary_crawler
+                    else:
+                        # Primary is supposed to be active but isn't healthy
+                        self.logger.warning("Primary crawler unhealthy, attempting immediate failover...")
+                        if not await self._try_failover_sync(): # Try to switch active crawler NOW
+                             raise NoHealthyCrawlerError("Primary unhealthy and no healthy backup available.")
+                        # If failover succeeded, active_crawler_index is updated
+                        current_active_index = self._active_crawler_index
+                        # Fall through to select the new active secondary
+
+                # Check if we need to use a secondary (either initially or after failover)
+                if current_active_index > 0:
+                     secondary_idx = current_active_index - 1
+                     if secondary_idx < len(self._secondary_crawlers) and \
+                        self._secondary_healthy_flags[secondary_idx] and \
+                        self._secondary_crawlers[secondary_idx]:
+                          active_crawler = self._secondary_crawlers[secondary_idx]
+                     else:
+                         self.logger.error(f"Selected Secondary-{current_active_index} is unhealthy or missing.")
+                         # Attempt failover to *another* secondary if possible? (Adds complexity)
+                         # For now, raise error if the selected one isn't good.
+                         raise NoHealthyCrawlerError(f"Selected Secondary-{current_active_index} is unavailable.")
+
+            if active_crawler is None:
+                 # This shouldn't happen if logic above is correct, but safeguard
+                 raise NoHealthyCrawlerError("Failed to select a healthy active crawler.")
+
+            # --- Yield Crawler ---
+            try:
+                yield active_crawler
+            except Exception as crawl_error:
+                self.logger.error(f"Error during crawl execution using {active_crawler}: {crawl_error}", exc_info=True)
+                # Determine if this error warrants failover
+                # For now, let's assume any exception triggers a health check/failover attempt
+                await self._handle_crawler_failure(active_crawler)
+                raise # Re-raise the original error for the API handler
+
+        except asyncio.TimeoutError:
+            self.logger.warning(f"Timeout waiting for semaphore after {self.config.max_wait_time_s}s.")
+            raise PoolTimeoutError(f"Timed out waiting for available crawler resource after {self.config.max_wait_time_s}s")
+        except NoHealthyCrawlerError:
+            # Logged within the selection logic
+             raise # Re-raise for API handler
+        except Exception as e:
+             self.logger.error(f"Unexpected error in get_crawler context manager: {e}", exc_info=True)
+             raise # Re-raise potentially unknown errors
+        finally:
+            if acquired:
+                self._semaphore.release()
+                self.logger.debug(f"Semaphore released. (Available: {self._semaphore._value}/{self._safe_pages})")
+
+
+    async def _try_failover_sync(self) -> bool:
+        """Synchronous part of failover logic (must be called under state_lock). Finds next healthy secondary."""
+        if not self._primary_healthy: # Only failover if primary is already marked down
+            found_healthy_backup = False
+            start_idx = (self._active_crawler_index % (self.config.backup_pool_size +1)) # Start check after current
+            for i in range(self.config.backup_pool_size):
+                 check_idx = (start_idx + i) % self.config.backup_pool_size # Circular check
+                 if self._secondary_healthy_flags[check_idx] and self._secondary_crawlers[check_idx]:
+                     self._active_crawler_index = check_idx + 1
+                     self.logger.warning(f"Failover successful: Switched active crawler to Secondary-{self._active_crawler_index}")
+                     found_healthy_backup = True
+                     break # Found one
+            if not found_healthy_backup:
+                 # If primary is down AND no backups are healthy, mark primary as active index (0) but it's still unhealthy
+                 self._active_crawler_index = 0
+                 self.logger.error("Failover failed: No healthy secondary crawlers available.")
+                 return False
+            return True
+        return True # Primary is healthy, no failover needed
+
+    async def _handle_crawler_failure(self, failed_crawler: AsyncWebCrawler):
+        """Handles marking a crawler as unhealthy and initiating recovery."""
+        if self._shutting_down: return # Don't handle failures during shutdown
+
+        async with self._state_lock:
+            crawler_index = -1
+            is_primary = False
+
+            if failed_crawler is self._primary_crawler and self._primary_healthy:
+                self.logger.warning("Primary crawler reported failure.")
+                self._primary_healthy = False
+                is_primary = True
+                crawler_index = 0
+                # Try immediate failover within the lock
+                await self._try_failover_sync()
+                # Start reload task if not already running for primary
+                if self._reload_tasks[0] is None or self._reload_tasks[0].done():
+                     self.logger.info("Initiating primary crawler reload task.")
+                     self._reload_tasks[0] = asyncio.create_task(self._reload_crawler(0))
+
+            else:
+                 # Check if it was one of the secondaries
+                 for i, crawler in enumerate(self._secondary_crawlers):
+                     if failed_crawler is crawler and self._secondary_healthy_flags[i]:
+                         self.logger.warning(f"Secondary-{i+1} crawler reported failure.")
+                         self._secondary_healthy_flags[i] = False
+                         is_primary = False
+                         crawler_index = i + 1
+                         # If this *was* the active crawler, trigger failover check
+                         if self._active_crawler_index == crawler_index:
+                              self.logger.warning(f"Active secondary {crawler_index} failed, attempting failover...")
+                              await self._try_failover_sync()
+                         # Start reload task for this secondary
+                         if self._reload_tasks[crawler_index] is None or self._reload_tasks[crawler_index].done():
+                              self.logger.info(f"Initiating Secondary-{i+1} crawler reload task.")
+                              self._reload_tasks[crawler_index] = asyncio.create_task(self._reload_crawler(crawler_index))
+                         break # Found the failed secondary
+
+            if crawler_index == -1:
+                 self.logger.debug("Failure reported by an unknown or already unhealthy crawler instance. Ignoring.")
+
+
+    async def _reload_crawler(self, crawler_index_to_reload: int):
+        """Background task to close, recreate, and start a specific crawler."""
+        is_primary = (crawler_index_to_reload == 0)
+        crawler_id = "Primary" if is_primary else f"Secondary-{crawler_index_to_reload}"
+        original_crawler = self._primary_crawler if is_primary else self._secondary_crawlers[crawler_index_to_reload - 1]
+
+        self.logger.info(f"Starting reload process for {crawler_id}...")
+
+        # 1. Delay before attempting reload (e.g., allow transient issues to clear)
+        if not is_primary: # Maybe shorter delay for backups?
+            await asyncio.sleep(self.config.primary_reload_delay_s / 2)
+        else:
+             await asyncio.sleep(self.config.primary_reload_delay_s)
+
+
+        # 2. Attempt to close the old instance cleanly
+        if original_crawler:
+            try:
+                self.logger.info(f"Attempting to close existing {crawler_id} instance...")
+                await original_crawler.close()
+                self.logger.info(f"Successfully closed old {crawler_id} instance.")
+            except Exception as e:
+                self.logger.warning(f"Error closing old {crawler_id} instance during reload: {e}")
+
+        # 3. Create and start a new instance
+        self.logger.info(f"Attempting to start new {crawler_id} instance...")
+        new_crawler = await self._create_and_start_crawler(crawler_id)
+
+        # 4. Update state if successful
+        async with self._state_lock:
+            if new_crawler:
+                self.logger.info(f"Successfully reloaded {crawler_id}. Marking as healthy.")
+                if is_primary:
+                    self._primary_crawler = new_crawler
+                    self._primary_healthy = True
+                    # Switch back to primary if no other failures occurred
+                    # Check if ANY secondary is currently active
+                    secondary_is_active = self._active_crawler_index > 0
+                    if not secondary_is_active or not self._secondary_healthy_flags[self._active_crawler_index - 1]:
+                         self.logger.info("Switching active crawler back to primary.")
+                         self._active_crawler_index = 0
+                else: # Is secondary
+                    secondary_idx = crawler_index_to_reload - 1
+                    self._secondary_crawlers[secondary_idx] = new_crawler
+                    self._secondary_healthy_flags[secondary_idx] = True
+                    # Potentially switch back if primary is still down and this was needed?
+                    if not self._primary_healthy and self._active_crawler_index == 0:
+                         self.logger.info(f"Primary still down, activating reloaded Secondary-{crawler_index_to_reload}.")
+                         self._active_crawler_index = crawler_index_to_reload
+
+            else:
+                self.logger.error(f"Failed to reload {crawler_id}. It remains unhealthy.")
+                # Keep the crawler marked as unhealthy
+                if is_primary:
+                    self._primary_healthy = False # Ensure it stays false
+                else:
+                    self._secondary_healthy_flags[crawler_index_to_reload - 1] = False
+
+
+            # Clear the reload task reference for this index
+            self._reload_tasks[crawler_index_to_reload] = None
+
+
+    async def get_status(self) -> Dict:
+        """Returns the current status of the manager."""
+        if not self.is_enabled():
+            return {"status": "disabled"}
+
+        async with self._state_lock:
+             active_id = "Primary" if self._active_crawler_index == 0 else f"Secondary-{self._active_crawler_index}"
+             primary_status = "Healthy" if self._primary_healthy else "Unhealthy"
+             secondary_statuses = [f"Secondary-{i+1}: {'Healthy' if healthy else 'Unhealthy'}"
+                                   for i, healthy in enumerate(self._secondary_healthy_flags)]
+             semaphore_available = self._semaphore._value if self._semaphore else 'N/A'
+             semaphore_locked = len(self._semaphore._waiters) if self._semaphore and self._semaphore._waiters else 0
+
+             return {
+                 "status": "enabled",
+                 "safe_pages": self._safe_pages,
+                 "semaphore_available": semaphore_available,
+                 "semaphore_waiters": semaphore_locked,
+                 "active_crawler": active_id,
+                 "primary_status": primary_status,
+                 "secondary_statuses": secondary_statuses,
+                 "reloading_tasks": [i for i, t in enumerate(self._reload_tasks) if t and not t.done()]
+             }
\ No newline at end of file
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index edb55130..f577348b 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -1,8 +1,20 @@
+# Import from auth.py
+from auth import create_access_token, get_token_dependency, TokenRequest
+from api import (
+    handle_markdown_request,
+    handle_llm_qa,
+    handle_stream_crawl_request,
+    handle_crawl_request,
+    stream_results,
+    _get_memory_mb
+)
+from utils import FilterType, load_config, setup_logging, verify_email_domain
 import os
 import sys
 import time
-from typing import List, Optional, Dict
-from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends
+from typing import List, Optional, Dict, AsyncGenerator
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends, status
 from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
 from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
@@ -11,28 +23,39 @@ from slowapi import Limiter
 from slowapi.util import get_remote_address
 from prometheus_fastapi_instrumentator import Instrumentator
 from redis import asyncio as aioredis
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    AsyncLogger
+)
+
+from crawler_manager import (
+    CrawlerManager,
+    CrawlerManagerConfig,
+    PoolTimeoutError,
+    NoHealthyCrawlerError
+)
+
 
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from utils import FilterType, load_config, setup_logging, verify_email_domain
-from api import (
-    handle_markdown_request,
-    handle_llm_qa,
-    handle_stream_crawl_request,
-    handle_crawl_request,
-    stream_results
-)
-from auth import create_access_token, get_token_dependency, TokenRequest  # Import from auth.py
 
 __version__ = "0.2.6"
 
+
 class CrawlRequest(BaseModel):
     urls: List[str] = Field(min_length=1, max_length=100)
     browser_config: Optional[Dict] = Field(default_factory=dict)
     crawler_config: Optional[Dict] = Field(default_factory=dict)
 
+
 # Load configuration and setup
 config = load_config()
 setup_logging(config)
+logger = AsyncLogger(
+    log_file=config["logging"].get("log_file", "app.log"),
+    verbose=config["logging"].get("verbose", False),
+    tag_width=10,
+)
 
 # Initialize Redis
 redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
@@ -44,9 +67,43 @@ limiter = Limiter(
     storage_uri=config["rate_limiting"]["storage_uri"]
 )
 
+# --- Initialize Manager (will be done in lifespan) ---
+# Load manager config from the main config
+manager_config_dict = config.get("crawler_pool", {})
+# Use Pydantic to parse and validate
+manager_config = CrawlerManagerConfig(**manager_config_dict)
+crawler_manager = CrawlerManager(config=manager_config, logger=logger)
+
+# --- FastAPI App and Lifespan ---
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting up the server...")
+    if manager_config.enabled:
+        logger.info("Initializing Crawler Manager...")
+        await crawler_manager.initialize()
+        app.state.crawler_manager = crawler_manager  # Store manager in app state
+        logger.info("Crawler Manager is enabled.")
+    else:
+        logger.warning("Crawler Manager is disabled.")
+        app.state.crawler_manager = None  # Indicate disabled state
+
+    yield  # Server runs here
+
+    # Shutdown
+    logger.info("Shutting down server...")
+    if app.state.crawler_manager:
+        logger.info("Shutting down Crawler Manager...")
+        await app.state.crawler_manager.shutdown()
+        logger.info("Crawler Manager shut down.")
+    logger.info("Server shut down.")
+
 app = FastAPI(
     title=config["app"]["title"],
-    version=config["app"]["version"]
+    version=config["app"]["version"],
+    lifespan=lifespan,
 )
 
 # Configure middleware
@@ -56,7 +113,9 @@ def setup_security_middleware(app, config):
         if sec_config.get("https_redirect", False):
             app.add_middleware(HTTPSRedirectMiddleware)
         if sec_config.get("trusted_hosts", []) != ["*"]:
-            app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"])
+            app.add_middleware(TrustedHostMiddleware,
+                               allowed_hosts=sec_config["trusted_hosts"])
+
 
 setup_security_middleware(app, config)
 
@@ -68,6 +127,8 @@ if config["observability"]["prometheus"]["enabled"]:
 token_dependency = get_token_dependency(config)
 
 # Middleware for security headers
+
+
 @app.middleware("http")
 async def add_security_headers(request: Request, call_next):
     response = await call_next(request)
@@ -75,7 +136,24 @@ async def add_security_headers(request: Request, call_next):
         response.headers.update(config["security"]["headers"])
     return response
 
+
+async def get_manager() -> CrawlerManager:
+    # Ensure manager exists and is enabled before yielding
+    if not hasattr(app.state, 'crawler_manager') or app.state.crawler_manager is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Crawler service is disabled or not initialized"
+        )
+    if not app.state.crawler_manager.is_enabled():
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Crawler service is currently disabled"
+        )
+    return app.state.crawler_manager
+
 # Token endpoint (always available, but usage depends on config)
+
+
 @app.post("/token")
 async def get_token(request_data: TokenRequest):
     if not verify_email_domain(request_data.email):
@@ -84,6 +162,8 @@ async def get_token(request_data: TokenRequest):
     return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
 
 # Endpoints with conditional auth
+
+
 @app.get("/md/{url:path}")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 async def get_markdown(
@@ -97,6 +177,7 @@ async def get_markdown(
     result = await handle_markdown_request(url, f, q, c, config)
     return PlainTextResponse(result)
 
+
 @app.get("/llm/{url:path}", description="URL should be without http/https prefix")
 async def llm_endpoint(
     request: Request,
@@ -105,7 +186,8 @@ async def llm_endpoint(
     token_data: Optional[Dict] = Depends(token_dependency)
 ):
     if not q:
-        raise HTTPException(status_code=400, detail="Query parameter 'q' is required")
+        raise HTTPException(
+            status_code=400, detail="Query parameter 'q' is required")
     if not url.startswith(('http://', 'https://')):
         url = 'https://' + url
     try:
@@ -114,37 +196,89 @@ async def llm_endpoint(
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
+
 @app.get("/schema")
 async def get_schema():
     from crawl4ai import BrowserConfig, CrawlerRunConfig
     return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
 
+
 @app.get(config["observability"]["health_check"]["endpoint"])
 async def health():
     return {"status": "ok", "timestamp": time.time(), "version": __version__}
 
+
 @app.get(config["observability"]["prometheus"]["endpoint"])
 async def metrics():
     return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
 
+
+@app.get("/browswers")
+# Optional dependency
+async def health(manager: Optional[CrawlerManager] = Depends(get_manager, use_cache=False)):
+    base_status = {"status": "ok", "timestamp": time.time(),
+                   "version": __version__}
+    if manager:
+        try:
+            manager_status = await manager.get_status()
+            base_status["crawler_manager"] = manager_status
+        except Exception as e:
+            base_status["crawler_manager"] = {
+                "status": "error", "detail": str(e)}
+    else:
+        base_status["crawler_manager"] = {"status": "disabled"}
+    return base_status
+
+
 @app.post("/crawl")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 async def crawl(
     request: Request,
     crawl_request: CrawlRequest,
-    token_data: Optional[Dict] = Depends(token_dependency)
+    manager: CrawlerManager = Depends(get_manager),  # Use dependency
+    token_data: Optional[Dict] = Depends(token_dependency)  # Keep auth
 ):
     if not crawl_request.urls:
-        raise HTTPException(status_code=400, detail="At least one URL required")
-    
-    results = await handle_crawl_request(
-        urls=crawl_request.urls,
-        browser_config=crawl_request.browser_config,
-        crawler_config=crawl_request.crawler_config,
-        config=config
-    )
+        raise HTTPException(
+            status_code=400, detail="At least one URL required")
 
-    return JSONResponse(results)
+    try:
+        # Use the manager's context to get a crawler instance
+        async with manager.get_crawler() as active_crawler:
+            # Call the actual handler from api.py, passing the acquired crawler
+            results_dict = await handle_crawl_request(
+                crawler=active_crawler,  # Pass the live crawler instance
+                urls=crawl_request.urls,
+                # Pass user-provided configs, these might override pool defaults if needed
+                # Or the manager/handler could decide how to merge them
+                browser_config=crawl_request.browser_config or {},  # Ensure dict
+                crawler_config=crawl_request.crawler_config or {},  # Ensure dict
+                config=config  # Pass the global server config
+            )
+            return JSONResponse(results_dict)
+
+    except PoolTimeoutError as e:
+        logger.warning(f"Request rejected due to pool timeout: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,  # Or 429
+            detail=f"Crawler resources busy. Please try again later. Timeout: {e}"
+        )
+    except NoHealthyCrawlerError as e:
+        logger.error(f"Request failed as no healthy crawler available: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail=f"Crawler service temporarily unavailable: {e}"
+        )
+    except HTTPException:  # Re-raise HTTP exceptions from handler
+        raise
+    except Exception as e:
+        logger.error(
+            f"Unexpected error during batch crawl processing: {e}", exc_info=True)
+        # Return generic error, details might be logged by handle_crawl_request
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An unexpected error occurred: {e}"
+        )
 
 
 @app.post("/crawl/stream")
@@ -152,23 +286,114 @@ async def crawl(
 async def crawl_stream(
     request: Request,
     crawl_request: CrawlRequest,
+    manager: CrawlerManager = Depends(get_manager),
     token_data: Optional[Dict] = Depends(token_dependency)
 ):
     if not crawl_request.urls:
-        raise HTTPException(status_code=400, detail="At least one URL required")
+        raise HTTPException(
+            status_code=400, detail="At least one URL required")
 
-    crawler, results_gen = await handle_stream_crawl_request(
-        urls=crawl_request.urls,
-        browser_config=crawl_request.browser_config,
-        crawler_config=crawl_request.crawler_config,
-        config=config
-    )
+    try:
+        # THIS IS A BIT WORK OF ART RATHER THAN ENGINEERING
+        # Acquire the crawler context from the manager
+        # IMPORTANT: The context needs to be active for the *duration* of the stream
+        # This structure might be tricky with FastAPI's StreamingResponse which consumes
+        # the generator *after* the endpoint function returns.
 
-    return StreamingResponse(
-        stream_results(crawler, results_gen),
-        media_type='application/x-ndjson',
-        headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
-    )
+        # --- Option A: Acquire crawler, pass to handler, handler yields ---
+        # (Requires handler NOT to be async generator itself, but return one)
+        # async with manager.get_crawler() as active_crawler:
+        #     # Handler returns the generator
+        #     _, results_gen = await handle_stream_crawl_request(
+        #         crawler=active_crawler,
+        #         urls=crawl_request.urls,
+        #         browser_config=crawl_request.browser_config or {},
+        #         crawler_config=crawl_request.crawler_config or {},
+        #         config=config
+        #     )
+        #     # PROBLEM: `active_crawler` context exits before StreamingResponse uses results_gen
+        #     # This releases the semaphore too early.
+
+        # --- Option B: Pass manager to handler, handler uses context internally ---
+        # (Requires modifying handle_stream_crawl_request signature/logic)
+        # This seems cleaner. Let's assume api.py is adapted for this.
+        # We need a way for the generator yielded by stream_results to know when
+        # to release the semaphore.
+
+        # --- Option C: Create a wrapper generator that handles context ---
+        async def stream_wrapper(manager: CrawlerManager, crawl_request: CrawlRequest, config: dict) -> AsyncGenerator[bytes, None]:
+            active_crawler = None
+            try:
+                async with manager.get_crawler() as acquired_crawler:
+                    active_crawler = acquired_crawler  # Keep reference for cleanup
+                    # Call the handler which returns the raw result generator
+                    _crawler_ref, results_gen = await handle_stream_crawl_request(
+                        crawler=acquired_crawler,
+                        urls=crawl_request.urls,
+                        browser_config=crawl_request.browser_config or {},
+                        crawler_config=crawl_request.crawler_config or {},
+                        config=config
+                    )
+                    # Use the stream_results utility to format and yield
+                    async for data_bytes in stream_results(_crawler_ref, results_gen):
+                        yield data_bytes
+            except (PoolTimeoutError, NoHealthyCrawlerError) as e:
+                # Yield a final error message in the stream
+                error_payload = {"status": "error", "detail": str(e)}
+                yield (json.dumps(error_payload) + "\n").encode('utf-8')
+                logger.warning(f"Stream request failed: {e}")
+                # Re-raise might be better if StreamingResponse handles it? Test needed.
+            except HTTPException as e:  # Catch HTTP exceptions from handler setup
+                error_payload = {"status": "error",
+                                 "detail": e.detail, "status_code": e.status_code}
+                yield (json.dumps(error_payload) + "\n").encode('utf-8')
+                logger.warning(
+                    f"Stream request failed with HTTPException: {e.detail}")
+            except Exception as e:
+                error_payload = {"status": "error",
+                                 "detail": f"Unexpected stream error: {e}"}
+                yield (json.dumps(error_payload) + "\n").encode('utf-8')
+                logger.error(
+                    f"Unexpected error during stream processing: {e}", exc_info=True)
+            # finally:
+                # Ensure crawler cleanup if stream_results doesn't handle it?
+                # stream_results *should* call crawler.close(), but only on the
+                # instance it received. If we pass the *manager* instead, this gets complex.
+                # Let's stick to passing the acquired_crawler and rely on stream_results.
+
+        # Create the generator using the wrapper
+        streaming_generator = stream_wrapper(manager, crawl_request, config)
+
+        return StreamingResponse(
+            streaming_generator,  # Use the wrapper
+            media_type='application/x-ndjson',
+            headers={'Cache-Control': 'no-cache',
+                     'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
+        )
+
+    except (PoolTimeoutError, NoHealthyCrawlerError) as e:
+        # These might occur if get_crawler fails *before* stream starts
+        # Or if the wrapper re-raises them.
+        logger.warning(f"Stream request rejected before starting: {e}")
+        status_code = status.HTTP_503_SERVICE_UNAVAILABLE  # Or 429 for timeout
+        # Don't raise HTTPException here, let the wrapper yield the error message.
+        # If we want to return a non-200 initial status, need more complex handling.
+        # Return an *empty* stream with error headers? Or just let wrapper yield error.
+
+        async def _error_stream():
+            error_payload = {"status": "error", "detail": str(e)}
+            yield (json.dumps(error_payload) + "\n").encode('utf-8')
+        return StreamingResponse(_error_stream(), status_code=status_code, media_type='application/x-ndjson')
+
+    except HTTPException:  # Re-raise HTTP exceptions from setup
+        raise
+    except Exception as e:
+        logger.error(
+            f"Unexpected error setting up stream crawl: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An unexpected error occurred setting up the stream: {e}"
+        )
 
 if __name__ == "__main__":
     import uvicorn
@@ -178,4 +403,4 @@ if __name__ == "__main__":
         port=config["app"]["port"],
         reload=config["app"]["reload"],
         timeout_keep_alive=config["app"]["timeout_keep_alive"]
-    )
\ No newline at end of file
+    )
diff --git a/tests/memory/test_stress_api.py b/tests/memory/test_stress_api.py
new file mode 100644
index 00000000..232964c1
--- /dev/null
+++ b/tests/memory/test_stress_api.py
@@ -0,0 +1,516 @@
+#!/usr/bin/env python3
+"""
+Stress test for Crawl4AI's Docker API server (/crawl and /crawl/stream endpoints).
+
+This version targets a running Crawl4AI API server, sending concurrent requests
+to test its ability to handle multiple crawl jobs simultaneously.
+It uses httpx for async HTTP requests and logs results per batch of requests,
+including server-side memory usage reported by the API.
+"""
+
+import asyncio
+import time
+import uuid
+import argparse
+import json
+import sys
+import os
+import shutil
+from typing import List, Dict, Optional, Union, AsyncGenerator, Tuple
+import httpx
+import pathlib # Import pathlib explicitly
+from rich.console import Console
+from rich.panel import Panel
+from rich.syntax import Syntax
+
+# --- Constants ---
+# DEFAULT_API_URL = "http://localhost:11235" # Default port
+DEFAULT_API_URL = "http://localhost:8020" # Default port
+DEFAULT_URL_COUNT = 1000
+DEFAULT_MAX_CONCURRENT_REQUESTS = 5
+DEFAULT_CHUNK_SIZE = 10
+DEFAULT_REPORT_PATH = "reports_api"
+DEFAULT_STREAM_MODE = False
+REQUEST_TIMEOUT = 180.0
+
+# Initialize Rich console
+console = Console()
+
+# --- API Health Check (Unchanged) ---
+async def check_server_health(client: httpx.AsyncClient, health_endpoint: str = "/health"):
+    """Check if the API server is healthy."""
+    console.print(f"[bold cyan]Checking API server health at {client.base_url}{health_endpoint}...[/]", end="")
+    try:
+        response = await client.get(health_endpoint, timeout=10.0)
+        response.raise_for_status()
+        health_data = response.json()
+        version = health_data.get('version', 'N/A')
+        console.print(f"[bold green] Server OK! Version: {version}[/]")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        console.print(f"\n[bold red]Server health check FAILED:[/]")
+        console.print(f"Error: {e}")
+        console.print(f"Is the server running and accessible at {client.base_url}?")
+        return False
+    except Exception as e:
+        console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
+        console.print(e)
+        return False
+
+# --- API Stress Test Class ---
+class ApiStressTest:
+    """Orchestrates the stress test by sending concurrent requests to the API."""
+
+    def __init__(
+        self,
+        api_url: str,
+        url_count: int,
+        max_concurrent_requests: int,
+        chunk_size: int,
+        report_path: str,
+        stream_mode: bool,
+    ):
+        self.api_base_url = api_url.rstrip('/')
+        self.url_count = url_count
+        self.max_concurrent_requests = max_concurrent_requests
+        self.chunk_size = chunk_size
+        self.report_path = pathlib.Path(report_path)
+        self.report_path.mkdir(parents=True, exist_ok=True)
+        self.stream_mode = stream_mode
+
+        self.test_id = time.strftime("%Y%m%d_%H%M%S")
+        self.results_summary = {
+            "test_id": self.test_id, "api_url": api_url, "url_count": url_count,
+            "max_concurrent_requests": max_concurrent_requests, "chunk_size": chunk_size,
+            "stream_mode": stream_mode, "start_time": "", "end_time": "",
+            "total_time_seconds": 0, "successful_requests": 0, "failed_requests": 0,
+            "successful_urls": 0, "failed_urls": 0, "total_urls_processed": 0,
+            "total_api_calls": 0,
+            "server_memory_metrics": { # To store aggregated server memory info
+                 "batch_mode_avg_delta_mb": None,
+                 "batch_mode_max_delta_mb": None,
+                 "stream_mode_avg_max_snapshot_mb": None,
+                 "stream_mode_max_max_snapshot_mb": None,
+                 "samples": [] # Store individual request memory results
+             }
+        }
+        self.http_client = httpx.AsyncClient(base_url=self.api_base_url, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=max_concurrent_requests + 5, max_keepalive_connections=max_concurrent_requests))
+
+    async def close_client(self):
+        """Close the httpx client."""
+        await self.http_client.aclose()
+
+    async def run(self) -> Dict:
+        """Run the API stress test."""
+        # No client memory tracker needed
+        urls_to_process = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(self.url_count)]
+        url_chunks = [urls_to_process[i:i+self.chunk_size] for i in range(0, len(urls_to_process), self.chunk_size)]
+
+        self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
+        start_time = time.time()
+
+        console.print(f"\n[bold cyan]Crawl4AI API Stress Test - {self.url_count} URLs, {self.max_concurrent_requests} concurrent requests[/bold cyan]")
+        console.print(f"[bold cyan]Target API:[/bold cyan] {self.api_base_url}, [bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]URLs per Request:[/bold cyan] {self.chunk_size}")
+        # Removed client memory log
+
+        semaphore = asyncio.Semaphore(self.max_concurrent_requests)
+
+        # Updated Batch logging header
+        console.print("\n[bold]API Request Batch Progress:[/bold]")
+        # Adjusted spacing and added Peak
+        console.print("[bold] Batch | Progress | SrvMem Peak / Δ|Max (MB) | Reqs/sec | S/F URLs | Time (s) | Status  [/bold]")
+        # Adjust separator length if needed, looks okay for now
+        console.print("─" * 95) 
+
+        # No client memory monitor task needed
+
+        tasks = []
+        total_api_calls = len(url_chunks)
+        self.results_summary["total_api_calls"] = total_api_calls
+
+        try:
+            for i, chunk in enumerate(url_chunks):
+                task = asyncio.create_task(self._make_api_request(
+                    chunk=chunk,
+                    batch_idx=i + 1,
+                    total_batches=total_api_calls,
+                    semaphore=semaphore
+                    # No memory tracker passed
+                ))
+                tasks.append(task)
+
+            api_results = await asyncio.gather(*tasks)
+
+            # Process aggregated results including server memory
+            total_successful_requests = sum(1 for r in api_results if r['request_success'])
+            total_failed_requests = total_api_calls - total_successful_requests
+            total_successful_urls = sum(r['success_urls'] for r in api_results)
+            total_failed_urls = sum(r['failed_urls'] for r in api_results)
+            total_urls_processed = total_successful_urls + total_failed_urls
+
+            # Aggregate server memory metrics
+            valid_samples = [r for r in api_results if r.get('server_delta_or_max_mb') is not None] # Filter results with valid mem data
+            self.results_summary["server_memory_metrics"]["samples"] = valid_samples # Store raw samples with both peak and delta/max
+
+            if valid_samples:
+                 delta_or_max_values = [r['server_delta_or_max_mb'] for r in valid_samples]
+                 if self.stream_mode:
+                     # Stream mode: delta_or_max holds max snapshot
+                     self.results_summary["server_memory_metrics"]["stream_mode_avg_max_snapshot_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
+                     self.results_summary["server_memory_metrics"]["stream_mode_max_max_snapshot_mb"] = max(delta_or_max_values)
+                 else: # Batch mode
+                     # delta_or_max holds delta
+                     self.results_summary["server_memory_metrics"]["batch_mode_avg_delta_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
+                     self.results_summary["server_memory_metrics"]["batch_mode_max_delta_mb"] = max(delta_or_max_values)
+
+                     # Aggregate peak values for batch mode
+                     peak_values = [r['server_peak_memory_mb'] for r in valid_samples if r.get('server_peak_memory_mb') is not None]
+                     if peak_values:
+                          self.results_summary["server_memory_metrics"]["batch_mode_avg_peak_mb"] = sum(peak_values) / len(peak_values)
+                          self.results_summary["server_memory_metrics"]["batch_mode_max_peak_mb"] = max(peak_values)
+
+
+            self.results_summary.update({
+                "successful_requests": total_successful_requests,
+                "failed_requests": total_failed_requests,
+                "successful_urls": total_successful_urls,
+                "failed_urls": total_failed_urls,
+                "total_urls_processed": total_urls_processed,
+            })
+
+        except Exception as e:
+             console.print(f"[bold red]An error occurred during task execution: {e}[/bold red]")
+             import traceback
+             traceback.print_exc()
+        # No finally block needed for monitor task
+
+        end_time = time.time()
+        self.results_summary.update({
+            "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "total_time_seconds": end_time - start_time,
+            # No client memory report
+        })
+        self._save_results()
+        return self.results_summary
+
+    async def _make_api_request(
+        self,
+        chunk: List[str],
+        batch_idx: int,
+        total_batches: int,
+        semaphore: asyncio.Semaphore
+        # No memory tracker
+    ) -> Dict:
+        """Makes a single API request for a chunk of URLs, handling concurrency and logging server memory."""
+        request_success = False
+        success_urls = 0
+        failed_urls = 0
+        status = "Pending"
+        status_color = "grey"
+        server_memory_metric = None # Store delta (batch) or max snapshot (stream)
+        api_call_start_time = time.time()
+
+        async with semaphore:
+            try:
+                # No client memory sampling
+
+                endpoint = "/crawl/stream" if self.stream_mode else "/crawl"
+                payload = {
+                    "urls": chunk,
+                    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+                    "crawler_config": {
+                        "type": "CrawlerRunConfig",
+                        "params": {"cache_mode": "BYPASS", "stream": self.stream_mode}
+                    }
+                }
+
+                if self.stream_mode:
+                    max_server_mem_snapshot = 0.0 # Track max memory seen in this stream
+                    async with self.http_client.stream("POST", endpoint, json=payload) as response:
+                        initial_status_code = response.status_code
+                        response.raise_for_status()
+
+                        completed_marker_received = False
+                        async for line in response.aiter_lines():
+                            if line:
+                                try:
+                                    data = json.loads(line)
+                                    if data.get("status") == "completed":
+                                        completed_marker_received = True
+                                        break
+                                    elif data.get("url"):
+                                        if data.get("success"): success_urls += 1
+                                        else: failed_urls += 1
+                                        # Extract server memory snapshot per result
+                                        mem_snapshot = data.get('server_memory_mb')
+                                        if mem_snapshot is not None:
+                                            max_server_mem_snapshot = max(max_server_mem_snapshot, float(mem_snapshot))
+                                except json.JSONDecodeError:
+                                    console.print(f"[Batch {batch_idx}] [red]Stream decode error for line:[/red] {line}")
+                                    failed_urls = len(chunk)
+                                    break
+                        request_success = completed_marker_received
+                        if not request_success:
+                             failed_urls = len(chunk) - success_urls
+                        server_memory_metric = max_server_mem_snapshot # Use max snapshot for stream logging
+
+                else: # Batch mode
+                    response = await self.http_client.post(endpoint, json=payload)
+                    response.raise_for_status()
+                    data = response.json()
+
+                    # Extract server memory delta from the response
+                    server_memory_metric = data.get('server_memory_delta_mb')
+                    server_peak_mem_mb = data.get('server_peak_memory_mb') 
+
+                    if data.get("success") and "results" in data:
+                        request_success = True
+                        results_list = data.get("results", [])
+                        for result_item in results_list:
+                            if result_item.get("success"): success_urls += 1
+                            else: failed_urls += 1
+                        if len(results_list) != len(chunk):
+                             console.print(f"[Batch {batch_idx}] [yellow]Warning: Result count ({len(results_list)}) doesn't match URL count ({len(chunk)})[/yellow]")
+                             failed_urls = len(chunk) - success_urls
+                    else:
+                        request_success = False
+                        failed_urls = len(chunk)
+                        # Try to get memory from error detail if available
+                        detail = data.get('detail')
+                        if isinstance(detail, str):
+                            try: detail_json = json.loads(detail)
+                            except: detail_json = {}
+                        elif isinstance(detail, dict):
+                            detail_json = detail
+                        else: detail_json = {}
+                        server_peak_mem_mb = detail_json.get('server_peak_memory_mb', None)
+                        server_memory_metric = detail_json.get('server_memory_delta_mb', None)
+                        console.print(f"[Batch {batch_idx}] [red]API request failed:[/red] {detail_json.get('error', 'No details')}")
+
+
+            except httpx.HTTPStatusError as e:
+                request_success = False
+                failed_urls = len(chunk)
+                console.print(f"[Batch {batch_idx}] [bold red]HTTP Error {e.response.status_code}:[/] {e.request.url}")
+                try:
+                    error_detail = e.response.json()
+                    # Attempt to extract memory info even from error responses
+                    detail_content = error_detail.get('detail', {})
+                    if isinstance(detail_content, str): # Handle if detail is stringified JSON
+                         try: detail_content = json.loads(detail_content)
+                         except: detail_content = {}
+                    server_memory_metric = detail_content.get('server_memory_delta_mb', None)
+                    server_peak_mem_mb = detail_content.get('server_peak_memory_mb', None)
+                    console.print(f"Response: {error_detail}")
+                except Exception:
+                     console.print(f"Response Text: {e.response.text[:200]}...")
+            except httpx.RequestError as e:
+                request_success = False
+                failed_urls = len(chunk)
+                console.print(f"[Batch {batch_idx}] [bold red]Request Error:[/bold] {e.request.url} - {e}")
+            except Exception as e:
+                request_success = False
+                failed_urls = len(chunk)
+                console.print(f"[Batch {batch_idx}] [bold red]Unexpected Error:[/bold] {e}")
+                import traceback
+                traceback.print_exc()
+
+            finally:
+                api_call_time = time.time() - api_call_start_time
+                total_processed_urls = success_urls + failed_urls
+
+                if request_success and failed_urls == 0: status_color, status = "green", "Success"
+                elif request_success and success_urls > 0: status_color, status = "yellow", "Partial"
+                else: status_color, status = "red", "Failed"
+
+                current_total_urls = batch_idx * self.chunk_size
+                progress_pct = min(100.0, (current_total_urls / self.url_count) * 100)
+                reqs_per_sec = 1.0 / api_call_time if api_call_time > 0 else float('inf')
+
+                # --- New Memory Formatting ---
+                mem_display = " N/A " # Default
+                peak_mem_value = None
+                delta_or_max_value = None
+
+                if self.stream_mode:
+                    # server_memory_metric holds max snapshot for stream
+                    if server_memory_metric is not None:
+                        mem_display = f"{server_memory_metric:.1f} (Max)"
+                        delta_or_max_value = server_memory_metric # Store for aggregation
+                else: # Batch mode - expect peak and delta
+                    # We need to get peak and delta from the API response
+                    peak_mem_value = locals().get('server_peak_mem_mb', None) # Get from response data if available
+                    delta_value = server_memory_metric # server_memory_metric holds delta for batch
+
+                    if peak_mem_value is not None and delta_value is not None:
+                        mem_display = f"{peak_mem_value:.1f} / {delta_value:+.1f}"
+                        delta_or_max_value = delta_value # Store delta for aggregation
+                    elif peak_mem_value is not None:
+                         mem_display = f"{peak_mem_value:.1f} / N/A"
+                    elif delta_value is not None:
+                         mem_display = f"N/A / {delta_value:+.1f}"
+                         delta_or_max_value = delta_value # Store delta for aggregation
+
+                # --- Updated Print Statement with Adjusted Padding ---
+                console.print(
+                    f" {batch_idx:<5} | {progress_pct:6.1f}% | {mem_display:>24} | {reqs_per_sec:8.1f} | " # Increased width for memory column
+                    f"{success_urls:^7}/{failed_urls:<6} | {api_call_time:8.2f} | [{status_color}]{status:<7}[/{status_color}] " # Added trailing space
+                )
+
+                # --- Updated Return Dictionary ---
+                return_data = {
+                    "batch_idx": batch_idx,
+                    "request_success": request_success,
+                    "success_urls": success_urls,
+                    "failed_urls": failed_urls,
+                    "time": api_call_time,
+                    # Return both peak (if available) and delta/max
+                    "server_peak_memory_mb": peak_mem_value, # Will be None for stream mode
+                    "server_delta_or_max_mb": delta_or_max_value # Delta for batch, Max for stream
+                }
+                # Add back the specific batch mode delta if needed elsewhere, but delta_or_max covers it
+                # if not self.stream_mode:
+                #    return_data["server_memory_delta_mb"] = delta_value
+                return return_data
+
+    # No _periodic_memory_sample needed
+
+    def _save_results(self) -> None:
+        """Saves the results summary to a JSON file."""
+        results_path = self.report_path / f"api_test_summary_{self.test_id}.json"
+        try:
+            # No client memory path to convert
+            with open(results_path, 'w', encoding='utf-8') as f:
+                json.dump(self.results_summary, f, indent=2, default=str)
+        except Exception as e:
+            console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
+
+
+# --- run_full_test Function ---
+async def run_full_test(args):
+    """Runs the full API stress test process."""
+    client = httpx.AsyncClient(base_url=args.api_url, timeout=REQUEST_TIMEOUT)
+
+    if not await check_server_health(client):
+        console.print("[bold red]Aborting test due to server health check failure.[/]")
+        await client.aclose()
+        return
+    await client.aclose()
+
+    test = ApiStressTest(
+        api_url=args.api_url,
+        url_count=args.urls,
+        max_concurrent_requests=args.max_concurrent_requests,
+        chunk_size=args.chunk_size,
+        report_path=args.report_path,
+        stream_mode=args.stream,
+    )
+    results = {}
+    try:
+        results = await test.run()
+    finally:
+        await test.close_client()
+
+    if not results:
+        console.print("[bold red]Test did not produce results.[/bold red]")
+        return
+
+    console.print("\n" + "=" * 80)
+    console.print("[bold green]API Stress Test Completed[/bold green]")
+    console.print("=" * 80)
+
+    success_rate_reqs = results["successful_requests"] / results["total_api_calls"] * 100 if results["total_api_calls"] > 0 else 0
+    success_rate_urls = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
+    urls_per_second = results["total_urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+    reqs_per_second = results["total_api_calls"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+
+
+    console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
+    console.print(f"[bold cyan]Target API:[/bold cyan] {results['api_url']}")
+    console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_concurrent_requests']} concurrent client requests, URLs/Req: {results['chunk_size']}, Stream: {results['stream_mode']}")
+    console.print(f"[bold cyan]API Requests:[/bold cyan] {results['successful_requests']} successful, {results['failed_requests']} failed ({results['total_api_calls']} total, {success_rate_reqs:.1f}% success)")
+    console.print(f"[bold cyan]URL Processing:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['total_urls_processed']} processed, {success_rate_urls:.1f}% success)")
+    console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f}s total | Avg Reqs/sec: {reqs_per_second:.2f} | Avg URLs/sec: {urls_per_second:.2f}")
+
+    # Report Server Memory
+    mem_metrics = results.get("server_memory_metrics", {})
+    mem_samples = mem_metrics.get("samples", [])
+    if mem_samples:
+         num_samples = len(mem_samples)
+         if results['stream_mode']:
+             avg_mem = mem_metrics.get("stream_mode_avg_max_snapshot_mb")
+             max_mem = mem_metrics.get("stream_mode_max_max_snapshot_mb")
+             avg_str = f"{avg_mem:.1f}" if avg_mem is not None else "N/A"
+             max_str = f"{max_mem:.1f}" if max_mem is not None else "N/A"
+             console.print(f"[bold cyan]Server Memory (Stream):[/bold cyan] Avg Max Snapshot: {avg_str} MB | Max Max Snapshot: {max_str} MB (across {num_samples} requests)")
+         else: # Batch mode
+             avg_delta = mem_metrics.get("batch_mode_avg_delta_mb")
+             max_delta = mem_metrics.get("batch_mode_max_delta_mb")
+             avg_peak = mem_metrics.get("batch_mode_avg_peak_mb")
+             max_peak = mem_metrics.get("batch_mode_max_peak_mb")
+
+             avg_delta_str = f"{avg_delta:.1f}" if avg_delta is not None else "N/A"
+             max_delta_str = f"{max_delta:.1f}" if max_delta is not None else "N/A"
+             avg_peak_str = f"{avg_peak:.1f}" if avg_peak is not None else "N/A"
+             max_peak_str = f"{max_peak:.1f}" if max_peak is not None else "N/A"
+
+             console.print(f"[bold cyan]Server Memory (Batch):[/bold cyan] Avg Peak: {avg_peak_str} MB | Max Peak: {max_peak_str} MB | Avg Delta: {avg_delta_str} MB | Max Delta: {max_delta_str} MB (across {num_samples} requests)")
+    else:
+        console.print("[bold cyan]Server Memory:[/bold cyan] No memory data reported by server.")
+
+
+    # No client memory report
+    summary_path = pathlib.Path(args.report_path) / f"api_test_summary_{results['test_id']}.json"
+    console.print(f"[bold green]Results summary saved to {summary_path}[/bold green]")
+
+    if results["failed_requests"] > 0:
+        console.print(f"\n[bold yellow]Warning: {results['failed_requests']} API requests failed ({100-success_rate_reqs:.1f}% failure rate)[/bold yellow]")
+    if results["failed_urls"] > 0:
+         console.print(f"[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate_urls:.1f}% URL failure rate)[/bold yellow]")
+    if results["total_urls_processed"] < results["url_count"]:
+        console.print(f"\n[bold red]Error: Only {results['total_urls_processed']} out of {results['url_count']} target URLs were processed![/bold red]")
+
+
+# --- main Function (Argument parsing mostly unchanged) ---
+def main():
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(description="Crawl4AI API Server Stress Test")
+
+    parser.add_argument("--api-url", type=str, default=DEFAULT_API_URL, help=f"Base URL of the Crawl4AI API server (default: {DEFAULT_API_URL})")
+    parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Total number of unique URLs to process via API calls (default: {DEFAULT_URL_COUNT})")
+    parser.add_argument("--max-concurrent-requests", type=int, default=DEFAULT_MAX_CONCURRENT_REQUESTS, help=f"Maximum concurrent API requests from this client (default: {DEFAULT_MAX_CONCURRENT_REQUESTS})")
+    parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per API request payload (default: {DEFAULT_CHUNK_SIZE})")
+    parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Use the /crawl/stream endpoint instead of /crawl (default: {DEFAULT_STREAM_MODE})")
+    parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
+    parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
+
+    args = parser.parse_args()
+
+    console.print("[bold underline]Crawl4AI API Stress Test Configuration[/bold underline]")
+    console.print(f"API URL: {args.api_url}")
+    console.print(f"Total URLs: {args.urls}, Concurrent Client Requests: {args.max_concurrent_requests}, URLs per Request: {args.chunk_size}")
+    console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
+    console.print(f"Report Path: {args.report_path}")
+    console.print("-" * 40)
+    if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
+    console.print("-" * 40)
+
+    if args.clean_reports:
+        report_dir = pathlib.Path(args.report_path)
+        if report_dir.exists():
+            console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]")
+            shutil.rmtree(args.report_path)
+        report_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        asyncio.run(run_full_test(args))
+    except KeyboardInterrupt:
+        console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
+    except Exception as e:
+        console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    # No need to modify sys.path for SimpleMemoryTracker as it's removed
+    main()
\ No newline at end of file
diff --git a/tests/memory/test_stress_docker_api.py b/tests/memory/test_stress_docker_api.py
new file mode 100644
index 00000000..05b3bea8
--- /dev/null
+++ b/tests/memory/test_stress_docker_api.py
@@ -0,0 +1,129 @@
+"""
+Crawl4AI Docker API stress tester.
+
+Examples
+--------
+python test_stress_docker_api.py --urls 1000 --concurrency 32
+python test_stress_docker_api.py --urls 1000 --concurrency 32 --stream
+python test_stress_docker_api.py --base-url http://10.0.0.42:11235 --http2
+"""
+
+import argparse, asyncio, json, secrets, statistics, time
+from typing import List, Tuple
+import httpx
+from rich.console import Console
+from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
+from rich.table import Table
+
+console = Console()
+
+
+# ───────────────────────── helpers ─────────────────────────
+def make_fake_urls(n: int) -> List[str]:
+    base = "https://httpbin.org/anything/"
+    return [f"{base}{secrets.token_hex(8)}" for _ in range(n)]
+
+
+async def fire(
+    client: httpx.AsyncClient, endpoint: str, payload: dict, sem: asyncio.Semaphore
+) -> Tuple[bool, float]:
+    async with sem:
+        print(f"POST {endpoint} with {len(payload['urls'])} URLs")
+        t0 = time.perf_counter()
+        try:
+            if endpoint.endswith("/stream"):
+                async with client.stream("POST", endpoint, json=payload) as r:
+                    r.raise_for_status()
+                    async for _ in r.aiter_lines():
+                        pass
+            else:
+                r = await client.post(endpoint, json=payload)                
+                r.raise_for_status()
+            return True, time.perf_counter() - t0
+        except Exception:
+            return False, time.perf_counter() - t0
+
+
+def pct(lat: List[float], p: float) -> str:
+    """Return percentile string even for tiny samples."""
+    if not lat:
+        return "-"
+    if len(lat) == 1:
+        return f"{lat[0]:.2f}s"
+    lat_sorted = sorted(lat)
+    k = (p / 100) * (len(lat_sorted) - 1)
+    lo = int(k)
+    hi = min(lo + 1, len(lat_sorted) - 1)
+    frac = k - lo
+    val = lat_sorted[lo] * (1 - frac) + lat_sorted[hi] * frac
+    return f"{val:.2f}s"
+
+
+# ───────────────────────── main ─────────────────────────
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Stress test Crawl4AI Docker API")
+    p.add_argument("--urls", type=int, default=100, help="number of URLs")
+    p.add_argument("--concurrency", type=int, default=1, help="max POSTs in flight")
+    p.add_argument("--chunk-size", type=int, default=50, help="URLs per request")
+    p.add_argument("--base-url", default="http://localhost:11235", help="API root")
+    # p.add_argument("--base-url", default="http://localhost:8020", help="API root")
+    p.add_argument("--stream", action="store_true", help="use /crawl/stream")
+    p.add_argument("--http2", action="store_true", help="enable HTTP/2")
+    p.add_argument("--headless", action="store_true", default=True)
+    return p.parse_args()
+
+
+async def main() -> None:
+    args = parse_args()
+
+    urls = make_fake_urls(args.urls)
+    batches = [urls[i : i + args.chunk_size] for i in range(0, len(urls), args.chunk_size)]
+    endpoint = "/crawl/stream" if args.stream else "/crawl"
+    sem = asyncio.Semaphore(args.concurrency)
+
+    async with httpx.AsyncClient(base_url=args.base_url, http2=args.http2, timeout=None) as client:
+        with Progress(
+            "[progress.description]{task.description}",
+            BarColumn(),
+            "[progress.percentage]{task.percentage:>3.0f}%",
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+        ) as progress:
+            task_id = progress.add_task("[cyan]bombarding…", total=len(batches))
+            tasks = []
+            for chunk in batches:
+                payload = {
+                    "urls": chunk,
+                    "browser_config": {"type": "BrowserConfig", "params": {"headless": args.headless}},
+                    "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "stream": args.stream}},
+                }
+                tasks.append(asyncio.create_task(fire(client, endpoint, payload, sem)))
+                progress.advance(task_id)
+
+            results = await asyncio.gather(*tasks)
+
+    ok_latencies = [dt for ok, dt in results if ok]
+    err_count = sum(1 for ok, _ in results if not ok)
+
+    table = Table(title="Docker API Stress‑Test Summary")
+    table.add_column("total", justify="right")
+    table.add_column("errors", justify="right")
+    table.add_column("p50", justify="right")
+    table.add_column("p95", justify="right")
+    table.add_column("max", justify="right")
+
+    table.add_row(
+        str(len(results)),
+        str(err_count),
+        pct(ok_latencies, 50),
+        pct(ok_latencies, 95),
+        f"{max(ok_latencies):.2f}s" if ok_latencies else "-",
+    )
+    console.print(table)
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]aborted by user[/]")
diff --git a/tests/memory/test_stress_sdk.py b/tests/memory/test_stress_sdk.py
index 8000690c..14da94a4 100644
--- a/tests/memory/test_stress_sdk.py
+++ b/tests/memory/test_stress_sdk.py
@@ -37,8 +37,8 @@ from crawl4ai import (
 DEFAULT_SITE_PATH = "test_site"
 DEFAULT_PORT = 8000
 DEFAULT_MAX_SESSIONS = 16
-DEFAULT_URL_COUNT = 100
-DEFAULT_CHUNK_SIZE = 10 # Define chunk size for batch logging
+DEFAULT_URL_COUNT = 1
+DEFAULT_CHUNK_SIZE = 1 # Define chunk size for batch logging
 DEFAULT_REPORT_PATH = "reports"
 DEFAULT_STREAM_MODE = False
 DEFAULT_MONITOR_MODE = "DETAILED"

From c2902fd200fa5ad354da33d8528a12844b3c75be Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 19 Apr 2025 19:46:20 +0530
Subject: [PATCH 64/78] reverse:last change in order of execution for it
 introduced a new issue in content generated.
 https://github.com/unclecode/crawl4ai/issues/902

---
 crawl4ai/content_scraping_strategy.py | 58 ++++++++++++++-------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index aa69c5fb..814e4b2b 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -901,7 +901,22 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                     element.extract()
             else:
                 for element in body.select(excluded_selector):
-                    element.extract()     
+                    element.extract()
+
+        content_element = None
+        if target_elements:
+            try:
+                for_content_targeted_element = []
+                for target_element in target_elements:
+                    for_content_targeted_element.extend(body.select(target_element))
+                content_element = soup.new_tag("div")
+                for el in for_content_targeted_element:
+                    content_element.append(el)
+            except Exception as e:
+                self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
+                return None
+        else:
+            content_element = body     
 
         kwargs["exclude_social_media_domains"] = set(
             kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -961,20 +976,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
         str_body = ""
         try:
-            content_element = None
-            if target_elements:
-                try:
-                    for_content_targeted_element = []
-                    for target_element in target_elements:
-                        for_content_targeted_element.extend(body.select(target_element))
-                    content_element = soup.new_tag("div")
-                    for el in for_content_targeted_element:
-                        content_element.append(el)
-                except Exception as e:
-                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
-                    return None
-            else:
-                content_element = body
             str_body = content_element.encode_contents().decode("utf-8")
         except Exception:
             # Reset body to the original HTML
@@ -1531,6 +1532,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                 self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
                 meta = {}
 
+            content_element = None
+            if target_elements:
+                try:
+                    for_content_targeted_element = []
+                    for target_element in target_elements:
+                        for_content_targeted_element.extend(body.cssselect(target_element))
+                    content_element = lhtml.Element("div")
+                    content_element.extend(for_content_targeted_element)
+                except Exception as e:
+                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
+                    return None
+            else:
+                content_element = body
+
             # Remove script and style tags
             for tag in ["script", "style", "link", "meta", "noscript"]:
                 for element in body.xpath(f".//{tag}"):
@@ -1599,19 +1614,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             )
 
             # Generate output HTML
-            content_element = None
-            if target_elements:
-                try:
-                    for_content_targeted_element = []
-                    for target_element in target_elements:
-                        for_content_targeted_element.extend(body.cssselect(target_element))
-                    content_element = lhtml.Element("div")
-                    content_element.extend(for_content_targeted_element)
-                except Exception as e:
-                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
-                    return None
-            else:
-                content_element = body
             cleaned_html = lhtml.tostring(
                 # body,   
                 content_element,

From d2648eaa39d4232b3de6a27a1170b5fef8ecc389 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 19 Apr 2025 20:08:36 +0530
Subject: [PATCH 65/78] fix: solved with deepcopy of elements
 https://github.com/unclecode/crawl4ai/issues/902

---
 crawl4ai/content_scraping_strategy.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 814e4b2b..1dfbce84 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -28,6 +28,7 @@ from lxml import etree
 from lxml import html as lhtml
 from typing import List
 from .models import ScrapingResult, MediaItem, Link, Media, Links
+import copy
 
 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r"^og:")
@@ -911,7 +912,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                     for_content_targeted_element.extend(body.select(target_element))
                 content_element = soup.new_tag("div")
                 for el in for_content_targeted_element:
-                    content_element.append(el)
+                    content_element.append(copy.deepcopy(el))
             except Exception as e:
                 self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                 return None
@@ -1539,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                     for target_element in target_elements:
                         for_content_targeted_element.extend(body.cssselect(target_element))
                     content_element = lhtml.Element("div")
-                    content_element.extend(for_content_targeted_element)
+                    content_element.extend(copy.deepcopy(for_content_targeted_element))
                 except Exception as e:
                     self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                     return None

From a58c8000aab067d51db15a871a0c3fe377e73788 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 20 Apr 2025 20:14:26 +0800
Subject: [PATCH 66/78] refactor(server): migrate to pool-based crawler
 management

Replace crawler_manager.py with simpler crawler_pool.py implementation:
- Add global page semaphore for hard concurrency cap
- Implement browser pool with idle cleanup
- Add playground UI for testing and stress testing
- Update API handlers to use pooled crawlers
- Enhance logging levels and symbols

BREAKING CHANGE: Removes CrawlerManager class in favor of simpler pool-based approach
---
 Dockerfile                                 |   3 +
 crawl4ai/async_logger.py                   |  36 +
 crawl4ai/browser_manager.py                |   3 +
 deploy/docker/api copy.py                  | 503 -------------
 deploy/docker/api.py                       |  59 +-
 deploy/docker/config.yml                   |  56 +-
 deploy/docker/crawler_manager.py           | 556 --------------
 deploy/docker/crawler_pool.py              |  60 ++
 deploy/docker/server.py                    | 509 +++++--------
 deploy/docker/static/playground/index.html | 813 +++++++++++++++++++++
 tests/memory/cap_test.py                   |  34 +
 tests/memory/test_docker_congif_gen.py     |  35 +
 tests/memory/test_stress_api.py            |  12 +-
 tests/memory/test_stress_api_xs.py         | 203 +++++
 14 files changed, 1447 insertions(+), 1435 deletions(-)
 delete mode 100644 deploy/docker/api copy.py
 delete mode 100644 deploy/docker/crawler_manager.py
 create mode 100644 deploy/docker/crawler_pool.py
 create mode 100644 deploy/docker/static/playground/index.html
 create mode 100644 tests/memory/cap_test.py
 create mode 100644 tests/memory/test_docker_congif_gen.py
 create mode 100644 tests/memory/test_stress_api_xs.py

diff --git a/Dockerfile b/Dockerfile
index a4ab56df..d32639a5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -162,6 +162,9 @@ RUN crawl4ai-doctor
 # Copy application code
 COPY deploy/docker/* ${APP_HOME}/
 
+# copy the playground + any future static assets
+COPY deploy/docker/static ${APP_HOME}/static
+
 # Change ownership of the application directory to the non-root user
 RUN chown -R appuser:appuser ${APP_HOME}
 
diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 273ef53b..541f755a 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -7,11 +7,18 @@ from datetime import datetime
 
 
 class LogLevel(Enum):
+    DEFAULT = 0
     DEBUG = 1
     INFO = 2
     SUCCESS = 3
     WARNING = 4
     ERROR = 5
+    CRITICAL = 6
+    ALERT = 7
+    NOTICE = 8
+    EXCEPTION = 9
+    FATAL = 10
+    
 
 
 
@@ -61,6 +68,13 @@ class AsyncLogger(AsyncLoggerBase):
         "DEBUG": "⋯",
         "INFO": "ℹ",
         "WARNING": "⚠",
+        "SUCCESS": "✔",
+        "CRITICAL": "‼",
+        "ALERT": "⚡",
+        "NOTICE": "ℹ",
+        "EXCEPTION": "❗",
+        "FATAL": "☠",
+        "DEFAULT": "•",
     }
 
     DEFAULT_COLORS = {
@@ -69,6 +83,12 @@ class AsyncLogger(AsyncLoggerBase):
         LogLevel.SUCCESS: Fore.GREEN,
         LogLevel.WARNING: Fore.YELLOW,
         LogLevel.ERROR: Fore.RED,
+        LogLevel.CRITICAL: Fore.RED + Style.BRIGHT,
+        LogLevel.ALERT: Fore.RED + Style.BRIGHT,
+        LogLevel.NOTICE: Fore.BLUE,
+        LogLevel.EXCEPTION: Fore.RED + Style.BRIGHT,
+        LogLevel.FATAL: Fore.RED + Style.BRIGHT,
+        LogLevel.DEFAULT: Fore.WHITE,
     }
 
     def __init__(
@@ -212,6 +232,22 @@ class AsyncLogger(AsyncLoggerBase):
     def warning(self, message: str, tag: str = "WARNING", **kwargs):
         """Log a warning message."""
         self._log(LogLevel.WARNING, message, tag, **kwargs)
+        
+    def critical(self, message: str, tag: str = "CRITICAL", **kwargs):
+        """Log a critical message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def exception(self, message: str, tag: str = "EXCEPTION", **kwargs):
+        """Log an exception message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def fatal(self, message: str, tag: str = "FATAL", **kwargs):
+        """Log a fatal message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def alert(self, message: str, tag: str = "ALERT", **kwargs):
+        """Log an alert message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def notice(self, message: str, tag: str = "NOTICE", **kwargs):
+        """Log a notice message."""
+        self._log(LogLevel.INFO, message, tag, **kwargs)
 
     def error(self, message: str, tag: str = "ERROR", **kwargs):
         """Log an error message."""
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index a338d71d..642fd6c2 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -572,6 +572,9 @@ class BrowserManager:
         if self.config.extra_args:
             args.extend(self.config.extra_args)
 
+        # Deduplicate args
+        args = list(dict.fromkeys(args))
+        
         browser_args = {"headless": self.config.headless, "args": args}
 
         if self.config.chrome_channel:
diff --git a/deploy/docker/api copy.py b/deploy/docker/api copy.py
deleted file mode 100644
index 341e23e1..00000000
--- a/deploy/docker/api copy.py	
+++ /dev/null
@@ -1,503 +0,0 @@
-import os
-import json
-import asyncio
-from typing import List, Tuple
-from functools import partial
-
-import logging
-from typing import Optional, AsyncGenerator
-from urllib.parse import unquote
-from fastapi import HTTPException, Request, status
-from fastapi.background import BackgroundTasks
-from fastapi.responses import JSONResponse
-from redis import asyncio as aioredis
-
-from crawl4ai import (
-    AsyncWebCrawler,
-    CrawlerRunConfig,
-    LLMExtractionStrategy,
-    CacheMode,
-    BrowserConfig,
-    MemoryAdaptiveDispatcher,
-    RateLimiter, 
-    LLMConfig
-)
-from crawl4ai.utils import perform_completion_with_backoff
-from crawl4ai.content_filter_strategy import (
-    PruningContentFilter,
-    BM25ContentFilter,
-    LLMContentFilter
-)
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
-
-from utils import (
-    TaskStatus,
-    FilterType,
-    get_base_url,
-    is_task_id,
-    should_cleanup_task,
-    decode_redis_hash
-)
-
-import psutil, time
-
-logger = logging.getLogger(__name__)
-
-# --- Helper to get memory ---
-def _get_memory_mb():
-    try:
-        return psutil.Process().memory_info().rss / (1024 * 1024)
-    except Exception as e:
-        logger.warning(f"Could not get memory info: {e}")
-        return None
-
-
-async def handle_llm_qa(
-    url: str,
-    query: str,
-    config: dict
-) -> str:
-    """Process QA using LLM with crawled content as context."""
-    try:
-        # Extract base URL by finding last '?q=' occurrence
-        last_q_index = url.rfind('?q=')
-        if last_q_index != -1:
-            url = url[:last_q_index]
-
-        # Get markdown content
-        async with AsyncWebCrawler() as crawler:
-            result = await crawler.arun(url)
-            if not result.success:
-                raise HTTPException(
-                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                    detail=result.error_message
-                )
-            content = result.markdown.fit_markdown
-
-        # Create prompt and get LLM response
-        prompt = f"""Use the following content as context to answer the question.
-    Content:
-    {content}
-
-    Question: {query}
-
-    Answer:"""
-
-        response = perform_completion_with_backoff(
-            provider=config["llm"]["provider"],
-            prompt_with_variables=prompt,
-            api_token=os.environ.get(config["llm"].get("api_key_env", ""))
-        )
-
-        return response.choices[0].message.content
-    except Exception as e:
-        logger.error(f"QA processing error: {str(e)}", exc_info=True)
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=str(e)
-        )
-
-async def process_llm_extraction(
-    redis: aioredis.Redis,
-    config: dict,
-    task_id: str,
-    url: str,
-    instruction: str,
-    schema: Optional[str] = None,
-    cache: str = "0"
-) -> None:
-    """Process LLM extraction in background."""
-    try:
-        # If config['llm'] has api_key then ignore the api_key_env
-        api_key = ""
-        if "api_key" in config["llm"]:
-            api_key = config["llm"]["api_key"]
-        else:
-            api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
-        llm_strategy = LLMExtractionStrategy(
-            llm_config=LLMConfig(
-                provider=config["llm"]["provider"],
-                api_token=api_key
-            ),
-            instruction=instruction,
-            schema=json.loads(schema) if schema else None,
-        )
-
-        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
-
-        async with AsyncWebCrawler() as crawler:
-            result = await crawler.arun(
-                url=url,
-                config=CrawlerRunConfig(
-                    extraction_strategy=llm_strategy,
-                    scraping_strategy=LXMLWebScrapingStrategy(),
-                    cache_mode=cache_mode
-                )
-            )
-
-        if not result.success:
-            await redis.hset(f"task:{task_id}", mapping={
-                "status": TaskStatus.FAILED,
-                "error": result.error_message
-            })
-            return
-
-        try:
-            content = json.loads(result.extracted_content)
-        except json.JSONDecodeError:
-            content = result.extracted_content
-        await redis.hset(f"task:{task_id}", mapping={
-            "status": TaskStatus.COMPLETED,
-            "result": json.dumps(content)
-        })
-
-    except Exception as e:
-        logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
-        await redis.hset(f"task:{task_id}", mapping={
-            "status": TaskStatus.FAILED,
-            "error": str(e)
-        })
-
-async def handle_markdown_request(
-    url: str,
-    filter_type: FilterType,
-    query: Optional[str] = None,
-    cache: str = "0",
-    config: Optional[dict] = None
-) -> str:
-    """Handle markdown generation requests."""
-    try:
-        decoded_url = unquote(url)
-        if not decoded_url.startswith(('http://', 'https://')):
-            decoded_url = 'https://' + decoded_url
-
-        if filter_type == FilterType.RAW:
-            md_generator = DefaultMarkdownGenerator()
-        else:
-            content_filter = {
-                FilterType.FIT: PruningContentFilter(),
-                FilterType.BM25: BM25ContentFilter(user_query=query or ""),
-                FilterType.LLM: LLMContentFilter(
-                    llm_config=LLMConfig(
-                        provider=config["llm"]["provider"],
-                        api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
-                    ),
-                    instruction=query or "Extract main content"
-                )
-            }[filter_type]
-            md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
-
-        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
-
-        async with AsyncWebCrawler() as crawler:
-            result = await crawler.arun(
-                url=decoded_url,
-                config=CrawlerRunConfig(
-                    markdown_generator=md_generator,
-                    scraping_strategy=LXMLWebScrapingStrategy(),
-                    cache_mode=cache_mode
-                )
-            )
-            
-            if not result.success:
-                raise HTTPException(
-                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                    detail=result.error_message
-                )
-
-            return (result.markdown.raw_markdown 
-                   if filter_type == FilterType.RAW 
-                   else result.markdown.fit_markdown)
-
-    except Exception as e:
-        logger.error(f"Markdown error: {str(e)}", exc_info=True)
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=str(e)
-        )
-
-async def handle_llm_request(
-    redis: aioredis.Redis,
-    background_tasks: BackgroundTasks,
-    request: Request,
-    input_path: str,
-    query: Optional[str] = None,
-    schema: Optional[str] = None,
-    cache: str = "0",
-    config: Optional[dict] = None
-) -> JSONResponse:
-    """Handle LLM extraction requests."""
-    base_url = get_base_url(request)
-    
-    try:
-        if is_task_id(input_path):
-            return await handle_task_status(
-                redis, input_path, base_url
-            )
-
-        if not query:
-            return JSONResponse({
-                "message": "Please provide an instruction",
-                "_links": {
-                    "example": {
-                        "href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
-                        "title": "Try this example"
-                    }
-                }
-            })
-
-        return await create_new_task(
-            redis,
-            background_tasks,
-            input_path,
-            query,
-            schema,
-            cache,
-            base_url,
-            config
-        )
-
-    except Exception as e:
-        logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
-        return JSONResponse({
-            "error": str(e),
-            "_links": {
-                "retry": {"href": str(request.url)}
-            }
-        }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
-
-async def handle_task_status(
-    redis: aioredis.Redis,
-    task_id: str,
-    base_url: str
-) -> JSONResponse:
-    """Handle task status check requests."""
-    task = await redis.hgetall(f"task:{task_id}")
-    if not task:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail="Task not found"
-        )
-
-    task = decode_redis_hash(task)
-    response = create_task_response(task, task_id, base_url)
-
-    if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
-        if should_cleanup_task(task["created_at"]):
-            await redis.delete(f"task:{task_id}")
-
-    return JSONResponse(response)
-
-async def create_new_task(
-    redis: aioredis.Redis,
-    background_tasks: BackgroundTasks,
-    input_path: str,
-    query: str,
-    schema: Optional[str],
-    cache: str,
-    base_url: str,
-    config: dict
-) -> JSONResponse:
-    """Create and initialize a new task."""
-    decoded_url = unquote(input_path)
-    if not decoded_url.startswith(('http://', 'https://')):
-        decoded_url = 'https://' + decoded_url
-
-    from datetime import datetime
-    task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
-    
-    await redis.hset(f"task:{task_id}", mapping={
-        "status": TaskStatus.PROCESSING,
-        "created_at": datetime.now().isoformat(),
-        "url": decoded_url
-    })
-
-    background_tasks.add_task(
-        process_llm_extraction,
-        redis,
-        config,
-        task_id,
-        decoded_url,
-        query,
-        schema,
-        cache
-    )
-
-    return JSONResponse({
-        "task_id": task_id,
-        "status": TaskStatus.PROCESSING,
-        "url": decoded_url,
-        "_links": {
-            "self": {"href": f"{base_url}/llm/{task_id}"},
-            "status": {"href": f"{base_url}/llm/{task_id}"}
-        }
-    })
-
-def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
-    """Create response for task status check."""
-    response = {
-        "task_id": task_id,
-        "status": task["status"],
-        "created_at": task["created_at"],
-        "url": task["url"],
-        "_links": {
-            "self": {"href": f"{base_url}/llm/{task_id}"},
-            "refresh": {"href": f"{base_url}/llm/{task_id}"}
-        }
-    }
-
-    if task["status"] == TaskStatus.COMPLETED:
-        response["result"] = json.loads(task["result"])
-    elif task["status"] == TaskStatus.FAILED:
-        response["error"] = task["error"]
-
-    return response
-
-async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
-    """Stream results with heartbeats and completion markers."""
-    import json
-    from utils import datetime_handler
-
-    try:
-        async for result in results_gen:
-            try:
-                server_memory_mb = _get_memory_mb()
-                result_dict = result.model_dump()
-                result_dict['server_memory_mb'] = server_memory_mb
-                logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
-                data = json.dumps(result_dict, default=datetime_handler) + "\n"
-                yield data.encode('utf-8')
-            except Exception as e:
-                logger.error(f"Serialization error: {e}")
-                error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
-                yield (json.dumps(error_response) + "\n").encode('utf-8')
-
-        yield json.dumps({"status": "completed"}).encode('utf-8')
-        
-    except asyncio.CancelledError:
-        logger.warning("Client disconnected during streaming")
-    finally:
-        try:
-            await crawler.close()
-        except Exception as e:
-            logger.error(f"Crawler cleanup error: {e}")
-
-async def handle_crawl_request(
-    urls: List[str],
-    browser_config: dict,
-    crawler_config: dict,
-    config: dict
-) -> dict:
-    """Handle non-streaming crawl requests."""
-    start_mem_mb = _get_memory_mb() # <--- Get memory before
-    start_time = time.time()
-    mem_delta_mb = None
-    peak_mem_mb = start_mem_mb
-    
-    try:
-        browser_config = BrowserConfig.load(browser_config)
-        crawler_config = CrawlerRunConfig.load(crawler_config)
-
-        dispatcher = MemoryAdaptiveDispatcher(
-            memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
-            rate_limiter=RateLimiter(
-                base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
-            )
-        )
-
-        crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
-        await crawler.start()
-        results = []
-        func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
-        partial_func = partial(func, 
-                                urls[0] if len(urls) == 1 else urls, 
-                                config=crawler_config, 
-                                dispatcher=dispatcher)
-        results = await partial_func()
-        await crawler.close()
-        
-        end_mem_mb = _get_memory_mb() # <--- Get memory after
-        end_time = time.time()
-        
-        if start_mem_mb is not None and end_mem_mb is not None:
-            mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
-            peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
-        logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
-                              
-        return {
-            "success": True,
-            "results": [result.model_dump() for result in results],
-            "server_processing_time_s": end_time - start_time,
-            "server_memory_delta_mb": mem_delta_mb,
-            "server_peak_memory_mb": peak_mem_mb
-        }
-
-    except Exception as e:
-        logger.error(f"Crawl error: {str(e)}", exc_info=True)
-        if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
-             try:
-                 await crawler.close()
-             except Exception as close_e:
-                  logger.error(f"Error closing crawler during exception handling: {close_e}")
-
-        # Measure memory even on error if possible
-        end_mem_mb_error = _get_memory_mb()
-        if start_mem_mb is not None and end_mem_mb_error is not None:
-            mem_delta_mb = end_mem_mb_error - start_mem_mb
-
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=json.dumps({ # Send structured error
-                "error": str(e),
-                "server_memory_delta_mb": mem_delta_mb,
-                "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
-            })
-        )
-
-async def handle_stream_crawl_request(
-    urls: List[str],
-    browser_config: dict,
-    crawler_config: dict,
-    config: dict
-) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
-    """Handle streaming crawl requests."""
-    try:
-        browser_config = BrowserConfig.load(browser_config)
-        # browser_config.verbose = True # Set to False or remove for production stress testing
-        browser_config.verbose = False
-        crawler_config = CrawlerRunConfig.load(crawler_config)
-        crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
-        crawler_config.stream = True
-
-        dispatcher = MemoryAdaptiveDispatcher(
-            memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
-            rate_limiter=RateLimiter(
-                base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
-            )
-        )
-
-        crawler = AsyncWebCrawler(config=browser_config)
-        await crawler.start()
-
-        results_gen = await crawler.arun_many(
-            urls=urls,
-            config=crawler_config,
-            dispatcher=dispatcher
-        )
-
-        return crawler, results_gen
-
-    except Exception as e:
-        # Make sure to close crawler if started during an error here
-        if 'crawler' in locals() and crawler.ready:
-             try:
-                  await crawler.close()
-             except Exception as close_e:
-                  logger.error(f"Error closing crawler during stream setup exception: {close_e}")
-        logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
-        # Raising HTTPException here will prevent streaming response
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=str(e)
-        )
\ No newline at end of file
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index b226682f..130b57d0 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -377,14 +377,14 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
         
     except asyncio.CancelledError:
         logger.warning("Client disconnected during streaming")
-    # finally:
-    #     try:
-    #         await crawler.close()
-    #     except Exception as e:
-    #         logger.error(f"Crawler cleanup error: {e}")
+    finally:
+        # try:
+        #     await crawler.close()
+        # except Exception as e:
+        #     logger.error(f"Crawler cleanup error: {e}")
+        pass
 
 async def handle_crawl_request(
-    crawler: AsyncWebCrawler,
     urls: List[str],
     browser_config: dict,
     crawler_config: dict,
@@ -404,24 +404,29 @@ async def handle_crawl_request(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
             rate_limiter=RateLimiter(
                 base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
-            )
+            ) if config["crawler"]["rate_limiter"]["enabled"] else None
         )
+        
+        from crawler_pool import get_crawler
+        crawler = await get_crawler(browser_config)
 
         # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
         # await crawler.start()
+        
+        base_config = config["crawler"]["base_config"]
+        # Iterate on key-value pairs in global_config then use haseattr to set them 
+        for key, value in base_config.items():
+            if hasattr(crawler_config, key):
+                setattr(crawler_config, key, value)
+
         results = []
         func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
         partial_func = partial(func, 
                                 urls[0] if len(urls) == 1 else urls, 
                                 config=crawler_config, 
                                 dispatcher=dispatcher)
-        
-        # Simulate work being done by the crawler
-        # logger.debug(f"Request (URLs: {len(urls)}) starting simulated work...") # Add log
-        # await asyncio.sleep(2) # <--- ADD ARTIFICIAL DELAY (e.g., 0.5 seconds)
-        # logger.debug(f"Request (URLs: {len(urls)}) finished simulated work.") 
-                
         results = await partial_func()
+
         # await crawler.close()
         
         end_mem_mb = _get_memory_mb() # <--- Get memory after
@@ -442,11 +447,12 @@ async def handle_crawl_request(
 
     except Exception as e:
         logger.error(f"Crawl error: {str(e)}", exc_info=True)
-        # if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
-        #      try:
-        #          await crawler.close()
-        #      except Exception as close_e:
-        #           logger.error(f"Error closing crawler during exception handling: {close_e}")
+        if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
+            #  try:
+            #      await crawler.close()
+            #  except Exception as close_e:
+            #       logger.error(f"Error closing crawler during exception handling: {close_e}")
+            logger.error(f"Error closing crawler during exception handling: {close_e}")
 
         # Measure memory even on error if possible
         end_mem_mb_error = _get_memory_mb()
@@ -463,7 +469,6 @@ async def handle_crawl_request(
         )
 
 async def handle_stream_crawl_request(
-    crawler: AsyncWebCrawler,
     urls: List[str],
     browser_config: dict,
     crawler_config: dict,
@@ -485,6 +490,9 @@ async def handle_stream_crawl_request(
             )
         )
 
+        from crawler_pool import get_crawler
+        crawler = await get_crawler(browser_config)
+
         # crawler = AsyncWebCrawler(config=browser_config)
         # await crawler.start()
 
@@ -494,17 +502,16 @@ async def handle_stream_crawl_request(
             dispatcher=dispatcher
         )
 
-        # Return the *same* crawler instance and the generator
-        # The caller (server.py) manages the crawler lifecycle via the pool context
         return crawler, results_gen
 
     except Exception as e:
         # Make sure to close crawler if started during an error here
-        # if 'crawler' in locals() and crawler.ready:
-        #      try:
-        #           await crawler.close()
-        #      except Exception as close_e:
-        #           logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+        if 'crawler' in locals() and crawler.ready:
+            #  try:
+            #       await crawler.close()
+            #  except Exception as close_e:
+            #       logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+            logger.error(f"Error closing crawler during stream setup exception: {close_e}")
         logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
         # Raising HTTPException here will prevent streaming response
         raise HTTPException(
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index 17848e99..e93343c1 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -5,6 +5,7 @@ app:
   host: "0.0.0.0"
   port: 8020
   reload: False
+  workers: 4
   timeout_keep_alive: 300
 
 # Default LLM Configuration
@@ -48,53 +49,38 @@ security:
     content_security_policy: "default-src 'self'"
     strict_transport_security: "max-age=63072000; includeSubDomains"
 
-# Crawler Pool Configuration
-crawler_pool:
-  enabled: true # Set to false to disable the pool
-
-  # --- Option 1: Auto-calculate size ---
-  auto_calculate_size: true
-  calculation_params:
-    mem_headroom_mb: 512     # Memory reserved for OS/other apps
-    avg_page_mem_mb: 150     # Estimated MB per concurrent "tab"/page in browsers
-    fd_per_page: 20          # Estimated file descriptors per page
-    core_multiplier: 4       # Max crawlers per CPU core
-    min_pool_size: 2         # Minimum number of primary crawlers
-    max_pool_size: 16        # Maximum number of primary crawlers
-
-  # --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
-  # pool_size: 8
-
-  # --- Other Pool Settings ---
-  backup_pool_size: 1        # Number of backup crawlers
-  max_wait_time_s: 30.0      # Max seconds a request waits for a free crawler
-  throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
-  throttle_delay_min_s: 0.1  # Min throttle delay
-  throttle_delay_max_s: 0.5  # Max throttle delay
-
-  # --- Browser Config for Pooled Crawlers ---
-  browser_config:
-    # No need for "type": "BrowserConfig" here, just params
-    headless: true
-    verbose: false # Keep pool crawlers less verbose in production
-    # user_agent: "MyPooledCrawler/1.0" # Example
-    # Add other BrowserConfig params as needed (e.g., proxy, viewport)
-
 # Crawler Configuration
 crawler:
+  base_config:
+    simulate_user: true
   memory_threshold_percent: 95.0
   rate_limiter:
+    enabled: true
     base_delay: [1.0, 2.0]
   timeouts:
     stream_init: 30.0  # Timeout for stream initialization
     batch_process: 300.0  # Timeout for batch processing
+  pool:
+    max_pages: 40                          # ← GLOBAL_SEM permits
+    idle_ttl_sec: 1800                     # ← 30 min janitor cutoff
+  browser:
+    kwargs:
+      headless: true
+      text_mode: true
+    extra_args:
+      # - "--single-process"
+      - "--no-sandbox"
+      - "--disable-dev-shm-usage"
+      - "--disable-gpu"
+      - "--disable-software-rasterizer"
+      - "--disable-web-security"
+      - "--allow-insecure-localhost"
+      - "--ignore-certificate-errors"
 
 # Logging Configuration
 logging:
   level: "INFO"
   format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-  file: "logs/app.log"
-  verbose: true
 
 # Observability Configuration
 observability:
@@ -102,4 +88,4 @@ observability:
     enabled: True
     endpoint: "/metrics"
   health_check:
-    endpoint: "/health"
+    endpoint: "/health"
\ No newline at end of file
diff --git a/deploy/docker/crawler_manager.py b/deploy/docker/crawler_manager.py
deleted file mode 100644
index b566e2d3..00000000
--- a/deploy/docker/crawler_manager.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# crawler_manager.py
-import asyncio
-import time
-import uuid
-import psutil
-import os
-import resource  # For FD limit
-import random
-import math
-from typing import Optional, Tuple, Any, List, Dict, AsyncGenerator
-from pydantic import BaseModel, Field, field_validator
-from contextlib import asynccontextmanager
-import logging
-
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, AsyncLogger
-# Assuming api.py handlers are accessible or refactored slightly if needed
-# We might need to import the specific handler functions if we call them directly
-# from api import handle_crawl_request, handle_stream_crawl_request, _get_memory_mb, stream_results
-
-# --- Custom Exceptions ---
-class PoolTimeoutError(Exception):
-    """Raised when waiting for a crawler resource times out."""
-    pass
-
-class PoolConfigurationError(Exception):
-    """Raised for configuration issues."""
-    pass
-
-class NoHealthyCrawlerError(Exception):
-    """Raised when no healthy crawler is available."""
-    pass
-
-
-# --- Configuration Models ---
-class CalculationParams(BaseModel):
-    mem_headroom_mb: int = 512
-    avg_page_mem_mb: int = 150
-    fd_per_page: int = 20
-    core_multiplier: int = 4
-    min_pool_size: int = 1 # Min safe pages should be at least 1
-    max_pool_size: int = 16
-
-    # V2 validation for avg_page_mem_mb
-    @field_validator('avg_page_mem_mb')
-    @classmethod
-    def check_avg_page_mem(cls, v: int) -> int:
-        if v <= 0:
-            raise ValueError("avg_page_mem_mb must be positive")
-        return v
-
-    # V2 validation for fd_per_page
-    @field_validator('fd_per_page')
-    @classmethod
-    def check_fd_per_page(cls, v: int) -> int:
-        if v <= 0:
-            raise ValueError("fd_per_page must be positive")
-        return v
-
-# crawler_manager.py
-# ... (imports including BaseModel, Field from pydantic) ...
-from pydantic import BaseModel, Field, field_validator # <-- Import field_validator
-
-# --- Configuration Models (Pydantic V2 Syntax) ---
-class CalculationParams(BaseModel):
-    mem_headroom_mb: int = 512
-    avg_page_mem_mb: int = 150
-    fd_per_page: int = 20
-    core_multiplier: int = 4
-    min_pool_size: int = 1 # Min safe pages should be at least 1
-    max_pool_size: int = 16
-
-    # V2 validation for avg_page_mem_mb
-    @field_validator('avg_page_mem_mb')
-    @classmethod
-    def check_avg_page_mem(cls, v: int) -> int:
-        if v <= 0:
-            raise ValueError("avg_page_mem_mb must be positive")
-        return v
-
-    # V2 validation for fd_per_page
-    @field_validator('fd_per_page')
-    @classmethod
-    def check_fd_per_page(cls, v: int) -> int:
-        if v <= 0:
-            raise ValueError("fd_per_page must be positive")
-        return v
-
-class CrawlerManagerConfig(BaseModel):
-    enabled: bool = True
-    auto_calculate_size: bool = True
-    calculation_params: CalculationParams = Field(default_factory=CalculationParams) # Use Field for default_factory
-    backup_pool_size: int = Field(1, ge=0) # Allow 0 backups
-    max_wait_time_s: float = 30.0
-    throttle_threshold_percent: float = Field(70.0, ge=0, le=100)
-    throttle_delay_min_s: float = 0.1
-    throttle_delay_max_s: float = 0.5
-    browser_config: Dict[str, Any] = Field(default_factory=lambda: {"headless": True, "verbose": False}) # Use Field for default_factory
-    primary_reload_delay_s: float = 60.0
-
-# --- Crawler Manager ---
-class CrawlerManager:
-    """Manages shared AsyncWebCrawler instances, concurrency, and failover."""
-
-    def __init__(self, config: CrawlerManagerConfig, logger = None):
-        if not config.enabled:
-            self.logger.warning("CrawlerManager is disabled by configuration.")
-            # Set defaults to allow server to run, but manager won't function
-            self.config = config
-            self._initialized = False,
-            return
-
-        self.config = config
-        self._primary_crawler: Optional[AsyncWebCrawler] = None
-        self._secondary_crawlers: List[AsyncWebCrawler] = []
-        self._active_crawler_index: int = 0 # 0 for primary, 1+ for secondary index
-        self._primary_healthy: bool = False
-        self._secondary_healthy_flags: List[bool] = []
-
-        self._safe_pages: int = 1 # Default, calculated in initialize
-        self._semaphore: Optional[asyncio.Semaphore] = None
-        self._state_lock = asyncio.Lock() # Protects active_crawler, health flags
-        self._reload_tasks: List[Optional[asyncio.Task]] = [] # Track reload background tasks
-
-        self._initialized = False
-        self._shutting_down = False
-        
-        # Initialize logger if provided
-        if logger is None:
-            self.logger = logging.getLogger(__name__)
-            self.logger.setLevel(logging.INFO)
-        else:
-            self.logger = logger
-
-        self.logger.info("CrawlerManager initialized with config.")
-        self.logger.debug(f"Config: {self.config.model_dump_json(indent=2)}")
-
-    def is_enabled(self) -> bool:
-        return self.config.enabled and self._initialized
-
-    def _get_system_resources(self) -> Tuple[int, int, int]:
-        """Gets RAM, CPU cores, and FD limit."""
-        total_ram_mb = 0
-        cpu_cores = 0
-        try:
-            mem_info = psutil.virtual_memory()
-            total_ram_mb = mem_info.total // (1024 * 1024)
-            cpu_cores = psutil.cpu_count(logical=False) or psutil.cpu_count(logical=True) # Prefer physical cores
-        except Exception as e:
-            self.logger.warning(f"Could not get RAM/CPU info via psutil: {e}")
-            total_ram_mb = 2048 # Default fallback
-            cpu_cores = 2      # Default fallback
-
-        fd_limit = 1024 # Default fallback
-        try:
-            soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
-            fd_limit = soft_limit # Use the soft limit
-        except (ImportError, ValueError, OSError, AttributeError) as e:
-            self.logger.warning(f"Could not get file descriptor limit (common on Windows): {e}. Using default: {fd_limit}")
-
-        self.logger.info(f"System Resources: RAM={total_ram_mb}MB, Cores={cpu_cores}, FD Limit={fd_limit}")
-        return total_ram_mb, cpu_cores, fd_limit
-
-    def _calculate_safe_pages(self) -> int:
-        """Calculates the safe number of concurrent pages based on resources."""
-        if not self.config.auto_calculate_size:
-            # If auto-calc is off, use max_pool_size as the hard limit
-            # This isn't ideal based on the prompt, but provides *some* manual override
-            # A dedicated `manual_safe_pages` might be better. Let's use max_pool_size for now.
-             self.logger.warning("Auto-calculation disabled. Using max_pool_size as safe_pages limit.")
-             return self.config.calculation_params.max_pool_size
-
-        params = self.config.calculation_params
-        total_ram_mb, cpu_cores, fd_limit = self._get_system_resources()
-
-        available_ram_mb = total_ram_mb - params.mem_headroom_mb
-        if available_ram_mb <= 0:
-            self.logger.error(f"Not enough RAM ({total_ram_mb}MB) after headroom ({params.mem_headroom_mb}MB). Cannot calculate safe pages.")
-            return params.min_pool_size # Fallback to minimum
-
-        try:
-            # Calculate limits from each resource
-            mem_limit = available_ram_mb // params.avg_page_mem_mb if params.avg_page_mem_mb > 0 else float('inf')
-            fd_limit_pages = fd_limit // params.fd_per_page if params.fd_per_page > 0 else float('inf')
-            cpu_limit = cpu_cores * params.core_multiplier if cpu_cores > 0 else float('inf')
-
-            # Determine the most constraining limit
-            calculated_limit = math.floor(min(mem_limit, fd_limit_pages, cpu_limit))
-
-        except ZeroDivisionError:
-             self.logger.error("Division by zero in safe_pages calculation (avg_page_mem_mb or fd_per_page is zero).")
-             calculated_limit = params.min_pool_size # Fallback
-
-        # Clamp the result within min/max bounds
-        safe_pages = max(params.min_pool_size, min(calculated_limit, params.max_pool_size))
-
-        self.logger.info(f"Calculated safe pages: MemoryLimit={mem_limit}, FDLimit={fd_limit_pages}, CPULimit={cpu_limit} -> RawCalc={calculated_limit} -> Clamped={safe_pages}")
-        return safe_pages
-
-    async def _create_and_start_crawler(self, crawler_id: str) -> Optional[AsyncWebCrawler]:
-        """Creates, starts, and returns a crawler instance."""
-        try:
-            # Create BrowserConfig from the dictionary in manager config
-            browser_conf = BrowserConfig(**self.config.browser_config)
-            crawler = AsyncWebCrawler(config=browser_conf)
-            await crawler.start()
-            self.logger.info(f"Successfully started crawler instance: {crawler_id}")
-            return crawler
-        except Exception as e:
-            self.logger.error(f"Failed to start crawler instance {crawler_id}: {e}", exc_info=True)
-            return None
-
-    async def initialize(self):
-        """Initializes crawlers and semaphore. Called at server startup."""
-        if not self.config.enabled or self._initialized:
-            return
-
-        self.logger.info("Initializing CrawlerManager...")
-        self._safe_pages = self._calculate_safe_pages()
-        self._semaphore = asyncio.Semaphore(self._safe_pages)
-
-        self._primary_crawler = await self._create_and_start_crawler("Primary")
-        if self._primary_crawler:
-            self._primary_healthy = True
-        else:
-            self._primary_healthy = False
-            self.logger.critical("Primary crawler failed to initialize!")
-
-        self._secondary_crawlers = []
-        self._secondary_healthy_flags = []
-        self._reload_tasks = [None] * (1 + self.config.backup_pool_size) # For primary + backups
-
-        for i in range(self.config.backup_pool_size):
-            sec_id = f"Secondary-{i+1}"
-            crawler = await self._create_and_start_crawler(sec_id)
-            self._secondary_crawlers.append(crawler) # Add even if None
-            self._secondary_healthy_flags.append(crawler is not None)
-            if crawler is None:
-                 self.logger.error(f"{sec_id} crawler failed to initialize!")
-
-        # Set initial active crawler (prefer primary)
-        if self._primary_healthy:
-            self._active_crawler_index = 0
-            self.logger.info("Primary crawler is active.")
-        else:
-            # Find the first healthy secondary
-            found_healthy_backup = False
-            for i, healthy in enumerate(self._secondary_healthy_flags):
-                if healthy:
-                    self._active_crawler_index = i + 1 # 1-based index for secondaries
-                    self.logger.warning(f"Primary failed, Secondary-{i+1} is active.")
-                    found_healthy_backup = True
-                    break
-            if not found_healthy_backup:
-                 self.logger.critical("FATAL: No healthy crawlers available after initialization!")
-                 # Server should probably refuse connections in this state
-
-        self._initialized = True
-        self.logger.info(f"CrawlerManager initialized. Safe Pages: {self._safe_pages}. Active Crawler Index: {self._active_crawler_index}")
-
-    async def shutdown(self):
-        """Shuts down all crawler instances. Called at server shutdown."""
-        if not self._initialized or self._shutting_down:
-            return
-
-        self._shutting_down = True
-        self.logger.info("Shutting down CrawlerManager...")
-
-        # Cancel any ongoing reload tasks
-        for i, task in enumerate(self._reload_tasks):
-            if task and not task.done():
-                try:
-                    task.cancel()
-                    await task # Wait for cancellation
-                    self.logger.info(f"Cancelled reload task for crawler index {i}.")
-                except asyncio.CancelledError:
-                    self.logger.info(f"Reload task for crawler index {i} was already cancelled.")
-                except Exception as e:
-                    self.logger.warning(f"Error cancelling reload task for crawler index {i}: {e}")
-        self._reload_tasks = []
-
-
-        # Close primary
-        if self._primary_crawler:
-            try:
-                self.logger.info("Closing primary crawler...")
-                await self._primary_crawler.close()
-                self._primary_crawler = None
-            except Exception as e:
-                self.logger.error(f"Error closing primary crawler: {e}", exc_info=True)
-
-        # Close secondaries
-        for i, crawler in enumerate(self._secondary_crawlers):
-             if crawler:
-                 try:
-                     self.logger.info(f"Closing secondary crawler {i+1}...")
-                     await crawler.close()
-                 except Exception as e:
-                     self.logger.error(f"Error closing secondary crawler {i+1}: {e}", exc_info=True)
-        self._secondary_crawlers = []
-
-        self._initialized = False
-        self.logger.info("CrawlerManager shut down complete.")
-
-    @asynccontextmanager
-    async def get_crawler(self) -> AsyncGenerator[AsyncWebCrawler, None]:
-        """Acquires semaphore, yields active crawler, handles throttling & failover."""
-        if not self.is_enabled():
-            raise NoHealthyCrawlerError("CrawlerManager is disabled or not initialized.")
-
-        if self._shutting_down:
-             raise NoHealthyCrawlerError("CrawlerManager is shutting down.")
-
-        active_crawler: Optional[AsyncWebCrawler] = None
-        acquired = False
-        request_id = uuid.uuid4()
-        start_wait = time.time()
-
-        # --- Throttling ---
-        try:
-            # Check semaphore value without acquiring
-            current_usage = self._safe_pages - self._semaphore._value
-            usage_percent = (current_usage / self._safe_pages) * 100 if self._safe_pages > 0 else 0
-
-            if usage_percent >= self.config.throttle_threshold_percent:
-                delay = random.uniform(self.config.throttle_delay_min_s, self.config.throttle_delay_max_s)
-                self.logger.debug(f"Throttling: Usage {usage_percent:.1f}% >= {self.config.throttle_threshold_percent}%. Delaying {delay:.3f}s")
-                await asyncio.sleep(delay)
-        except Exception as e:
-             self.logger.warning(f"Error during throttling check: {e}") # Continue attempt even if throttle check fails
-
-        # --- Acquire Semaphore ---
-        try:
-            # self.logger.debug(f"Attempting to acquire semaphore (Available: {self._semaphore._value}/{self._safe_pages}). Wait Timeout: {self.config.max_wait_time_s}s")
-            
-            # --- Logging Before Acquire ---
-            sem_value = self._semaphore._value if self._semaphore else 'N/A'
-            sem_waiters = len(self._semaphore._waiters) if self._semaphore and self._semaphore._waiters else 0
-            self.logger.debug(f"Req {request_id}: Attempting acquire. Available={sem_value}/{self._safe_pages}, Waiters={sem_waiters}, Timeout={self.config.max_wait_time_s}s")            
-
-            await asyncio.wait_for(
-                self._semaphore.acquire(), timeout=self.config.max_wait_time_s
-            )
-            acquired = True
-            wait_duration = time.time() - start_wait
-            if wait_duration > 1:
-                self.logger.warning(f"Semaphore acquired after {wait_duration:.3f}s. (Available: {self._semaphore._value}/{self._safe_pages})")
-                
-            self.logger.debug(f"Semaphore acquired successfully after {wait_duration:.3f}s. (Available: {self._semaphore._value}/{self._safe_pages})")
-
-            # --- Select Active Crawler (Critical Section) ---
-            async with self._state_lock:
-                current_active_index = self._active_crawler_index
-                is_primary_active = (current_active_index == 0)
-
-                if is_primary_active:
-                    if self._primary_healthy and self._primary_crawler:
-                        active_crawler = self._primary_crawler
-                    else:
-                        # Primary is supposed to be active but isn't healthy
-                        self.logger.warning("Primary crawler unhealthy, attempting immediate failover...")
-                        if not await self._try_failover_sync(): # Try to switch active crawler NOW
-                             raise NoHealthyCrawlerError("Primary unhealthy and no healthy backup available.")
-                        # If failover succeeded, active_crawler_index is updated
-                        current_active_index = self._active_crawler_index
-                        # Fall through to select the new active secondary
-
-                # Check if we need to use a secondary (either initially or after failover)
-                if current_active_index > 0:
-                     secondary_idx = current_active_index - 1
-                     if secondary_idx < len(self._secondary_crawlers) and \
-                        self._secondary_healthy_flags[secondary_idx] and \
-                        self._secondary_crawlers[secondary_idx]:
-                          active_crawler = self._secondary_crawlers[secondary_idx]
-                     else:
-                         self.logger.error(f"Selected Secondary-{current_active_index} is unhealthy or missing.")
-                         # Attempt failover to *another* secondary if possible? (Adds complexity)
-                         # For now, raise error if the selected one isn't good.
-                         raise NoHealthyCrawlerError(f"Selected Secondary-{current_active_index} is unavailable.")
-
-            if active_crawler is None:
-                 # This shouldn't happen if logic above is correct, but safeguard
-                 raise NoHealthyCrawlerError("Failed to select a healthy active crawler.")
-
-            # --- Yield Crawler ---
-            try:
-                yield active_crawler
-            except Exception as crawl_error:
-                self.logger.error(f"Error during crawl execution using {active_crawler}: {crawl_error}", exc_info=True)
-                # Determine if this error warrants failover
-                # For now, let's assume any exception triggers a health check/failover attempt
-                await self._handle_crawler_failure(active_crawler)
-                raise # Re-raise the original error for the API handler
-
-        except asyncio.TimeoutError:
-            self.logger.warning(f"Timeout waiting for semaphore after {self.config.max_wait_time_s}s.")
-            raise PoolTimeoutError(f"Timed out waiting for available crawler resource after {self.config.max_wait_time_s}s")
-        except NoHealthyCrawlerError:
-            # Logged within the selection logic
-             raise # Re-raise for API handler
-        except Exception as e:
-             self.logger.error(f"Unexpected error in get_crawler context manager: {e}", exc_info=True)
-             raise # Re-raise potentially unknown errors
-        finally:
-            if acquired:
-                self._semaphore.release()
-                self.logger.debug(f"Semaphore released. (Available: {self._semaphore._value}/{self._safe_pages})")
-
-
-    async def _try_failover_sync(self) -> bool:
-        """Synchronous part of failover logic (must be called under state_lock). Finds next healthy secondary."""
-        if not self._primary_healthy: # Only failover if primary is already marked down
-            found_healthy_backup = False
-            start_idx = (self._active_crawler_index % (self.config.backup_pool_size +1)) # Start check after current
-            for i in range(self.config.backup_pool_size):
-                 check_idx = (start_idx + i) % self.config.backup_pool_size # Circular check
-                 if self._secondary_healthy_flags[check_idx] and self._secondary_crawlers[check_idx]:
-                     self._active_crawler_index = check_idx + 1
-                     self.logger.warning(f"Failover successful: Switched active crawler to Secondary-{self._active_crawler_index}")
-                     found_healthy_backup = True
-                     break # Found one
-            if not found_healthy_backup:
-                 # If primary is down AND no backups are healthy, mark primary as active index (0) but it's still unhealthy
-                 self._active_crawler_index = 0
-                 self.logger.error("Failover failed: No healthy secondary crawlers available.")
-                 return False
-            return True
-        return True # Primary is healthy, no failover needed
-
-    async def _handle_crawler_failure(self, failed_crawler: AsyncWebCrawler):
-        """Handles marking a crawler as unhealthy and initiating recovery."""
-        if self._shutting_down: return # Don't handle failures during shutdown
-
-        async with self._state_lock:
-            crawler_index = -1
-            is_primary = False
-
-            if failed_crawler is self._primary_crawler and self._primary_healthy:
-                self.logger.warning("Primary crawler reported failure.")
-                self._primary_healthy = False
-                is_primary = True
-                crawler_index = 0
-                # Try immediate failover within the lock
-                await self._try_failover_sync()
-                # Start reload task if not already running for primary
-                if self._reload_tasks[0] is None or self._reload_tasks[0].done():
-                     self.logger.info("Initiating primary crawler reload task.")
-                     self._reload_tasks[0] = asyncio.create_task(self._reload_crawler(0))
-
-            else:
-                 # Check if it was one of the secondaries
-                 for i, crawler in enumerate(self._secondary_crawlers):
-                     if failed_crawler is crawler and self._secondary_healthy_flags[i]:
-                         self.logger.warning(f"Secondary-{i+1} crawler reported failure.")
-                         self._secondary_healthy_flags[i] = False
-                         is_primary = False
-                         crawler_index = i + 1
-                         # If this *was* the active crawler, trigger failover check
-                         if self._active_crawler_index == crawler_index:
-                              self.logger.warning(f"Active secondary {crawler_index} failed, attempting failover...")
-                              await self._try_failover_sync()
-                         # Start reload task for this secondary
-                         if self._reload_tasks[crawler_index] is None or self._reload_tasks[crawler_index].done():
-                              self.logger.info(f"Initiating Secondary-{i+1} crawler reload task.")
-                              self._reload_tasks[crawler_index] = asyncio.create_task(self._reload_crawler(crawler_index))
-                         break # Found the failed secondary
-
-            if crawler_index == -1:
-                 self.logger.debug("Failure reported by an unknown or already unhealthy crawler instance. Ignoring.")
-
-
-    async def _reload_crawler(self, crawler_index_to_reload: int):
-        """Background task to close, recreate, and start a specific crawler."""
-        is_primary = (crawler_index_to_reload == 0)
-        crawler_id = "Primary" if is_primary else f"Secondary-{crawler_index_to_reload}"
-        original_crawler = self._primary_crawler if is_primary else self._secondary_crawlers[crawler_index_to_reload - 1]
-
-        self.logger.info(f"Starting reload process for {crawler_id}...")
-
-        # 1. Delay before attempting reload (e.g., allow transient issues to clear)
-        if not is_primary: # Maybe shorter delay for backups?
-            await asyncio.sleep(self.config.primary_reload_delay_s / 2)
-        else:
-             await asyncio.sleep(self.config.primary_reload_delay_s)
-
-
-        # 2. Attempt to close the old instance cleanly
-        if original_crawler:
-            try:
-                self.logger.info(f"Attempting to close existing {crawler_id} instance...")
-                await original_crawler.close()
-                self.logger.info(f"Successfully closed old {crawler_id} instance.")
-            except Exception as e:
-                self.logger.warning(f"Error closing old {crawler_id} instance during reload: {e}")
-
-        # 3. Create and start a new instance
-        self.logger.info(f"Attempting to start new {crawler_id} instance...")
-        new_crawler = await self._create_and_start_crawler(crawler_id)
-
-        # 4. Update state if successful
-        async with self._state_lock:
-            if new_crawler:
-                self.logger.info(f"Successfully reloaded {crawler_id}. Marking as healthy.")
-                if is_primary:
-                    self._primary_crawler = new_crawler
-                    self._primary_healthy = True
-                    # Switch back to primary if no other failures occurred
-                    # Check if ANY secondary is currently active
-                    secondary_is_active = self._active_crawler_index > 0
-                    if not secondary_is_active or not self._secondary_healthy_flags[self._active_crawler_index - 1]:
-                         self.logger.info("Switching active crawler back to primary.")
-                         self._active_crawler_index = 0
-                else: # Is secondary
-                    secondary_idx = crawler_index_to_reload - 1
-                    self._secondary_crawlers[secondary_idx] = new_crawler
-                    self._secondary_healthy_flags[secondary_idx] = True
-                    # Potentially switch back if primary is still down and this was needed?
-                    if not self._primary_healthy and self._active_crawler_index == 0:
-                         self.logger.info(f"Primary still down, activating reloaded Secondary-{crawler_index_to_reload}.")
-                         self._active_crawler_index = crawler_index_to_reload
-
-            else:
-                self.logger.error(f"Failed to reload {crawler_id}. It remains unhealthy.")
-                # Keep the crawler marked as unhealthy
-                if is_primary:
-                    self._primary_healthy = False # Ensure it stays false
-                else:
-                    self._secondary_healthy_flags[crawler_index_to_reload - 1] = False
-
-
-            # Clear the reload task reference for this index
-            self._reload_tasks[crawler_index_to_reload] = None
-
-
-    async def get_status(self) -> Dict:
-        """Returns the current status of the manager."""
-        if not self.is_enabled():
-            return {"status": "disabled"}
-
-        async with self._state_lock:
-             active_id = "Primary" if self._active_crawler_index == 0 else f"Secondary-{self._active_crawler_index}"
-             primary_status = "Healthy" if self._primary_healthy else "Unhealthy"
-             secondary_statuses = [f"Secondary-{i+1}: {'Healthy' if healthy else 'Unhealthy'}"
-                                   for i, healthy in enumerate(self._secondary_healthy_flags)]
-             semaphore_available = self._semaphore._value if self._semaphore else 'N/A'
-             semaphore_locked = len(self._semaphore._waiters) if self._semaphore and self._semaphore._waiters else 0
-
-             return {
-                 "status": "enabled",
-                 "safe_pages": self._safe_pages,
-                 "semaphore_available": semaphore_available,
-                 "semaphore_waiters": semaphore_locked,
-                 "active_crawler": active_id,
-                 "primary_status": primary_status,
-                 "secondary_statuses": secondary_statuses,
-                 "reloading_tasks": [i for i, t in enumerate(self._reload_tasks) if t and not t.done()]
-             }
\ No newline at end of file
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
new file mode 100644
index 00000000..d15102e4
--- /dev/null
+++ b/deploy/docker/crawler_pool.py
@@ -0,0 +1,60 @@
+# crawler_pool.py  (new file)
+import asyncio, json, hashlib, time, psutil
+from contextlib import suppress
+from typing import Dict
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+from typing import Dict
+from utils import load_config 
+
+CONFIG = load_config()
+
+POOL: Dict[str, AsyncWebCrawler] = {}
+LAST_USED: Dict[str, float] = {}
+LOCK = asyncio.Lock()
+
+MEM_LIMIT  = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)   # % RAM – refuse new browsers above this
+IDLE_TTL  = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)   # close if unused for 30 min
+
+def _sig(cfg: BrowserConfig) -> str:
+    payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
+    return hashlib.sha1(payload.encode()).hexdigest()
+
+async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
+    try:
+        sig = _sig(cfg)
+        async with LOCK:
+            if sig in POOL:
+                LAST_USED[sig] = time.time();  
+                return POOL[sig]
+            if psutil.virtual_memory().percent >= MEM_LIMIT:
+                raise MemoryError("RAM pressure – new browser denied")
+            crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
+            await crawler.start()
+            POOL[sig] = crawler; LAST_USED[sig] = time.time()
+            return crawler
+    except MemoryError as e:
+        raise MemoryError(f"RAM pressure – new browser denied: {e}")
+    except Exception as e:
+        raise RuntimeError(f"Failed to start browser: {e}")
+    finally:
+        if sig in POOL:
+            LAST_USED[sig] = time.time()
+        else:
+            # If we failed to start the browser, we should remove it from the pool
+            POOL.pop(sig, None)
+            LAST_USED.pop(sig, None)
+        # If we failed to start the browser, we should remove it from the pool
+async def close_all():
+    async with LOCK:
+        await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
+        POOL.clear(); LAST_USED.clear()
+
+async def janitor():
+    while True:
+        await asyncio.sleep(60)
+        now = time.time()
+        async with LOCK:
+            for sig, crawler in list(POOL.items()):
+                if now - LAST_USED[sig] > IDLE_TTL:
+                    with suppress(Exception): await crawler.close()
+                    POOL.pop(sig, None); LAST_USED.pop(sig, None)
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index f577348b..ae60ffa2 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -1,167 +1,200 @@
-# Import from auth.py
-from auth import create_access_token, get_token_dependency, TokenRequest
-from api import (
-    handle_markdown_request,
-    handle_llm_qa,
-    handle_stream_crawl_request,
-    handle_crawl_request,
-    stream_results,
-    _get_memory_mb
-)
-from utils import FilterType, load_config, setup_logging, verify_email_domain
-import os
-import sys
-import time
-from typing import List, Optional, Dict, AsyncGenerator
+# ───────────────────────── server.py ─────────────────────────
+"""
+Crawl4AI FastAPI entry‑point
+• Browser pool + global page cap
+• Rate‑limiting, security, metrics
+• /crawl, /crawl/stream, /md, /llm endpoints
+"""
+
+# ── stdlib & 3rd‑party imports ───────────────────────────────
+import os, sys, time, asyncio
+from typing import List, Optional, Dict
 from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends, status
-from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
+import pathlib
+
+from fastapi import (
+    FastAPI, HTTPException, Request, Path, Query, Depends
+)
+from fastapi.responses import (
+    StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
+)
 from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from fastapi.staticfiles import StaticFiles
+
+import ast, crawl4ai as _c4
 from pydantic import BaseModel, Field
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from prometheus_fastapi_instrumentator import Instrumentator
 from redis import asyncio as aioredis
-from crawl4ai import (
-    BrowserConfig,
-    CrawlerRunConfig,
-    AsyncLogger
-)
-
-from crawler_manager import (
-    CrawlerManager,
-    CrawlerManagerConfig,
-    PoolTimeoutError,
-    NoHealthyCrawlerError
-)
-
 
+# ── internal imports (after sys.path append) ─────────────────
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from utils import (
+    FilterType, load_config, setup_logging, verify_email_domain
+)
+from api import (
+    handle_markdown_request, handle_llm_qa,
+    handle_stream_crawl_request, handle_crawl_request,
+    stream_results
+)
+from auth import create_access_token, get_token_dependency, TokenRequest
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawler_pool import get_crawler, close_all, janitor
 
-__version__ = "0.2.6"
-
-
-class CrawlRequest(BaseModel):
-    urls: List[str] = Field(min_length=1, max_length=100)
-    browser_config: Optional[Dict] = Field(default_factory=dict)
-    crawler_config: Optional[Dict] = Field(default_factory=dict)
-
-
-# Load configuration and setup
+# ────────────────── configuration / logging ──────────────────
 config = load_config()
 setup_logging(config)
-logger = AsyncLogger(
-    log_file=config["logging"].get("log_file", "app.log"),
-    verbose=config["logging"].get("verbose", False),
-    tag_width=10,
-)
 
-# Initialize Redis
-redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
+__version__ = "0.5.1-d1"
 
-# Initialize rate limiter
-limiter = Limiter(
-    key_func=get_remote_address,
-    default_limits=[config["rate_limiting"]["default_limit"]],
-    storage_uri=config["rate_limiting"]["storage_uri"]
-)
+# ── global page semaphore (hard cap) ─────────────────────────
+MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
+GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
 
-# --- Initialize Manager (will be done in lifespan) ---
-# Load manager config from the main config
-manager_config_dict = config.get("crawler_pool", {})
-# Use Pydantic to parse and validate
-manager_config = CrawlerManagerConfig(**manager_config_dict)
-crawler_manager = CrawlerManager(config=manager_config, logger=logger)
-
-# --- FastAPI App and Lifespan ---
+# import logging
+# page_log = logging.getLogger("page_cap")
+# orig_arun = AsyncWebCrawler.arun
+# async def capped_arun(self, *a, **kw):
+#     await GLOBAL_SEM.acquire()                        # ← take slot
+#     try:
+#         in_flight = MAX_PAGES - GLOBAL_SEM._value     # used permits
+#         page_log.info("🕸️  pages_in_flight=%s / %s", in_flight, MAX_PAGES)
+#         return await orig_arun(self, *a, **kw)
+#     finally:
+#         GLOBAL_SEM.release()                          # ← free slot
 
+orig_arun = AsyncWebCrawler.arun
+async def capped_arun(self, *a, **kw):
+    async with GLOBAL_SEM:
+        return await orig_arun(self, *a, **kw)
+AsyncWebCrawler.arun = capped_arun
 
+# ───────────────────── FastAPI lifespan ──────────────────────
 @asynccontextmanager
-async def lifespan(app: FastAPI):
-    # Startup
-    logger.info("Starting up the server...")
-    if manager_config.enabled:
-        logger.info("Initializing Crawler Manager...")
-        await crawler_manager.initialize()
-        app.state.crawler_manager = crawler_manager  # Store manager in app state
-        logger.info("Crawler Manager is enabled.")
-    else:
-        logger.warning("Crawler Manager is disabled.")
-        app.state.crawler_manager = None  # Indicate disabled state
-
-    yield  # Server runs here
-
-    # Shutdown
-    logger.info("Shutting down server...")
-    if app.state.crawler_manager:
-        logger.info("Shutting down Crawler Manager...")
-        await app.state.crawler_manager.shutdown()
-        logger.info("Crawler Manager shut down.")
-    logger.info("Server shut down.")
+async def lifespan(_: FastAPI):
+    await get_crawler(BrowserConfig(
+        extra_args=config["crawler"]["browser"].get("extra_args", []),
+        **config["crawler"]["browser"].get("kwargs", {}),
+    ))           # warm‑up
+    app.state.janitor = asyncio.create_task(janitor())        # idle GC
+    yield
+    app.state.janitor.cancel()
+    await close_all()
 
+# ───────────────────── FastAPI instance ──────────────────────
 app = FastAPI(
     title=config["app"]["title"],
     version=config["app"]["version"],
     lifespan=lifespan,
 )
 
-# Configure middleware
-def setup_security_middleware(app, config):
-    sec_config = config.get("security", {})
-    if sec_config.get("enabled", False):
-        if sec_config.get("https_redirect", False):
-            app.add_middleware(HTTPSRedirectMiddleware)
-        if sec_config.get("trusted_hosts", []) != ["*"]:
-            app.add_middleware(TrustedHostMiddleware,
-                               allowed_hosts=sec_config["trusted_hosts"])
+# ── static playground ──────────────────────────────────────
+STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
+if not STATIC_DIR.exists():
+    raise RuntimeError(f"Playground assets not found at {STATIC_DIR}")
+app.mount(
+    "/playground",
+    StaticFiles(directory=STATIC_DIR, html=True),
+    name="play",
+)
 
+# Optional nice‑to‑have: opening the root shows the playground
+@app.get("/")
+async def root():
+    return RedirectResponse("/playground")
 
-setup_security_middleware(app, config)
+# ─────────────────── infra / middleware  ─────────────────────
+redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
+
+limiter = Limiter(
+    key_func=get_remote_address,
+    default_limits=[config["rate_limiting"]["default_limit"]],
+    storage_uri=config["rate_limiting"]["storage_uri"],
+)
+
+def _setup_security(app_: FastAPI):
+    sec = config["security"]
+    if not sec["enabled"]:
+        return
+    if sec.get("https_redirect"):
+        app_.add_middleware(HTTPSRedirectMiddleware)
+    if sec.get("trusted_hosts", []) != ["*"]:
+        app_.add_middleware(
+            TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
+        )
+_setup_security(app)
 
-# Prometheus instrumentation
 if config["observability"]["prometheus"]["enabled"]:
     Instrumentator().instrument(app).expose(app)
 
-# Get token dependency based on config
-token_dependency = get_token_dependency(config)
-
-# Middleware for security headers
-
+token_dep = get_token_dependency(config)
 
 @app.middleware("http")
 async def add_security_headers(request: Request, call_next):
-    response = await call_next(request)
+    resp = await call_next(request)
     if config["security"]["enabled"]:
-        response.headers.update(config["security"]["headers"])
-    return response
+        resp.headers.update(config["security"]["headers"])
+    return resp
+
+# ───────────────── safe config‑dump helper ─────────────────
+ALLOWED_TYPES = {
+    "CrawlerRunConfig": CrawlerRunConfig,
+    "BrowserConfig": BrowserConfig,
+}
+
+def _safe_eval_config(expr: str) -> dict:
+    """
+    Accept exactly one top‑level call to CrawlerRunConfig(...) or BrowserConfig(...).
+    Whatever is inside the parentheses is fine *except* further function calls
+    (so no  __import__('os') stuff).  All public names from crawl4ai are available
+    when we eval.
+    """
+    tree = ast.parse(expr, mode="eval")
+
+    # must be a single call
+    if not isinstance(tree.body, ast.Call):
+        raise ValueError("Expression must be a single constructor call")
+
+    call = tree.body
+    if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
+        raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
+
+    # forbid nested calls to keep the surface tiny
+    for node in ast.walk(call):
+        if isinstance(node, ast.Call) and node is not call:
+            raise ValueError("Nested function calls are not permitted")
+
+    # expose everything that crawl4ai exports, nothing else
+    safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
+    obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
+    return obj.dump()
 
 
-async def get_manager() -> CrawlerManager:
-    # Ensure manager exists and is enabled before yielding
-    if not hasattr(app.state, 'crawler_manager') or app.state.crawler_manager is None:
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail="Crawler service is disabled or not initialized"
-        )
-    if not app.state.crawler_manager.is_enabled():
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail="Crawler service is currently disabled"
-        )
-    return app.state.crawler_manager
-
-# Token endpoint (always available, but usage depends on config)
+# ───────────────────────── Schemas ───────────────────────────
+class CrawlRequest(BaseModel):
+    urls: List[str] = Field(min_length=1, max_length=100)
+    browser_config: Optional[Dict] = Field(default_factory=dict)
+    crawler_config: Optional[Dict] = Field(default_factory=dict)
 
+class RawCode(BaseModel):
+    code: str
 
+# ──────────────────────── Endpoints ──────────────────────────
 @app.post("/token")
-async def get_token(request_data: TokenRequest):
-    if not verify_email_domain(request_data.email):
-        raise HTTPException(status_code=400, detail="Invalid email domain")
-    token = create_access_token({"sub": request_data.email})
-    return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
+async def get_token(req: TokenRequest):
+    if not verify_email_domain(req.email):
+        raise HTTPException(400, "Invalid email domain")
+    token = create_access_token({"sub": req.email})
+    return {"email": req.email, "access_token": token, "token_type": "bearer"}
 
-# Endpoints with conditional auth
+@app.post("/config/dump")
+async def config_dump(raw: RawCode):
+    try:
+        return JSONResponse(_safe_eval_config(raw.code.strip()))
+    except Exception as e:
+        raise HTTPException(400, str(e))
 
 
 @app.get("/md/{url:path}")
@@ -171,230 +204,83 @@ async def get_markdown(
     url: str,
     f: FilterType = FilterType.FIT,
     q: Optional[str] = None,
-    c: Optional[str] = "0",
-    token_data: Optional[Dict] = Depends(token_dependency)
+    c: str = "0",
+    _td: Dict = Depends(token_dep),
 ):
-    result = await handle_markdown_request(url, f, q, c, config)
-    return PlainTextResponse(result)
+    md = await handle_markdown_request(url, f, q, c, config)
+    return PlainTextResponse(md)
 
-
-@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
+@app.get("/llm/{url:path}")
 async def llm_endpoint(
     request: Request,
     url: str = Path(...),
     q: Optional[str] = Query(None),
-    token_data: Optional[Dict] = Depends(token_dependency)
+    _td: Dict = Depends(token_dep),
 ):
     if not q:
-        raise HTTPException(
-            status_code=400, detail="Query parameter 'q' is required")
-    if not url.startswith(('http://', 'https://')):
-        url = 'https://' + url
-    try:
-        answer = await handle_llm_qa(url, q, config)
-        return JSONResponse({"answer": answer})
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-
+        raise HTTPException(400, "Query parameter 'q' is required")
+    if not url.startswith(("http://", "https://")):
+        url = "https://" + url
+    answer = await handle_llm_qa(url, q, config)
+    return JSONResponse({"answer": answer})
 
 @app.get("/schema")
 async def get_schema():
     from crawl4ai import BrowserConfig, CrawlerRunConfig
-    return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
-
+    return {"browser": BrowserConfig().dump(),
+            "crawler": CrawlerRunConfig().dump()}
 
 @app.get(config["observability"]["health_check"]["endpoint"])
 async def health():
     return {"status": "ok", "timestamp": time.time(), "version": __version__}
 
-
 @app.get(config["observability"]["prometheus"]["endpoint"])
 async def metrics():
-    return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
-
-
-@app.get("/browswers")
-# Optional dependency
-async def health(manager: Optional[CrawlerManager] = Depends(get_manager, use_cache=False)):
-    base_status = {"status": "ok", "timestamp": time.time(),
-                   "version": __version__}
-    if manager:
-        try:
-            manager_status = await manager.get_status()
-            base_status["crawler_manager"] = manager_status
-        except Exception as e:
-            base_status["crawler_manager"] = {
-                "status": "error", "detail": str(e)}
-    else:
-        base_status["crawler_manager"] = {"status": "disabled"}
-    return base_status
-
+    return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
 
 @app.post("/crawl")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 async def crawl(
     request: Request,
     crawl_request: CrawlRequest,
-    manager: CrawlerManager = Depends(get_manager),  # Use dependency
-    token_data: Optional[Dict] = Depends(token_dependency)  # Keep auth
+    _td: Dict = Depends(token_dep),
 ):
     if not crawl_request.urls:
-        raise HTTPException(
-            status_code=400, detail="At least one URL required")
-
-    try:
-        # Use the manager's context to get a crawler instance
-        async with manager.get_crawler() as active_crawler:
-            # Call the actual handler from api.py, passing the acquired crawler
-            results_dict = await handle_crawl_request(
-                crawler=active_crawler,  # Pass the live crawler instance
-                urls=crawl_request.urls,
-                # Pass user-provided configs, these might override pool defaults if needed
-                # Or the manager/handler could decide how to merge them
-                browser_config=crawl_request.browser_config or {},  # Ensure dict
-                crawler_config=crawl_request.crawler_config or {},  # Ensure dict
-                config=config  # Pass the global server config
-            )
-            return JSONResponse(results_dict)
-
-    except PoolTimeoutError as e:
-        logger.warning(f"Request rejected due to pool timeout: {e}")
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,  # Or 429
-            detail=f"Crawler resources busy. Please try again later. Timeout: {e}"
-        )
-    except NoHealthyCrawlerError as e:
-        logger.error(f"Request failed as no healthy crawler available: {e}")
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail=f"Crawler service temporarily unavailable: {e}"
-        )
-    except HTTPException:  # Re-raise HTTP exceptions from handler
-        raise
-    except Exception as e:
-        logger.error(
-            f"Unexpected error during batch crawl processing: {e}", exc_info=True)
-        # Return generic error, details might be logged by handle_crawl_request
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"An unexpected error occurred: {e}"
-        )
-
+        raise HTTPException(400, "At least one URL required")
+    res = await handle_crawl_request(
+        urls=crawl_request.urls,
+        browser_config=crawl_request.browser_config,
+        crawler_config=crawl_request.crawler_config,
+        config=config,
+    )
+    return JSONResponse(res)
 
 @app.post("/crawl/stream")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 async def crawl_stream(
     request: Request,
     crawl_request: CrawlRequest,
-    manager: CrawlerManager = Depends(get_manager),
-    token_data: Optional[Dict] = Depends(token_dependency)
+    _td: Dict = Depends(token_dep),
 ):
     if not crawl_request.urls:
-        raise HTTPException(
-            status_code=400, detail="At least one URL required")
-
-    try:
-        # THIS IS A BIT WORK OF ART RATHER THAN ENGINEERING
-        # Acquire the crawler context from the manager
-        # IMPORTANT: The context needs to be active for the *duration* of the stream
-        # This structure might be tricky with FastAPI's StreamingResponse which consumes
-        # the generator *after* the endpoint function returns.
-
-        # --- Option A: Acquire crawler, pass to handler, handler yields ---
-        # (Requires handler NOT to be async generator itself, but return one)
-        # async with manager.get_crawler() as active_crawler:
-        #     # Handler returns the generator
-        #     _, results_gen = await handle_stream_crawl_request(
-        #         crawler=active_crawler,
-        #         urls=crawl_request.urls,
-        #         browser_config=crawl_request.browser_config or {},
-        #         crawler_config=crawl_request.crawler_config or {},
-        #         config=config
-        #     )
-        #     # PROBLEM: `active_crawler` context exits before StreamingResponse uses results_gen
-        #     # This releases the semaphore too early.
-
-        # --- Option B: Pass manager to handler, handler uses context internally ---
-        # (Requires modifying handle_stream_crawl_request signature/logic)
-        # This seems cleaner. Let's assume api.py is adapted for this.
-        # We need a way for the generator yielded by stream_results to know when
-        # to release the semaphore.
-
-        # --- Option C: Create a wrapper generator that handles context ---
-        async def stream_wrapper(manager: CrawlerManager, crawl_request: CrawlRequest, config: dict) -> AsyncGenerator[bytes, None]:
-            active_crawler = None
-            try:
-                async with manager.get_crawler() as acquired_crawler:
-                    active_crawler = acquired_crawler  # Keep reference for cleanup
-                    # Call the handler which returns the raw result generator
-                    _crawler_ref, results_gen = await handle_stream_crawl_request(
-                        crawler=acquired_crawler,
-                        urls=crawl_request.urls,
-                        browser_config=crawl_request.browser_config or {},
-                        crawler_config=crawl_request.crawler_config or {},
-                        config=config
-                    )
-                    # Use the stream_results utility to format and yield
-                    async for data_bytes in stream_results(_crawler_ref, results_gen):
-                        yield data_bytes
-            except (PoolTimeoutError, NoHealthyCrawlerError) as e:
-                # Yield a final error message in the stream
-                error_payload = {"status": "error", "detail": str(e)}
-                yield (json.dumps(error_payload) + "\n").encode('utf-8')
-                logger.warning(f"Stream request failed: {e}")
-                # Re-raise might be better if StreamingResponse handles it? Test needed.
-            except HTTPException as e:  # Catch HTTP exceptions from handler setup
-                error_payload = {"status": "error",
-                                 "detail": e.detail, "status_code": e.status_code}
-                yield (json.dumps(error_payload) + "\n").encode('utf-8')
-                logger.warning(
-                    f"Stream request failed with HTTPException: {e.detail}")
-            except Exception as e:
-                error_payload = {"status": "error",
-                                 "detail": f"Unexpected stream error: {e}"}
-                yield (json.dumps(error_payload) + "\n").encode('utf-8')
-                logger.error(
-                    f"Unexpected error during stream processing: {e}", exc_info=True)
-            # finally:
-                # Ensure crawler cleanup if stream_results doesn't handle it?
-                # stream_results *should* call crawler.close(), but only on the
-                # instance it received. If we pass the *manager* instead, this gets complex.
-                # Let's stick to passing the acquired_crawler and rely on stream_results.
-
-        # Create the generator using the wrapper
-        streaming_generator = stream_wrapper(manager, crawl_request, config)
-
-        return StreamingResponse(
-            streaming_generator,  # Use the wrapper
-            media_type='application/x-ndjson',
-            headers={'Cache-Control': 'no-cache',
-                     'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
-        )
-
-    except (PoolTimeoutError, NoHealthyCrawlerError) as e:
-        # These might occur if get_crawler fails *before* stream starts
-        # Or if the wrapper re-raises them.
-        logger.warning(f"Stream request rejected before starting: {e}")
-        status_code = status.HTTP_503_SERVICE_UNAVAILABLE  # Or 429 for timeout
-        # Don't raise HTTPException here, let the wrapper yield the error message.
-        # If we want to return a non-200 initial status, need more complex handling.
-        # Return an *empty* stream with error headers? Or just let wrapper yield error.
-
-        async def _error_stream():
-            error_payload = {"status": "error", "detail": str(e)}
-            yield (json.dumps(error_payload) + "\n").encode('utf-8')
-        return StreamingResponse(_error_stream(), status_code=status_code, media_type='application/x-ndjson')
-
-    except HTTPException:  # Re-raise HTTP exceptions from setup
-        raise
-    except Exception as e:
-        logger.error(
-            f"Unexpected error setting up stream crawl: {e}", exc_info=True)
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"An unexpected error occurred setting up the stream: {e}"
-        )
+        raise HTTPException(400, "At least one URL required")
+    crawler, gen = await handle_stream_crawl_request(
+        urls=crawl_request.urls,
+        browser_config=crawl_request.browser_config,
+        crawler_config=crawl_request.crawler_config,
+        config=config,
+    )
+    return StreamingResponse(
+        stream_results(crawler, gen),
+        media_type="application/x-ndjson",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Stream-Status": "active",
+        },
+    )
 
+# ────────────────────────── cli ──────────────────────────────
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(
@@ -402,5 +288,6 @@ if __name__ == "__main__":
         host=config["app"]["host"],
         port=config["app"]["port"],
         reload=config["app"]["reload"],
-        timeout_keep_alive=config["app"]["timeout_keep_alive"]
+        timeout_keep_alive=config["app"]["timeout_keep_alive"],
     )
+# ─────────────────────────────────────────────────────────────
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
new file mode 100644
index 00000000..8c2b3fb9
--- /dev/null
+++ b/deploy/docker/static/playground/index.html
@@ -0,0 +1,813 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Crawl4AI Playground</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script>
+        tailwind.config = {
+            theme: {
+                extend: {
+                    colors: {
+                        primary: '#4EFFFF',
+                        primarydim: '#09b5a5',
+                        accent: '#F380F5',
+                        dark: '#070708',
+                        light: '#E8E9ED',
+                        secondary: '#D5CEBF',
+                        codebg: '#1E1E1E',
+                        surface: '#202020',
+                        border: '#3F3F44',
+                    },
+                    fontFamily: {
+                        mono: ['Fira Code', 'monospace'],
+                    },
+                }
+            }
+        }
+    </script>
+    <link href="https://fonts.googleapis.com/css2?family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
+    <!-- Highlight.js -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github-dark.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js"></script>
+    <!-- CodeMirror (python mode) -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/codemirror.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/codemirror.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/mode/python/python.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/addon/edit/matchbrackets.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/addon/selection/active-line.min.js"></script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/theme/darcula.min.css">
+    <!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/bash.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/json.min.js"></script> -->
+    <style>
+        /* Custom CodeMirror styling to match theme */
+        .CodeMirror {
+            background-color: #1E1E1E !important;
+            color: #E8E9ED !important;
+            border-radius: 4px;
+            font-family: 'Fira Code', monospace;
+            font-size: 0.9rem;
+        }
+        
+        .CodeMirror-gutters {
+            background-color: #1E1E1E !important;
+            border-right: 1px solid #3F3F44 !important;
+        }
+        
+        .CodeMirror-linenumber {
+            color: #3F3F44 !important;
+        }
+        
+        .cm-s-darcula .cm-keyword {
+            color: #4EFFFF !important;
+        }
+        
+        .cm-s-darcula .cm-string {
+            color: #F380F5 !important;
+        }
+        
+        .cm-s-darcula .cm-number {
+            color: #D5CEBF !important;
+        }
+        
+        /* Add to your <style> section or Tailwind config */
+        .hljs {
+            background: #1E1E1E !important;
+            border-radius: 4px;
+            padding: 1rem !important;
+        }
+
+        pre code.hljs {
+            display: block;
+            overflow-x: auto;
+        }
+
+        /* Language-specific colors */
+        .hljs-attr {
+            color: #4EFFFF;
+        }
+
+        /* JSON keys */
+        .hljs-string {
+            color: #F380F5;
+        }
+
+        /* Strings */
+        .hljs-number {
+            color: #D5CEBF;
+        }
+
+        /* Numbers */
+        .hljs-keyword {
+            color: #4EFFFF;
+        }
+
+        pre code {
+            white-space: pre-wrap;
+            word-break: break-word;
+        }
+
+        .copy-btn {
+            transition: all 0.2s ease;
+            opacity: 0.7;
+        }
+
+        .copy-btn:hover {
+            opacity: 1;
+        }
+
+        .tab-content:hover .copy-btn {
+            opacity: 0.7;
+        }
+
+        .tab-content:hover .copy-btn:hover {
+            opacity: 1;
+        }
+
+        /* copid text highlighted */
+        .highlighted {
+            background-color: rgba(78, 255, 255, 0.2) !important;
+            transition: background-color 0.5s ease;
+        }
+    </style>
+</head>
+
+<body class="bg-dark text-light font-mono min-h-screen flex flex-col" style="font-feature-settings: 'calt' 0;">
+    <!-- Header -->
+    <header class="border-b border-border px-4 py-2 flex items-center">
+        <h1 class="text-lg font-medium flex items-center space-x-4">
+            <span>🚀🤖 <span class="text-primary">Crawl4AI</span> Playground</span>
+
+            <!-- GitHub badges -->
+            <a href="https://github.com/unclecode/crawl4ai" target="_blank" class="flex space-x-1">
+                <img src="https://img.shields.io/github/stars/unclecode/crawl4ai?style=social"
+                     alt="GitHub stars" class="h-5">
+                <img src="https://img.shields.io/github/forks/unclecode/crawl4ai?style=social"
+                     alt="GitHub forks" class="h-5">
+            </a>
+
+            <!-- Docs -->
+            <a href="https://docs.crawl4ai.com" target="_blank"
+               class="text-xs text-secondary hover:text-primary underline flex items-center">
+                Docs
+            </a>
+
+            <!-- X (Twitter) follow -->
+            <a href="https://x.com/unclecode" target="_blank"
+               class="hover:text-primary flex items-center" title="Follow @unclecode on X">
+                <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"
+                     class="w-4 h-4 fill-current mr-1">
+                    <path d="M22.46 6c-.77.35-1.6.58-2.46.69a4.27 4.27 0 001.88-2.35 8.53 8.53 0 01-2.71 1.04 4.24 4.24 0 00-7.23 3.87A12.05 12.05 0 013 4.62a4.24 4.24 0 001.31 5.65 4.2 4.2 0 01-1.92-.53v.05a4.24 4.24 0 003.4 4.16 4.31 4.31 0 01-1.91.07 4.25 4.25 0 003.96 2.95A8.5 8.5 0 012 19.55a12.04 12.04 0 006.53 1.92c7.84 0 12.13-6.49 12.13-12.13 0-.18-.01-.36-.02-.54A8.63 8.63 0 0024 5.1a8.45 8.45 0 01-2.54.7z"/>
+                </svg>
+                <span class="text-xs">@unclecode</span>
+            </a>
+        </h1>
+
+        <div class="ml-auto flex space-x-2">
+            <button id="play-tab"
+                class="px-3 py-1 rounded-t bg-surface border border-b-0 border-border text-primary">Playground</button>
+            <button id="stress-tab" class="px-3 py-1 rounded-t border border-border hover:bg-surface">Stress
+                Test</button>
+        </div>
+    </header>
+
+    <!-- Main Playground -->
+    <main id="playground" class="flex-1 flex flex-col p-4 space-y-4 max-w-5xl w-full mx-auto">
+        <!-- Request Builder -->
+        <section class="bg-surface rounded-lg border border-border overflow-hidden">
+            <div class="px-4 py-2 border-b border-border flex items-center">
+                <h2 class="font-medium">Request Builder</h2>
+                <select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
+                    <option value="crawl">/crawl (batch)</option>
+                    <option value="crawl_stream">/crawl/stream</option>
+                    <option value="md">/md</option>
+                    <option value="llm">/llm</option>
+                </select>
+            </div>
+            <div class="p-4">
+                <label class="block mb-2 text-sm">URL(s) - one per line</label>
+                <textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
+                    spellcheck="false">https://example.com</textarea>
+
+                <details class="mb-4">
+                    <summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
+                        class="text-xs text-primary">(Python → auto‑JSON)</span></summary>
+
+                    <!-- Toolbar -->
+                    <div class="flex items-center justify-end space-x-3 mt-2">
+                        <label for="cfg-type" class="text-xs text-secondary">Type:</label>
+                        <select id="cfg-type"
+                                class="bg-dark border border-border rounded px-1 py-0.5 text-xs">
+                            <option value="CrawlerRunConfig">CrawlerRunConfig</option>
+                            <option value="BrowserConfig">BrowserConfig</option>
+                        </select>
+
+                        <!-- help link -->
+                        <a href="https://docs.crawl4ai.com/api/parameters/"
+                           target="_blank"
+                           class="text-xs text-primary hover:underline flex items-center space-x-1"
+                           title="Open parameter reference in new tab">
+                            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"
+                                 class="w-4 h-4 fill-current">
+                                 <path d="M13 3h8v8h-2V6.41l-9.29 9.3-1.42-1.42 9.3-9.29H13V3z"/>
+                                 <path d="M5 5h4V3H3v6h2V5zm0 14v-4H3v6h6v-2H5z"/>
+                            </svg>
+                            <span>Docs</span>
+                        </a>
+
+                        <span id="cfg-status" class="text-xs text-secondary ml-2"></span>
+                    </div>
+
+                    <!-- CodeMirror host -->
+                    <div id="adv-editor" class="mt-2 border border-border rounded overflow-hidden h-40"></div>
+                </details>
+
+                <div class="flex space-x-2">
+                    <button id="run-btn" class="bg-primary text-dark px-4 py-2 rounded hover:bg-primarydim font-medium">
+                        Run (⌘/Ctrl+Enter)
+                    </button>
+                    <button id="export-btn" class="border border-border px-4 py-2 rounded hover:bg-surface hidden">
+                        Export Python Code
+                    </button>
+                </div>
+            </div>
+        </section>
+
+        <!-- Execution Status -->
+        <section id="execution-status" class="hidden bg-surface rounded-lg border border-border p-3 text-sm">
+            <div class="flex space-x-4">
+                <div id="status-badge" class="flex items-center">
+                    <span class="w-3 h-3 rounded-full mr-2"></span>
+                    <span>Ready</span>
+                </div>
+                <div>
+                    <span class="text-secondary">Time:</span>
+                    <span id="exec-time" class="text-light">-</span>
+                </div>
+                <div>
+                    <span class="text-secondary">Memory:</span>
+                    <span id="exec-mem" class="text-light">-</span>
+                </div>
+            </div>
+        </section>
+
+        <!-- Response Viewer -->
+        <!-- Update the Response Viewer section -->
+        <section class="bg-surface rounded-lg border border-border overflow-hidden flex-1 flex flex-col">
+            <div class="border-b border-border flex">
+                <button data-tab="response" class="tab-btn active px-4 py-2 border-r border-border">Response</button>
+                <button data-tab="python" class="tab-btn px-4 py-2 border-r border-border">Python</button>
+                <button data-tab="curl" class="tab-btn px-4 py-2">cURL</button>
+            </div>
+            <div class="flex-1 overflow-auto relative">
+                <!-- Response Tab -->
+                <div class="tab-content active h-full">
+                    <div class="absolute right-2 top-2">
+                        <button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
+                            data-target="#response-content code">
+                            Copy
+                        </button>
+                    </div>
+                    <pre id="response-content" class="p-4 text-sm h-full"><code class="json hljs">{}</code></pre>
+                </div>
+
+                <!-- Python Tab -->
+                <div class="tab-content hidden h-full">
+                    <div class="absolute right-2 top-2">
+                        <button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
+                            data-target="#python-content code">
+                            Copy
+                        </button>
+                    </div>
+                    <pre id="python-content" class="p-4 text-sm h-full"><code class="python hljs"></code></pre>
+                </div>
+
+                <!-- cURL Tab -->
+                <div class="tab-content hidden h-full">
+                    <div class="absolute right-2 top-2">
+                        <button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
+                            data-target="#curl-content code">
+                            Copy
+                        </button>
+                    </div>
+                    <pre id="curl-content" class="p-4 text-sm h-full"><code class="bash hljs"></code></pre>
+                </div>
+            </div>
+        </section>
+    </main>
+
+    <!-- Stress Test Modal -->
+    <div id="stress-modal"
+        class="hidden fixed inset-0 bg-black bg-opacity-70 z-50 flex items-center justify-center p-4">
+        <div class="bg-surface rounded-lg border border-accent w-full max-w-3xl max-h-[90vh] flex flex-col">
+            <div class="px-4 py-2 border-b border-border flex items-center">
+                <h2 class="font-medium text-accent">🔥 Stress Test</h2>
+                <button id="close-stress" class="ml-auto text-secondary hover:text-light">&times;</button>
+            </div>
+
+            <div class="p-4 space-y-4 flex-1 overflow-auto">
+                <div class="grid grid-cols-3 gap-4">
+                    <div>
+                        <label class="block text-sm mb-1">Total URLs</label>
+                        <input id="st-total" type="number" value="20"
+                            class="w-full bg-dark border border-border rounded px-3 py-1">
+                    </div>
+                    <div>
+                        <label class="block text-sm mb-1">Chunk Size</label>
+                        <input id="st-chunk" type="number" value="5"
+                            class="w-full bg-dark border border-border rounded px-3 py-1">
+                    </div>
+                    <div>
+                        <label class="block text-sm mb-1">Concurrency</label>
+                        <input id="st-conc" type="number" value="2"
+                            class="w-full bg-dark border border-border rounded px-3 py-1">
+                    </div>
+                </div>
+
+                <div class="flex items-center">
+                    <input id="st-stream" type="checkbox" class="mr-2">
+                    <label for="st-stream" class="text-sm">Use /crawl/stream</label>
+                    <button id="st-run"
+                        class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
+                        Run Stress Test
+                    </button>
+                </div>
+
+                <div class="mt-4">
+                    <div class="bg-dark rounded border border-border p-3 h-64 overflow-auto text-sm whitespace-break-spaces"
+                        id="stress-log"></div>
+                </div>
+            </div>
+
+            <div class="px-4 py-2 border-t border-border text-sm text-secondary">
+                <div class="flex justify-between">
+                    <span>Completed: <span id="stress-completed">0</span>/<span id="stress-total">0</span></span>
+                    <span>Avg. Time: <span id="stress-avg-time">0</span>ms</span>
+                    <span>Peak Memory: <span id="stress-peak-mem">0</span>MB</span>
+                </div>
+            </div>
+        </div>
+    </div>
+
+    <script>
+        // Tab switching
+        document.querySelectorAll('.tab-btn').forEach(btn => {
+            btn.addEventListener('click', () => {
+                document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
+                document.querySelectorAll('.tab-content').forEach(c => c.classList.add('hidden'));
+
+                btn.classList.add('active');
+                const tabName = btn.dataset.tab;
+                document.querySelector(`#${tabName}-content`).parentElement.classList.remove('hidden');
+
+                // Re-highlight content when switching tabs
+                const activeCode = document.querySelector(`#${tabName}-content code`);
+                if (activeCode) {
+                    forceHighlightElement(activeCode);
+                }
+            });
+        });
+
+        // View switching
+        document.getElementById('play-tab').addEventListener('click', () => {
+            document.getElementById('playground').classList.remove('hidden');
+            document.getElementById('stress-modal').classList.add('hidden');
+            document.getElementById('play-tab').classList.add('bg-surface', 'border-b-0');
+            document.getElementById('stress-tab').classList.remove('bg-surface', 'border-b-0');
+        });
+
+        document.getElementById('stress-tab').addEventListener('click', () => {
+            document.getElementById('stress-modal').classList.remove('hidden');
+            document.getElementById('stress-tab').classList.add('bg-surface', 'border-b-0');
+            document.getElementById('play-tab').classList.remove('bg-surface', 'border-b-0');
+        });
+
+        document.getElementById('close-stress').addEventListener('click', () => {
+            document.getElementById('stress-modal').classList.add('hidden');
+            document.getElementById('play-tab').classList.add('bg-surface', 'border-b-0');
+            document.getElementById('stress-tab').classList.remove('bg-surface', 'border-b-0');
+        });
+
+        // Initialize clipboard and highlight.js
+        new ClipboardJS('#export-btn');
+        hljs.highlightAll();
+
+        // Keyboard shortcut
+        window.addEventListener('keydown', e => {
+            if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
+                document.getElementById('run-btn').click();
+            }
+        });
+
+        // ================ ADVANCED CONFIG EDITOR ================
+        const cm = CodeMirror(document.getElementById('adv-editor'), {
+            value: `CrawlerRunConfig(
+    stream=True,
+    cache_mode=CacheMode.BYPASS,
+)`,
+            mode: 'python',
+            lineNumbers: true,
+            theme: 'darcula',
+            tabSize: 4,
+            styleActiveLine: true,
+            matchBrackets: true,
+            gutters: ["CodeMirror-linenumbers"],
+            lineWrapping: true,
+        });
+
+        const TEMPLATES = {
+            CrawlerRunConfig: `CrawlerRunConfig(
+    stream=True,
+    cache_mode=CacheMode.BYPASS,
+)`,
+            BrowserConfig: `BrowserConfig(
+    headless=True,
+    extra_args=[
+        "--no-sandbox",
+        "--disable-gpu",
+    ],
+)`,
+        };
+
+        document.getElementById('cfg-type').addEventListener('change', (e) => {
+            cm.setValue(TEMPLATES[e.target.value]);
+            document.getElementById('cfg-status').textContent = '';
+        });
+
+        async function pyConfigToJson() {
+            const code = cm.getValue().trim();
+            if (!code) return {};
+
+            const res = await fetch('/config/dump', {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ code }),
+            });
+
+            const statusEl = document.getElementById('cfg-status');
+            if (!res.ok) {
+                const msg = await res.text();
+                statusEl.textContent = '✖ config error';
+                statusEl.className = 'text-xs text-red-400';
+                throw new Error(msg || 'Invalid config');
+            }
+
+            statusEl.textContent = '✓ parsed';
+            statusEl.className = 'text-xs text-green-400';
+
+            return await res.json();
+        }
+
+        // ================ SERVER COMMUNICATION ================
+
+        // Update status UI
+        function updateStatus(status, time, memory, peakMemory) {
+            const statusEl = document.getElementById('execution-status');
+            const badgeEl = document.querySelector('#status-badge span:first-child');
+            const textEl = document.querySelector('#status-badge span:last-child');
+
+            statusEl.classList.remove('hidden');
+            badgeEl.className = 'w-3 h-3 rounded-full mr-2';
+
+            if (status === 'success') {
+                badgeEl.classList.add('bg-green-500');
+                textEl.textContent = 'Success';
+            } else if (status === 'error') {
+                badgeEl.classList.add('bg-red-500');
+                textEl.textContent = 'Error';
+            } else {
+                badgeEl.classList.add('bg-yellow-500');
+                textEl.textContent = 'Processing...';
+            }
+
+            if (time) {
+                document.getElementById('exec-time').textContent = `${time}ms`;
+            }
+
+            if (memory !== undefined && peakMemory !== undefined) {
+                document.getElementById('exec-mem').textContent = `Δ${memory >= 0 ? '+' : ''}${memory}MB (Peak: ${peakMemory}MB)`;
+            }
+        }
+
+        // Generate code snippets
+        function generateSnippets(api, payload) {
+            // Python snippet
+            const pyCodeEl = document.querySelector('#python-content code');
+            const pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+
+            pyCodeEl.textContent = pySnippet;
+            pyCodeEl.className = 'python hljs'; // Reset classes
+            forceHighlightElement(pyCodeEl);
+
+            // cURL snippet
+            const curlCodeEl = document.querySelector('#curl-content code');
+            const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+
+            curlCodeEl.textContent = curlSnippet;
+            curlCodeEl.className = 'bash hljs'; // Reset classes
+            forceHighlightElement(curlCodeEl);
+        }
+
+        // Main run function
+        async function runCrawl() {
+            const endpoint = document.getElementById('endpoint').value;
+            const urls = document.getElementById('urls').value.trim().split(/\n/).filter(u => u);
+            // 1) grab python from CodeMirror, validate via /config/dump
+            let advConfig = {};
+            try {
+                const cfgJson = await pyConfigToJson(); // may throw
+                if (Object.keys(cfgJson).length) {
+                    const cfgType = document.getElementById('cfg-type').value;
+                    advConfig = cfgType === 'CrawlerRunConfig'
+                        ? { crawler_config: cfgJson }
+                        : { browser_config: cfgJson };
+                }
+            } catch (err) {
+                updateStatus('error');
+                document.querySelector('#response-content code').textContent =
+                    JSON.stringify({ error: err.message }, null, 2);
+                forceHighlightElement(document.querySelector('#response-content code'));
+                return; // stop run
+            }
+
+            const endpointMap = {
+                crawl: '/crawl',
+                crawl_stream: '/crawl/stream',
+                md: '/md',
+                llm: '/llm'
+            };
+
+            const api = endpointMap[endpoint];
+            const payload = {
+                urls,
+                ...advConfig
+            };
+
+            updateStatus('processing');
+
+            try {
+                const startTime = performance.now();
+                let response, responseData;
+
+                if (endpoint === 'crawl_stream') {
+                    // Stream processing
+                    response = await fetch(api, {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify(payload)
+                    });
+
+                    const reader = response.body.getReader();
+                    let text = '';
+                    let maxMemory = 0;
+
+                    while (true) {
+                        const { value, done } = await reader.read();
+                        if (done) break;
+
+                        const chunk = new TextDecoder().decode(value);
+                        text += chunk;
+
+                        // Process each line for memory updates
+                        chunk.trim().split('\n').forEach(line => {
+                            if (!line) return;
+                            try {
+                                const obj = JSON.parse(line);
+                                if (obj.server_memory_mb) {
+                                    maxMemory = Math.max(maxMemory, obj.server_memory_mb);
+                                }
+                            } catch (e) {
+                                console.error('Error parsing stream line:', e);
+                            }
+                        });
+                    }
+
+                    responseData = { stream: text };
+                    const time = Math.round(performance.now() - startTime);
+                    updateStatus('success', time, null, maxMemory);
+                    document.querySelector('#response-content code').textContent = text;
+                    document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
+                    forceHighlightElement(document.querySelector('#response-content code'));
+                } else {
+                    // Regular request
+                    response = await fetch(api, {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify(payload)
+                    });
+
+                    responseData = await response.json();
+                    const time = Math.round(performance.now() - startTime);
+
+                    if (!response.ok) {
+                        updateStatus('error', time);
+                        throw new Error(responseData.error || 'Request failed');
+                    }
+
+                    updateStatus(
+                        'success',
+                        time,
+                        responseData.server_memory_delta_mb,
+                        responseData.server_peak_memory_mb
+                    );
+
+                    document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
+                    document.querySelector('#response-content code').className = 'json hljs'; // Ensure class is set
+                    forceHighlightElement(document.querySelector('#response-content code'));
+                }
+
+                forceHighlightElement(document.querySelector('#response-content code'));
+                generateSnippets(api, payload);
+            } catch (error) {
+                console.error('Error:', error);
+                updateStatus('error');
+                document.querySelector('#response-content code').textContent = JSON.stringify(
+                    { error: error.message },
+                    null,
+                    2
+                );
+                forceHighlightElement(document.querySelector('#response-content code'));
+            }
+        }
+
+        // Stress test function
+        async function runStressTest() {
+            const total = parseInt(document.getElementById('st-total').value);
+            const chunkSize = parseInt(document.getElementById('st-chunk').value);
+            const concurrency = parseInt(document.getElementById('st-conc').value);
+            const useStream = document.getElementById('st-stream').checked;
+
+            const logEl = document.getElementById('stress-log');
+            logEl.textContent = '';
+
+            document.getElementById('stress-completed').textContent = '0';
+            document.getElementById('stress-total').textContent = total;
+            document.getElementById('stress-avg-time').textContent = '0';
+            document.getElementById('stress-peak-mem').textContent = '0';
+
+            const api = useStream ? '/crawl/stream' : '/crawl';
+            const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
+            const chunks = [];
+
+            for (let i = 0; i < urls.length; i += chunkSize) {
+                chunks.push(urls.slice(i, i + chunkSize));
+            }
+
+            let completed = 0;
+            let totalTime = 0;
+            let peakMemory = 0;
+
+            const processBatch = async (batch, index) => {
+                const payload = {
+                    urls: batch,
+                    browser_config: {},
+                    crawler_config: { cache_mode: 'BYPASS', stream: useStream }
+                };
+
+                const start = performance.now();
+                let time, memory;
+
+                try {
+                    if (useStream) {
+                        const response = await fetch(api, {
+                            method: 'POST',
+                            headers: { 'Content-Type': 'application/json' },
+                            body: JSON.stringify(payload)
+                        });
+
+                        const reader = response.body.getReader();
+                        let maxMem = 0;
+                        while (true) {
+                            const { value, done } = await reader.read();
+                            if (done) break;
+                            const text = new TextDecoder().decode(value);
+                            text.split('\n').forEach(line => {
+                                try {
+                                    const obj = JSON.parse(line);
+                                    if (obj.server_memory_mb) {
+                                        maxMem = Math.max(maxMem, obj.server_memory_mb);
+                                    }
+                                } catch { }
+                            });
+                        }
+
+                        memory = maxMem;
+                    } else {
+                        const response = await fetch(api, {
+                            method: 'POST',
+                            headers: { 'Content-Type': 'application/json' },
+                            body: JSON.stringify(payload)
+                        });
+
+                        const data = await response.json();
+                        memory = data.server_peak_memory_mb;
+                    }
+
+                    time = Math.round(performance.now() - start);
+                    peakMemory = Math.max(peakMemory, memory || 0);
+                    totalTime += time;
+
+                    logEl.textContent += `[${index + 1}/${chunks.length}] ✔ ${time}ms | Peak ${memory}MB\n`;
+                } catch (error) {
+                    time = Math.round(performance.now() - start);
+                    logEl.textContent += `[${index + 1}/${chunks.length}] ✖ ${time}ms | ${error.message}\n`;
+                }
+
+                completed += batch.length;
+                document.getElementById('stress-completed').textContent = completed;
+                document.getElementById('stress-peak-mem').textContent = peakMemory;
+                document.getElementById('stress-avg-time').textContent = Math.round(totalTime / (index + 1));
+
+                logEl.scrollTop = logEl.scrollHeight;
+            };
+
+            // Run with concurrency control
+            let active = 0;
+            let index = 0;
+
+            return new Promise(resolve => {
+                const runNext = () => {
+                    while (active < concurrency && index < chunks.length) {
+                        processBatch(chunks[index], index)
+                            .finally(() => {
+                                active--;
+                                runNext();
+                            });
+                        active++;
+                        index++;
+                    }
+
+                    if (active === 0 && index >= chunks.length) {
+                        logEl.textContent += '\n✅ Stress test completed\n';
+                        resolve();
+                    }
+                };
+
+                runNext();
+            });
+        }
+
+        // Event listeners
+        document.getElementById('run-btn').addEventListener('click', runCrawl);
+        document.getElementById('st-run').addEventListener('click', runStressTest);
+
+        function forceHighlightElement(element) {
+            if (!element) return;
+
+            // Save current scroll position (important for large code blocks)
+            const scrollTop = element.parentElement.scrollTop;
+
+            // Reset the element
+            const text = element.textContent;
+            element.innerHTML = text;
+            element.removeAttribute('data-highlighted');
+
+            // Reapply highlighting
+            hljs.highlightElement(element);
+
+            // Restore scroll position
+            element.parentElement.scrollTop = scrollTop;
+        }
+
+        // Initialize clipboard for all copy buttons
+        function initCopyButtons() {
+            document.querySelectorAll('.copy-btn').forEach(btn => {
+                new  ClipboardJS(btn, {
+                    text: () => {
+                        const target = document.querySelector(btn.dataset.target);
+                        return target ? target.textContent : '';
+                    }
+                }).on('success', e => {
+                    e.clearSelection();
+                    // make button text "copied" for 1 second
+                    const originalText = e.trigger.textContent;
+                    e.trigger.textContent = 'Copied!';
+                    setTimeout(() => {
+                        e.trigger.textContent = originalText;
+                    }, 1000);
+                    // Highlight the copied code
+                    const target = document.querySelector(btn.dataset.target);
+                    if (target) {
+                        target.classList.add('highlighted');
+                        setTimeout(() => {
+                            target.classList.remove('highlighted');
+                        }, 1000);
+                    }
+
+                }).on('error', e => {
+                    console.error('Error copying:', e);
+                });
+            });
+        }
+
+        // Call this in your DOMContentLoaded or initialization
+        initCopyButtons();
+
+    </script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/tests/memory/cap_test.py b/tests/memory/cap_test.py
new file mode 100644
index 00000000..56d7b261
--- /dev/null
+++ b/tests/memory/cap_test.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+"""
+Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
+"""
+
+import asyncio, httpx, json, uuid, argparse
+
+API = "http://localhost:8020/crawl"
+URLS_PER_CALL = 1          # keep it minimal so each arun() == 1 page
+CONCURRENT_CALLS = 20      # way above your cap
+
+payload_template = {
+    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {"cache_mode": "BYPASS", "verbose": False},
+    }
+}
+
+async def one_call(client):
+    payload = payload_template.copy()
+    payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
+    r = await client.post(API, json=payload)
+    r.raise_for_status()
+    return r.json()["server_peak_memory_mb"]
+
+async def main():
+    async with httpx.AsyncClient(timeout=60) as client:
+        tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
+        mem_usages = await asyncio.gather(*tasks)
+        print("Calls finished OK, server peaks reported:", mem_usages)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/memory/test_docker_congif_gen.py b/tests/memory/test_docker_congif_gen.py
new file mode 100644
index 00000000..2da26078
--- /dev/null
+++ b/tests/memory/test_docker_congif_gen.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""
+Quick sanity‑check for /config/dump endpoint.
+
+Usage:
+    python test_config_dump.py  [http://localhost:8020]
+
+If the server isn’t running, start it first:
+    uvicorn deploy.docker.server:app --port 8020
+"""
+
+import sys, json, textwrap, requests
+
+BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+URL  = f"{BASE.rstrip('/')}/config/dump"
+
+CASES = [
+    # --- CrawlRunConfig variants ---
+    "CrawlerRunConfig()",
+    "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
+    "CrawlerRunConfig(js_only=True, wait_until='networkidle')",
+
+    # --- BrowserConfig variants ---
+    "BrowserConfig()",
+    "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
+    "BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
+]
+
+for code in CASES:
+    print("\n===  POST:", code)
+    resp = requests.post(URL, json={"code": code}, timeout=15)
+    if resp.ok:
+        print(json.dumps(resp.json(), indent=2)[:400] + "...")
+    else:
+        print("ERROR", resp.status_code, resp.text[:200])
diff --git a/tests/memory/test_stress_api.py b/tests/memory/test_stress_api.py
index 232964c1..1b4f1a9c 100644
--- a/tests/memory/test_stress_api.py
+++ b/tests/memory/test_stress_api.py
@@ -24,13 +24,13 @@ from rich.panel import Panel
 from rich.syntax import Syntax
 
 # --- Constants ---
-# DEFAULT_API_URL = "http://localhost:11235" # Default port
+DEFAULT_API_URL = "http://localhost:11235" # Default port
 DEFAULT_API_URL = "http://localhost:8020" # Default port
-DEFAULT_URL_COUNT = 1000
-DEFAULT_MAX_CONCURRENT_REQUESTS = 5
+DEFAULT_URL_COUNT = 100
+DEFAULT_MAX_CONCURRENT_REQUESTS = 1
 DEFAULT_CHUNK_SIZE = 10
 DEFAULT_REPORT_PATH = "reports_api"
-DEFAULT_STREAM_MODE = False
+DEFAULT_STREAM_MODE = True
 REQUEST_TIMEOUT = 180.0
 
 # Initialize Rich console
@@ -77,6 +77,10 @@ class ApiStressTest:
         self.report_path = pathlib.Path(report_path)
         self.report_path.mkdir(parents=True, exist_ok=True)
         self.stream_mode = stream_mode
+        
+        # Ignore repo path and set it to current file path
+        self.repo_path = pathlib.Path(__file__).parent.resolve()
+
 
         self.test_id = time.strftime("%Y%m%d_%H%M%S")
         self.results_summary = {
diff --git a/tests/memory/test_stress_api_xs.py b/tests/memory/test_stress_api_xs.py
new file mode 100644
index 00000000..27248883
--- /dev/null
+++ b/tests/memory/test_stress_api_xs.py
@@ -0,0 +1,203 @@
+"""Lite Crawl4AI API stress‑tester.
+
+✔ batch or stream mode (single unified path)
+✔ global stats + JSON summary
+✔ rich table progress
+✔ Typer CLI with presets (quick / soak)
+
+Usage examples:
+    python api_stress_test.py               # uses quick preset
+    python api_stress_test.py soak          # 5 K URLs stress run
+    python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
+"""
+
+from __future__ import annotations
+
+import asyncio, json, time, uuid, pathlib, statistics
+from typing import List, Dict, Optional
+
+import httpx, typer
+from rich.console import Console
+from rich.table import Table
+
+# ───────────────────────── defaults / presets ──────────────────────────
+PRESETS = {
+    "quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
+    "debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
+    "soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
+}
+
+API_HEALTH_ENDPOINT = "/health"
+REQUEST_TIMEOUT = 180.0
+
+console = Console()
+app = typer.Typer(add_completion=False, rich_markup_mode="rich")
+
+# ───────────────────────── helpers ─────────────────────────────────────
+async def _check_health(client: httpx.AsyncClient) -> None:
+    resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
+    resp.raise_for_status()
+    console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
+
+async def _iter_results(resp: httpx.Response, stream: bool):
+    """Yield result dicts from batch JSON or ND‑JSON stream."""
+    if stream:
+        async for line in resp.aiter_lines():
+            if not line:
+                continue
+            rec = json.loads(line)
+            if rec.get("status") == "completed":
+                break
+            yield rec
+    else:
+        data = resp.json()
+        for rec in data.get("results", []):
+            yield rec, data  # rec + whole payload for memory delta/peak
+
+async def _consume_stream(resp: httpx.Response) -> Dict:
+    stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
+    async for line in resp.aiter_lines():
+        if not line:
+            continue
+        rec = json.loads(line)
+        if rec.get("status") == "completed":
+            break
+        if rec.get("success"):
+            stats["success_urls"] += 1
+        else:
+            stats["failed_urls"] += 1
+        mem = rec.get("server_memory_mb")
+        if mem is not None:
+            stats["mem_metric"] = max(stats["mem_metric"], float(mem))
+    return stats
+
+def _consume_batch(body: Dict) -> Dict:
+    stats = {"success_urls": 0, "failed_urls": 0}
+    for rec in body.get("results", []):
+        if rec.get("success"):
+            stats["success_urls"] += 1
+        else:
+            stats["failed_urls"] += 1
+    stats["mem_metric"] = body.get("server_memory_delta_mb")
+    stats["peak"] = body.get("server_peak_memory_mb")
+    return stats
+
+async def _fetch_chunk(
+    client: httpx.AsyncClient,
+    urls: List[str],
+    stream: bool,
+    semaphore: asyncio.Semaphore,
+) -> Dict:
+    endpoint = "/crawl/stream" if stream else "/crawl"
+    payload = {
+        "urls": urls,
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig",
+                           "params": {"cache_mode": "BYPASS", "stream": stream}},
+    }
+
+    async with semaphore:
+        start = time.perf_counter()
+
+        if stream:
+            # ---- streaming request ----
+            async with client.stream("POST", endpoint, json=payload) as resp:
+                resp.raise_for_status()
+                stats = await _consume_stream(resp)
+        else:
+            # ---- batch request ----
+            resp = await client.post(endpoint, json=payload)
+            resp.raise_for_status()
+            stats = _consume_batch(resp.json())
+
+        stats["elapsed"] = time.perf_counter() - start
+        return stats
+
+
+# ───────────────────────── core runner ─────────────────────────────────
+async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
+    client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
+    await _check_health(client)
+
+    url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
+    chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
+    sem = asyncio.Semaphore(concurrent)
+
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Batch", style="dim", width=6)
+    table.add_column("Success/Fail", width=12)
+    table.add_column("Mem", width=14)
+    table.add_column("Time (s)")
+
+    agg_success = agg_fail = 0
+    deltas, peaks = [], []
+
+    start = time.perf_counter()
+    tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
+    for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
+        res = await coro
+        agg_success += res["success_urls"]
+        agg_fail += res["failed_urls"]
+        if res["mem_metric"] is not None:
+            deltas.append(res["mem_metric"])
+        if res["peak"] is not None:
+            peaks.append(res["peak"])
+
+        mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else "‑"
+        if res["peak"] is not None:
+            mem_txt = f"{res['peak']:.1f}/{mem_txt}"
+
+        table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
+
+    console.print(table)
+    total_time = time.perf_counter() - start
+
+    summary = {
+        "urls": urls,
+        "concurrent": concurrent,
+        "chunk": chunk,
+        "stream": stream,
+        "success_urls": agg_success,
+        "failed_urls": agg_fail,
+        "elapsed_sec": round(total_time, 2),
+        "avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
+        "max_mem": max(deltas) if deltas else None,
+        "avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
+        "max_peak": max(peaks) if peaks else None,
+    }
+    console.print("\n[bold green]Done:[/]" , summary)
+
+    report.mkdir(parents=True, exist_ok=True)
+    path = report / f"api_test_{int(time.time())}.json"
+    path.write_text(json.dumps(summary, indent=2))
+    console.print(f"[green]Summary → {path}")
+
+    await client.aclose()
+
+# ───────────────────────── Typer CLI ──────────────────────────────────
+@app.command()
+def main(
+    preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
+    api_url: str = typer.Option("http://localhost:8020", show_default=True),
+    urls: int = typer.Option(None, help="Total URLs to crawl"),
+    concurrent: int = typer.Option(None, help="Concurrent API requests"),
+    chunk: int = typer.Option(None, help="URLs per request"),
+    stream: bool = typer.Option(None, help="Use /crawl/stream"),
+    report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
+):
+    """Run a stress test against a running Crawl4AI API server."""
+    if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
+        console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
+        raise typer.Exit(1)
+
+    cfg = PRESETS.get(preset, {})
+    urls = urls or cfg.get("urls")
+    concurrent = concurrent or cfg.get("concurrent")
+    chunk = chunk or cfg.get("chunk")
+    stream = stream if stream is not None else cfg.get("stream", False)
+
+    console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
+    asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
+
+if __name__ == "__main__":
+    app()

From 5297e362f34b27f8d63b830f2a69bb6858a5009d Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 21 Apr 2025 22:22:02 +0800
Subject: [PATCH 67/78]   feat(mcp): Implement MCP protocol and enhance server
 capabilities

  This commit introduces several significant enhancements to the Crawl4AI Docker deployment:

  1. Add MCP Protocol Support:
     - Implement WebSocket and SSE transport layers for MCP server communication
     - Create mcp_bridge.py to expose existing API endpoints via MCP protocol
     - Add comprehensive tests for both socket and SSE transport methods

  2. Enhance Docker Server Capabilities:
     - Add PDF generation endpoint with file saving functionality
     - Add screenshot capture endpoint with configurable wait time
     - Implement JavaScript execution endpoint for dynamic page interaction
     - Add intelligent file path handling for saving generated assets

  3. Improve Search and Context Functionality:
     - Implement syntax-aware code function chunking using AST parsing
     - Add BM25-based intelligent document search with relevance scoring
     - Create separate code and documentation context endpoints
     - Enhance response format with structured results and scores

  4. Rename and Fix File Organization:
     - Fix typo in test_docker_config_gen.py filename
     - Update import statements and dependencies
     - Add FileResponse for context endpoints

  This enhancement significantly improves the machine-to-machine communication
  capabilities of Crawl4AI, making it more suitable for integration with LLM agents
  and other automated systems.

  The CHANGELOG update has been applied successfully, highlighting the key features and improvements made in this release. The commit message provides a detailed explanation of all the
  changes, which will be helpful for tracking the project's evolution.
---
 CHANGELOG.md                                  |    24 +
 deploy/docker/c4ai-code-context.md            | 11631 ++++++++++++++++
 deploy/docker/c4ai-doc-context.md             |  8899 ++++++++++++
 deploy/docker/mcp_bridge.py                   |   252 +
 deploy/docker/requirements.txt                |    16 +-
 deploy/docker/server.py                       |   402 +-
 tests/mcp/test_mcp_socket.py                  |   119 +
 tests/mcp/test_mcp_sse.py                     |    11 +
 ...ongif_gen.py => test_docker_config_gen.py} |     3 +-
 9 files changed, 21327 insertions(+), 30 deletions(-)
 create mode 100644 deploy/docker/c4ai-code-context.md
 create mode 100644 deploy/docker/c4ai-doc-context.md
 create mode 100644 deploy/docker/mcp_bridge.py
 create mode 100644 tests/mcp/test_mcp_socket.py
 create mode 100644 tests/mcp/test_mcp_sse.py
 rename tests/memory/{test_docker_congif_gen.py => test_docker_config_gen.py} (87%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6ef49dd3..fea79456 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,30 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+### [Feature] 2025-04-21
+- Implemented MCP protocol for machine-to-machine communication
+  - Added WebSocket and SSE transport for MCP server
+  - Exposed server endpoints via MCP protocol
+  - Created tests for MCP socket and SSE communication
+- Enhanced Docker server with file handling and intelligent search
+  - Added PDF and screenshot endpoints with file saving capability
+  - Added JavaScript execution endpoint for page interaction
+  - Implemented advanced context search with BM25 and code chunking
+  - Added file path output support for generated assets
+- Improved server endpoints and API surface
+  - Added intelligent context search with query filtering
+  - Added syntax-aware code function chunking
+  - Implemented efficient HTML processing pipeline
+
+### [Refactor] 2025-04-20
+- Replaced crawler_manager.py with simpler crawler_pool.py implementation
+- Added global page semaphore for hard concurrency cap
+- Implemented browser pool with idle cleanup
+- Added playground UI for testing and stress testing
+- Updated API handlers to use pooled crawlers
+- Enhanced logging levels and symbols
+- Added memory tests and stress test utilities
+
 ### [Added] 2025-04-17
 - Added content source selection feature for markdown generation
   - New `content_source` parameter allows choosing between `cleaned_html`, `raw_html`, and `fit_html`
diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md
new file mode 100644
index 00000000..f2551c01
--- /dev/null
+++ b/deploy/docker/c4ai-code-context.md
@@ -0,0 +1,11631 @@
+# Crawl4AI Code Context
+
+Generated on 2025-04-21
+
+## File: crawl4ai/async_configs.py
+
+```py
+import os
+from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    MIN_WORD_THRESHOLD,
+    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    PROVIDER_MODELS,
+    PROVIDER_MODELS_PREFIXES,
+    SCREENSHOT_HEIGHT_TRESHOLD,
+    PAGE_TIMEOUT,
+    IMAGE_SCORE_THRESHOLD,
+    SOCIAL_MEDIA_DOMAINS,
+)
+
+from .user_agent_generator import UAGen, ValidUAGenerator  # , OnlineUAGenerator
+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
+
+from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
+from .deep_crawling import DeepCrawlStrategy
+
+from .cache_context import CacheMode
+from .proxy_strategy import ProxyRotationStrategy
+
+from typing import Union, List
+import inspect
+from typing import Any, Dict, Optional
+from enum import Enum
+
+# from .proxy_strategy import ProxyConfig
+
+
+
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+    """
+    Recursively convert an object to a serializable dictionary using {type, params} structure
+    for complex objects.
+    """
+    if obj is None:
+        return None
+
+    # Handle basic types
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+
+    # Handle Enum
+    if isinstance(obj, Enum):
+        return {"type": obj.__class__.__name__, "params": obj.value}
+
+    # Handle datetime objects
+    if hasattr(obj, "isoformat"):
+        return obj.isoformat()
+
+    # Handle lists, tuples, and sets, and basically any iterable
+    if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict):
+        return [to_serializable_dict(item) for item in obj]
+
+    # Handle frozensets, which are not iterable
+    if isinstance(obj, frozenset):
+        return [to_serializable_dict(item) for item in list(obj)]
+
+    # Handle dictionaries - preserve them as-is
+    if isinstance(obj, dict):
+        return {
+            "type": "dict",  # Mark as plain dictionary
+            "value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
+        }
+
+    _type = obj.__class__.__name__
+
+    # Handle class instances
+    if hasattr(obj, "__class__"):
+        # Get constructor signature
+        sig = inspect.signature(obj.__class__.__init__)
+        params = sig.parameters
+
+        # Get current values
+        current_values = {}
+        for name, param in params.items():
+            if name == "self":
+                continue
+
+            value = getattr(obj, name, param.default)
+
+            # Only include if different from default, considering empty values
+            if not (is_empty_value(value) and is_empty_value(param.default)):
+                if value != param.default and not ignore_default_value:
+                    current_values[name] = to_serializable_dict(value)
+        
+        if hasattr(obj, '__slots__'):
+            for slot in obj.__slots__:
+                if slot.startswith('_'):  # Handle private slots
+                    attr_name = slot[1:]  # Remove leading '_'
+                    value = getattr(obj, slot, None)
+                    if value is not None:
+                        current_values[attr_name] = to_serializable_dict(value)
+
+            
+        
+        return {
+            "type": obj.__class__.__name__,
+            "params": current_values
+        }
+        
+    return str(obj)
+
+
+def from_serializable_dict(data: Any) -> Any:
+    """
+    Recursively convert a serializable dictionary back to an object instance.
+    """
+    if data is None:
+        return None
+
+    # Handle basic types
+    if isinstance(data, (str, int, float, bool)):
+        return data
+
+    # Handle typed data
+    if isinstance(data, dict) and "type" in data:
+        # Handle plain dictionaries
+        if data["type"] == "dict" and "value" in data:
+            return {k: from_serializable_dict(v) for k, v in data["value"].items()}
+
+        # Import from crawl4ai for class instances
+        import crawl4ai
+
+        if hasattr(crawl4ai, data["type"]):
+            cls = getattr(crawl4ai, data["type"])
+
+            # Handle Enum
+            if issubclass(cls, Enum):
+                return cls(data["params"])
+
+            if "params" in data:
+                # Handle class instances
+                constructor_args = {
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
+                }
+                return cls(**constructor_args)
+
+    # Handle lists
+    if isinstance(data, list):
+        return [from_serializable_dict(item) for item in data]
+
+    # Handle raw dictionaries (legacy support)
+    if isinstance(data, dict):
+        return {k: from_serializable_dict(v) for k, v in data.items()}
+
+    return data
+
+
+def is_empty_value(value: Any) -> bool:
+    """Check if a value is effectively empty/null."""
+    if value is None:
+        return True
+    if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
+        return True
+    return False
+
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
+
+
+
+class BrowserConfig:
+    """
+    Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
+
+    This class centralizes all parameters that affect browser and context creation. Instead of passing
+    scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
+    code will then reference these settings to initialize the browser in a consistent, documented manner.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_mode (str): Determines how the browser should be initialized:
+                           "builtin" - use the builtin CDP browser running in background
+                           "dedicated" - create a new dedicated browser instance each time
+                           "cdp" - use explicit CDP settings provided in cdp_url
+                           "docker" - run browser in Docker container with isolation
+                           Default: "dedicated"
+        use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
+                                    advanced manipulation. Default: False.
+        cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
+        debugging_port (int): Port for the browser debugging protocol. Default: 9222.
+        use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
+                                       Automatically sets use_managed_browser=True. Default: False.
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
+                              is "chromium". Default: "chromium".
+        channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
+                              is "chromium". Default: "chromium".
+        proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
+                             Default: None.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.
+        viewport_width (int): Default viewport width for pages. Default: 1080.
+        viewport_height (int): Default viewport height for pages. Default: 600.
+        viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
+                         Default: None.
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
+                                 Default: False.
+        downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
+                                      a default path will be created. Default: None.
+        storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
+                                             Default: None.
+        ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
+        java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
+        cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
+                        {"name": "...", "value": "...", "url": "..."}.
+                        Default: [].
+        headers (dict): Extra HTTP headers to apply to all requests in this context.
+                        Default: {}.
+        user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                           "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
+                                       user_agent as-is. Default: None.
+        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
+                                                    Default: None.
+        text_mode (bool): If True, disables images and other rich content for potentially faster load times.
+                          Default: False.
+        light_mode (bool): Disables certain background features for performance gains. Default: False.
+        extra_args (list): Additional command-line arguments passed to the browser.
+                           Default: [].
+    """
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        headless: bool = True,
+        browser_mode: str = "dedicated",
+        use_managed_browser: bool = False,
+        cdp_url: str = None,
+        use_persistent_context: bool = False,
+        user_data_dir: str = None,
+        chrome_channel: str = "chromium",
+        channel: str = "chromium",
+        proxy: str = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        viewport_width: int = 1080,
+        viewport_height: int = 600,
+        viewport: dict = None,
+        accept_downloads: bool = False,
+        downloads_path: str = None,
+        storage_state: Union[str, dict, None] = None,
+        ignore_https_errors: bool = True,
+        java_script_enabled: bool = True,
+        sleep_on_close: bool = False,
+        verbose: bool = True,
+        cookies: list = None,
+        headers: dict = None,
+        user_agent: str = (
+            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
+            # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+            # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
+        ),
+        user_agent_mode: str = "",
+        user_agent_generator_config: dict = {},
+        text_mode: bool = False,
+        light_mode: bool = False,
+        extra_args: list = None,
+        debugging_port: int = 9222,
+        host: str = "localhost",
+    ):
+        self.browser_type = browser_type
+        self.headless = headless or True
+        self.browser_mode = browser_mode
+        self.use_managed_browser = use_managed_browser
+        self.cdp_url = cdp_url
+        self.use_persistent_context = use_persistent_context
+        self.user_data_dir = user_data_dir
+        self.chrome_channel = chrome_channel or self.browser_type or "chromium"
+        self.channel = channel or self.browser_type or "chromium"
+        if self.browser_type in ["firefox", "webkit"]:
+            self.channel = ""
+            self.chrome_channel = ""
+        self.proxy = proxy
+        self.proxy_config = proxy_config
+
+
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self.viewport = viewport
+        if self.viewport is not None:
+            self.viewport_width = self.viewport.get("width", 1080)
+            self.viewport_height = self.viewport.get("height", 600)
+        self.accept_downloads = accept_downloads
+        self.downloads_path = downloads_path
+        self.storage_state = storage_state
+        self.ignore_https_errors = ignore_https_errors
+        self.java_script_enabled = java_script_enabled
+        self.cookies = cookies if cookies is not None else []
+        self.headers = headers if headers is not None else {}
+        self.user_agent = user_agent
+        self.user_agent_mode = user_agent_mode
+        self.user_agent_generator_config = user_agent_generator_config
+        self.text_mode = text_mode
+        self.light_mode = light_mode
+        self.extra_args = extra_args if extra_args is not None else []
+        self.sleep_on_close = sleep_on_close
+        self.verbose = verbose
+        self.debugging_port = debugging_port
+        self.host = host
+
+        fa_user_agenr_generator = ValidUAGenerator()
+        if self.user_agent_mode == "random":
+            self.user_agent = fa_user_agenr_generator.generate(
+                **(self.user_agent_generator_config or {})
+            )
+        else:
+            pass
+
+        self.browser_hint = UAGen.generate_client_hints(self.user_agent)
+        self.headers.setdefault("sec-ch-ua", self.browser_hint)
+
+        # Set appropriate browser management flags based on browser_mode
+        if self.browser_mode == "builtin":
+            # Builtin mode uses managed browser connecting to builtin CDP endpoint
+            self.use_managed_browser = True
+            # cdp_url will be set later by browser_manager
+        elif self.browser_mode == "docker":
+            # Docker mode uses managed browser with CDP to connect to browser in container
+            self.use_managed_browser = True
+            # cdp_url will be set later by docker browser strategy
+        elif self.browser_mode == "custom" and self.cdp_url:
+            # Custom mode with explicit CDP URL
+            self.use_managed_browser = True
+        elif self.browser_mode == "dedicated":
+            # Dedicated mode uses a new browser instance each time
+            pass
+
+        # If persistent context is requested, ensure managed browser is enabled
+        if self.use_persistent_context:
+            self.use_managed_browser = True
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "BrowserConfig":
+        return BrowserConfig(
+            browser_type=kwargs.get("browser_type", "chromium"),
+            headless=kwargs.get("headless", True),
+            browser_mode=kwargs.get("browser_mode", "dedicated"),
+            use_managed_browser=kwargs.get("use_managed_browser", False),
+            cdp_url=kwargs.get("cdp_url"),
+            use_persistent_context=kwargs.get("use_persistent_context", False),
+            user_data_dir=kwargs.get("user_data_dir"),
+            chrome_channel=kwargs.get("chrome_channel", "chromium"),
+            channel=kwargs.get("channel", "chromium"),
+            proxy=kwargs.get("proxy"),
+            proxy_config=kwargs.get("proxy_config", None),
+            viewport_width=kwargs.get("viewport_width", 1080),
+            viewport_height=kwargs.get("viewport_height", 600),
+            accept_downloads=kwargs.get("accept_downloads", False),
+            downloads_path=kwargs.get("downloads_path"),
+            storage_state=kwargs.get("storage_state"),
+            ignore_https_errors=kwargs.get("ignore_https_errors", True),
+            java_script_enabled=kwargs.get("java_script_enabled", True),
+            cookies=kwargs.get("cookies", []),
+            headers=kwargs.get("headers", {}),
+            user_agent=kwargs.get(
+                "user_agent",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
+            ),
+            user_agent_mode=kwargs.get("user_agent_mode"),
+            user_agent_generator_config=kwargs.get("user_agent_generator_config"),
+            text_mode=kwargs.get("text_mode", False),
+            light_mode=kwargs.get("light_mode", False),
+            extra_args=kwargs.get("extra_args", []),
+            debugging_port=kwargs.get("debugging_port", 9222),
+            host=kwargs.get("host", "localhost"),
+        )
+
+    def to_dict(self):
+        result = {
+            "browser_type": self.browser_type,
+            "headless": self.headless,
+            "browser_mode": self.browser_mode,
+            "use_managed_browser": self.use_managed_browser,
+            "cdp_url": self.cdp_url,
+            "use_persistent_context": self.use_persistent_context,
+            "user_data_dir": self.user_data_dir,
+            "chrome_channel": self.chrome_channel,
+            "channel": self.channel,
+            "proxy": self.proxy,
+            "proxy_config": self.proxy_config,
+            "viewport_width": self.viewport_width,
+            "viewport_height": self.viewport_height,
+            "accept_downloads": self.accept_downloads,
+            "downloads_path": self.downloads_path,
+            "storage_state": self.storage_state,
+            "ignore_https_errors": self.ignore_https_errors,
+            "java_script_enabled": self.java_script_enabled,
+            "cookies": self.cookies,
+            "headers": self.headers,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "text_mode": self.text_mode,
+            "light_mode": self.light_mode,
+            "extra_args": self.extra_args,
+            "sleep_on_close": self.sleep_on_close,
+            "verbose": self.verbose,
+            "debugging_port": self.debugging_port,
+            "host": self.host,
+        }
+
+                
+        return result
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            BrowserConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return BrowserConfig.from_kwargs(config_dict)
+
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "BrowserConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, BrowserConfig):
+            return config
+        return BrowserConfig.from_kwargs(config)
+
+
+class HTTPCrawlerConfig:
+    """HTTP-specific crawler configuration"""
+
+    method: str = "GET"
+    headers: Optional[Dict[str, str]] = None
+    data: Optional[Dict[str, Any]] = None
+    json: Optional[Dict[str, Any]] = None
+    follow_redirects: bool = True
+    verify_ssl: bool = True
+
+    def __init__(
+        self,
+        method: str = "GET",
+        headers: Optional[Dict[str, str]] = None,
+        data: Optional[Dict[str, Any]] = None,
+        json: Optional[Dict[str, Any]] = None,
+        follow_redirects: bool = True,
+        verify_ssl: bool = True,
+    ):
+        self.method = method
+        self.headers = headers
+        self.data = data
+        self.json = json
+        self.follow_redirects = follow_redirects
+        self.verify_ssl = verify_ssl
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
+        return HTTPCrawlerConfig(
+            method=kwargs.get("method", "GET"),
+            headers=kwargs.get("headers"),
+            data=kwargs.get("data"),
+            json=kwargs.get("json"),
+            follow_redirects=kwargs.get("follow_redirects", True),
+            verify_ssl=kwargs.get("verify_ssl", True),
+        )
+
+    def to_dict(self):
+        return {
+            "method": self.method,
+            "headers": self.headers,
+            "data": self.data,
+            "json": self.json,
+            "follow_redirects": self.follow_redirects,
+            "verify_ssl": self.verify_ssl,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            HTTPCrawlerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return HTTPCrawlerConfig.from_kwargs(config_dict)
+
+    def dump(self) -> dict:
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "HTTPCrawlerConfig":
+        config = from_serializable_dict(data)
+        if isinstance(config, HTTPCrawlerConfig):
+            return config
+        return HTTPCrawlerConfig.from_kwargs(config)
+
+class CrawlerRunConfig():
+    _UNWANTED_PROPS = {
+        'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
+        'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
+        'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
+        'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
+    }
+
+    """
+    Configuration class for controlling how the crawler runs each crawl operation.
+    This includes parameters for content extraction, page manipulation, waiting conditions,
+    caching, and other runtime behaviors.
+
+    This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
+    By using this class, you have a single place to understand and adjust the crawling options.
+
+    Attributes:
+        # Deep Crawl Parameters
+        deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
+
+        # Content Processing Parameters
+        word_count_threshold (int): Minimum word count threshold before processing content.
+                                    Default: MIN_WORD_THRESHOLD (typically 200).
+        extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
+                                                          Default: None (NoExtractionStrategy is used if None).
+        chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
+                                              Default: RegexChunking().
+        markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
+                                                         Default: None.
+        only_text (bool): If True, attempt to extract text-only content where applicable.
+                          Default: False.
+        css_selector (str or None): CSS selector to extract a specific portion of the page.
+                                    Default: None.
+        
+        target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation 
+                                                and structured data extraction. When you set this, only the contents 
+                                                of these elements are processed for extraction and Markdown generation. 
+                                                If you do not set any value, the entire page is processed. 
+                                                The difference between this and css_selector is that this will shrink 
+                                                the initial raw HTML to the selected element, while this will only affect 
+                                                the extraction and Markdown generation.
+                                    Default: None
+        excluded_tags (list of str or None): List of HTML tags to exclude from processing.
+                                             Default: None.
+        excluded_selector (str or None): CSS selector to exclude from processing.
+                                         Default: None.
+        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
+                                     Default: False.
+        keep_attrs (list of str): List of HTML attributes to keep during processing.
+                                      Default: [].
+        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
+                             Default: False.
+        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
+                          Default: False.
+        parser_type (str): Type of parser to use for HTML parsing.
+                           Default: "lxml".
+        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
+                           Default: WebScrapingStrategy.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.
+
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
+        # Caching Parameters
+        cache_mode (CacheMode or None): Defines how caching is handled.
+                                        If None, defaults to CacheMode.ENABLED internally.
+                                        Default: CacheMode.BYPASS.
+        session_id (str or None): Optional session ID to persist the browser context and the created
+                                  page instance. If the ID already exists, the crawler does not
+                                  create a new page and uses the current page to preserve the state.
+        bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
+                             Default: False.
+        disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
+                              Default: False.
+        no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
+                              Default: False.
+        no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
+                               Default: False.
+        shared_data (dict or None): Shared data to be passed between hooks.
+                                     Default: None.
+
+        # Page Navigation and Timing Parameters
+        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
+                          Default: "domcontentloaded".
+        page_timeout (int): Timeout in ms for page operations like navigation.
+                            Default: 60000 (60 seconds).
+        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
+                                Default: None.
+        wait_for_images (bool): If True, wait for images to load before extracting content.
+                                Default: False.
+        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
+                                          Default: 0.1.
+        mean_delay (float): Mean base delay between requests when calling arun_many.
+                            Default: 0.1.
+        max_range (float): Max random additional delay range for requests in arun_many.
+                           Default: 0.3.
+        semaphore_count (int): Number of concurrent operations allowed.
+                               Default: 5.
+
+        # Page Interaction Parameters
+        js_code (str or list of str or None): JavaScript code/snippets to run on the page.
+                                              Default: None.
+        js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
+                        Default: False.
+        ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
+                                       Default: True.
+        scan_full_page (bool): If True, scroll through the entire page to load all content.
+                               Default: False.
+        scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
+                              Default: 0.2.
+        process_iframes (bool): If True, attempts to process and inline iframe content.
+                                Default: False.
+        remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
+                                        Default: False.
+        simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
+                              Default: False.
+        override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
+                                   Default: False.
+        magic (bool): If True, attempts automatic handling of overlays/popups.
+                      Default: False.
+        adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
+                                           Default: False.
+
+        # Media Handling Parameters
+        screenshot (bool): Whether to take a screenshot after crawling.
+                           Default: False.
+        screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
+                                             Default: None.
+        screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
+                                           Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
+        pdf (bool): Whether to generate a PDF of the page.
+                    Default: False.
+        image_description_min_word_threshold (int): Minimum words for image description extraction.
+                                                    Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
+        image_score_threshold (int): Minimum score threshold for processing an image.
+                                     Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
+        exclude_external_images (bool): If True, exclude all external images from processing.
+                                         Default: False.
+        table_score_threshold (int): Minimum score threshold for processing a table.
+                                     Default: 7.
+
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
+                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
+        exclude_external_links (bool): If True, exclude all external links from the results.
+                                       Default: False.
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
+        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
+                                           Default: False.
+        exclude_domains (list of str): List of specific domains to exclude from results.
+                                       Default: [].
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
+
+        # Debugging and Logging Parameters
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        log_console (bool): If True, log console messages from the page.
+                            Default: False.
+
+        # HTTP Crwler Strategy Parameters
+        method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
+                        Default: "GET".
+        data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                        Default: None.
+        json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+
+        # Connection Parameters
+        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
+                      Default: False.
+
+        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
+                                 Default: False.
+        user_agent (str): Custom User-Agent string to use.
+                          Default: None.
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
+                                       Default: None.
+        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
+                                                    Default: None.
+
+        # Experimental Parameters
+        experimental (dict): Dictionary containing experimental parameters that are in beta phase.
+                            This allows passing temporary features that are not yet fully integrated 
+                            into the main parameter set.
+                            Default: None.
+
+        url: str = None  # This is not a compulsory parameter
+    """
+
+    def __init__(
+        self,
+        # Content Processing Parameters
+        word_count_threshold: int = MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
+        only_text: bool = False,
+        css_selector: str = None,
+        target_elements: List[str] = None,
+        excluded_tags: list = None,
+        excluded_selector: str = None,
+        keep_data_attributes: bool = False,
+        keep_attrs: list = None,
+        remove_forms: bool = False,
+        prettiify: bool = False,
+        parser_type: str = "lxml",
+        scraping_strategy: ContentScrapingStrategy = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
+        # Caching Parameters
+        cache_mode: CacheMode = CacheMode.BYPASS,
+        session_id: str = None,
+        bypass_cache: bool = False,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
+        shared_data: dict = None,
+        # Page Navigation and Timing Parameters
+        wait_until: str = "domcontentloaded",
+        page_timeout: int = PAGE_TIMEOUT,
+        wait_for: str = None,
+        wait_for_images: bool = False,
+        delay_before_return_html: float = 0.1,
+        mean_delay: float = 0.1,
+        max_range: float = 0.3,
+        semaphore_count: int = 5,
+        # Page Interaction Parameters
+        js_code: Union[str, List[str]] = None,
+        js_only: bool = False,
+        ignore_body_visibility: bool = True,
+        scan_full_page: bool = False,
+        scroll_delay: float = 0.2,
+        process_iframes: bool = False,
+        remove_overlay_elements: bool = False,
+        simulate_user: bool = False,
+        override_navigator: bool = False,
+        magic: bool = False,
+        adjust_viewport_to_content: bool = False,
+        # Media Handling Parameters
+        screenshot: bool = False,
+        screenshot_wait_for: float = None,
+        screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
+        pdf: bool = False,
+        capture_mhtml: bool = False,
+        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
+        table_score_threshold: int = 7,
+        exclude_external_images: bool = False,
+        exclude_all_images: bool = False,
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains: list = None,
+        exclude_external_links: bool = False,
+        exclude_social_media_links: bool = False,
+        exclude_domains: list = None,
+        exclude_internal_links: bool = False,
+        # Debugging and Logging Parameters
+        verbose: bool = True,
+        log_console: bool = False,
+        # Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
+        # Connection Parameters
+        method: str = "GET",
+        stream: bool = False,
+        url: str = None,
+        check_robots_txt: bool = False,
+        user_agent: str = None,
+        user_agent_mode: str = None,
+        user_agent_generator_config: dict = {},
+        # Deep Crawl Parameters
+        deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
+        # Experimental Parameters
+        experimental: Dict[str, Any] = None,
+    ):
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        self.url = url
+
+        # Content Processing Parameters
+        self.word_count_threshold = word_count_threshold
+        self.extraction_strategy = extraction_strategy
+        self.chunking_strategy = chunking_strategy
+        self.markdown_generator = markdown_generator
+        self.only_text = only_text
+        self.css_selector = css_selector
+        self.target_elements = target_elements or []
+        self.excluded_tags = excluded_tags or []
+        self.excluded_selector = excluded_selector or ""
+        self.keep_data_attributes = keep_data_attributes
+        self.keep_attrs = keep_attrs or []
+        self.remove_forms = remove_forms
+        self.prettiify = prettiify
+        self.parser_type = parser_type
+        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
+        self.proxy_config = proxy_config
+        self.proxy_rotation_strategy = proxy_rotation_strategy
+
+        # SSL Parameters
+        self.fetch_ssl_certificate = fetch_ssl_certificate
+
+        # Caching Parameters
+        self.cache_mode = cache_mode
+        self.session_id = session_id
+        self.bypass_cache = bypass_cache
+        self.disable_cache = disable_cache
+        self.no_cache_read = no_cache_read
+        self.no_cache_write = no_cache_write
+        self.shared_data = shared_data
+
+        # Page Navigation and Timing Parameters
+        self.wait_until = wait_until
+        self.page_timeout = page_timeout
+        self.wait_for = wait_for
+        self.wait_for_images = wait_for_images
+        self.delay_before_return_html = delay_before_return_html
+        self.mean_delay = mean_delay
+        self.max_range = max_range
+        self.semaphore_count = semaphore_count
+
+        # Page Interaction Parameters
+        self.js_code = js_code
+        self.js_only = js_only
+        self.ignore_body_visibility = ignore_body_visibility
+        self.scan_full_page = scan_full_page
+        self.scroll_delay = scroll_delay
+        self.process_iframes = process_iframes
+        self.remove_overlay_elements = remove_overlay_elements
+        self.simulate_user = simulate_user
+        self.override_navigator = override_navigator
+        self.magic = magic
+        self.adjust_viewport_to_content = adjust_viewport_to_content
+
+        # Media Handling Parameters
+        self.screenshot = screenshot
+        self.screenshot_wait_for = screenshot_wait_for
+        self.screenshot_height_threshold = screenshot_height_threshold
+        self.pdf = pdf
+        self.capture_mhtml = capture_mhtml
+        self.image_description_min_word_threshold = image_description_min_word_threshold
+        self.image_score_threshold = image_score_threshold
+        self.exclude_external_images = exclude_external_images
+        self.exclude_all_images = exclude_all_images
+        self.table_score_threshold = table_score_threshold
+
+        # Link and Domain Handling Parameters
+        self.exclude_social_media_domains = (
+            exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        )
+        self.exclude_external_links = exclude_external_links
+        self.exclude_social_media_links = exclude_social_media_links
+        self.exclude_domains = exclude_domains or []
+        self.exclude_internal_links = exclude_internal_links
+
+        # Debugging and Logging Parameters
+        self.verbose = verbose
+        self.log_console = log_console
+        
+        # Network and Console Capturing Parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
+
+        # Connection Parameters
+        self.stream = stream
+        self.method = method
+
+        # Robots.txt Handling Parameters
+        self.check_robots_txt = check_robots_txt
+
+        # User Agent Parameters
+        self.user_agent = user_agent
+        self.user_agent_mode = user_agent_mode
+        self.user_agent_generator_config = user_agent_generator_config
+
+        # Validate type of extraction strategy and chunking strategy if they are provided
+        if self.extraction_strategy is not None and not isinstance(
+            self.extraction_strategy, ExtractionStrategy
+        ):
+            raise ValueError(
+                "extraction_strategy must be an instance of ExtractionStrategy"
+            )
+        if self.chunking_strategy is not None and not isinstance(
+            self.chunking_strategy, ChunkingStrategy
+        ):
+            raise ValueError(
+                "chunking_strategy must be an instance of ChunkingStrategy"
+            )
+
+        # Set default chunking strategy if None
+        if self.chunking_strategy is None:
+            self.chunking_strategy = RegexChunking()
+
+        # Deep Crawl Parameters
+        self.deep_crawl_strategy = deep_crawl_strategy
+        
+        # Experimental Parameters
+        self.experimental = experimental or {}
+
+
+    def __getattr__(self, name):
+        """Handle attribute access."""
+        if name in self._UNWANTED_PROPS:
+            raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+        return CrawlerRunConfig(
+            # Content Processing Parameters
+            word_count_threshold=kwargs.get("word_count_threshold", 200),
+            extraction_strategy=kwargs.get("extraction_strategy"),
+            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
+            markdown_generator=kwargs.get("markdown_generator"),
+            only_text=kwargs.get("only_text", False),
+            css_selector=kwargs.get("css_selector"),
+            target_elements=kwargs.get("target_elements", []),
+            excluded_tags=kwargs.get("excluded_tags", []),
+            excluded_selector=kwargs.get("excluded_selector", ""),
+            keep_data_attributes=kwargs.get("keep_data_attributes", False),
+            keep_attrs=kwargs.get("keep_attrs", []),
+            remove_forms=kwargs.get("remove_forms", False),
+            prettiify=kwargs.get("prettiify", False),
+            parser_type=kwargs.get("parser_type", "lxml"),
+            scraping_strategy=kwargs.get("scraping_strategy"),
+            proxy_config=kwargs.get("proxy_config"),
+            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
+            # SSL Parameters
+            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
+            # Caching Parameters
+            cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+            session_id=kwargs.get("session_id"),
+            bypass_cache=kwargs.get("bypass_cache", False),
+            disable_cache=kwargs.get("disable_cache", False),
+            no_cache_read=kwargs.get("no_cache_read", False),
+            no_cache_write=kwargs.get("no_cache_write", False),
+            shared_data=kwargs.get("shared_data", None),
+            # Page Navigation and Timing Parameters
+            wait_until=kwargs.get("wait_until", "domcontentloaded"),
+            page_timeout=kwargs.get("page_timeout", 60000),
+            wait_for=kwargs.get("wait_for"),
+            wait_for_images=kwargs.get("wait_for_images", False),
+            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
+            mean_delay=kwargs.get("mean_delay", 0.1),
+            max_range=kwargs.get("max_range", 0.3),
+            semaphore_count=kwargs.get("semaphore_count", 5),
+            # Page Interaction Parameters
+            js_code=kwargs.get("js_code"),
+            js_only=kwargs.get("js_only", False),
+            ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
+            scan_full_page=kwargs.get("scan_full_page", False),
+            scroll_delay=kwargs.get("scroll_delay", 0.2),
+            process_iframes=kwargs.get("process_iframes", False),
+            remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
+            simulate_user=kwargs.get("simulate_user", False),
+            override_navigator=kwargs.get("override_navigator", False),
+            magic=kwargs.get("magic", False),
+            adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
+            # Media Handling Parameters
+            screenshot=kwargs.get("screenshot", False),
+            screenshot_wait_for=kwargs.get("screenshot_wait_for"),
+            screenshot_height_threshold=kwargs.get(
+                "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
+            ),
+            pdf=kwargs.get("pdf", False),
+            capture_mhtml=kwargs.get("capture_mhtml", False),
+            image_description_min_word_threshold=kwargs.get(
+                "image_description_min_word_threshold",
+                IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+            ),
+            image_score_threshold=kwargs.get(
+                "image_score_threshold", IMAGE_SCORE_THRESHOLD
+            ),
+            table_score_threshold=kwargs.get("table_score_threshold", 7),
+            exclude_all_images=kwargs.get("exclude_all_images", False),
+            exclude_external_images=kwargs.get("exclude_external_images", False),
+            # Link and Domain Handling Parameters
+            exclude_social_media_domains=kwargs.get(
+                "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
+            ),
+            exclude_external_links=kwargs.get("exclude_external_links", False),
+            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
+            exclude_domains=kwargs.get("exclude_domains", []),
+            exclude_internal_links=kwargs.get("exclude_internal_links", False),
+            # Debugging and Logging Parameters
+            verbose=kwargs.get("verbose", True),
+            log_console=kwargs.get("log_console", False),
+            # Network and Console Capturing Parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
+            # Connection Parameters
+            method=kwargs.get("method", "GET"),
+            stream=kwargs.get("stream", False),
+            check_robots_txt=kwargs.get("check_robots_txt", False),
+            user_agent=kwargs.get("user_agent"),
+            user_agent_mode=kwargs.get("user_agent_mode"),
+            user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
+            # Deep Crawl Parameters
+            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+            url=kwargs.get("url"),
+            # Experimental Parameters 
+            experimental=kwargs.get("experimental"),
+        )
+
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "CrawlerRunConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, CrawlerRunConfig):
+            return config
+        return CrawlerRunConfig.from_kwargs(config)
+
+    def to_dict(self):
+        return {
+            "word_count_threshold": self.word_count_threshold,
+            "extraction_strategy": self.extraction_strategy,
+            "chunking_strategy": self.chunking_strategy,
+            "markdown_generator": self.markdown_generator,
+            "only_text": self.only_text,
+            "css_selector": self.css_selector,
+            "target_elements": self.target_elements,
+            "excluded_tags": self.excluded_tags,
+            "excluded_selector": self.excluded_selector,
+            "keep_data_attributes": self.keep_data_attributes,
+            "keep_attrs": self.keep_attrs,
+            "remove_forms": self.remove_forms,
+            "prettiify": self.prettiify,
+            "parser_type": self.parser_type,
+            "scraping_strategy": self.scraping_strategy,
+            "proxy_config": self.proxy_config,
+            "proxy_rotation_strategy": self.proxy_rotation_strategy,
+            "fetch_ssl_certificate": self.fetch_ssl_certificate,
+            "cache_mode": self.cache_mode,
+            "session_id": self.session_id,
+            "bypass_cache": self.bypass_cache,
+            "disable_cache": self.disable_cache,
+            "no_cache_read": self.no_cache_read,
+            "no_cache_write": self.no_cache_write,
+            "shared_data": self.shared_data,
+            "wait_until": self.wait_until,
+            "page_timeout": self.page_timeout,
+            "wait_for": self.wait_for,
+            "wait_for_images": self.wait_for_images,
+            "delay_before_return_html": self.delay_before_return_html,
+            "mean_delay": self.mean_delay,
+            "max_range": self.max_range,
+            "semaphore_count": self.semaphore_count,
+            "js_code": self.js_code,
+            "js_only": self.js_only,
+            "ignore_body_visibility": self.ignore_body_visibility,
+            "scan_full_page": self.scan_full_page,
+            "scroll_delay": self.scroll_delay,
+            "process_iframes": self.process_iframes,
+            "remove_overlay_elements": self.remove_overlay_elements,
+            "simulate_user": self.simulate_user,
+            "override_navigator": self.override_navigator,
+            "magic": self.magic,
+            "adjust_viewport_to_content": self.adjust_viewport_to_content,
+            "screenshot": self.screenshot,
+            "screenshot_wait_for": self.screenshot_wait_for,
+            "screenshot_height_threshold": self.screenshot_height_threshold,
+            "pdf": self.pdf,
+            "capture_mhtml": self.capture_mhtml,
+            "image_description_min_word_threshold": self.image_description_min_word_threshold,
+            "image_score_threshold": self.image_score_threshold,
+            "table_score_threshold": self.table_score_threshold,
+            "exclude_all_images": self.exclude_all_images,
+            "exclude_external_images": self.exclude_external_images,
+            "exclude_social_media_domains": self.exclude_social_media_domains,
+            "exclude_external_links": self.exclude_external_links,
+            "exclude_social_media_links": self.exclude_social_media_links,
+            "exclude_domains": self.exclude_domains,
+            "exclude_internal_links": self.exclude_internal_links,
+            "verbose": self.verbose,
+            "log_console": self.log_console,
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
+            "method": self.method,
+            "stream": self.stream,
+            "check_robots_txt": self.check_robots_txt,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "url": self.url,
+            "experimental": self.experimental,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            CrawlerRunConfig: A new instance with the specified updates
+
+        Example:
+            ```python
+            # Create a new config with streaming enabled
+            stream_config = config.clone(stream=True)
+
+            # Create a new config with multiple updates
+            new_config = config.clone(
+                stream=True,
+                cache_mode=CacheMode.BYPASS,
+                verbose=True
+            )
+            ```
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return CrawlerRunConfig.from_kwargs(config_dict)
+
+
+class LLMConfig:
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        temprature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+        n: Optional[int] = None,    
+    ):
+        """Configuaration class for LLM provider and API token."""
+        self.provider = provider
+        if api_token and not api_token.startswith("env:"):
+            self.api_token = api_token
+        elif api_token and api_token.startswith("env:"):
+            self.api_token = os.getenv(api_token[4:])
+        else:
+            # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
+            # If not, check if it is in PROVIDER_MODELS
+            prefixes = PROVIDER_MODELS_PREFIXES.keys()
+            if any(provider.startswith(prefix) for prefix in prefixes):
+                selected_prefix = next(
+                    (prefix for prefix in prefixes if provider.startswith(prefix)),
+                    None,
+                )
+                self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)                    
+            else:
+                self.provider = DEFAULT_PROVIDER
+                self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
+        self.base_url = base_url
+        self.temprature = temprature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.stop = stop
+        self.n = n
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "LLMConfig":
+        return LLMConfig(
+            provider=kwargs.get("provider", DEFAULT_PROVIDER),
+            api_token=kwargs.get("api_token"),
+            base_url=kwargs.get("base_url"),
+            temprature=kwargs.get("temprature"),
+            max_tokens=kwargs.get("max_tokens"),
+            top_p=kwargs.get("top_p"),
+            frequency_penalty=kwargs.get("frequency_penalty"),
+            presence_penalty=kwargs.get("presence_penalty"),
+            stop=kwargs.get("stop"),
+            n=kwargs.get("n")
+        )
+
+    def to_dict(self):
+        return {
+            "provider": self.provider,
+            "api_token": self.api_token,
+            "base_url": self.base_url,
+            "temprature": self.temprature,
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "n": self.n
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            llm_config: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return LLMConfig.from_kwargs(config_dict)
+
+
+
+```
+
+
+## File: crawl4ai/async_webcrawler.py
+
+```py
+from .__version__ import __version__ as crawl4ai_version
+import os
+import sys
+import time
+from colorama import Fore
+from pathlib import Path
+from typing import Optional, List
+import json
+import asyncio
+
+# from contextlib import nullcontext, asynccontextmanager
+from contextlib import asynccontextmanager
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    DispatchResult,
+    ScrapingResult,
+    CrawlResultContainer,
+    RunManyReturn
+)
+from .async_database import async_db_manager
+from .chunking_strategy import *  # noqa: F403
+from .chunking_strategy import IdentityChunking
+from .content_filter_strategy import *  # noqa: F403
+from .extraction_strategy import *  # noqa: F403
+from .extraction_strategy import NoExtractionStrategy
+from .async_crawler_strategy import (
+    AsyncCrawlerStrategy,
+    AsyncPlaywrightCrawlerStrategy,
+    AsyncCrawlResponse,
+)
+from .cache_context import CacheMode, CacheContext
+from .markdown_generation_strategy import (
+    DefaultMarkdownGenerator,
+    MarkdownGenerationStrategy,
+)
+from .deep_crawling import DeepCrawlDecorator
+from .async_logger import AsyncLogger, AsyncLoggerBase
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
+from .async_dispatcher import *  # noqa: F403
+from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
+
+from .utils import (
+    sanitize_input_encode,
+    InvalidCSSSelectorError,
+    fast_format_html,
+    create_box_message,
+    get_error_context,
+    RobotsParser,
+    preprocess_html_for_schema,
+)
+
+
+class AsyncWebCrawler:
+    """
+    Asynchronous web crawler with flexible caching capabilities.
+
+    There are two ways to use the crawler:
+
+    1. Using context manager (recommended for simple cases):
+        ```python
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com")
+        ```
+
+    2. Using explicit lifecycle management (recommended for long-running applications):
+        ```python
+        crawler = AsyncWebCrawler()
+        await crawler.start()
+
+        # Use the crawler multiple times
+        result1 = await crawler.arun(url="https://example.com")
+        result2 = await crawler.arun(url="https://another.com")
+
+        await crawler.close()
+        ```
+
+    Attributes:
+        browser_config (BrowserConfig): Configuration object for browser settings.
+        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        crawl4ai_folder (str): Directory for storing cache.
+        base_directory (str): Base directory for storing cache.
+        ready (bool): Whether the crawler is ready for use.
+
+    Methods:
+        start(): Start the crawler explicitly without using context manager.
+        close(): Close the crawler explicitly without using context manager.
+        arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+        awarmup(): Perform warmup sequence.
+        arun_many(): Run the crawler for multiple sources.
+        aprocess_html(): Process HTML content.
+
+    Typical Usage:
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com")
+            print(result.markdown)
+
+        Using configuration:
+        browser_config = BrowserConfig(browser_type="chromium", headless=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS
+            )
+            result = await crawler.arun(url="https://example.com", config=crawler_config)
+            print(result.markdown)
+    """
+
+    _domain_last_hit = {}
+
+    def __init__(
+        self,
+        crawler_strategy: AsyncCrawlerStrategy = None,
+        config: BrowserConfig = None,
+        base_directory: str = str(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
+        thread_safe: bool = False,
+        logger: AsyncLoggerBase = None,
+        **kwargs,
+    ):
+        """
+        Initialize the AsyncWebCrawler.
+
+        Args:
+            crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
+            config: Configuration object for browser settings. Default BrowserConfig()
+            base_directory: Base directory for storing cache
+            thread_safe: Whether to use thread-safe operations
+            **kwargs: Additional arguments for backwards compatibility
+        """
+        # Handle browser configuration
+        browser_config = config or BrowserConfig()
+
+        self.browser_config = browser_config
+
+        # Initialize logger first since other components may need it
+        self.logger = logger or AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
+            verbose=self.browser_config.verbose,
+            tag_width=10,
+        )
+
+        # Initialize crawler strategy
+        params = {k: v for k, v in kwargs.items() if k in [
+            "browser_config", "logger"]}
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            browser_config=browser_config,
+            logger=self.logger,
+            **params,  # Pass remaining kwargs for backwards compatibility
+        )
+
+        # Thread safety setup
+        self._lock = asyncio.Lock() if thread_safe else None
+
+        # Initialize directories
+        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
+        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+
+        # Initialize robots parser
+        self.robots_parser = RobotsParser()
+
+        self.ready = False
+
+        # Decorate arun method with deep crawling capabilities
+        self._deep_handler = DeepCrawlDecorator(self)
+        self.arun = self._deep_handler(self.arun)
+
+    async def start(self):
+        """
+        Start the crawler explicitly without using context manager.
+        This is equivalent to using 'async with' but gives more control over the lifecycle.
+        Returns:
+            AsyncWebCrawler: The initialized crawler instance
+        """
+        await self.crawler_strategy.__aenter__()
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        self.ready = True
+        return self
+
+    async def close(self):
+        """
+        Close the crawler explicitly without using context manager.
+        This should be called when you're done with the crawler if you used start().
+
+        This method will:
+        1. Clean up browser resources
+        2. Close any open pages and contexts
+        """
+        await self.crawler_strategy.__aexit__(None, None, None)
+
+    async def __aenter__(self):
+        return await self.start()
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+
+    @asynccontextmanager
+    async def nullcontext(self):
+        """异步空上下文管理器"""
+        yield
+
+    async def arun(
+        self,
+        url: str,
+        config: CrawlerRunConfig = None,
+        **kwargs,
+    ) -> RunManyReturn:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
+
+        Migration Guide:
+        Old way (deprecated):
+            result = await crawler.arun(
+                url="https://example.com",
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+
+        New way (recommended):
+            config = CrawlerRunConfig(
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+            result = await crawler.arun(url="https://example.com", crawler_config=config)
+
+        Args:
+            url: The URL to crawl (http://, https://, file://, or raw:)
+            crawler_config: Configuration object controlling crawl behavior
+            [other parameters maintained for backwards compatibility]
+
+        Returns:
+            CrawlResult: The result of crawling and processing
+        """
+        # Auto-start if not ready
+        if not self.ready:
+            await self.start()
+
+        config = config or CrawlerRunConfig()
+        if not isinstance(url, str) or not url:
+            raise ValueError(
+                "Invalid URL, make sure the URL is a non-empty string")
+
+        async with self._lock or self.nullcontext():
+            try:
+                self.logger.verbose = config.verbose
+
+                # Default to ENABLED if no cache mode specified
+                if config.cache_mode is None:
+                    config.cache_mode = CacheMode.ENABLED
+
+                # Create cache context
+                cache_context = CacheContext(url, config.cache_mode, False)
+
+                # Initialize processing variables
+                async_response: AsyncCrawlResponse = None
+                cached_result: CrawlResult = None
+                screenshot_data = None
+                pdf_data = None
+                extracted_content = None
+                start_time = time.perf_counter()
+
+                # Try to get cached result if appropriate
+                if cache_context.should_read():
+                    cached_result = await async_db_manager.aget_cached_url(url)
+
+                if cached_result:
+                    html = sanitize_input_encode(cached_result.html)
+                    extracted_content = sanitize_input_encode(
+                        cached_result.extracted_content or ""
+                    )
+                    extracted_content = (
+                        None
+                        if not extracted_content or extracted_content == "[]"
+                        else extracted_content
+                    )
+                    # If screenshot is requested but its not in cache, then set cache_result to None
+                    screenshot_data = cached_result.screenshot
+                    pdf_data = cached_result.pdf
+                    # if config.screenshot and not screenshot or config.pdf and not pdf:
+                    if config.screenshot and not screenshot_data:
+                        cached_result = None
+
+                    if config.pdf and not pdf_data:
+                        cached_result = None
+
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - start_time,
+                        tag="FETCH",
+                    )
+
+                # Update proxy configuration from rotation strategy if available
+                if config and config.proxy_rotation_strategy:
+                    next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
+                    if next_proxy:
+                        self.logger.info(
+                            message="Switch proxy: {proxy}",
+                            tag="PROXY",
+                            params={"proxy": next_proxy.server}
+                        )
+                        config.proxy_config = next_proxy
+                        # config = config.clone(proxy_config=next_proxy)
+
+                # Fetch fresh content if needed
+                if not cached_result or not html:
+                    t1 = time.perf_counter()
+
+                    if config.user_agent:
+                        self.crawler_strategy.update_user_agent(
+                            config.user_agent)
+
+                    # Check robots.txt if enabled
+                    if config and config.check_robots_txt:
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
+                            return CrawlResult(
+                                url=url,
+                                html="",
+                                success=False,
+                                status_code=403,
+                                error_message="Access denied by robots.txt",
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
+                            )
+
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
+                    async_response = await self.crawler_strategy.crawl(
+                        url,
+                        config=config,  # Pass the entire config object
+                    )
+
+                    html = sanitize_input_encode(async_response.html)
+                    screenshot_data = async_response.screenshot
+                    pdf_data = async_response.pdf_data
+                    js_execution_result = async_response.js_execution_result
+
+                    t2 = time.perf_counter()
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=t2 - t1,
+                        tag="FETCH",
+                    )
+
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
+                    crawl_result: CrawlResult = await self.aprocess_html(
+                        url=url,
+                        html=html,
+                        extracted_content=extracted_content,
+                        config=config,  # Pass the config object instead of individual parameters
+                        screenshot=screenshot_data,
+                        pdf_data=pdf_data,
+                        verbose=config.verbose,
+                        is_raw_html=True if url.startswith("raw:") else False,
+                        **kwargs,
+                    )
+
+                    crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.redirected_url or url
+                    crawl_result.response_headers = async_response.response_headers
+                    crawl_result.downloaded_files = async_response.downloaded_files
+                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.mhtml = async_response.mhtml_data
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # Add captured network and console data if available
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
+
+                    crawl_result.success = bool(html)
+                    crawl_result.session_id = getattr(
+                        config, "session_id", None)
+
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": crawl_result.success,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={
+                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                            "timing": Fore.YELLOW,
+                        },
+                    )
+
+                    # Update cache if appropriate
+                    if cache_context.should_write() and not bool(cached_result):
+                        await async_db_manager.acache_url(crawl_result)
+
+                    return CrawlResultContainer(crawl_result)
+
+                else:
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": True,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    )
+
+                    cached_result.success = bool(html)
+                    cached_result.session_id = getattr(
+                        config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
+                    return CrawlResultContainer(cached_result)
+
+            except Exception as e:
+                error_context = get_error_context(sys.exc_info())
+
+                error_message = (
+                    f"Unexpected error in _crawl_web at line {error_context['line_no']} "
+                    f"in {error_context['function']} ({error_context['filename']}):\n"
+                    f"Error: {str(e)}\n\n"
+                    f"Code context:\n{error_context['code_context']}"
+                )
+
+                self.logger.error_status(
+                    url=url,
+                    error=create_box_message(error_message, type="error"),
+                    tag="ERROR",
+                )
+
+                return CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
+                )
+
+    async def aprocess_html(
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        config: CrawlerRunConfig,
+        screenshot: str,
+        pdf_data: str,
+        verbose: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        """
+        Process HTML content using the provided configuration.
+
+        Args:
+            url: The URL being processed
+            html: Raw HTML content
+            extracted_content: Previously extracted content (if any)
+            config: Configuration object controlling processing behavior
+            screenshot: Screenshot data (if any)
+            pdf_data: PDF data (if any)
+            verbose: Whether to enable verbose logging
+            **kwargs: Additional parameters for backwards compatibility
+
+        Returns:
+            CrawlResult: Processed result containing extracted and formatted content
+        """
+        cleaned_html = ""
+        try:
+            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
+            t1 = time.perf_counter()
+
+            # Get scraping strategy and ensure it has a logger
+            scraping_strategy = config.scraping_strategy
+            if not scraping_strategy.logger:
+                scraping_strategy.logger = self.logger
+
+            # Process HTML content
+            params = config.__dict__.copy()
+            params.pop("url", None)
+            # add keys from kwargs to params that doesn't exist in params
+            params.update({k: v for k, v in kwargs.items()
+                          if k not in params.keys()})
+
+            ################################
+            # Scraping Strategy Execution  #
+            ################################
+            result: ScrapingResult = scraping_strategy.scrap(
+                url, html, **params)
+
+            if result is None:
+                raise ValueError(
+                    f"Process HTML, Failed to extract content from the website: {url}"
+                )
+
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        except Exception as e:
+            raise ValueError(
+                f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
+            )
+
+        # Extract results - handle both dict and ScrapingResult
+        if isinstance(result, dict):
+            cleaned_html = sanitize_input_encode(
+                result.get("cleaned_html", ""))
+            media = result.get("media", {})
+            links = result.get("links", {})
+            metadata = result.get("metadata", {})
+        else:
+            cleaned_html = sanitize_input_encode(result.cleaned_html)
+            media = result.media.model_dump()
+            links = result.links.model_dump()
+            metadata = result.metadata
+
+        ################################
+        # Generate Markdown            #
+        ################################
+        markdown_generator: Optional[MarkdownGenerationStrategy] = (
+            config.markdown_generator or DefaultMarkdownGenerator()
+        )
+
+        # --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
+        # Get the desired source from the generator config, default to 'cleaned_html'
+        selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
+
+        # Define the source selection logic using dict dispatch
+        html_source_selector = {
+            "raw_html": lambda: html,  # The original raw HTML
+            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
+            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+        }
+
+        markdown_input_html = cleaned_html  # Default to cleaned_html
+
+        try:
+            # Get the appropriate lambda function, default to returning cleaned_html if key not found
+            source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
+            # Execute the lambda to get the selected HTML
+            markdown_input_html = source_lambda()
+
+            # Log which source is being used (optional, but helpful for debugging)
+            # if self.logger and verbose:
+            #     actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+            #     self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+
+        except Exception as e:
+            # Handle potential errors, especially from preprocess_html_for_schema
+            if self.logger:
+                self.logger.warning(
+                    f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
+                    tag="MARKDOWN_SRC"
+                )
+            # Ensure markdown_input_html is still the default cleaned_html in case of error
+            markdown_input_html = cleaned_html
+        # --- END: HTML SOURCE SELECTION ---
+
+        # Uncomment if by default we want to use PruningContentFilter
+        # if not config.content_filter and not markdown_generator.content_filter:
+        #     markdown_generator.content_filter = PruningContentFilter()
+
+        markdown_result: MarkdownGenerationResult = (
+            markdown_generator.generate_markdown(
+                input_html=markdown_input_html,
+                base_url=url,
+                # html2text_options=kwargs.get('html2text', {})
+            )
+        )
+
+        # Log processing completion
+        self.logger.info(
+            message="{url:.50}... | Time: {timing}s",
+            tag="SCRAPE",
+            params={
+                "url": _url,
+                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
+            },
+        )
+
+        ################################
+        # Structured Content Extraction           #
+        ################################
+        if (
+            not bool(extracted_content)
+            and config.extraction_strategy
+            and not isinstance(config.extraction_strategy, NoExtractionStrategy)
+        ):
+            t1 = time.perf_counter()
+            # Choose content based on input_format
+            content_format = config.extraction_strategy.input_format
+            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
+                self.logger.warning(
+                    message="Fit markdown requested but not available. Falling back to raw markdown.",
+                    tag="EXTRACT",
+                    params={"url": _url},
+                )
+                content_format = "markdown"
+
+            content = {
+                "markdown": markdown_result.raw_markdown,
+                "html": html,
+                "cleaned_html": cleaned_html,
+                "fit_markdown": markdown_result.fit_markdown,
+            }.get(content_format, markdown_result.raw_markdown)
+
+            # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
+            chunking = (
+                IdentityChunking()
+                if content_format in ["html", "cleaned_html"]
+                else config.chunking_strategy
+            )
+            sections = chunking.chunk(content)
+            extracted_content = config.extraction_strategy.run(url, sections)
+            extracted_content = json.dumps(
+                extracted_content, indent=4, default=str, ensure_ascii=False
+            )
+
+            # Log extraction completion
+            self.logger.info(
+                message="Completed for {url:.50}... | Time: {timing}s",
+                tag="EXTRACT",
+                params={"url": _url, "timing": time.perf_counter() - t1},
+            )
+
+        # Handle screenshot and PDF data
+        screenshot_data = None if not screenshot else screenshot
+        pdf_data = None if not pdf_data else pdf_data
+
+        # Apply HTML formatting if requested
+        if config.prettiify:
+            cleaned_html = fast_format_html(cleaned_html)
+
+        # Return complete crawl result
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown_result,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot_data,
+            pdf=pdf_data,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )
+
+    async def arun_many(
+        self,
+        urls: List[str],
+        config: Optional[CrawlerRunConfig] = None,
+        dispatcher: Optional[BaseDispatcher] = None,
+        # Legacy parameters maintained for backwards compatibility
+        # word_count_threshold=MIN_WORD_THRESHOLD,
+        # extraction_strategy: ExtractionStrategy = None,
+        # chunking_strategy: ChunkingStrategy = RegexChunking(),
+        # content_filter: RelevantContentFilter = None,
+        # cache_mode: Optional[CacheMode] = None,
+        # bypass_cache: bool = False,
+        # css_selector: str = None,
+        # screenshot: bool = False,
+        # pdf: bool = False,
+        # user_agent: str = None,
+        # verbose=True,
+        **kwargs,
+    ) -> RunManyReturn:
+        """
+        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
+
+        Args:
+        urls: List of URLs to crawl
+        config: Configuration object controlling crawl behavior for all URLs
+        dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
+        [other parameters maintained for backwards compatibility]
+
+        Returns:
+        Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
+            Either a list of all results or an async generator yielding results
+
+        Examples:
+
+        # Batch processing (default)
+        results = await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        for result in results:
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+
+        # Streaming results
+        async for result in await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True),
+        ):
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+        """
+        config = config or CrawlerRunConfig()
+        # if config is None:
+        #     config = CrawlerRunConfig(
+        #         word_count_threshold=word_count_threshold,
+        #         extraction_strategy=extraction_strategy,
+        #         chunking_strategy=chunking_strategy,
+        #         content_filter=content_filter,
+        #         cache_mode=cache_mode,
+        #         bypass_cache=bypass_cache,
+        #         css_selector=css_selector,
+        #         screenshot=screenshot,
+        #         pdf=pdf,
+        #         verbose=verbose,
+        #         **kwargs,
+        #     )
+
+        if dispatcher is None:
+            dispatcher = MemoryAdaptiveDispatcher(
+                rate_limiter=RateLimiter(
+                    base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
+                ),
+            )
+
+        def transform_result(task_result):
+            return (
+                setattr(
+                    task_result.result,
+                    "dispatch_result",
+                    DispatchResult(
+                        task_id=task_result.task_id,
+                        memory_usage=task_result.memory_usage,
+                        peak_memory=task_result.peak_memory,
+                        start_time=task_result.start_time,
+                        end_time=task_result.end_time,
+                        error_message=task_result.error_message,
+                    ),
+                )
+                or task_result.result
+            )
+
+        stream = config.stream
+
+        if stream:
+
+            async def result_transformer():
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
+                    yield transform_result(task_result)
+
+            return result_transformer()
+        else:
+            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
+            return [transform_result(res) for res in _results]
+
+```
+
+
+## File: crawl4ai/cli.py
+
+```py
+import click
+import os
+import sys
+import time
+
+import humanize
+from typing import Dict, Any, Optional, List
+import json
+import yaml
+import anyio
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.prompt import Prompt, Confirm
+
+from crawl4ai import (
+    CacheMode,
+    AsyncWebCrawler, 
+    CrawlResult,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    LLMExtractionStrategy, 
+    LXMLWebScrapingStrategy,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy,
+    BM25ContentFilter, 
+    PruningContentFilter,
+    BrowserProfiler,
+    DefaultMarkdownGenerator,
+    LLMConfig
+)
+from crawl4ai.config import USER_SETTINGS
+from litellm import completion
+from pathlib import Path
+
+
+# Initialize rich console
+console = Console()
+
+def get_global_config() -> dict:
+    config_dir = Path.home() / ".crawl4ai"
+    config_file = config_dir / "global.yml"
+    
+    if not config_file.exists():
+        config_dir.mkdir(parents=True, exist_ok=True)
+        return {}
+        
+    with open(config_file) as f:
+        return yaml.safe_load(f) or {}
+
+def save_global_config(config: dict):
+    config_file = Path.home() / ".crawl4ai" / "global.yml"
+    with open(config_file, "w") as f:
+        yaml.dump(config, f)
+
+def setup_llm_config() -> tuple[str, str]:
+    config = get_global_config()
+    provider = config.get("DEFAULT_LLM_PROVIDER")
+    token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
+    
+    if not provider:
+        click.echo("\nNo default LLM provider configured.")
+        click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
+        click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
+        provider = click.prompt("Enter provider")
+        
+    if not provider.startswith("ollama/"):
+        if not token:
+            token = click.prompt("Enter API token for " + provider, hide_input=True)
+    else:
+        token = "no-token"
+    
+    if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
+        config["DEFAULT_LLM_PROVIDER"] = provider
+        config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
+        save_global_config(config)
+        click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
+    
+    return provider, token
+
+async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
+    response = completion(
+        model=provider,
+        api_key=token,
+        messages=[
+            {
+                "content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
+                "role": "system"
+            },
+            {
+                "content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
+                "role": "user"
+            },
+        ],
+        stream=True,
+    )
+    
+    for chunk in response:
+        if content := chunk["choices"][0]["delta"].get("content"):
+            print(content, end="", flush=True)
+    print()  # New line at end
+
+
+
+def parse_key_values(ctx, param, value) -> Dict[str, Any]:
+    if not value:
+        return {}
+    result = {}
+    pairs = value.split(',')
+    for pair in pairs:
+        try:
+            k, v = pair.split('=', 1)
+            # Handle common value types 
+            if v.lower() == 'true': v = True
+            elif v.lower() == 'false': v = False
+            elif v.isdigit(): v = int(v)
+            elif v.replace('.','',1).isdigit(): v = float(v)
+            elif v.startswith('[') and v.endswith(']'):
+                v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
+            elif v.startswith('{') and v.endswith('}'):
+                try:
+                    v = json.loads(v)
+                except json.JSONDecodeError:
+                    raise click.BadParameter(f'Invalid JSON object: {v}')
+            result[k.strip()] = v
+        except ValueError:
+            raise click.BadParameter(f'Invalid key=value pair: {pair}')
+    return result
+
+def load_config_file(path: Optional[str]) -> dict:
+    if not path:
+        return {}
+    
+    try:
+        with open(path) as f:
+            if path.endswith((".yaml", ".yml")):
+                return yaml.safe_load(f)
+            return json.load(f)
+    except Exception as e:
+        raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
+
+def load_schema_file(path: Optional[str]) -> dict:
+    if not path:
+        return None
+    return load_config_file(path)
+
+async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
+    if verbose:
+        click.echo("Starting crawler with configurations:")
+        click.echo(f"Browser config: {browser_cfg.dump()}")
+        click.echo(f"Crawler config: {crawler_cfg.dump()}")
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        try:
+            result = await crawler.arun(url=url, config=crawler_cfg)
+            return result
+        except Exception as e:
+            raise click.ClickException(f"Crawling failed: {str(e)}")
+
+def show_examples():
+    examples = """
+🚀 Crawl4AI CLI Examples
+
+1️⃣  Basic Usage:
+    # Simple crawl with default settings
+    crwl https://example.com
+
+    # Get markdown output
+    crwl https://example.com -o markdown
+
+    # Verbose JSON output with cache bypass
+    crwl https://example.com -o json -v --bypass-cache
+
+2️⃣  Using Config Files:
+    # Using browser and crawler configs
+    crwl https://example.com -B browser.yml -C crawler.yml
+
+    # CSS-based extraction
+    crwl https://example.com -e extract_css.yml -s css_schema.json -o json
+
+    # LLM-based extraction with config file
+    crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
+    
+    # Quick LLM-based JSON extraction (prompts for LLM provider first time)
+    crwl https://example.com -j  # Auto-extracts structured data
+    crwl https://example.com -j "Extract product details including name, price, and features"  # With specific instructions
+
+3️⃣  Direct Parameters:
+    # Browser settings
+    crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+
+    # Crawler settings
+    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+
+4️⃣  Profile Management for Identity-Based Crawling:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create, list, and delete browser profiles for identity-based crawling
+    # Use a profile for crawling (keeps you logged in)
+    crwl https://example.com -p my-profile-name
+
+    # Example: Crawl a site that requires login
+    # 1. First create a profile and log in:
+    crwl profiles
+    # 2. Then use that profile to crawl the authenticated site:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+5️⃣  CDP Mode for Browser Automation:
+    # Launch browser with CDP debugging on default port 9222
+    crwl cdp
+
+    # Use a specific profile and custom port
+    crwl cdp -p my-profile -P 9223
+
+    # Launch headless browser with CDP enabled
+    crwl cdp --headless
+
+    # Launch in incognito mode (ignores profile)
+    crwl cdp --incognito
+
+    # Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
+    # The URL will be displayed in the terminal when the browser starts
+
+    
+6️⃣  Sample Config Files:
+
+browser.yml:
+    headless: true
+    viewport_width: 1280
+    user_agent_mode: "random"
+    verbose: true
+    ignore_https_errors: true
+
+extract_css.yml:
+    type: "json-css"
+    params:
+        verbose: true
+
+css_schema.json:
+    {
+      "name": "ArticleExtractor",
+      "baseSelector": ".article",
+      "fields": [
+        {
+          "name": "title",
+          "selector": "h1.title",
+          "type": "text"
+        },
+        {
+          "name": "link",
+          "selector": "a.read-more",
+          "type": "attribute",
+          "attribute": "href"
+        }
+      ]
+    }
+
+extract_llm.yml:
+    type: "llm"
+    provider: "openai/gpt-4"
+    instruction: "Extract all articles with their titles and links"
+    api_token: "your-token"
+    params:
+        temperature: 0.3
+        max_tokens: 1000
+
+llm_schema.json:
+    {
+      "title": "Article",
+      "type": "object",
+      "properties": {
+        "title": {
+          "type": "string",
+          "description": "The title of the article"
+        },
+        "link": {
+          "type": "string",
+          "description": "URL to the full article"
+        }
+      }
+    }
+
+7️⃣  Advanced Usage:
+    # Combine configs with direct parameters
+    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
+
+    # Full extraction pipeline with config files
+    crwl https://example.com \\
+        -B browser.yml \\
+        -C crawler.yml \\
+        -e extract_llm.yml \\
+        -s llm_schema.json \\
+        -o json \\
+        -v
+        
+    # Quick LLM-based extraction with specific instructions
+    crwl https://amazon.com/dp/B01DFKC2SO \\
+        -j "Extract product title, current price, original price, rating, and all product specifications" \\
+        -b "headless=true,viewport_width=1280" \\
+        -v
+
+    # Content filtering with BM25
+    crwl https://example.com \\
+        -f filter_bm25.yml \\
+        -o markdown-fit
+
+    # Authenticated crawling with profile
+    crwl https://login-required-site.com \\
+        -p my-authenticated-profile \\
+        -c "css_selector=.dashboard-content" \\
+        -o markdown
+
+For more documentation visit: https://github.com/unclecode/crawl4ai
+
+8️⃣  Q&A with LLM:
+    # Ask a question about the content
+    crwl https://example.com -q "What is the main topic discussed?"
+
+    # First view content, then ask questions
+    crwl https://example.com -o markdown  # See the crawled content first
+    crwl https://example.com -q "Summarize the key points"
+    crwl https://example.com -q "What are the conclusions?"
+
+    # Advanced crawling with Q&A
+    crwl https://example.com \\
+        -B browser.yml \\
+        -c "css_selector=article,scan_full_page=true" \\
+        -q "What are the pros and cons mentioned?"
+
+    Note: First time using -q will prompt for LLM provider and API token.
+    These will be saved in ~/.crawl4ai/global.yml for future use.
+    
+    Supported provider format: 'company/model'
+    Examples:
+      - ollama/llama3.3
+      - openai/gpt-4
+      - anthropic/claude-3-sonnet
+      - cohere/command
+      - google/gemini-pro
+    
+    See full list of providers: https://docs.litellm.ai/docs/providers
+    
+    # Set default LLM provider and token in advance
+    crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
+    crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here"
+    
+    # Set default browser behavior
+    crwl config set BROWSER_HEADLESS false  # Always show browser window
+    crwl config set USER_AGENT_MODE random  # Use random user agent
+
+9️⃣ Profile Management:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create a profile and use it for crawling
+    crwl profiles  # Create and set up your profile interactively
+    crwl https://example.com -p my-profile-name  # Use profile for crawling
+
+    # Example workflow for authenticated site
+    # 1. First create a profile and log in to the site:
+    crwl profiles  # Select "Create new profile" option
+    # 2. Then use that profile to crawl authenticated content:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+🔄 Builtin Browser Management:
+    # Start a builtin browser (runs in the background)
+    crwl browser start
+    
+    # Check builtin browser status
+    crwl browser status
+    
+    # Open a visible window to see the browser
+    crwl browser view --url https://example.com
+    
+    # Stop the builtin browser
+    crwl browser stop
+    
+    # Restart with different options
+    crwl browser restart --browser-type chromium --port 9223 --no-headless
+    
+    # Use the builtin browser in your code
+    # (Just set browser_mode="builtin" in your BrowserConfig)
+    browser_config = BrowserConfig(
+        browser_mode="builtin", 
+        headless=True
+    )
+    
+    # Usage via CLI:
+    crwl https://example.com -b "browser_mode=builtin"
+"""
+    click.echo(examples)
+
+def get_directory_size(path: str) -> int:
+    """Calculate the total size of a directory in bytes"""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if not os.path.islink(fp):
+                total_size += os.path.getsize(fp)
+    return total_size
+
+def display_profiles_table(profiles: List[Dict[str, Any]]):
+    """Display a rich table of browser profiles"""
+    if not profiles:
+        console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", 
+                          title="Browser Profiles", border_style="blue"))
+        return
+    
+    table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("#", style="dim", width=4)
+    table.add_column("Name", style="cyan", no_wrap=True)
+    table.add_column("Path", style="green")
+    table.add_column("Created", style="yellow")
+    table.add_column("Browser", style="magenta")
+    table.add_column("Size", style="blue", justify="right")
+    
+    for i, profile in enumerate(profiles):
+        # Calculate folder size
+        size = get_directory_size(profile["path"])
+        human_size = humanize.naturalsize(size)
+        
+        # Format creation date
+        created = profile["created"].strftime("%Y-%m-%d %H:%M")
+        
+        # Add row to table
+        table.add_row(
+            str(i+1), 
+            profile["name"], 
+            profile["path"], 
+            created, 
+            profile["type"].capitalize(), 
+            human_size
+        )
+    
+    console.print(table)
+
+async def create_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile creation wizard"""
+    console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
+                      "This will open a browser window for you to set up your identity.\n"
+                      "Log in to sites, adjust settings, then press 'q' to save.",
+                      border_style="cyan"))
+    
+    profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
+    
+    console.print("[cyan]Creating profile...[/cyan]")
+    console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
+    
+    # Create the profile
+    try:
+        profile_path = await profiler.create_profile(profile_name)
+        
+        if profile_path:
+            console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
+        else:
+            console.print("[red]Failed to create profile.[/red]")
+    except Exception as e:
+        console.print(f"[red]Error creating profile: {str(e)}[/red]")
+
+def delete_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile deletion"""
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found to delete.[/yellow]")
+        return
+    
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[red]Enter number of profile to delete[/red]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Confirm deletion
+        if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
+            success = profiler.delete_profile(profile["path"])
+            
+            if success:
+                console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
+            else:
+                console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection.[/red]")
+        
+async def crawl_with_profile_cli(profile_path, url):
+    """Use a profile to crawl a website via CLI"""
+    console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
+    
+    # Create browser config with the profile
+    browser_cfg = BrowserConfig(
+        headless=False,  # Set to False to see the browser in action
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    # Default crawler config
+    crawler_cfg = CrawlerRunConfig()
+    
+    # Ask for output format
+    output_format = Prompt.ask(
+        "[cyan]Output format[/cyan]",
+        choices=["all", "json", "markdown", "md", "title"],
+        default="markdown"
+    )
+    
+    try:
+        # Run the crawler
+        result = await run_crawler(url, browser_cfg, crawler_cfg, True)
+        
+        # Handle output
+        if output_format == "all":
+            console.print(json.dumps(result.model_dump(), indent=2))
+        elif output_format == "json":
+            console.print(json.dumps(json.loads(result.extracted_content), indent=2))
+        elif output_format in ["markdown", "md"]:
+            console.print(result.markdown.raw_markdown)
+        elif output_format == "title":
+            console.print(result.metadata.get("title", "No title found"))
+        
+        console.print(f"[green]Successfully crawled[/green] {url}")
+        return result
+    except Exception as e:
+        console.print(f"[red]Error crawling:[/red] {str(e)}")
+        return None
+        
+async def use_profile_to_crawl():
+    """Interactive profile selection for crawling"""
+    profiler = BrowserProfiler()
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found. Create one first.[/yellow]")
+        return
+        
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[cyan]Enter number of profile to use[/cyan]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Get URL
+        url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
+        if url:
+            # Crawl with the selected profile
+            await crawl_with_profile_cli(profile["path"], url)
+        else:
+            console.print("[red]No URL provided[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection[/red]")
+
+async def manage_profiles():
+    """Interactive profile management menu"""
+    profiler = BrowserProfiler()
+    
+    options = {
+        "1": "List profiles",
+        "2": "Create new profile",
+        "3": "Delete profile",
+        "4": "Use a profile to crawl a website",
+        "5": "Exit",
+    }
+    
+    while True:
+        console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
+        
+        for key, value in options.items():
+            color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
+            console.print(f"[{color}]{key}[/{color}]. {value}")
+        
+        choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
+        
+        if choice == "1":
+            # List profiles
+            profiles = profiler.list_profiles()
+            display_profiles_table(profiles)
+        
+        elif choice == "2":
+            # Create profile
+            await create_profile_interactive(profiler)
+        
+        elif choice == "3":
+            # Delete profile
+            delete_profile_interactive(profiler)
+            
+        elif choice == "4":
+            # Use profile to crawl
+            await use_profile_to_crawl()
+        
+        elif choice == "5":
+            # Exit
+            console.print("[cyan]Exiting profile manager.[/cyan]")
+            break
+        
+        # Add a separator between operations
+        console.print("\n")
+
+
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+def cli():
+    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
+    pass
+
+
+@cli.group("browser")
+def browser_cmd():
+    """Manage browser instances for Crawl4AI
+    
+    Commands to manage browser instances for Crawl4AI, including:
+    - status - Check status of the builtin browser
+    - start - Start a new builtin browser
+    - stop - Stop the running builtin browser
+    - restart - Restart the builtin browser
+    """
+    pass
+    
+@browser_cmd.command("status")
+def browser_status_cmd():
+    """Show status of the builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        status = anyio.run(profiler.get_builtin_browser_status)
+        
+        if status["running"]:
+            info = status["info"]
+            console.print(Panel(
+                f"[green]Builtin browser is running[/green]\n\n"
+                f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
+                f"Process ID: [yellow]{info['pid']}[/yellow]\n"
+                f"Browser type: [blue]{info['browser_type']}[/blue]\n"
+                f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
+                f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
+                title="Builtin Browser Status",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[yellow]Builtin browser is not running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser",
+                title="Builtin Browser Status",
+                border_style="yellow"
+            ))
+            
+    except Exception as e:
+        console.print(f"[red]Error checking browser status: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("start")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
+def browser_start_cmd(browser_type: str, port: int, headless: bool):
+    """Start a builtin browser instance
+    
+    This will start a persistent browser instance that can be used by Crawl4AI
+    by setting browser_mode="builtin" in BrowserConfig.
+    """
+    profiler = BrowserProfiler()
+    
+    # First check if browser is already running
+    status = anyio.run(profiler.get_builtin_browser_status)
+    if status["running"]:
+        console.print(Panel(
+            "[yellow]Builtin browser is already running[/yellow]\n\n"
+            f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
+            "Use 'crwl browser restart' to restart the browser",
+            title="Builtin Browser Start",
+            border_style="yellow"
+        ))
+        return
+    
+    try:
+        console.print(Panel(
+            f"[cyan]Starting builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Start",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser started successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
+                "This browser will be used automatically when setting browser_mode='builtin'",
+                title="Builtin Browser Start",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to start builtin browser[/red]",
+                title="Builtin Browser Start",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("stop")
+def browser_stop_cmd():
+    """Stop the running builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]",
+                title="Builtin Browser Stop",
+                border_style="yellow"
+            ))
+            return
+            
+        console.print(Panel(
+            "[cyan]Stopping builtin browser...[/cyan]",
+            title="Builtin Browser Stop", 
+            border_style="cyan"
+        ))
+        
+        success = anyio.run(profiler.kill_builtin_browser)
+        
+        if success:
+            console.print(Panel(
+                "[green]Builtin browser stopped successfully[/green]",
+                title="Builtin Browser Stop",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to stop builtin browser[/red]",
+                title="Builtin Browser Stop",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("view")
+@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
+def browser_view_cmd(url: Optional[str]):
+    """
+    Open a visible window of the builtin browser
+    
+    This command connects to the running builtin browser and opens a visible window,
+    allowing you to see what the browser is currently viewing or navigate to a URL.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser first",
+                title="Builtin Browser View",
+                border_style="yellow"
+            ))
+            return
+        
+        info = status["info"]
+        cdp_url = info["cdp_url"]
+        
+        console.print(Panel(
+            f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
+            f"CDP URL: [green]{cdp_url}[/green]\n"
+            f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
+            title="Builtin Browser View",
+            border_style="cyan"
+        ))
+        
+        # Use the CDP URL to launch a new visible window
+        import subprocess
+        import os
+        
+        # Determine the browser command based on platform
+        if sys.platform == "darwin":  # macOS
+            browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
+        elif sys.platform == "win32":  # Windows
+            browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
+        else:  # Linux
+            browser_cmd = ["google-chrome"]
+        
+        # Add arguments
+        browser_args = [
+            f"--remote-debugging-port={info['debugging_port']}",
+            "--remote-debugging-address=localhost",
+            "--no-first-run",
+            "--no-default-browser-check"
+        ]
+        
+        # Add URL if provided
+        if url:
+            browser_args.append(url)
+        
+        # Launch browser
+        try:
+            subprocess.Popen(browser_cmd + browser_args)
+            console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
+        except Exception as e:
+            console.print(f"[red]Error launching browser: {str(e)}[/red]")
+            console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
+    
+    except Exception as e:
+        console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@browser_cmd.command("restart")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None, 
+              help="Browser type (defaults to same as current)")
+@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
+@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
+def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
+    """Restart the builtin browser
+    
+    Stops the current builtin browser if running and starts a new one.
+    By default, uses the same configuration as the current browser.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running and get its config
+        status = anyio.run(profiler.get_builtin_browser_status)
+        current_config = {}
+        
+        if status["running"]:
+            info = status["info"]
+            current_config = {
+                "browser_type": info["browser_type"],
+                "port": info["debugging_port"],
+                "headless": True  # Default assumption
+            }
+            
+            # Stop the browser
+            console.print(Panel(
+                "[cyan]Stopping current builtin browser...[/cyan]",
+                title="Builtin Browser Restart", 
+                border_style="cyan"
+            ))
+            
+            success = anyio.run(profiler.kill_builtin_browser)
+            if not success:
+                console.print(Panel(
+                    "[red]Failed to stop current browser[/red]",
+                    title="Builtin Browser Restart",
+                    border_style="red"
+                ))
+                sys.exit(1)
+        
+        # Use provided options or defaults from current config
+        browser_type = browser_type or current_config.get("browser_type", "chromium")
+        port = port or current_config.get("port", 9222)
+        headless = headless if headless is not None else current_config.get("headless", True)
+        
+        # Start a new browser
+        console.print(Panel(
+            f"[cyan]Starting new builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Restart",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser restarted successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]",
+                title="Builtin Browser Restart",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to restart builtin browser[/red]",
+                title="Builtin Browser Restart",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@cli.command("cdp")
+@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
+@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--headless", is_flag=True, help="Run browser in headless mode")
+@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
+def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
+    """Launch a standalone browser with CDP debugging enabled
+    
+    This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
+    prints the CDP URL, and keeps the browser running until you press 'q'.
+    
+    The CDP URL can be used for various automation and debugging tasks.
+    
+    Examples:
+        # Launch Chromium with CDP on default port 9222
+        crwl cdp
+        
+        # Use a specific directory for browser data and custom port
+        crwl cdp --user-data-dir ~/browser-data --port 9223
+        
+        # Launch in headless mode
+        crwl cdp --headless
+        
+        # Launch in incognito mode (ignores user-data-dir)
+        crwl cdp --incognito
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # Handle data directory
+        data_dir = None
+        if not incognito and user_data_dir:
+            # Expand user path (~/something)
+            expanded_path = os.path.expanduser(user_data_dir)
+            
+            # Create directory if it doesn't exist
+            if not os.path.exists(expanded_path):
+                console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
+                os.makedirs(expanded_path, exist_ok=True)
+            
+            data_dir = expanded_path
+        
+        # Print launch info
+        console.print(Panel(
+            f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
+            f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
+            f"[yellow]Press 'q' to quit when done[/yellow]",
+            title="CDP Browser",
+            border_style="cyan"
+        ))
+        
+        # Run the browser
+        cdp_url = anyio.run(
+            profiler.launch_standalone_browser,
+            browser_type,
+            data_dir,
+            port,
+            headless
+        )
+        
+        if not cdp_url:
+            console.print("[red]Failed to launch browser or get CDP URL[/red]")
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
+        sys.exit(1)
+
+
+@cli.command("crawl")
+@click.argument("url", required=True)
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
+@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
+           extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl a website and extract content
+    
+    Simple Usage:
+        crwl crawl https://example.com
+    """
+    
+    # Handle profile option
+    if profile:
+        profiler = BrowserProfiler()
+        profile_path = profiler.get_profile_path(profile)
+        
+        if not profile_path:
+            profiles = profiler.list_profiles()
+            
+            if profiles:
+                console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
+                display_profiles_table(profiles)
+            else:
+                console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
+            
+            return
+        
+        # Include the profile in browser config
+        if not browser:
+            browser = {}
+        browser["user_data_dir"] = profile_path
+        browser["use_managed_browser"] = True
+        
+        if verbose:
+            console.print(f"[green]Using browser profile:[/green] {profile}")
+            
+    try:
+        # Load base configurations
+        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
+        crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
+        
+        # Override with CLI params
+        if browser:
+            browser_cfg = browser_cfg.clone(**browser)
+        if crawler:
+            crawler_cfg = crawler_cfg.clone(**crawler)
+            
+        # Handle content filter config
+        if filter_config or output in ["markdown-fit", "md-fit"]:
+            if filter_config:
+                filter_conf = load_config_file(filter_config)
+            elif not filter_config and output in ["markdown-fit", "md-fit"]:
+                filter_conf = {
+                    "type": "pruning",
+                    "query": "",
+                    "threshold": 0.48
+                }
+            if filter_conf["type"] == "bm25":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = BM25ContentFilter(
+                        user_query=filter_conf.get("query"),
+                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                    )
+                )
+            elif filter_conf["type"] == "pruning":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = PruningContentFilter(
+                        user_query=filter_conf.get("query"),
+                        threshold=filter_conf.get("threshold", 0.48)
+                    )
+                )
+        
+        # Handle json-extract option (takes precedence over extraction-config)
+        if json_extract is not None:
+            # Get LLM provider and token
+            provider, token = setup_llm_config()
+            
+            # Default sophisticated instruction for structured data extraction
+            default_instruction = """Analyze the web page content and extract structured data as JSON. 
+If the page contains a list of items with repeated patterns, extract all items in an array. 
+If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information.
+Look at the content, intention of content, what it offers and find the data item(s) in the page.
+Always return valid, properly formatted JSON."""
+            
+            
+            default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract
+            
+            # Determine instruction based on whether json_extract is empty or has content
+            instruction = default_instruction_with_user_query if json_extract else default_instruction
+            
+            # Create LLM extraction strategy
+            crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                llm_config=LLMConfig(provider=provider, api_token=token),
+                instruction=instruction,
+                schema=load_schema_file(schema),  # Will be None if no schema is provided
+                extraction_type="schema", #if schema else "block",
+                apply_chunking=False,
+                force_json_response=True,
+                verbose=verbose,
+            )
+            
+            # Set output to JSON if not explicitly specified
+            if output == "all":
+                output = "json"
+                
+        # Handle extraction strategy from config file (only if json-extract wasn't used)
+        elif extraction_config:
+            extract_conf = load_config_file(extraction_config)
+            schema_data = load_schema_file(schema)
+            
+            # Check if type does not exist show proper message
+            if not extract_conf.get("type"):
+                raise click.ClickException("Extraction type not specified")
+            if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
+                raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
+            
+            if extract_conf["type"] == "llm":
+                # if no provider show error emssage
+                if not extract_conf.get("provider") or not extract_conf.get("api_token"):
+                    raise click.ClickException("LLM provider and API token are required for LLM extraction")
+
+                crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                    llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
+                    instruction=extract_conf["instruction"],
+                    schema=schema_data,
+                    **extract_conf.get("params", {})
+                )
+            elif extract_conf["type"] == "json-css":
+                crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
+                    schema=schema_data
+                )
+            elif extract_conf["type"] == "json-xpath":
+                crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
+                    schema=schema_data
+                )
+                
+
+        # No cache
+        if bypass_cache:
+            crawler_cfg.cache_mode = CacheMode.BYPASS
+
+        crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()    
+
+        config = get_global_config()
+        
+        browser_cfg.verbose = config.get("VERBOSE", False)
+        crawler_cfg.verbose = config.get("VERBOSE", False)
+        
+        # Run crawler
+        result : CrawlResult = anyio.run(
+            run_crawler,
+            url,
+            browser_cfg,
+            crawler_cfg,
+            verbose
+        )
+
+        # Handle question
+        if question:
+            provider, token = setup_llm_config()
+            markdown = result.markdown.raw_markdown
+            anyio.run(stream_llm_response, url, markdown, question, provider, token)
+            return
+        
+        # Handle output
+        if not output_file:
+            if output == "all":
+                click.echo(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                print(result.extracted_content)
+                extracted_items = json.loads(result.extracted_content)
+                click.echo(json.dumps(extracted_items, indent=2))
+                
+            elif output in ["markdown", "md"]:
+                click.echo(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                click.echo(result.markdown.fit_markdown)
+        else:
+            if output == "all":
+                with open(output_file, "w") as f:
+                    f.write(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                with open(output_file, "w") as f:
+                    f.write(result.extracted_content)
+            elif output in ["markdown", "md"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.fit_markdown)
+            
+    except Exception as e:
+        raise click.ClickException(str(e))
+
+@cli.command("examples")
+def examples_cmd():
+    """Show usage examples"""
+    show_examples()
+
+@cli.group("config")
+def config_cmd():
+    """Manage global configuration settings
+    
+    Commands to view and update global configuration settings:
+    - list: Display all current configuration settings
+    - get: Get the value of a specific setting
+    - set: Set the value of a specific setting
+    """
+    pass
+
+@config_cmd.command("list")
+def config_list_cmd():
+    """List all configuration settings"""
+    config = get_global_config()
+    
+    table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("Setting", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_column("Default", style="yellow")
+    table.add_column("Description", style="white")
+    
+    for key, setting in USER_SETTINGS.items():
+        value = config.get(key, setting["default"])
+        
+        # Handle secret values
+        display_value = value
+        if setting.get("secret", False) and value:
+            display_value = "********"
+            
+        # Handle boolean values
+        if setting["type"] == "boolean":
+            display_value = str(value).lower()
+            default_value = str(setting["default"]).lower()
+        else:
+            default_value = str(setting["default"])
+        
+        table.add_row(
+            key,
+            str(display_value),
+            default_value,
+            setting["description"]
+        )
+    
+    console.print(table)
+
+@config_cmd.command("get")
+@click.argument("key", required=True)
+def config_get_cmd(key: str):
+    """Get a specific configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        return
+    
+    value = config.get(key, USER_SETTINGS[key]["default"])
+    
+    # Handle secret values
+    display_value = value
+    if USER_SETTINGS[key].get("secret", False) and value:
+        display_value = "********"
+    
+    console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]")
+    console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]")
+
+@config_cmd.command("set")
+@click.argument("key", required=True)
+@click.argument("value", required=True)
+def config_set_cmd(key: str, value: str):
+    """Set a configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]")
+        return
+    
+    setting = USER_SETTINGS[key]
+    
+    # Type conversion and validation
+    if setting["type"] == "boolean":
+        if value.lower() in ["true", "yes", "1", "y"]:
+            typed_value = True
+        elif value.lower() in ["false", "no", "0", "n"]:
+            typed_value = False
+        else:
+            console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]")
+            return
+    elif setting["type"] == "string":
+        typed_value = value
+        
+        # Check if the value should be one of the allowed options
+        if "options" in setting and value not in setting["options"]:
+            console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]")
+            return
+    
+    # Update config
+    config[key] = typed_value
+    save_global_config(config)
+    
+    # Handle secret values for display
+    display_value = typed_value
+    if setting.get("secret", False) and typed_value:
+        display_value = "********"
+        
+    console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]")
+
+@cli.command("profiles")
+def profiles_cmd():
+    """Manage browser profiles interactively
+    
+    Launch an interactive browser profile manager where you can:
+    - List all existing profiles
+    - Create new profiles for authenticated browsing
+    - Delete unused profiles
+    """
+    # Run interactive profile manager
+    anyio.run(manage_profiles)
+
+@cli.command(name="")
+@click.argument("url", required=False)
+@click.option("--example", is_flag=True, help="Show usage examples")
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+        extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl4AI CLI - Web content extraction tool
+
+    Simple Usage:
+        crwl https://example.com
+    
+    Run with --example to see detailed usage examples.
+    
+    Other commands:
+        crwl profiles   - Manage browser profiles for identity-based crawling
+        crwl crawl      - Crawl a website with advanced options
+        crwl cdp        - Launch browser with CDP debugging enabled
+        crwl browser    - Manage builtin browser (start, stop, status, restart)
+        crwl config     - Manage global configuration settings
+        crwl examples   - Show more usage examples
+        
+    Configuration Examples:
+        crwl config list                         - List all configuration settings
+        crwl config get DEFAULT_LLM_PROVIDER     - Show current LLM provider
+        crwl config set VERBOSE true             - Enable verbose mode globally
+        crwl config set BROWSER_HEADLESS false   - Default to visible browser
+    """
+
+    if example:
+        show_examples()
+        return
+        
+    if not url:
+        # Show help without error message
+        ctx = click.get_current_context()
+        click.echo(ctx.get_help())
+        return
+        
+    # Forward to crawl command
+    ctx = click.get_current_context()
+    ctx.invoke(
+        crawl_cmd, 
+        url=url, 
+        browser_config=browser_config,
+        crawler_config=crawler_config,
+        filter_config=filter_config,
+        extraction_config=extraction_config,
+        json_extract=json_extract,
+        schema=schema,
+        browser=browser,
+        crawler=crawler,
+        output=output,
+        bypass_cache=bypass_cache,
+        question=question,
+        verbose=verbose,
+        profile=profile
+    )
+
+def main():
+    import sys
+    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
+        sys.argv.insert(1, "crawl")
+    cli()
+
+if __name__ == "__main__":
+    main()
+```
+
+
+## File: crawl4ai/extraction_strategy.py
+
+```py
+from abc import ABC, abstractmethod
+import inspect
+from typing import Any, List, Dict, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import time
+
+from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
+from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    CHUNK_TOKEN_THRESHOLD,
+    OVERLAP_RATE,
+    WORD_TOKEN_RATE,
+)
+from .utils import *  # noqa: F403
+
+from .utils import (
+    sanitize_html,
+    escape_json_string,
+    perform_completion_with_backoff,
+    extract_xml_data,
+    split_and_parse_json_objects,
+    sanitize_input_encode,
+    merge_chunks,
+)
+from .models import * # noqa: F403
+
+from .models import TokenUsage
+
+from .model_loader import * # noqa: F403
+from .model_loader import (
+    get_device,
+    load_HF_embedding_model,
+    load_text_multilabel_classifier,
+    calculate_batch_size
+)
+
+from .types import LLMConfig, create_llm_config
+
+from functools import partial
+import numpy as np
+import re
+from bs4 import BeautifulSoup
+from lxml import html, etree
+
+
+class ExtractionStrategy(ABC):
+    """
+    Abstract base class for all extraction strategies.
+    """
+
+    def __init__(self, input_format: str = "markdown", **kwargs):
+        """
+        Initialize the extraction strategy.
+
+        Args:
+            input_format: Content format to use for extraction.
+                         Options: "markdown" (default), "html", "fit_markdown"
+            **kwargs: Additional keyword arguments
+        """
+        self.input_format = input_format
+        self.DEL = "<|DEL|>"
+        self.name = self.__class__.__name__
+        self.verbose = kwargs.get("verbose", False)
+
+    @abstractmethod
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+
+        :param url: The URL of the webpage.
+        :param html: The HTML content of the webpage.
+        :return: A list of extracted blocks or chunks.
+        """
+        pass
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Process sections of text in parallel by default.
+
+        :param url: The URL of the webpage.
+        :param sections: List of sections (strings) to process.
+        :return: A list of processed JSON blocks.
+        """
+        extracted_content = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.extract, url, section, **kwargs)
+                for section in sections
+            ]
+            for future in as_completed(futures):
+                extracted_content.extend(future.result())
+        return extracted_content
+
+
+class NoExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
+    """
+
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+        """
+        return [{"index": 0, "content": html}]
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        return [
+            {"index": i, "tags": [], "content": section}
+            for i, section in enumerate(sections)
+        ]
+
+
+#######################################################
+# Strategies using clustering for text data extraction #
+#######################################################
+
+
+class CosineStrategy(ExtractionStrategy):
+    """
+    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
+
+    How it works:
+    1. Pre-filter documents using embeddings and semantic_filter.
+    2. Perform clustering using cosine similarity.
+    3. Organize texts by their cluster labels, retaining order.
+    4. Filter clusters by word count.
+    5. Extract meaningful blocks or chunks from the filtered clusters.
+
+    Attributes:
+        semantic_filter (str): A keyword filter for document filtering.
+        word_count_threshold (int): Minimum number of words per cluster.
+        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+        linkage_method (str): The linkage method for hierarchical clustering.
+        top_k (int): Number of top categories to extract.
+        model_name (str): The name of the sentence-transformers model.
+        sim_threshold (float): The similarity threshold for clustering.
+    """
+
+    def __init__(
+        self,
+        semantic_filter=None,
+        word_count_threshold=10,
+        max_dist=0.2,
+        linkage_method="ward",
+        top_k=3,
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        sim_threshold=0.3,
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            semantic_filter (str): A keyword filter for document filtering.
+            word_count_threshold (int): Minimum number of words per cluster.
+            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+            linkage_method (str): The linkage method for hierarchical clustering.
+            top_k (int): Number of top categories to extract.
+        """
+        super().__init__(**kwargs)
+
+        import numpy as np
+
+        self.semantic_filter = semantic_filter
+        self.word_count_threshold = word_count_threshold
+        self.max_dist = max_dist
+        self.linkage_method = linkage_method
+        self.top_k = top_k
+        self.sim_threshold = sim_threshold
+        self.timer = time.time()
+        self.verbose = kwargs.get("verbose", False)
+
+        self.buffer_embeddings = np.array([])
+        self.get_embedding_method = "direct"
+
+        self.device = get_device()
+        # import torch
+        # self.device = torch.device('cpu')
+
+        self.default_batch_size = calculate_batch_size(self.device)
+
+        if self.verbose:
+            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
+
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_HF_embedding_model(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.get_embedding_method = "batch"
+
+        self.buffer_embeddings = np.array([])
+
+        # if model_name == "bert-base-uncased":
+        #     self.tokenizer, self.model = load_bert_base_uncased()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "BAAI/bge-small-en-v1.5":
+        #     self.tokenizer, self.model = load_bge_small_en_v1_5()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+
+        if self.verbose:
+            print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
+
+        self.nlp, _ = load_text_multilabel_classifier()
+        # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
+
+        if self.verbose:
+            print(
+                f"[LOG] Model loaded {model_name}, models/reuters, took "
+                + str(time.time() - self.timer)
+                + " seconds"
+            )
+
+    def filter_documents_embeddings(
+        self, documents: List[str], semantic_filter: str, at_least_k: int = 20
+    ) -> List[str]:
+        """
+        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
+
+        Args:
+            documents (List[str]): A list of document texts.
+            semantic_filter (str): A keyword filter for document filtering.
+            at_least_k (int): The minimum number of documents to return.
+
+        Returns:
+            List[str]: A list of filtered and sorted document texts.
+        """
+
+        if not semantic_filter:
+            return documents
+
+        if len(documents) < at_least_k:
+            at_least_k = len(documents) // 2
+
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        # Compute embedding for the keyword filter
+        query_embedding = self.get_embeddings([semantic_filter])[0]
+
+        # Compute embeddings for the documents
+        document_embeddings = self.get_embeddings(documents)
+
+        # Calculate cosine similarity between the query embedding and document embeddings
+        similarities = cosine_similarity(
+            [query_embedding], document_embeddings
+        ).flatten()
+
+        # Filter documents based on the similarity threshold
+        filtered_docs = [
+            (doc, sim)
+            for doc, sim in zip(documents, similarities)
+            if sim >= self.sim_threshold
+        ]
+
+        # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
+        if len(filtered_docs) < at_least_k:
+            remaining_docs = [
+                (doc, sim)
+                for doc, sim in zip(documents, similarities)
+                if sim < self.sim_threshold
+            ]
+            remaining_docs.sort(key=lambda x: x[1], reverse=True)
+            filtered_docs.extend(remaining_docs[: at_least_k - len(filtered_docs)])
+
+        # Extract the document texts from the tuples
+        filtered_docs = [doc for doc, _ in filtered_docs]
+
+        return filtered_docs[:at_least_k]
+
+    def get_embeddings(
+        self, sentences: List[str], batch_size=None, bypass_buffer=False
+    ):
+        """
+        Get BERT embeddings for a list of sentences.
+
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of embeddings.
+        """
+        # if self.buffer_embeddings.any() and not bypass_buffer:
+        #     return self.buffer_embeddings
+
+        if self.device.type in ["cpu", "gpu", "cuda", "mps"]:
+            import torch
+
+            # Tokenize sentences and convert to tensor
+            if batch_size is None:
+                batch_size = self.default_batch_size
+
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i : i + batch_size]
+                encoded_input = self.tokenizer(
+                    batch_sentences, padding=True, truncation=True, return_tensors="pt"
+                )
+                encoded_input = {
+                    key: tensor.to(self.device) for key, tensor in encoded_input.items()
+                }
+
+                # Ensure no gradients are calculated
+                with torch.no_grad():
+                    model_output = self.model(**encoded_input)
+
+                # Get embeddings from the last hidden state (mean pooling)
+                embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
+                all_embeddings.append(embeddings)
+
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        elif self.device.type == "cpu":
+            # self.buffer_embeddings = self.model(sentences)
+            if batch_size is None:
+                batch_size = self.default_batch_size
+
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i : i + batch_size]
+                embeddings = self.model(batch_sentences)
+                all_embeddings.append(embeddings)
+
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        return self.buffer_embeddings
+
+    def hierarchical_clustering(self, sentences: List[str], embeddings=None):
+        """
+        Perform hierarchical clustering on sentences and return cluster labels.
+
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of cluster labels.
+        """
+        # Get embeddings
+        from scipy.cluster.hierarchy import linkage, fcluster
+        from scipy.spatial.distance import pdist
+
+        self.timer = time.time()
+        embeddings = self.get_embeddings(sentences, bypass_buffer=True)
+        # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
+        # Compute pairwise cosine distances
+        distance_matrix = pdist(embeddings, "cosine")
+        # Perform agglomerative clustering respecting order
+        linked = linkage(distance_matrix, method=self.linkage_method)
+        # Form flat clusters
+        labels = fcluster(linked, self.max_dist, criterion="distance")
+        return labels
+
+    def filter_clusters_by_word_count(
+        self, clusters: Dict[int, List[str]]
+    ) -> Dict[int, List[str]]:
+        """
+        Filter clusters to remove those with a word count below the threshold.
+
+        Args:
+            clusters (Dict[int, List[str]]): Dictionary of clusters.
+
+        Returns:
+            Dict[int, List[str]]: Filtered dictionary of clusters.
+        """
+        filtered_clusters = {}
+        for cluster_id, texts in clusters.items():
+            # Concatenate texts for analysis
+            full_text = " ".join(texts)
+            # Count words
+            word_count = len(full_text.split())
+
+            # Keep clusters with word count above the threshold
+            if word_count >= self.word_count_threshold:
+                filtered_clusters[cluster_id] = texts
+
+        return filtered_clusters
+
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract clusters from HTML content using hierarchical clustering.
+
+        Args:
+            url (str): The URL of the webpage.
+            html (str): The HTML content of the webpage.
+
+        Returns:
+            List[Dict[str, Any]]: A list of processed JSON blocks.
+        """
+        # Assume `html` is a list of text chunks for this strategy
+        t = time.time()
+        text_chunks = html.split(self.DEL)  # Split by lines or paragraphs as needed
+
+        # Pre-filter documents using embeddings and semantic_filter
+        text_chunks = self.filter_documents_embeddings(
+            text_chunks, self.semantic_filter
+        )
+
+        if not text_chunks:
+            return []
+
+        # Perform clustering
+        labels = self.hierarchical_clustering(text_chunks)
+        # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
+
+        # Organize texts by their cluster labels, retaining order
+        t = time.time()
+        clusters = {}
+        for index, label in enumerate(labels):
+            clusters.setdefault(label, []).append(text_chunks[index])
+
+        # Filter clusters by word count
+        filtered_clusters = self.filter_clusters_by_word_count(clusters)
+
+        # Convert filtered clusters to a sorted list of dictionaries
+        cluster_list = [
+            {"index": int(idx), "tags": [], "content": " ".join(filtered_clusters[idx])}
+            for idx in sorted(filtered_clusters)
+        ]
+
+        if self.verbose:
+            print(f"[LOG] 🚀 Assign tags using {self.device}")
+
+        if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
+            labels = self.nlp([cluster["content"] for cluster in cluster_list])
+
+            for cluster, label in zip(cluster_list, labels):
+                cluster["tags"] = label
+        # elif self.device.type == "cpu":
+        #     # Process the text with the loaded model
+        #     texts = [cluster['content'] for cluster in cluster_list]
+        #     # Batch process texts
+        #     docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
+
+        #     for doc, cluster in zip(docs, cluster_list):
+        #         tok_k = self.top_k
+        #         top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #         cluster['tags'] = [cat for cat, _ in top_categories]
+
+        # for cluster in  cluster_list:
+        #     doc = self.nlp(cluster['content'])
+        #     tok_k = self.top_k
+        #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #     cluster['tags'] = [cat for cat, _ in top_categories]
+
+        if self.verbose:
+            print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
+
+        return cluster_list
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Process sections using hierarchical clustering.
+
+        Args:
+            url (str): The URL of the webpage.
+            sections (List[str]): List of sections (strings) to process.
+
+        Returns:
+        """
+        # This strategy processes all sections together
+
+        return self.extract(url, self.DEL.join(sections), **kwargs)
+
+
+#######################################################
+# Strategies using LLM-based extraction for text data #
+#######################################################
+class LLMExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that uses an LLM to extract meaningful content from the HTML.
+
+    Attributes:
+        llm_config: The LLM configuration object.
+        instruction: The instruction to use for the LLM model.
+        schema: Pydantic model schema for structured data.
+        extraction_type: "block" or "schema".
+        chunk_token_threshold: Maximum tokens per chunk.
+        overlap_rate: Overlap between chunks.
+        word_token_rate: Word to token conversion rate.
+        apply_chunking: Whether to apply chunking.
+        verbose: Whether to print verbose output.
+        usages: List of individual token usages.
+        total_usage: Accumulated token usage.
+    """
+    _UNWANTED_PROPS = {
+            'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+            'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+            'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+            'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        }
+    def __init__(
+        self,
+        llm_config: 'LLMConfig' = None,
+        instruction: str = None,
+        schema: Dict = None,
+        extraction_type="block",
+        chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
+        overlap_rate=OVERLAP_RATE,
+        word_token_rate=WORD_TOKEN_RATE,
+        apply_chunking=True,
+        input_format: str = "markdown",
+        force_json_response=False,
+        verbose=False,
+        # Deprecated arguments
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: str = None,
+        api_base: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            llm_config: The LLM configuration object.
+            instruction: The instruction to use for the LLM model.
+            schema: Pydantic model schema for structured data.
+            extraction_type: "block" or "schema".
+            chunk_token_threshold: Maximum tokens per chunk.
+            overlap_rate: Overlap between chunks.
+            word_token_rate: Word to token conversion rate.
+            apply_chunking: Whether to apply chunking.
+            input_format: Content format to use for extraction.
+                            Options: "markdown" (default), "html", "fit_markdown"
+            force_json_response: Whether to force a JSON response from the LLM.
+            verbose: Whether to print verbose output.
+
+            # Deprecated arguments, will be removed very soon
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
+            base_url: The base URL for the API request.
+            api_base: The base URL for the API request.
+            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+        """
+        super().__init__( input_format=input_format, **kwargs)
+        self.llm_config = llm_config
+        if not self.llm_config:
+            self.llm_config = create_llm_config(
+                provider=DEFAULT_PROVIDER,
+                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
+            )
+        self.instruction = instruction
+        self.extract_type = extraction_type
+        self.schema = schema
+        if schema:
+            self.extract_type = "schema"
+        self.force_json_response = force_json_response
+        self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.apply_chunking = apply_chunking
+        self.extra_args = kwargs.get("extra_args", {})
+        if not self.apply_chunking:
+            self.chunk_token_threshold = 1e9
+        self.verbose = verbose
+        self.usages = []  # Store individual usages
+        self.total_usage = TokenUsage()  # Accumulated usage
+
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url
+        self.api_base = api_base
+
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
+    def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML using an LLM.
+
+        How it works:
+        1. Construct a prompt with variables.
+        2. Make a request to the LLM using the prompt.
+        3. Parse the response and extract blocks or chunks.
+
+        Args:
+            url: The URL of the webpage.
+            ix: Index of the block.
+            html: The HTML content of the webpage.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+        if self.verbose:
+            # print("[LOG] Extracting blocks from URL:", url)
+            print(f"[LOG] Call LLM for {url} - block index: {ix}")
+
+        variable_values = {
+            "URL": url,
+            "HTML": escape_json_string(sanitize_html(html)),
+        }
+
+        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+        if self.instruction:
+            variable_values["REQUEST"] = self.instruction
+            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and self.schema:
+            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
+            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and not self.schema:
+            prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
+
+        for variable in variable_values:
+            prompt_with_variables = prompt_with_variables.replace(
+                "{" + variable + "}", variable_values[variable]
+            )
+
+        try:
+            response = perform_completion_with_backoff(
+                self.llm_config.provider,
+                prompt_with_variables,
+                self.llm_config.api_token,
+                base_url=self.llm_config.base_url,
+                json_response=self.force_json_response,
+                extra_args=self.extra_args,
+            )  # , json_response=self.extract_type == "schema")
+            # Track usage
+            usage = TokenUsage(
+                completion_tokens=response.usage.completion_tokens,
+                prompt_tokens=response.usage.prompt_tokens,
+                total_tokens=response.usage.total_tokens,
+                completion_tokens_details=response.usage.completion_tokens_details.__dict__
+                if response.usage.completion_tokens_details
+                else {},
+                prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                if response.usage.prompt_tokens_details
+                else {},
+            )
+            self.usages.append(usage)
+
+            # Update totals
+            self.total_usage.completion_tokens += usage.completion_tokens
+            self.total_usage.prompt_tokens += usage.prompt_tokens
+            self.total_usage.total_tokens += usage.total_tokens
+
+            try:
+                response = response.choices[0].message.content
+                blocks = None
+
+                if self.force_json_response:
+                    blocks = json.loads(response)
+                    if isinstance(blocks, dict):
+                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
+                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
+                            blocks = list(blocks.values())[0]
+                        else:
+                            # If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
+                            blocks = [blocks]
+                    elif isinstance(blocks, list):
+                        # If it is a list then assign that to blocks
+                        blocks = blocks
+                else: 
+                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
+                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = json.loads(blocks)
+
+                for block in blocks:
+                    block["error"] = False
+            except Exception:
+                parsed, unparsed = split_and_parse_json_objects(
+                    response.choices[0].message.content
+                )
+                blocks = parsed
+                if unparsed:
+                    blocks.append(
+                        {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+                    )
+
+            if self.verbose:
+                print(
+                    "[LOG] Extracted",
+                    len(blocks),
+                    "blocks from URL:",
+                    url,
+                    "block index:",
+                    ix,
+                )
+            return blocks
+        except Exception as e:
+            if self.verbose:
+                print(f"[LOG] Error in LLM extraction: {e}")
+            # Add error information to extracted_content
+            return [
+                {
+                    "index": ix,
+                    "error": True,
+                    "tags": ["error"],
+                    "content": str(e),
+                }
+            ]
+
+    def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
+        """
+        Merge documents into sections based on chunk_token_threshold and overlap.
+        """
+        sections =  merge_chunks(
+            docs = documents,
+            target_size= chunk_token_threshold,
+            overlap=overlap,
+            word_token_ratio=self.word_token_rate
+        )
+        return sections
+
+    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
+        """
+        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
+
+        Args:
+            url: The URL of the webpage.
+            sections: List of sections (strings) to process.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+
+        merged_sections = self._merge(
+            sections,
+            self.chunk_token_threshold,
+            overlap=int(self.chunk_token_threshold * self.overlap_rate),
+        )
+        extracted_content = []
+        if self.llm_config.provider.startswith("groq/"):
+            # Sequential processing with a delay
+            for ix, section in enumerate(merged_sections):
+                extract_func = partial(self.extract, url)
+                extracted_content.extend(
+                    extract_func(ix, sanitize_input_encode(section))
+                )
+                time.sleep(0.5)  # 500 ms delay between each processing
+        else:
+            # Parallel processing using ThreadPoolExecutor
+            # extract_func = partial(self.extract, url)
+            # for ix, section in enumerate(merged_sections):
+            #     extracted_content.append(extract_func(ix, section))
+
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                extract_func = partial(self.extract, url)
+                futures = [
+                    executor.submit(extract_func, ix, sanitize_input_encode(section))
+                    for ix, section in enumerate(merged_sections)
+                ]
+
+                for future in as_completed(futures):
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append(
+                            {
+                                "index": 0,
+                                "error": True,
+                                "tags": ["error"],
+                                "content": str(e),
+                            }
+                        )
+
+        return extracted_content
+
+    def show_usage(self) -> None:
+        """Print a detailed token usage report showing total and per-request usage."""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        print("\n=== Usage History ===")
+        print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+        print("-" * 48)
+        for i, usage in enumerate(self.usages, 1):
+            print(
+                f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+            )
+
+
+#######################################################
+# New extraction strategies for JSON-based extraction #
+#######################################################
+class JsonElementExtractionStrategy(ExtractionStrategy):
+    """
+    Abstract base class for extracting structured JSON from HTML content.
+
+    How it works:
+    1. Parses HTML content using the `_parse_html` method.
+    2. Uses a schema to define base selectors, fields, and transformations.
+    3. Extracts data hierarchically, supporting nested fields and lists.
+    4. Handles computed fields with expressions or functions.
+
+    Attributes:
+        DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
+        _extract_item(element, fields): Extracts fields from a single element.
+        _extract_single_field(element, field): Extracts a single field based on its type.
+        _apply_transform(value, transform): Applies a transformation to a value.
+        _compute_field(item, field): Computes a field value using an expression or function.
+        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
+
+    Abstract Methods:
+        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
+        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
+        _get_elements(element, selector): Retrieves child elements using a selector.
+        _get_element_text(element): Extracts text content from an element.
+        _get_element_html(element): Extracts raw HTML from an element.
+        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
+    """
+
+    DEL = "\n"
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        """
+        Initialize the JSON element extraction strategy with a schema.
+
+        Args:
+            schema (Dict[str, Any]): The schema defining the extraction rules.
+        """
+        super().__init__(**kwargs)
+        self.schema = schema
+        self.verbose = kwargs.get("verbose", False)
+
+    def extract(
+        self, url: str, html_content: str, *q, **kwargs
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract structured data from HTML content.
+
+        How it works:
+        1. Parses the HTML content using the `_parse_html` method.
+        2. Identifies base elements using the schema's base selector.
+        3. Extracts fields from each base element using `_extract_item`.
+
+        Args:
+            url (str): The URL of the page being processed.
+            html_content (str): The raw HTML content to parse and extract.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
+        """
+
+        parsed_html = self._parse_html(html_content)
+        base_elements = self._get_base_elements(
+            parsed_html, self.schema["baseSelector"]
+        )
+
+        results = []
+        for element in base_elements:
+            # Extract base element attributes
+            item = {}
+            if "baseFields" in self.schema:
+                for field in self.schema["baseFields"]:
+                    value = self._extract_single_field(element, field)
+                    if value is not None:
+                        item[field["name"]] = value
+
+            # Extract child fields
+            field_data = self._extract_item(element, self.schema["fields"])
+            item.update(field_data)
+
+            if item:
+                results.append(item)
+
+        return results
+
+    @abstractmethod
+    def _parse_html(self, html_content: str):
+        """Parse HTML content into appropriate format"""
+        pass
+
+    @abstractmethod
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        pass
+
+    @abstractmethod
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector"""
+        pass
+
+    def _extract_field(self, element, field):
+        try:
+            if field["type"] == "nested":
+                nested_elements = self._get_elements(element, field["selector"])
+                nested_element = nested_elements[0] if nested_elements else None
+                return (
+                    self._extract_item(nested_element, field["fields"])
+                    if nested_element
+                    else {}
+                )
+
+            if field["type"] == "list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_list_item(el, field["fields"]) for el in elements]
+
+            if field["type"] == "nested_list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_item(el, field["fields"]) for el in elements]
+
+            return self._extract_single_field(element, field)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting field {field['name']}: {str(e)}")
+            return field.get("default")
+
+    def _extract_single_field(self, element, field):
+        """
+        Extract a single field based on its type.
+
+        How it works:
+        1. Selects the target element using the field's selector.
+        2. Extracts the field value based on its type (e.g., text, attribute, regex).
+        3. Applies transformations if defined in the schema.
+
+        Args:
+            element: The base element to extract the field from.
+            field (Dict[str, Any]): The field definition in the schema.
+
+        Returns:
+            Any: The extracted field value.
+        """
+
+        if "selector" in field:
+            selected = self._get_elements(element, field["selector"])
+            if not selected:
+                return field.get("default")
+            selected = selected[0]
+        else:
+            selected = element
+
+        value = None
+        if field["type"] == "text":
+            value = self._get_element_text(selected)
+        elif field["type"] == "attribute":
+            value = self._get_element_attribute(selected, field["attribute"])
+        elif field["type"] == "html":
+            value = self._get_element_html(selected)
+        elif field["type"] == "regex":
+            text = self._get_element_text(selected)
+            match = re.search(field["pattern"], text)
+            value = match.group(1) if match else None
+
+        if "transform" in field:
+            value = self._apply_transform(value, field["transform"])
+
+        return value if value is not None else field.get("default")
+
+    def _extract_list_item(self, element, fields):
+        item = {}
+        for field in fields:
+            value = self._extract_single_field(element, field)
+            if value is not None:
+                item[field["name"]] = value
+        return item
+
+    def _extract_item(self, element, fields):
+        """
+        Extracts fields from a given element.
+
+        How it works:
+        1. Iterates through the fields defined in the schema.
+        2. Handles computed, single, and nested field types.
+        3. Updates the item dictionary with extracted field values.
+
+        Args:
+            element: The base element to extract fields from.
+            fields (List[Dict[str, Any]]): The list of fields to extract.
+
+        Returns:
+            Dict[str, Any]: A dictionary representing the extracted item.
+        """
+
+        item = {}
+        for field in fields:
+            if field["type"] == "computed":
+                value = self._compute_field(item, field)
+            else:
+                value = self._extract_field(element, field)
+            if value is not None:
+                item[field["name"]] = value
+        return item
+
+    def _apply_transform(self, value, transform):
+        """
+        Apply a transformation to a value.
+
+        How it works:
+        1. Checks the transformation type (e.g., `lowercase`, `strip`).
+        2. Applies the transformation to the value.
+        3. Returns the transformed value.
+
+        Args:
+            value (str): The value to transform.
+            transform (str): The type of transformation to apply.
+
+        Returns:
+            str: The transformed value.
+        """
+
+        if transform == "lowercase":
+            return value.lower()
+        elif transform == "uppercase":
+            return value.upper()
+        elif transform == "strip":
+            return value.strip()
+        return value
+
+    def _compute_field(self, item, field):
+        try:
+            if "expression" in field:
+                return eval(field["expression"], {}, item)
+            elif "function" in field:
+                return field["function"](item)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error computing field {field['name']}: {str(e)}")
+            return field.get("default")
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Run the extraction strategy on a combined HTML content.
+
+        How it works:
+        1. Combines multiple HTML sections using the `DEL` delimiter.
+        2. Calls the `extract` method with the combined HTML.
+
+        Args:
+            url (str): The URL of the page being processed.
+            sections (List[str]): A list of HTML sections.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items.
+        """
+
+        combined_html = self.DEL.join(sections)
+        return self.extract(url, combined_html, **kwargs)
+
+    @abstractmethod
+    def _get_element_text(self, element) -> str:
+        """Get text content from element"""
+        pass
+
+    @abstractmethod
+    def _get_element_html(self, element) -> str:
+        """Get HTML content from element"""
+        pass
+
+    @abstractmethod
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value from element"""
+        pass
+
+    _GENERATE_SCHEMA_UNWANTED_PROPS = {
+        'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
+    }
+
+    @staticmethod
+    def generate_schema(
+        html: str,
+        schema_type: str = "CSS", # or XPATH
+        query: str = None,
+        target_json_example: str = None,
+        llm_config: 'LLMConfig' = create_llm_config(),
+        provider: str = None,
+        api_token: str = None,
+        **kwargs
+    ) -> dict:
+        """
+        Generate extraction schema from HTML content and optional query.
+        
+        Args:
+            html (str): The HTML content to analyze
+            query (str, optional): Natural language description of what data to extract
+            provider (str): Legacy Parameter. LLM provider to use 
+            api_token (str): Legacy Parameter. API token for LLM provider
+            llm_config (LLMConfig): LLM configuration object
+            prompt (str, optional): Custom prompt template to use
+            **kwargs: Additional args passed to LLM processor
+            
+        Returns:
+            dict: Generated schema following the JsonElementExtractionStrategy format
+        """
+        from .prompts import JSON_SCHEMA_BUILDER
+        from .utils import perform_completion_with_backoff
+        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals()[name] is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
+        
+        # Use default or custom prompt
+        prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
+        
+        # Build the prompt
+        system_message = {
+            "role": "system", 
+            "content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
+
+Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
+
+# Schema main keys:
+- name: This is the name of the schema.
+- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
+- baseFields: This is a list of fields that you extract from the base element itself.
+- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
+
+# Extra Context:
+In this context, the following items may or may not be present:
+- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
+- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
+- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
+
+# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
+In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
+
+# What are the instructions and details for this schema generation?
+{prompt_template}"""
+        }
+        
+        user_message = {
+            "role": "user",
+            "content": f"""
+                HTML to analyze:
+                ```html
+                {html}
+                ```
+                """
+        }
+
+        if query:
+            user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
+        if target_json_example:
+            user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
+
+        if query and not target_json_example:
+            user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
+        elif not query and target_json_example:
+            user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
+        elif not query and not target_json_example:
+            user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
+        
+        user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
+
+        Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
+        """
+
+        try:
+            # Call LLM with backoff handling
+            response = perform_completion_with_backoff(
+                provider=llm_config.provider,
+                prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
+                json_response = True,                
+                api_token=llm_config.api_token,
+                base_url=llm_config.base_url,
+                extra_args=kwargs
+            )
+            
+            # Extract and return schema
+            return json.loads(response.choices[0].message.content)
+            
+        except Exception as e:
+            raise Exception(f"Failed to generate schema: {str(e)}")
+
+class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
+
+    How it works:
+    1. Parses HTML content with BeautifulSoup.
+    2. Selects elements using CSS selectors defined in the schema.
+    3. Extracts field data and applies transformations as defined.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
+        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
+        _get_elements(element, selector): Selects child elements using a CSS selector.
+        _get_element_text(element): Extracts text content from a BeautifulSoup element.
+        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
+    """
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+
+    def _parse_html(self, html_content: str):
+        # return BeautifulSoup(html_content, "html.parser")
+        return BeautifulSoup(html_content, "lxml")
+
+    def _get_base_elements(self, parsed_html, selector: str):
+        return parsed_html.select(selector)
+
+    def _get_elements(self, element, selector: str):
+        # Return all matching elements using select() instead of select_one()
+        # This ensures that we get all elements that match the selector, not just the first one
+        return element.select(selector)
+
+    def _get_element_text(self, element) -> str:
+        return element.get_text(strip=True)
+
+    def _get_element_html(self, element) -> str:
+        return str(element)
+
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)
+
+class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+        self._xpath_cache = {}
+        self._result_cache = {}
+        
+        # Control selector optimization strategy
+        self.use_caching = kwargs.get("use_caching", True)
+        self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
+        
+        # Load lxml dependencies once
+        from lxml import etree, html
+        from lxml.cssselect import CSSSelector
+        self.etree = etree
+        self.html_parser = html
+        self.CSSSelector = CSSSelector
+    
+    def _parse_html(self, html_content: str):
+        """Parse HTML content with error recovery"""
+        try:
+            parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
+            return self.etree.fromstring(html_content, parser)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error parsing HTML, falling back to alternative method: {e}")
+            try:
+                return self.html_parser.fromstring(html_content)
+            except Exception as e2:
+                if self.verbose:
+                    print(f"Critical error parsing HTML: {e2}")
+                # Create minimal document as fallback
+                return self.etree.Element("html")
+    
+    def _optimize_selector(self, selector_str):
+        """Optimize common selector patterns for better performance"""
+        if not self.optimize_common_patterns:
+            return selector_str
+            
+        # Handle td:nth-child(N) pattern which is very common in table scraping
+        import re
+        if re.search(r'td:nth-child\(\d+\)', selector_str):
+            return selector_str  # Already handled specially in _apply_selector
+            
+        # Split complex selectors into parts for optimization
+        parts = selector_str.split()
+        if len(parts) <= 1:
+            return selector_str
+            
+        # For very long selectors, consider using just the last specific part
+        if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
+            specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
+            if specific_parts:
+                return specific_parts[-1]  # Use most specific class/id selector
+                
+        return selector_str
+    
+    def _create_selector_function(self, selector_str):
+        """Create a selector function that handles all edge cases"""
+        original_selector = selector_str
+        
+        # Try to optimize the selector if appropriate
+        if self.optimize_common_patterns:
+            selector_str = self._optimize_selector(selector_str)
+        
+        try:
+            # Attempt to compile the CSS selector
+            compiled = self.CSSSelector(selector_str)
+            xpath = compiled.path
+            
+            # Store XPath for later use
+            self._xpath_cache[selector_str] = xpath
+            
+            # Create the wrapper function that implements the selection strategy
+            def selector_func(element, context_sensitive=True):
+                cache_key = None
+                
+                # Use result caching if enabled
+                if self.use_caching:
+                    # Create a cache key based on element and selector
+                    element_id = element.get('id', '') or str(hash(element))
+                    cache_key = f"{element_id}::{selector_str}"
+                    
+                    if cache_key in self._result_cache:
+                        return self._result_cache[cache_key]
+                
+                results = []
+                try:
+                    # Strategy 1: Direct CSS selector application (fastest)
+                    results = compiled(element)
+                    
+                    # If that fails and we need context sensitivity
+                    if not results and context_sensitive:
+                        # Strategy 2: Try XPath with context adjustment
+                        context_xpath = self._make_context_sensitive_xpath(xpath, element)
+                        if context_xpath:
+                            results = element.xpath(context_xpath)
+                        
+                        # Strategy 3: Handle special case - nth-child
+                        if not results and 'nth-child' in original_selector:
+                            results = self._handle_nth_child_selector(element, original_selector)
+                        
+                        # Strategy 4: Direct descendant search for class/ID selectors
+                        if not results:
+                            results = self._fallback_class_id_search(element, original_selector)
+                            
+                        # Strategy 5: Last resort - tag name search for the final part
+                        if not results:
+                            parts = original_selector.split()
+                            if parts:
+                                last_part = parts[-1]
+                                # Extract tag name from the selector
+                                tag_match = re.match(r'^(\w+)', last_part)
+                                if tag_match:
+                                    tag_name = tag_match.group(1)
+                                    results = element.xpath(f".//{tag_name}")
+                    
+                    # Cache results if caching is enabled
+                    if self.use_caching and cache_key:
+                        self._result_cache[cache_key] = results
+                        
+                except Exception as e:
+                    if self.verbose:
+                        print(f"Error applying selector '{selector_str}': {e}")
+                
+                return results
+                
+            return selector_func
+            
+        except Exception as e:
+            if self.verbose:
+                print(f"Error compiling selector '{selector_str}': {e}")
+            
+            # Fallback function for invalid selectors
+            return lambda element, context_sensitive=True: []
+    
+    def _make_context_sensitive_xpath(self, xpath, element):
+        """Convert absolute XPath to context-sensitive XPath"""
+        try:
+            # If starts with descendant-or-self, it's already context-sensitive
+            if xpath.startswith('descendant-or-self::'):
+                return xpath
+                
+            # Remove leading slash if present
+            if xpath.startswith('/'):
+                context_xpath = f".{xpath}"
+            else:
+                context_xpath = f".//{xpath}"
+                
+            # Validate the XPath by trying it
+            try:
+                element.xpath(context_xpath)
+                return context_xpath
+            except:
+                # If that fails, try a simpler descendant search
+                return f".//{xpath.split('/')[-1]}"
+        except:
+            return None
+    
+    def _handle_nth_child_selector(self, element, selector_str):
+        """Special handling for nth-child selectors in tables"""
+        import re
+        results = []
+        
+        try:
+            # Extract the column number from td:nth-child(N)
+            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+            if match:
+                col_num = match.group(1)
+                
+                # Check if there's content after the nth-child part
+                remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
+                
+                if remaining_selector:
+                    # If there's a specific element we're looking for after the column
+                    # Extract any tag names from the remaining selector
+                    tag_match = re.search(r'(\w+)', remaining_selector)
+                    tag_name = tag_match.group(1) if tag_match else '*'
+                    results = element.xpath(f".//td[{col_num}]//{tag_name}")
+                else:
+                    # Just get the column cell
+                    results = element.xpath(f".//td[{col_num}]")
+        except Exception as e:
+            if self.verbose:
+                print(f"Error handling nth-child selector: {e}")
+                
+        return results
+    
+    def _fallback_class_id_search(self, element, selector_str):
+        """Fallback to search by class or ID"""
+        results = []
+        
+        try:
+            # Extract class selectors (.classname)
+            import re
+            class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Extract ID selectors (#idname)
+            id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Try each class
+            for class_name in class_matches:
+                class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
+                results.extend(class_results)
+                
+            # Try each ID (usually more specific)
+            for id_name in id_matches:
+                id_results = element.xpath(f".//*[@id='{id_name}']")
+                results.extend(id_results)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error in fallback class/id search: {e}")
+                
+        return results
+    
+    def _get_selector(self, selector_str):
+        """Get or create a selector function with caching"""
+        if selector_str not in self._selector_cache:
+            self._selector_cache[selector_str] = self._create_selector_function(selector_str)
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        selector_func = self._get_selector(selector)
+        # For base elements, we don't need context sensitivity
+        return selector_func(parsed_html, context_sensitive=False)
+    
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector with context sensitivity"""
+        selector_func = self._get_selector(selector)
+        return selector_func(element, context_sensitive=True)
+    
+    def _get_element_text(self, element) -> str:
+        """Extract normalized text from element"""
+        try:
+            # Get all text nodes and normalize
+            text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
+            return text
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting text: {e}")
+            # Fallback
+            try:
+                return element.text_content().strip()
+            except:
+                return ""
+    
+    def _get_element_html(self, element) -> str:
+        """Get HTML string representation of element"""
+        try:
+            return self.etree.tostring(element, encoding='unicode', method='html')
+        except Exception as e:
+            if self.verbose:
+                print(f"Error serializing HTML: {e}")
+            return ""
+    
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value safely"""
+        try:
+            return element.get(attribute)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error getting attribute '{attribute}': {e}")
+            return None
+            
+    def _clear_caches(self):
+        """Clear caches to free memory"""
+        if self.use_caching:
+            self._result_cache.clear()
+
+class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+    
+    def _parse_html(self, html_content: str):
+        from lxml import etree
+        parser = etree.HTMLParser(recover=True)
+        return etree.fromstring(html_content, parser)
+    
+    def _get_selector(self, selector_str):
+        """Get a selector function that works within the context of an element"""
+        if selector_str not in self._selector_cache:
+            from lxml.cssselect import CSSSelector
+            try:
+                # Store both the compiled selector and its xpath translation
+                compiled = CSSSelector(selector_str)
+                
+                # Create a function that will apply this selector appropriately
+                def select_func(element):
+                    try:
+                        # First attempt: direct CSS selector application
+                        results = compiled(element)
+                        if results:
+                            return results
+                        
+                        # Second attempt: contextual XPath selection
+                        # Convert the root-based XPath to a context-based XPath
+                        xpath = compiled.path
+                        
+                        # If the XPath already starts with descendant-or-self, handle it specially
+                        if xpath.startswith('descendant-or-self::'):
+                            context_xpath = xpath
+                        else:
+                            # For normal XPath expressions, make them relative to current context
+                            context_xpath = f"./{xpath.lstrip('/')}"
+                        
+                        results = element.xpath(context_xpath)
+                        if results:
+                            return results
+                        
+                        # Final fallback: simple descendant search for common patterns
+                        if 'nth-child' in selector_str:
+                            # Handle td:nth-child(N) pattern
+                            import re
+                            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+                            if match:
+                                col_num = match.group(1)
+                                sub_selector = selector_str.split(')', 1)[-1].strip()
+                                if sub_selector:
+                                    return element.xpath(f".//td[{col_num}]//{sub_selector}")
+                                else:
+                                    return element.xpath(f".//td[{col_num}]")
+                        
+                        # Last resort: try each part of the selector separately
+                        parts = selector_str.split()
+                        if len(parts) > 1 and parts[-1]:
+                            return element.xpath(f".//{parts[-1]}")
+                            
+                        return []
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error applying selector '{selector_str}': {e}")
+                        return []
+                
+                self._selector_cache[selector_str] = select_func
+            except Exception as e:
+                if self.verbose:
+                    print(f"Error compiling selector '{selector_str}': {e}")
+                
+                # Fallback function for invalid selectors
+                def fallback_func(element):
+                    return []
+                
+                self._selector_cache[selector_str] = fallback_func
+                
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(parsed_html)
+    
+    def _get_elements(self, element, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(element)
+    
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+    
+    def _get_element_html(self, element) -> str:
+        from lxml import etree
+        return etree.tostring(element, encoding='unicode')
+    
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)    
+
+class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
+
+    How it works:
+    1. Parses HTML content into an lxml tree.
+    2. Selects elements using XPath expressions.
+    3. Converts CSS selectors to XPath when needed.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into an lxml tree.
+        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
+        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
+        _get_elements(element, selector): Selects child elements using an XPath selector.
+        _get_element_text(element): Extracts text content from an lxml element.
+        _get_element_html(element): Extracts the raw HTML content of an lxml element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
+    """
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+
+    def _parse_html(self, html_content: str):
+        return html.fromstring(html_content)
+
+    def _get_base_elements(self, parsed_html, selector: str):
+        return parsed_html.xpath(selector)
+
+    def _css_to_xpath(self, css_selector: str) -> str:
+        """Convert CSS selector to XPath if needed"""
+        if "/" in css_selector:  # Already an XPath
+            return css_selector
+        return self._basic_css_to_xpath(css_selector)
+
+    def _basic_css_to_xpath(self, css_selector: str) -> str:
+        """Basic CSS to XPath conversion for common cases"""
+        if " > " in css_selector:
+            parts = css_selector.split(" > ")
+            return "//" + "/".join(parts)
+        if " " in css_selector:
+            parts = css_selector.split(" ")
+            return "//" + "//".join(parts)
+        return "//" + css_selector
+
+    def _get_elements(self, element, selector: str):
+        xpath = self._css_to_xpath(selector)
+        if not xpath.startswith("."):
+            xpath = "." + xpath
+        return element.xpath(xpath)
+
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+
+    def _get_element_html(self, element) -> str:
+        return etree.tostring(element, encoding="unicode")
+
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)
+
+
+```
+
+
+## File: crawl4ai/models.py
+
+```py
+from pydantic import BaseModel, HttpUrl, PrivateAttr
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from typing import AsyncGenerator
+from typing import Generic, TypeVar
+from enum import Enum
+from dataclasses import dataclass
+from .ssl_certificate import SSLCertificate
+from datetime import datetime
+from datetime import timedelta
+
+
+###############################
+# Dispatcher Models
+###############################
+@dataclass
+class DomainState:
+    last_request_time: float = 0
+    current_delay: float = 0
+    fail_count: int = 0
+
+
+@dataclass
+class CrawlerTaskResult:
+    task_id: str
+    url: str
+    result: "CrawlResult"
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+    retry_count: int = 0
+    wait_time: float = 0.0
+    
+    @property
+    def success(self) -> bool:
+        return self.result.success
+
+class CrawlStatus(Enum):
+    QUEUED = "QUEUED"
+    IN_PROGRESS = "IN_PROGRESS"
+    COMPLETED = "COMPLETED"
+    FAILED = "FAILED"
+
+@dataclass
+class CrawlStats:
+    task_id: str
+    url: str
+    status: CrawlStatus
+    start_time: Optional[Union[datetime, float]] = None
+    end_time: Optional[Union[datetime, float]] = None
+    memory_usage: float = 0.0
+    peak_memory: float = 0.0
+    error_message: str = ""
+    wait_time: float = 0.0
+    retry_count: int = 0
+    counted_requeue: bool = False
+
+    @property
+    def duration(self) -> str:
+        if not self.start_time:
+            return "0:00"
+            
+        # Convert start_time to datetime if it's a float
+        start = self.start_time
+        if isinstance(start, float):
+            start = datetime.fromtimestamp(start)
+            
+        # Get end time or use current time
+        end = self.end_time or datetime.now()
+        # Convert end_time to datetime if it's a float
+        if isinstance(end, float):
+            end = datetime.fromtimestamp(end)
+            
+        duration = end - start
+        return str(timedelta(seconds=int(duration.total_seconds())))
+
+class DisplayMode(Enum):
+    DETAILED = "DETAILED"
+    AGGREGATED = "AGGREGATED"
+
+
+###############################
+# Crawler Models
+###############################
+@dataclass
+class TokenUsage:
+    completion_tokens: int = 0
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens_details: Optional[dict] = None
+    prompt_tokens_details: Optional[dict] = None
+
+class UrlModel(BaseModel):
+    url: HttpUrl
+    forced: bool = False
+
+
+
+@dataclass
+class TraversalStats:
+    """Statistics for the traversal process"""
+
+    start_time: datetime = datetime.now()
+    urls_processed: int = 0
+    urls_failed: int = 0
+    urls_skipped: int = 0
+    total_depth_reached: int = 0
+    current_depth: int = 0
+
+class DispatchResult(BaseModel):
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+    def __str__(self):
+        return self.raw_markdown
+    
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
+    screenshot: Optional[str] = None
+    pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
+# and model_dump override all exist to support a smooth transition from markdown as a string
+# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
+# 
+# This allows code that expects markdown to be a string to continue working, while also
+# providing access to the full MarkdownGenerationResult object's properties.
+# 
+# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
+# 
+# When backward compatibility is no longer needed in future versions, this entire mechanism
+# can be simplified to a standard field with no custom accessors or serialization logic.
+    
+    def __init__(self, **data):
+        markdown_result = data.pop('markdown', None)
+        super().__init__(**data)
+        if markdown_result is not None:
+            self._markdown = (
+                MarkdownGenerationResult(**markdown_result)
+                if isinstance(markdown_result, dict)
+                else markdown_result
+            )
+    
+    @property
+    def markdown(self):
+        """
+        Property that returns a StringCompatibleMarkdown object that behaves like
+        a string but also provides access to MarkdownGenerationResult attributes.
+        
+        This approach allows backward compatibility with code that expects 'markdown'
+        to be a string, while providing access to the full MarkdownGenerationResult.
+        """
+        if self._markdown is None:
+            return None
+        return StringCompatibleMarkdown(self._markdown)
+    
+    @markdown.setter
+    def markdown(self, value):
+        """
+        Setter for the markdown property.
+        """
+        self._markdown = value
+    
+    @property
+    def markdown_v2(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+
+        This property exists to inform users that 'markdown_v2' has been
+        deprecated and they should use 'markdown' instead.
+        """
+        raise AttributeError(
+            "The 'markdown_v2' attribute is deprecated and has been removed. "
+            """Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
+            following properties:
+            - raw_markdown: The raw markdown string
+            - markdown_with_citations: The markdown string with citations
+            - references_markdown: The markdown string with references
+            - fit_markdown: The markdown string with fit text
+            """
+        )
+    
+    @property
+    def fit_markdown(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_markdown' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_markdown' instead."
+        )
+    
+    @property
+    def fit_html(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_html' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_html' instead."
+        )
+
+    def model_dump(self, *args, **kwargs):
+        """
+        Override model_dump to include the _markdown private attribute in serialization.
+        
+        This override is necessary because:
+        1. PrivateAttr fields are excluded from serialization by default
+        2. We need to maintain backward compatibility by including the 'markdown' field
+           in the serialized output
+        3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
+           the same type of data
+        
+        Future developers: This method ensures that the markdown content is properly
+        serialized despite being stored in a private attribute. If the serialization
+        requirements change, this is where you would update the logic.
+        """
+        result = super().model_dump(*args, **kwargs)
+        if self._markdown is not None:
+            result["markdown"] = self._markdown.model_dump() 
+        return result
+
+class StringCompatibleMarkdown(str):
+    """A string subclass that also provides access to MarkdownGenerationResult attributes"""
+    def __new__(cls, markdown_result):
+        return super().__new__(cls, markdown_result.raw_markdown)
+    
+    def __init__(self, markdown_result):
+        self._markdown_result = markdown_result
+    
+    def __getattr__(self, name):
+        return getattr(self._markdown_result, name)
+
+CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+
+class CrawlResultContainer(Generic[CrawlResultT]):
+    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
+        # Normalize to a list
+        if isinstance(results, list):
+            self._results = results
+        else:
+            self._results = [results]
+
+    def __iter__(self):
+        return iter(self._results)
+
+    def __getitem__(self, index):
+        return self._results[index]
+
+    def __len__(self):
+        return len(self._results)
+
+    def __getattr__(self, attr):
+        # Delegate attribute access to the first element.
+        if self._results:
+            return getattr(self._results[0], attr)
+        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._results!r})"
+
+RunManyReturn = Union[
+    CrawlResultContainer[CrawlResultT],
+    AsyncGenerator[CrawlResultT, None]
+]
+
+
+# END of backward compatibility code for markdown/markdown_v2.
+# When removing this code in the future, make sure to:
+# 1. Replace the private attribute and property with a standard field
+# 2. Update any serialization logic that might depend on the current behavior
+
+class AsyncCrawlResponse(BaseModel):
+    html: str
+    response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
+    status_code: int
+    screenshot: Optional[str] = None
+    pdf_data: Optional[bytes] = None
+    mhtml_data: Optional[str] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+    downloaded_files: Optional[List[str]] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+###############################
+# Scraping Models
+###############################
+class MediaItem(BaseModel):
+    src: Optional[str] = ""
+    data: Optional[str] = ""
+    alt: Optional[str] = ""
+    desc: Optional[str] = ""
+    score: Optional[int] = 0
+    type: str = "image"
+    group_id: Optional[int] = 0
+    format: Optional[str] = None
+    width: Optional[int] = None
+
+
+class Link(BaseModel):
+    href: Optional[str] = ""
+    text: Optional[str] = ""
+    title: Optional[str] = ""
+    base_domain: Optional[str] = ""
+
+
+class Media(BaseModel):
+    images: List[MediaItem] = []
+    videos: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Video model if needed
+    audios: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+    tables: List[Dict] = []  # Table data extracted from HTML tables
+
+
+class Links(BaseModel):
+    internal: List[Link] = []
+    external: List[Link] = []
+
+
+class ScrapingResult(BaseModel):
+    cleaned_html: str
+    success: bool
+    media: Media = Media()
+    links: Links = Links()
+    metadata: Dict[str, Any] = {}
+
+```
+
+
+## File: crawl4ai/content_filter_strategy.py
+
+```py
+import inspect
+import re
+import time
+from bs4 import BeautifulSoup, Tag
+from typing import List, Tuple, Dict, Optional
+from rank_bm25 import BM25Okapi
+from collections import deque
+from bs4 import NavigableString, Comment
+
+from .utils import (
+    clean_tokens,
+    perform_completion_with_backoff,
+    escape_json_string,
+    sanitize_html,
+    get_home_folder,
+    extract_xml_data,
+    merge_chunks,
+)
+from .types import LLMConfig
+from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
+from abc import ABC, abstractmethod
+import math
+from snowballstemmer import stemmer
+from .models import TokenUsage
+from .prompts import PROMPT_FILTER_CONTENT
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from .async_logger import AsyncLogger, LogLevel
+from colorama import Fore, Style
+
+
+class RelevantContentFilter(ABC):
+    """Abstract base class for content filtering strategies"""
+
+    def __init__(
+        self,
+        user_query: str = None,
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+    ):
+        """
+        Initializes the RelevantContentFilter class with optional user query.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            verbose (bool): Enable verbose logging (default: False).
+        """
+        self.user_query = user_query
+        self.included_tags = {
+            # Primary structure
+            "article",
+            "main",
+            "section",
+            "div",
+            # List structures
+            "ul",
+            "ol",
+            "li",
+            "dl",
+            "dt",
+            "dd",
+            # Text content
+            "p",
+            "span",
+            "blockquote",
+            "pre",
+            "code",
+            # Headers
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            # Tables
+            "table",
+            "thead",
+            "tbody",
+            "tr",
+            "td",
+            "th",
+            # Other semantic elements
+            "figure",
+            "figcaption",
+            "details",
+            "summary",
+            # Text formatting
+            "em",
+            "strong",
+            "b",
+            "i",
+            "mark",
+            "small",
+            # Rich content
+            "time",
+            "address",
+            "cite",
+            "q",
+        }
+        self.excluded_tags = {
+            "nav",
+            "footer",
+            "header",
+            "aside",
+            "script",
+            "style",
+            "form",
+            "iframe",
+            "noscript",
+        }
+        self.header_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+        self.negative_patterns = re.compile(
+            r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
+        )
+        self.min_word_count = 2
+        self.verbose = False
+        self.logger = logger
+
+    @abstractmethod
+    def filter_content(self, html: str) -> List[str]:
+        """Abstract method to be implemented by specific filtering strategies"""
+        pass
+
+    def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
+        """Common method to extract page metadata with fallbacks"""
+        if self.user_query:
+            return self.user_query
+
+        query_parts = []
+
+        # Title
+        try:
+            title = soup.title.string
+            if title:
+                query_parts.append(title)
+        except Exception:
+            pass
+
+        if soup.find("h1"):
+            query_parts.append(soup.find("h1").get_text())
+
+        # Meta tags
+        temp = ""
+        for meta_name in ["keywords", "description"]:
+            meta = soup.find("meta", attrs={"name": meta_name})
+            if meta and meta.get("content"):
+                query_parts.append(meta["content"])
+                temp += meta["content"]
+
+        # If still empty, grab first significant paragraph
+        if not temp:
+            # Find the first tag P thatits text contains more than 50 characters
+            for p in body.find_all("p"):
+                if len(p.get_text()) > 150:
+                    query_parts.append(p.get_text()[:150])
+                    break
+
+        return " ".join(filter(None, query_parts))
+
+    def extract_text_chunks(
+        self, body: Tag, min_word_threshold: int = None
+    ) -> List[Tuple[str, str]]:
+        """
+        Extracts text chunks from a BeautifulSoup body element while preserving order.
+        Returns list of tuples (text, tag_name) for classification.
+
+        Args:
+            body: BeautifulSoup Tag object representing the body element
+
+        Returns:
+            List of (text, tag_name) tuples
+        """
+        # Tags to ignore - inline elements that shouldn't break text flow
+        INLINE_TAGS = {
+            "a",
+            "abbr",
+            "acronym",
+            "b",
+            "bdo",
+            "big",
+            "br",
+            "button",
+            "cite",
+            "code",
+            "dfn",
+            "em",
+            "i",
+            "img",
+            "input",
+            "kbd",
+            "label",
+            "map",
+            "object",
+            "q",
+            "samp",
+            "script",
+            "select",
+            "small",
+            "span",
+            "strong",
+            "sub",
+            "sup",
+            "textarea",
+            "time",
+            "tt",
+            "var",
+        }
+
+        # Tags that typically contain meaningful headers
+        HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "header"}
+
+        chunks = []
+        current_text = []
+        chunk_index = 0
+
+        def should_break_chunk(tag: Tag) -> bool:
+            """Determine if a tag should cause a break in the current text chunk"""
+            return tag.name not in INLINE_TAGS and not (
+                tag.name == "p" and len(current_text) == 0
+            )
+
+        # Use deque for efficient push/pop operations
+        stack = deque([(body, False)])
+
+        while stack:
+            element, visited = stack.pop()
+
+            if visited:
+                # End of block element - flush accumulated text
+                if current_text and should_break_chunk(element):
+                    text = " ".join("".join(current_text).split())
+                    if text:
+                        tag_type = (
+                            "header" if element.name in HEADER_TAGS else "content"
+                        )
+                        chunks.append((chunk_index, text, tag_type, element))
+                        chunk_index += 1
+                    current_text = []
+                continue
+
+            if isinstance(element, NavigableString):
+                if str(element).strip():
+                    current_text.append(str(element).strip())
+                continue
+
+            # Pre-allocate children to avoid multiple list operations
+            children = list(element.children)
+            if not children:
+                continue
+
+            # Mark block for revisit after processing children
+            stack.append((element, True))
+
+            # Add children in reverse order for correct processing
+            for child in reversed(children):
+                if isinstance(child, (Tag, NavigableString)):
+                    stack.append((child, False))
+
+        # Handle any remaining text
+        if current_text:
+            text = " ".join("".join(current_text).split())
+            if text:
+                chunks.append((chunk_index, text, "content", body))
+
+        if min_word_threshold:
+            chunks = [
+                chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold
+            ]
+
+        return chunks
+
+    def _deprecated_extract_text_chunks(
+        self, soup: BeautifulSoup
+    ) -> List[Tuple[int, str, Tag]]:
+        """Common method for extracting text chunks"""
+        _text_cache = {}
+
+        def fast_text(element: Tag) -> str:
+            elem_id = id(element)
+            if elem_id in _text_cache:
+                return _text_cache[elem_id]
+            texts = []
+            for content in element.contents:
+                if isinstance(content, str):
+                    text = content.strip()
+                    if text:
+                        texts.append(text)
+            result = " ".join(texts)
+            _text_cache[elem_id] = result
+            return result
+
+        candidates = []
+        index = 0
+
+        def dfs(element):
+            nonlocal index
+            if isinstance(element, Tag):
+                if element.name in self.included_tags:
+                    if not self.is_excluded(element):
+                        text = fast_text(element)
+                        word_count = len(text.split())
+
+                        # Headers pass through with adjusted minimum
+                        if element.name in self.header_tags:
+                            if word_count >= 3:  # Minimal sanity check for headers
+                                candidates.append((index, text, element))
+                                index += 1
+                        # Regular content uses standard minimum
+                        elif word_count >= self.min_word_count:
+                            candidates.append((index, text, element))
+                            index += 1
+
+                for child in element.children:
+                    dfs(child)
+
+        dfs(soup.body if soup.body else soup)
+        return candidates
+
+    def is_excluded(self, tag: Tag) -> bool:
+        """Common method for exclusion logic"""
+        if tag.name in self.excluded_tags:
+            return True
+        class_id = " ".join(
+            filter(None, [" ".join(tag.get("class", [])), tag.get("id", "")])
+        )
+        return bool(self.negative_patterns.search(class_id))
+
+    def clean_element(self, tag: Tag) -> str:
+        """Common method for cleaning HTML elements with minimal overhead"""
+        if not tag or not isinstance(tag, Tag):
+            return ""
+
+        unwanted_tags = {"script", "style", "aside", "form", "iframe", "noscript"}
+        unwanted_attrs = {
+            "style",
+            "onclick",
+            "onmouseover",
+            "align",
+            "bgcolor",
+            "class",
+            "id",
+        }
+
+        # Use string builder pattern for better performance
+        builder = []
+
+        def render_tag(elem):
+            if not isinstance(elem, Tag):
+                if isinstance(elem, str):
+                    builder.append(elem.strip())
+                return
+
+            if elem.name in unwanted_tags:
+                return
+
+            # Start tag
+            builder.append(f"<{elem.name}")
+
+            # Add cleaned attributes
+            attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
+            for key, value in attrs.items():
+                builder.append(f' {key}="{value}"')
+
+            builder.append(">")
+
+            # Process children
+            for child in elem.children:
+                render_tag(child)
+
+            # Close tag
+            builder.append(f"</{elem.name}>")
+
+        try:
+            render_tag(tag)
+            return "".join(builder)
+        except Exception:
+            return str(tag)  # Fallback to original if anything fails
+
+
+class BM25ContentFilter(RelevantContentFilter):
+    """
+    Content filtering using BM25 algorithm with priority tag handling.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Tokenizes the corpus and query.
+    4. Applies BM25 algorithm to calculate scores for each chunk.
+    5. Filters out chunks below the threshold.
+    6. Sorts chunks by score in descending order.
+    7. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional).
+        bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+        language (str): Language for stemming (default: 'english').
+
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None)
+    """
+
+    def __init__(
+        self,
+        user_query: str = None,
+        bm25_threshold: float = 1.0,
+        language: str = "english",
+    ):
+        """
+        Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
+
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+            language (str): Language for stemming (default: 'english').
+        """
+        super().__init__(user_query=user_query)
+        self.bm25_threshold = bm25_threshold
+        self.priority_tags = {
+            "h1": 5.0,
+            "h2": 4.0,
+            "h3": 3.0,
+            "title": 4.0,
+            "strong": 2.0,
+            "b": 1.5,
+            "em": 1.5,
+            "blockquote": 2.0,
+            "code": 2.0,
+            "pre": 1.5,
+            "th": 1.5,  # Table headers
+        }
+        self.stemmer = stemmer(language)
+
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using BM25 algorithm with priority tag handling.
+
+            Note:
+        This method implements the filtering logic for the BM25ContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, "lxml")
+
+        # Check if body is present
+        if not soup.body:
+            # Wrap in body tag if missing
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+        body = soup.find("body")
+
+        query = self.extract_page_query(soup, body)
+
+        if not query:
+            return []
+            # return [self.clean_element(soup)]
+
+        candidates = self.extract_text_chunks(body, min_word_threshold)
+
+        if not candidates:
+            return []
+
+        # Tokenize corpus
+        # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
+        # tokenized_query = query.lower().split()
+
+        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
+        #                 for _, chunk, _, _ in candidates]
+        # tokenized_query = [ps.stem(word) for word in query.lower().split()]
+
+        tokenized_corpus = [
+            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+            for _, chunk, _, _ in candidates
+        ]
+        tokenized_query = [
+            self.stemmer.stemWord(word) for word in query.lower().split()
+        ]
+
+        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
+        #            for _, chunk, _, _ in candidates]
+        # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
+
+        # Clean from stop words and noise
+        tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
+        tokenized_query = clean_tokens(tokenized_query)
+
+        bm25 = BM25Okapi(tokenized_corpus)
+        scores = bm25.get_scores(tokenized_query)
+
+        # Adjust scores with tag weights
+        adjusted_candidates = []
+        for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
+            tag_weight = self.priority_tags.get(tag.name, 1.0)
+            adjusted_score = score * tag_weight
+            adjusted_candidates.append((adjusted_score, index, chunk, tag))
+
+        # Filter candidates by threshold
+        selected_candidates = [
+            (index, chunk, tag)
+            for adjusted_score, index, chunk, tag in adjusted_candidates
+            if adjusted_score >= self.bm25_threshold
+        ]
+
+        if not selected_candidates:
+            return []
+
+        # Sort selected candidates by original document order
+        selected_candidates.sort(key=lambda x: x[0])
+
+        return [self.clean_element(tag) for _, _, tag in selected_candidates]
+
+
+class PruningContentFilter(RelevantContentFilter):
+    """
+    Content filtering using pruning algorithm with dynamic threshold.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies pruning algorithm to calculate scores for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
+        min_word_threshold (int): Minimum word threshold for filtering (optional).
+        threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+        threshold (float): Fixed threshold value (default: 0.48).
+
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None):
+    """
+
+    def __init__(
+        self,
+        user_query: str = None,
+        min_word_threshold: int = None,
+        threshold_type: str = "fixed",
+        threshold: float = 0.48,
+    ):
+        """
+        Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
+
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+            threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+            threshold (float): Fixed threshold value (default: 0.48).
+        """
+        super().__init__(None)
+        self.min_word_threshold = min_word_threshold
+        self.threshold_type = threshold_type
+        self.threshold = threshold
+
+        # Add tag importance for dynamic threshold
+        self.tag_importance = {
+            "article": 1.5,
+            "main": 1.4,
+            "section": 1.3,
+            "p": 1.2,
+            "h1": 1.4,
+            "h2": 1.3,
+            "h3": 1.2,
+            "div": 0.7,
+            "span": 0.6,
+        }
+
+        # Metric configuration
+        self.metric_config = {
+            "text_density": True,
+            "link_density": True,
+            "tag_weight": True,
+            "class_id_weight": True,
+            "text_length": True,
+        }
+
+        self.metric_weights = {
+            "text_density": 0.4,
+            "link_density": 0.2,
+            "tag_weight": 0.2,
+            "class_id_weight": 0.1,
+            "text_length": 0.1,
+        }
+
+        self.tag_weights = {
+            "div": 0.5,
+            "p": 1.0,
+            "article": 1.5,
+            "section": 1.0,
+            "span": 0.3,
+            "li": 0.5,
+            "ul": 0.5,
+            "ol": 0.5,
+            "h1": 1.2,
+            "h2": 1.1,
+            "h3": 1.0,
+            "h4": 0.9,
+            "h5": 0.8,
+            "h6": 0.7,
+        }
+
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using pruning algorithm with dynamic threshold.
+
+        Note:
+        This method implements the filtering logic for the PruningContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, "lxml")
+        if not soup.body:
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
+        # Remove comments and unwanted tags
+        self._remove_comments(soup)
+        self._remove_unwanted_tags(soup)
+
+        # Prune tree starting from body
+        body = soup.find("body")
+        self._prune_tree(body)
+
+        # Extract remaining content as list of HTML strings
+        content_blocks = []
+        for element in body.children:
+            if isinstance(element, str) or not hasattr(element, "name"):
+                continue
+            if len(element.get_text(strip=True)) > 0:
+                content_blocks.append(str(element))
+
+        return content_blocks
+
+    def _remove_comments(self, soup):
+        """Removes HTML comments"""
+        for element in soup(text=lambda text: isinstance(text, Comment)):
+            element.extract()
+
+    def _remove_unwanted_tags(self, soup):
+        """Removes unwanted tags"""
+        for tag in self.excluded_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+
+    def _prune_tree(self, node):
+        """
+        Prunes the tree starting from the given node.
+
+        Args:
+            node (Tag): The node from which the pruning starts.
+        """
+        if not node or not hasattr(node, "name") or node.name is None:
+            return
+
+        text_len = len(node.get_text(strip=True))
+        tag_len = len(node.encode_contents().decode("utf-8"))
+        link_text_len = sum(
+            len(s.strip())
+            for s in (a.string for a in node.find_all("a", recursive=False))
+            if s
+        )
+
+        metrics = {
+            "node": node,
+            "tag_name": node.name,
+            "text_len": text_len,
+            "tag_len": tag_len,
+            "link_text_len": link_text_len,
+        }
+
+        score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
+
+        if self.threshold_type == "fixed":
+            should_remove = score < self.threshold
+        else:  # dynamic
+            tag_importance = self.tag_importance.get(node.name, 0.7)
+            text_ratio = text_len / tag_len if tag_len > 0 else 0
+            link_ratio = link_text_len / text_len if text_len > 0 else 1
+
+            threshold = self.threshold  # base threshold
+            if tag_importance > 1:
+                threshold *= 0.8
+            if text_ratio > 0.4:
+                threshold *= 0.9
+            if link_ratio > 0.6:
+                threshold *= 1.2
+
+            should_remove = score < threshold
+
+        if should_remove:
+            node.decompose()
+        else:
+            children = [child for child in node.children if hasattr(child, "name")]
+            for child in children:
+                self._prune_tree(child)
+
+    def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
+        """Computes the composite score"""
+        if self.min_word_threshold:
+            # Get raw text from metrics node - avoid extra processing
+            text = metrics["node"].get_text(strip=True)
+            word_count = text.count(" ") + 1
+            if word_count < self.min_word_threshold:
+                return -1.0  # Guaranteed removal
+        score = 0.0
+        total_weight = 0.0
+
+        if self.metric_config["text_density"]:
+            density = text_len / tag_len if tag_len > 0 else 0
+            score += self.metric_weights["text_density"] * density
+            total_weight += self.metric_weights["text_density"]
+
+        if self.metric_config["link_density"]:
+            density = 1 - (link_text_len / text_len if text_len > 0 else 0)
+            score += self.metric_weights["link_density"] * density
+            total_weight += self.metric_weights["link_density"]
+
+        if self.metric_config["tag_weight"]:
+            tag_score = self.tag_weights.get(metrics["tag_name"], 0.5)
+            score += self.metric_weights["tag_weight"] * tag_score
+            total_weight += self.metric_weights["tag_weight"]
+
+        if self.metric_config["class_id_weight"]:
+            class_score = self._compute_class_id_weight(metrics["node"])
+            score += self.metric_weights["class_id_weight"] * max(0, class_score)
+            total_weight += self.metric_weights["class_id_weight"]
+
+        if self.metric_config["text_length"]:
+            score += self.metric_weights["text_length"] * math.log(text_len + 1)
+            total_weight += self.metric_weights["text_length"]
+
+        return score / total_weight if total_weight > 0 else 0
+
+    def _compute_class_id_weight(self, node):
+        """Computes the class ID weight"""
+        class_id_score = 0
+        if "class" in node.attrs:
+            classes = " ".join(node["class"])
+            if self.negative_patterns.match(classes):
+                class_id_score -= 0.5
+        if "id" in node.attrs:
+            element_id = node["id"]
+            if self.negative_patterns.match(element_id):
+                class_id_score -= 0.5
+        return class_id_score
+
+
+class LLMContentFilter(RelevantContentFilter):
+    """Content filtering using LLMs to generate relevant markdown.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies LLMs to generate markdown for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        llm_config (LLMConfig): LLM configuration object.
+        instruction (str): Instruction for LLM markdown generation
+        chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
+        overlap_rate (float): Overlap rate for chunking (default: 0.5).
+        word_token_rate (float): Word token rate for chunking (default: 0.2).
+        verbose (bool): Enable verbose logging (default: False).
+        logger (AsyncLogger): Custom logger for LLM operations (optional).
+    """
+    _UNWANTED_PROPS = {
+        'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+        'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+    }
+
+    def __init__(
+        self,
+        llm_config: "LLMConfig" = None,
+        instruction: str = None,
+        chunk_token_threshold: int = int(1e9),
+        overlap_rate: float = OVERLAP_RATE,
+        word_token_rate: float = WORD_TOKEN_RATE,
+        # char_token_rate: float = WORD_TOKEN_RATE * 5,
+        # chunk_mode: str = "char",
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+        ignore_cache: bool = True,
+        # Deprecated properties
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        api_base: Optional[str] = None,
+        extra_args: Dict = None,
+    ):
+        super().__init__(None)
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url or api_base
+        self.llm_config = llm_config
+        self.instruction = instruction
+        self.chunk_token_threshold = chunk_token_threshold
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
+        # self.chunk_mode: str = chunk_mode
+        # self.char_token_rate = char_token_rate or word_token_rate / 5
+        # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
+        self.token_rate = word_token_rate or WORD_TOKEN_RATE
+        self.extra_args = extra_args or {}
+        self.ignore_cache = ignore_cache
+        self.verbose = verbose
+
+        # Setup logger with custom styling for LLM operations
+        if logger:
+            self.logger = logger
+        elif verbose:
+            self.logger = AsyncLogger(
+                verbose=verbose,
+                icons={
+                    **AsyncLogger.DEFAULT_ICONS,
+                    "LLM": "★",  # Star for LLM operations
+                    "CHUNK": "◈",  # Diamond for chunks
+                    "CACHE": "⚡",  # Lightning for cache operations
+                },
+                colors={
+                    **AsyncLogger.DEFAULT_COLORS,
+                    LogLevel.INFO: Fore.MAGENTA
+                    + Style.DIM,  # Dimmed purple for LLM ops
+                },
+            )
+        else:
+            self.logger = None
+
+        self.usages = []
+        self.total_usage = TokenUsage()
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
+    def _get_cache_key(self, html: str, instruction: str) -> str:
+        """Generate a unique cache key based on HTML and instruction"""
+        content = f"{html}{instruction}"
+        return hashlib.md5(content.encode()).hexdigest()
+
+    def _merge_chunks(self, text: str) -> List[str]:
+        """Split text into chunks with overlap using char or word mode."""
+        ov = int(self.chunk_token_threshold * self.overlap_rate)
+        sections = merge_chunks(
+            docs=[text],
+            target_size=self.chunk_token_threshold,
+            overlap=ov,
+            word_token_ratio=self.word_token_rate,
+        )
+        return sections
+
+    def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
+        if not html or not isinstance(html, str):
+            return []
+
+        if self.logger:
+            self.logger.info(
+                "Starting LLM markdown content filtering process",
+                tag="LLM",
+                params={"provider": self.llm_config.provider},
+                colors={"provider": Fore.CYAN},
+            )
+
+        # Cache handling
+        cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_key = self._get_cache_key(html, self.instruction or "")
+        cache_file = cache_dir / f"{cache_key}.json"
+
+        # if ignore_cache == None:
+        ignore_cache = self.ignore_cache
+
+        if not ignore_cache and cache_file.exists():
+            if self.logger:
+                self.logger.info("Found  cached markdown result", tag="CACHE")
+            try:
+                with cache_file.open("r") as f:
+                    cached_data = json.load(f)
+                    usage = TokenUsage(**cached_data["usage"])
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+                    return cached_data["blocks"]
+            except Exception as e:
+                if self.logger:
+                    self.logger.error(
+                        f"LLM markdown: Cache read error: {str(e)}", tag="CACHE"
+                    )
+
+        # Split into chunks
+        html_chunks = self._merge_chunks(html)
+        if self.logger:
+            self.logger.info(
+                "LLM markdown: Split content into {chunk_count} chunks",
+                tag="CHUNK",
+                params={"chunk_count": len(html_chunks)},
+                colors={"chunk_count": Fore.YELLOW},
+            )
+
+        start_time = time.time()
+
+        # Process chunks in parallel
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = []
+            for i, chunk in enumerate(html_chunks):
+                if self.logger:
+                    self.logger.debug(
+                        "LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
+                        tag="CHUNK",
+                        params={"chunk_num": i + 1, "total_chunks": len(html_chunks)},
+                    )
+
+                prompt_variables = {
+                    "HTML": escape_json_string(sanitize_html(chunk)),
+                    "REQUEST": self.instruction
+                    or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.",
+                }
+
+                prompt = PROMPT_FILTER_CONTENT
+                for var, value in prompt_variables.items():
+                    prompt = prompt.replace("{" + var + "}", value)
+
+                def _proceed_with_chunk(
+                    provider: str,
+                    prompt: str,
+                    api_token: str,
+                    base_url: Optional[str] = None,
+                    extra_args: Dict = {},
+                ) -> List[str]:
+                    if self.logger:
+                        self.logger.info(
+                            "LLM Markdown: Processing chunk {chunk_num}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1},
+                        )
+                    return perform_completion_with_backoff(
+                        provider,
+                        prompt,
+                        api_token,
+                        base_url=base_url,
+                        extra_args=extra_args,
+                    )
+
+                future = executor.submit(
+                    _proceed_with_chunk,
+                    self.llm_config.provider,
+                    prompt,
+                    self.llm_config.api_token,
+                    self.llm_config.base_url,
+                    self.extra_args,
+                )
+                futures.append((i, future))
+
+            # Collect results in order
+            ordered_results = []
+            for i, future in sorted(futures):
+                try:
+                    response = future.result()
+
+                    # Track usage
+                    usage = TokenUsage(
+                        completion_tokens=response.usage.completion_tokens,
+                        prompt_tokens=response.usage.prompt_tokens,
+                        total_tokens=response.usage.total_tokens,
+                        completion_tokens_details=(
+                            response.usage.completion_tokens_details.__dict__
+                            if response.usage.completion_tokens_details
+                            else {}
+                        ),
+                        prompt_tokens_details=(
+                            response.usage.prompt_tokens_details.__dict__
+                            if response.usage.prompt_tokens_details
+                            else {}
+                        ),
+                    )
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+
+                    blocks = extract_xml_data(
+                        ["content"], response.choices[0].message.content
+                    )["content"]
+                    if blocks:
+                        ordered_results.append(blocks)
+                        if self.logger:
+                            self.logger.success(
+                                "LLM markdown: Successfully processed chunk {chunk_num}",
+                                tag="CHUNK",
+                                params={"chunk_num": i + 1},
+                            )
+                except Exception as e:
+                    if self.logger:
+                        self.logger.error(
+                            "LLM markdown: Error processing chunk {chunk_num}: {error}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1, "error": str(e)},
+                        )
+
+        end_time = time.time()
+        if self.logger:
+            self.logger.success(
+                "LLM markdown: Completed processing in {time:.2f}s",
+                tag="LLM",
+                params={"time": end_time - start_time},
+                colors={"time": Fore.YELLOW},
+            )
+
+        result = ordered_results if ordered_results else []
+
+        # Cache the final result
+        cache_data = {"blocks": result, "usage": self.total_usage.__dict__}
+        with cache_file.open("w") as f:
+            json.dump(cache_data, f)
+            if self.logger:
+                self.logger.info("Cached results for future use", tag="CACHE")
+
+        return result
+
+    def show_usage(self) -> None:
+        """Print usage statistics"""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        if self.usages:
+            print("\n=== Usage History ===")
+            print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+            print("-" * 48)
+            for i, usage in enumerate(self.usages, 1):
+                print(
+                    f"{i:<10} {usage.completion_tokens:>12,} "
+                    f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+                )
+
+```
+
+
+## File: crawl4ai/markdown_generation_strategy.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any, Tuple
+from .models import MarkdownGenerationResult
+from .html2text import CustomHTML2Text
+# from .types import RelevantContentFilter
+from .content_filter_strategy import RelevantContentFilter
+import re
+from urllib.parse import urljoin
+
+# Pre-compile the regex pattern
+LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
+
+
+def fast_urljoin(base: str, url: str) -> str:
+    """Fast URL joining for common cases."""
+    if url.startswith(("http://", "https://", "mailto:", "//")):
+        return url
+    if url.startswith("/"):
+        # Handle absolute paths
+        if base.endswith("/"):
+            return base[:-1] + url
+        return base + url
+    return urljoin(base, url)
+
+
+class MarkdownGenerationStrategy(ABC):
+    """Abstract base class for markdown generation strategies."""
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+        verbose: bool = False,
+        content_source: str = "cleaned_html",
+    ):
+        self.content_filter = content_filter
+        self.options = options or {}
+        self.verbose = verbose
+        self.content_source = content_source
+
+    @abstractmethod
+    def generate_markdown(
+        self,
+        input_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
+        """Generate markdown from the selected input HTML."""
+        pass
+
+
+class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
+    """
+    Default implementation of markdown generation strategy.
+
+    How it works:
+    1. Generate raw markdown from cleaned HTML.
+    2. Convert links to citations.
+    3. Generate fit markdown if content filter is provided.
+    4. Return MarkdownGenerationResult.
+
+    Args:
+        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
+
+    Returns:
+        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+    """
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_source: str = "cleaned_html",
+    ):
+        super().__init__(content_filter, options, verbose=False, content_source=content_source)
+
+    def convert_links_to_citations(
+        self, markdown: str, base_url: str = ""
+    ) -> Tuple[str, str]:
+        """
+        Convert links in markdown to citations.
+
+        How it works:
+        1. Find all links in the markdown.
+        2. Convert links to citations.
+        3. Return converted markdown and references markdown.
+
+        Note:
+        This function uses a regex pattern to find links in markdown.
+
+        Args:
+            markdown (str): Markdown text.
+            base_url (str): Base URL for URL joins.
+
+        Returns:
+            Tuple[str, str]: Converted markdown and references markdown.
+        """
+        link_map = {}
+        url_cache = {}  # Cache for URL joins
+        parts = []
+        last_end = 0
+        counter = 1
+
+        for match in LINK_PATTERN.finditer(markdown):
+            parts.append(markdown[last_end : match.start()])
+            text, url, title = match.groups()
+
+            # Use cached URL if available, otherwise compute and cache
+            if base_url and not url.startswith(("http://", "https://", "mailto:")):
+                if url not in url_cache:
+                    url_cache[url] = fast_urljoin(base_url, url)
+                url = url_cache[url]
+
+            if url not in link_map:
+                desc = []
+                if title:
+                    desc.append(title)
+                if text and text != title:
+                    desc.append(text)
+                link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
+                counter += 1
+
+            num = link_map[url][0]
+            parts.append(
+                f"{text}⟨{num}⟩"
+                if not match.group(0).startswith("!")
+                else f"![{text}⟨{num}⟩]"
+            )
+            last_end = match.end()
+
+        parts.append(markdown[last_end:])
+        converted_text = "".join(parts)
+
+        # Pre-build reference strings
+        references = ["\n\n## References\n\n"]
+        references.extend(
+            f"⟨{num}⟩ {url}{desc}\n"
+            for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
+        )
+
+        return converted_text, "".join(references)
+
+    def generate_markdown(
+        self,
+        input_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
+        """
+        Generate markdown with citations from the provided input HTML.
+
+        How it works:
+        1. Generate raw markdown from the input HTML.
+        2. Convert links to citations.
+        3. Generate fit markdown if content filter is provided.
+        4. Return MarkdownGenerationResult.
+
+        Args:
+            input_html (str): The HTML content to process (selected based on content_source).
+            base_url (str): Base URL for URL joins.
+            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
+            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
+            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+            citations (bool): Whether to generate citations.
+
+        Returns:
+            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+        """
+        try:
+            # Initialize HTML2Text with default options for better conversion
+            h = CustomHTML2Text(baseurl=base_url)
+            default_options = {
+                "body_width": 0,  # Disable text wrapping
+                "ignore_emphasis": False,
+                "ignore_links": False,
+                "ignore_images": False,
+                "protect_links": False,
+                "single_line_break": True,
+                "mark_code": True,
+                "escape_snob": False,
+            }
+
+            # Update with custom options if provided
+            if html2text_options:
+                default_options.update(html2text_options)
+            elif options:
+                default_options.update(options)
+            elif self.options:
+                default_options.update(self.options)
+
+            h.update_params(**default_options)
+
+            # Ensure we have valid input
+            if not input_html:
+                input_html = ""
+            elif not isinstance(input_html, str):
+                input_html = str(input_html)
+
+            # Generate raw markdown
+            try:
+                raw_markdown = h.handle(input_html)
+            except Exception as e:
+                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
+
+            raw_markdown = raw_markdown.replace("    ```", "```")
+
+            # Convert links to citations
+            markdown_with_citations: str = raw_markdown
+            references_markdown: str = ""
+            if citations:
+                try:
+                    (
+                        markdown_with_citations,
+                        references_markdown,
+                    ) = self.convert_links_to_citations(raw_markdown, base_url)
+                except Exception as e:
+                    markdown_with_citations = raw_markdown
+                    references_markdown = f"Error generating citations: {str(e)}"
+
+            # Generate fit markdown if content filter is provided
+            fit_markdown: Optional[str] = ""
+            filtered_html: Optional[str] = ""
+            if content_filter or self.content_filter:
+                try:
+                    content_filter = content_filter or self.content_filter
+                    filtered_html = content_filter.filter_content(input_html)
+                    filtered_html = "\n".join(
+                        "<div>{}</div>".format(s) for s in filtered_html
+                    )
+                    fit_markdown = h.handle(filtered_html)
+                except Exception as e:
+                    fit_markdown = f"Error generating fit markdown: {str(e)}"
+                    filtered_html = ""
+
+            return MarkdownGenerationResult(
+                raw_markdown=raw_markdown or "",
+                markdown_with_citations=markdown_with_citations or "",
+                references_markdown=references_markdown or "",
+                fit_markdown=fit_markdown or "",
+                fit_html=filtered_html or "",
+            )
+        except Exception as e:
+            # If anything fails, return empty strings with error message
+            error_msg = f"Error in markdown generation: {str(e)}"
+            return MarkdownGenerationResult(
+                raw_markdown=error_msg,
+                markdown_with_citations=error_msg,
+                references_markdown="",
+                fit_markdown="",
+                fit_html="",
+            )
+
+```
+
+
+## File: crawl4ai/browser_manager.py
+
+```py
+import asyncio
+import time
+from typing import List, Optional
+import os
+import sys
+import shutil
+import tempfile
+import subprocess
+from playwright.async_api import BrowserContext
+import hashlib
+from .js_snippet import load_js_script
+from .config import DOWNLOAD_PAGE_TIMEOUT
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from playwright_stealth import StealthConfig
+from .utils import get_chromium_path
+
+stealth_config = StealthConfig(
+    webdriver=True,
+    chrome_app=True,
+    chrome_csi=True,
+    chrome_load_times=True,
+    chrome_runtime=True,
+    navigator_languages=True,
+    navigator_plugins=True,
+    navigator_permissions=True,
+    webgl_vendor=True,
+    outerdimensions=True,
+    navigator_hardware_concurrency=True,
+    media_codecs=True,
+)
+
+BROWSER_DISABLE_OPTIONS = [
+    "--disable-background-networking",
+    "--disable-background-timer-throttling",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-breakpad",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-extensions-with-background-pages",
+    "--disable-default-apps",
+    "--disable-extensions",
+    "--disable-features=TranslateUI",
+    "--disable-hang-monitor",
+    "--disable-ipc-flooding-protection",
+    "--disable-popup-blocking",
+    "--disable-prompt-on-repost",
+    "--disable-sync",
+    "--force-color-profile=srgb",
+    "--metrics-recording-only",
+    "--no-first-run",
+    "--password-store=basic",
+    "--use-mock-keychain",
+]
+
+
+class ManagedBrowser:
+    """
+    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_process (subprocess.Popen): The process object for the browser.
+        temp_dir (str): Temporary directory for user data if not provided.
+        debugging_port (int): Port for debugging the browser.
+        host (str): Host for debugging the browser.
+
+        Methods:
+            start(): Starts the browser process and returns the CDP endpoint URL.
+            _get_browser_path(): Returns the browser executable path based on OS and browser type.
+            _get_browser_args(): Returns browser-specific command line arguments.
+            _get_user_data_dir(): Returns the user data directory path.
+            _cleanup(): Terminates the browser process and removes the temporary directory.
+            create_profile(): Static method to create a user profile by launching a browser for user interaction.
+    """
+
+    browser_type: str
+    user_data_dir: str
+    headless: bool
+    browser_process: subprocess.Popen
+    temp_dir: str
+    debugging_port: int
+    host: str
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        user_data_dir: Optional[str] = None,
+        headless: bool = False,
+        logger=None,
+        host: str = "localhost",
+        debugging_port: int = 9222,
+        cdp_url: Optional[str] = None, 
+        browser_config: Optional[BrowserConfig] = None,
+    ):
+        """
+        Initialize the ManagedBrowser instance.
+
+        Args:
+            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                                Default: "chromium".
+            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                         temporary directory may be used. Default: None.
+            headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                             Default: True.
+            logger (logging.Logger): Logger instance for logging messages. Default: None.
+            host (str): Host for debugging the browser. Default: "localhost".
+            debugging_port (int): Port for debugging the browser. Default: 9222.
+            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
+            browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
+        """
+        self.browser_type = browser_config.browser_type
+        self.user_data_dir = browser_config.user_data_dir
+        self.headless = browser_config.headless
+        self.browser_process = None
+        self.temp_dir = None
+        self.debugging_port = browser_config.debugging_port
+        self.host = browser_config.host
+        self.logger = logger
+        self.shutting_down = False
+        self.cdp_url = browser_config.cdp_url
+        self.browser_config = browser_config
+
+    async def start(self) -> str:
+        """
+        Starts the browser process or returns CDP endpoint URL.
+        If cdp_url is provided, returns it directly.
+        If user_data_dir is not provided for local browser, creates a temporary directory.
+        
+        Returns:
+            str: CDP endpoint URL
+        """
+        # If CDP URL provided, just return it
+        if self.cdp_url:
+            return self.cdp_url
+
+        # Create temp dir if needed
+        if not self.user_data_dir:
+            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
+            self.user_data_dir = self.temp_dir
+
+        # Get browser path and args based on OS and browser type
+        # browser_path = self._get_browser_path()
+        args = await self._get_browser_args()
+        
+        if self.browser_config.extra_args:
+            args.extend(self.browser_config.extra_args)
+
+        # Start browser process
+        try:
+            # Use DETACHED_PROCESS flag on Windows to fully detach the process
+            # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
+            if sys.platform == "win32":
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
+                )
+            else:
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    preexec_fn=os.setpgrp  # Start in a new process group
+                )
+                
+            # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
+            await asyncio.sleep(0.5)  # Give browser time to start
+            await self._initial_startup_check()
+            await asyncio.sleep(2)  # Give browser time to start
+            return f"http://{self.host}:{self.debugging_port}"
+        except Exception as e:
+            await self.cleanup()
+            raise Exception(f"Failed to start browser: {e}")
+
+    async def _initial_startup_check(self):
+        """
+        Perform a quick check to make sure the browser started successfully.
+        This only runs once at startup rather than continuously monitoring.
+        """
+        if not self.browser_process:
+            return
+            
+        # Check that process started without immediate termination
+        await asyncio.sleep(0.5)
+        if self.browser_process.poll() is not None:
+            # Process already terminated
+            stdout, stderr = b"", b""
+            try:
+                stdout, stderr = self.browser_process.communicate(timeout=0.5)
+            except subprocess.TimeoutExpired:
+                pass
+                
+            self.logger.error(
+                message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                tag="ERROR",
+                params={
+                    "code": self.browser_process.returncode,
+                    "stdout": stdout.decode() if stdout else "",
+                    "stderr": stderr.decode() if stderr else "",
+                },
+            )
+    
+    async def _monitor_browser_process(self):
+        """
+        Monitor the browser process for unexpected termination.
+
+        How it works:
+        1. Read stdout and stderr from the browser process.
+        2. If the process has terminated, log the error message and terminate the browser.
+        3. If the shutting_down flag is set, log the normal termination message.
+        4. If any other error occurs, log the error message.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
+        """
+        if self.browser_process:
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read),
+                )
+
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode(),
+                            },
+                        )
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode},
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)},
+                    )
+
+    def _get_browser_path_WIP(self) -> str:
+        """Returns the browser executable path based on OS and browser type"""
+        if sys.platform == "darwin":  # macOS
+            paths = {
+                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
+                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
+            }
+        elif sys.platform == "win32":  # Windows
+            paths = {
+                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
+                "webkit": None,  # WebKit not supported on Windows
+            }
+        else:  # Linux
+            paths = {
+                "chromium": "google-chrome",
+                "firefox": "firefox",
+                "webkit": None,  # WebKit not supported on Linux
+            }
+
+        return paths.get(self.browser_type)
+
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
+        """Returns browser-specific command line arguments"""
+        base_args = [await self._get_browser_path()]
+
+        if self.browser_type == "chromium":
+            args = [
+                f"--remote-debugging-port={self.debugging_port}",
+                f"--user-data-dir={self.user_data_dir}",
+            ]
+            if self.headless:
+                args.append("--headless=new")
+        elif self.browser_type == "firefox":
+            args = [
+                "--remote-debugging-port",
+                str(self.debugging_port),
+                "--profile",
+                self.user_data_dir,
+            ]
+            if self.headless:
+                args.append("--headless")
+        else:
+            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
+
+        return base_args + args
+
+    async def cleanup(self):
+        """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+
+        if self.browser_process:
+            try:
+                # For builtin browsers that should persist, we should check if it's a detached process
+                # Only terminate if we have proper control over the process
+                if not self.browser_process.poll():
+                    # Process is still running
+                    self.browser_process.terminate()
+                    # Wait for process to end gracefully
+                    for _ in range(10):  # 10 attempts, 100ms each
+                        if self.browser_process.poll() is not None:
+                            break
+                        await asyncio.sleep(0.1)
+
+                    # Force kill if still running
+                    if self.browser_process.poll() is None:
+                        if sys.platform == "win32":
+                            # On Windows we might need taskkill for detached processes
+                            try:
+                                subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
+                            except Exception:
+                                self.browser_process.kill()
+                        else:
+                            self.browser_process.kill()
+                        await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+
+            except Exception as e:
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR", 
+                    params={"error": str(e)},
+                )
+
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+            except Exception as e:
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)},
+                )
+                
+    # These methods have been moved to BrowserProfiler class
+    @staticmethod
+    async def create_profile(browser_config=None, profile_name=None, logger=None):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Creates a browser profile by launching a browser for interactive user setup
+        and waits until the user closes it. The profile is stored in a directory that
+        can be used later with BrowserConfig.user_data_dir.
+        
+        Please use BrowserProfiler.create_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profile_path = await profiler.create_profile(profile_name="my-login-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler(logger=logger)
+        return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
+    
+    @staticmethod
+    def list_profiles():
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Lists all available browser profiles in the Crawl4AI profiles directory.
+        
+        Please use BrowserProfiler.list_profiles() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profiles = profiler.list_profiles()
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.list_profiles()
+        
+    @staticmethod
+    def delete_profile(profile_name_or_path):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Delete a browser profile by name or path.
+        
+        Please use BrowserProfiler.delete_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            success = profiler.delete_profile("my-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.delete_profile(profile_name_or_path)
+
+
+
+
+class BrowserManager:
+    """
+    Manages the browser instance and context.
+
+    Attributes:
+        config (BrowserConfig): Configuration object containing all browser settings
+        logger: Logger instance for recording events and errors
+        browser (Browser): The browser instance
+        default_context (BrowserContext): The default browser context
+        managed_browser (ManagedBrowser): The managed browser instance
+        playwright (Playwright): The Playwright instance
+        sessions (dict): Dictionary to store session information
+        session_ttl (int): Session timeout in seconds
+    """
+
+    _playwright_instance = None
+    
+    @classmethod
+    async def get_playwright(cls):
+        from playwright.async_api import async_playwright
+        cls._playwright_instance = await async_playwright().start()
+        return cls._playwright_instance    
+
+    def __init__(self, browser_config: BrowserConfig, logger=None):
+        """
+        Initialize the BrowserManager with a browser configuration.
+
+        Args:
+            browser_config (BrowserConfig): Configuration object containing all browser settings
+            logger: Logger instance for recording events and errors
+        """
+        self.config: BrowserConfig = browser_config
+        self.logger = logger
+
+        # Browser state
+        self.browser = None
+        self.default_context = None
+        self.managed_browser = None
+        self.playwright = None
+
+        # Session management
+        self.sessions = {}
+        self.session_ttl = 1800  # 30 minutes
+
+        # Keep track of contexts by a "config signature," so each unique config reuses a single context
+        self.contexts_by_config = {}
+        self._contexts_lock = asyncio.Lock() 
+
+        # Initialize ManagedBrowser if needed
+        if self.config.use_managed_browser:
+            self.managed_browser = ManagedBrowser(
+                browser_type=self.config.browser_type,
+                user_data_dir=self.config.user_data_dir,
+                headless=self.config.headless,
+                logger=self.logger,
+                debugging_port=self.config.debugging_port,
+                cdp_url=self.config.cdp_url,
+                browser_config=self.config,
+            )
+
+    async def start(self):
+        """
+        Start the browser instance and set up the default context.
+
+        How it works:
+        1. Check if Playwright is already initialized.
+        2. If not, initialize Playwright.
+        3. If managed browser is used, start it and connect to the CDP endpoint.
+        4. If managed browser is not used, launch the browser and set up the default context.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        """
+        if self.playwright is not None:
+            await self.close()
+            
+        from playwright.async_api import async_playwright
+
+        self.playwright = await async_playwright().start()
+
+        if self.config.cdp_url or self.config.use_managed_browser:
+            self.config.use_managed_browser = True
+            cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
+            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
+            contexts = self.browser.contexts
+            if contexts:
+                self.default_context = contexts[0]
+            else:
+                self.default_context = await self.create_browser_context()
+            await self.setup_context(self.default_context)
+        else:
+            browser_args = self._build_browser_args()
+
+            # Launch appropriate browser type
+            if self.config.browser_type == "firefox":
+                self.browser = await self.playwright.firefox.launch(**browser_args)
+            elif self.config.browser_type == "webkit":
+                self.browser = await self.playwright.webkit.launch(**browser_args)
+            else:
+                self.browser = await self.playwright.chromium.launch(**browser_args)
+
+            self.default_context = self.browser
+
+
+    def _build_browser_args(self) -> dict:
+        """Build browser launch arguments from config."""
+        args = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+            # "--single-process",
+            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
+        ]
+
+        if self.config.light_mode:
+            args.extend(BROWSER_DISABLE_OPTIONS)
+
+        if self.config.text_mode:
+            args.extend(
+                [
+                    "--blink-settings=imagesEnabled=false",
+                    "--disable-remote-fonts",
+                    "--disable-images",
+                    "--disable-javascript",
+                    "--disable-software-rasterizer",
+                    "--disable-dev-shm-usage",
+                ]
+            )
+
+        if self.config.extra_args:
+            args.extend(self.config.extra_args)
+
+        # Deduplicate args
+        args = list(dict.fromkeys(args))
+        
+        browser_args = {"headless": self.config.headless, "args": args}
+
+        if self.config.chrome_channel:
+            browser_args["channel"] = self.config.chrome_channel
+
+        if self.config.accept_downloads:
+            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
+                os.getcwd(), "downloads"
+            )
+            os.makedirs(browser_args["downloads_path"], exist_ok=True)
+
+        if self.config.proxy or self.config.proxy_config:
+            from playwright.async_api import ProxySettings
+
+            proxy_settings = (
+                ProxySettings(server=self.config.proxy)
+                if self.config.proxy
+                else ProxySettings(
+                    server=self.config.proxy_config.server,
+                    username=self.config.proxy_config.username,
+                    password=self.config.proxy_config.password,
+                )
+            )
+            browser_args["proxy"] = proxy_settings
+
+        return browser_args
+
+    async def setup_context(
+        self,
+        context: BrowserContext,
+        crawlerRunConfig: CrawlerRunConfig = None,
+        is_default=False,
+    ):
+        """
+        Set up a browser context with the configured options.
+
+        How it works:
+        1. Set extra HTTP headers if provided.
+        2. Add cookies if provided.
+        3. Load storage state if provided.
+        4. Accept downloads if enabled.
+        5. Set default timeouts for navigation and download.
+        6. Set user agent if provided.
+        7. Set browser hints if provided.
+        8. Set proxy if provided.
+        9. Set downloads path if provided.
+        10. Set storage state if provided.
+        11. Set cache if provided.
+        12. Set extra HTTP headers if provided.
+        13. Add cookies if provided.
+        14. Set default timeouts for navigation and download if enabled.
+        15. Set user agent if provided.
+        16. Set browser hints if provided.
+
+        Args:
+            context (BrowserContext): The browser context to set up
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+            is_default (bool): Flag indicating if this is the default context
+        Returns:
+            None
+        """
+        if self.config.headers:
+            await context.set_extra_http_headers(self.config.headers)
+
+        if self.config.cookies:
+            await context.add_cookies(self.config.cookies)
+
+        if self.config.storage_state:
+            await context.storage_state(path=None)
+
+        if self.config.accept_downloads:
+            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            if self.config.downloads_path:
+                context._impl_obj._options["accept_downloads"] = True
+                context._impl_obj._options[
+                    "downloads_path"
+                ] = self.config.downloads_path
+
+        # Handle user agent and browser hints
+        if self.config.user_agent:
+            combined_headers = {
+                "User-Agent": self.config.user_agent,
+                "sec-ch-ua": self.config.browser_hint,
+            }
+            combined_headers.update(self.config.headers)
+            await context.set_extra_http_headers(combined_headers)
+
+        # Add default cookie
+        await context.add_cookies(
+            [
+                {
+                    "name": "cookiesEnabled",
+                    "value": "true",
+                    "url": crawlerRunConfig.url
+                    if crawlerRunConfig and crawlerRunConfig.url
+                    else "https://crawl4ai.com/",
+                }
+            ]
+        )
+
+        # Handle navigator overrides
+        if crawlerRunConfig:
+            if (
+                crawlerRunConfig.override_navigator
+                or crawlerRunConfig.simulate_user
+                or crawlerRunConfig.magic
+            ):
+                await context.add_init_script(load_js_script("navigator_overrider"))        
+
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
+        """
+        Creates and returns a new browser context with configured settings.
+        Applies text-only mode settings if text_mode is enabled in config.
+
+        Returns:
+            Context: Browser context object with the specified configurations
+        """
+        # Base settings
+        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
+        viewport_settings = {
+            "width": self.config.viewport_width,
+            "height": self.config.viewport_height,
+        }
+        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
+
+        blocked_extensions = [
+            # Images
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "webp",
+            "svg",
+            "ico",
+            "bmp",
+            "tiff",
+            "psd",
+            # Fonts
+            "woff",
+            "woff2",
+            "ttf",
+            "otf",
+            "eot",
+            # Styles
+            # 'css', 'less', 'scss', 'sass',
+            # Media
+            "mp4",
+            "webm",
+            "ogg",
+            "avi",
+            "mov",
+            "wmv",
+            "flv",
+            "m4v",
+            "mp3",
+            "wav",
+            "aac",
+            "m4a",
+            "opus",
+            "flac",
+            # Documents
+            "pdf",
+            "doc",
+            "docx",
+            "xls",
+            "xlsx",
+            "ppt",
+            "pptx",
+            # Archives
+            "zip",
+            "rar",
+            "7z",
+            "tar",
+            "gz",
+            # Scripts and data
+            "xml",
+            "swf",
+            "wasm",
+        ]
+
+        # Common context settings
+        context_settings = {
+            "user_agent": user_agent,
+            "viewport": viewport_settings,
+            "proxy": proxy_settings,
+            "accept_downloads": self.config.accept_downloads,
+            "storage_state": self.config.storage_state,
+            "ignore_https_errors": self.config.ignore_https_errors,
+            "device_scale_factor": 1.0,
+            "java_script_enabled": self.config.java_script_enabled,
+        }
+        
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.server,
+                }
+                if crawlerRunConfig.proxy_config.username:
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.username,
+                        "password": crawlerRunConfig.proxy_config.password,
+                    })
+                context_settings["proxy"] = proxy_settings
+
+        if self.config.text_mode:
+            text_mode_settings = {
+                "has_touch": False,
+                "is_mobile": False,
+            }
+            # Update context settings with text mode settings
+            context_settings.update(text_mode_settings)
+
+        # Create and return the context with all settings
+        context = await self.browser.new_context(**context_settings)
+
+        # Apply text mode settings if enabled
+        if self.config.text_mode:
+            # Create and apply route patterns for each extension
+            for ext in blocked_extensions:
+                await context.route(f"**/*.{ext}", lambda route: route.abort())
+        return context
+
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
+        """
+        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
+        then returns a hash of the sorted JSON. This yields a stable signature
+        that identifies configurations requiring a unique browser context.
+        """
+        import json
+
+        config_dict = crawlerRunConfig.__dict__.copy()
+        # Exclude items that do not affect browser-level setup.
+        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
+        ephemeral_keys = [
+            "session_id",
+            "js_code",
+            "scraping_strategy",
+            "extraction_strategy",
+            "chunking_strategy",
+            "cache_mode",
+            "content_filter",
+            "semaphore_count",
+            "url"
+        ]
+        for key in ephemeral_keys:
+            if key in config_dict:
+                del config_dict[key]
+        # Convert to canonical JSON string
+        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
+
+        # Hash the JSON so we get a compact, unique string
+        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
+        return signature_hash
+
+    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
+        """
+        Get a page for the given session ID, creating a new one if needed.
+
+        Args:
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+
+        Returns:
+            (page, context): The Page and its BrowserContext
+        """
+        self._cleanup_expired_sessions()
+
+        # If a session_id is provided and we already have it, reuse that page + context
+        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
+            context, page, _ = self.sessions[crawlerRunConfig.session_id]
+            # Update last-used timestamp
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+            return page, context
+
+        # If using a managed browser, just grab the shared default_context
+        if self.config.use_managed_browser:
+            context = self.default_context
+            pages = context.pages
+            page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
+            if not page:
+                page = await context.new_page()
+        else:
+            # Otherwise, check if we have an existing context for this config
+            config_signature = self._make_config_signature(crawlerRunConfig)
+
+            async with self._contexts_lock:
+                if config_signature in self.contexts_by_config:
+                    context = self.contexts_by_config[config_signature]
+                else:
+                    # Create and setup a new context
+                    context = await self.create_browser_context(crawlerRunConfig)
+                    await self.setup_context(context, crawlerRunConfig)
+                    self.contexts_by_config[config_signature] = context
+
+            # Create a new page from the chosen context
+            page = await context.new_page()
+
+        # If a session_id is specified, store this session so we can reuse later
+        if crawlerRunConfig.session_id:
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+
+        return page, context
+
+    async def kill_session(self, session_id: str):
+        """
+        Kill a browser session and clean up resources.
+
+        Args:
+            session_id (str): The session ID to kill.
+        """
+        if session_id in self.sessions:
+            context, page, _ = self.sessions[session_id]
+            await page.close()
+            if not self.config.use_managed_browser:
+                await context.close()
+            del self.sessions[session_id]
+
+    def _cleanup_expired_sessions(self):
+        """Clean up expired sessions based on TTL."""
+        current_time = time.time()
+        expired_sessions = [
+            sid
+            for sid, (_, _, last_used) in self.sessions.items()
+            if current_time - last_used > self.session_ttl
+        ]
+        for sid in expired_sessions:
+            asyncio.create_task(self.kill_session(sid))
+
+    async def close(self):
+        """Close all browser resources and clean up."""
+        if self.config.cdp_url:
+            return
+        
+        if self.config.sleep_on_close:
+            await asyncio.sleep(0.5)
+
+        session_ids = list(self.sessions.keys())
+        for session_id in session_ids:
+            await self.kill_session(session_id)
+
+        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
+        for ctx in self.contexts_by_config.values():
+            try:
+                await ctx.close()
+            except Exception as e:
+                self.logger.error(
+                    message="Error closing context: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+        self.contexts_by_config.clear()
+
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+
+        if self.managed_browser:
+            await asyncio.sleep(0.5)
+            await self.managed_browser.cleanup()
+            self.managed_browser = None
+
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
+
+```
+
+
+
+
+## File: docs/examples/quickstart.py
+
+```py
+import os, sys
+
+from crawl4ai import LLMConfig
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links["internal"][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        for img in result.media["images"][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
+
+        # Perform the crawl operation
+        result = await crawler.arun(url="https://crawl4ai.com")
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
+    )
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+
+        if result.success and result.screenshot:
+            import base64
+
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, "wb") as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+        delay_before_return_html=1
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config,
+        )
+        print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+
+    # Firefox
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+        print(result.markdown)
+
+
+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+    # Basic examples
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+
+    # Advanced examples
+    await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+
+    # Browser comparisons
+    await crawl_custom_browser_type()
+
+    # Screenshot example
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/quickstart_examples_set_1.py
+
+```py
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter, BM25ContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+    """Basic web crawling with markdown generation"""
+    print("\n=== 1. Basic Web Crawling ===")
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com/"
+        )
+
+        for i, result in enumerate(results):
+            print(f"Result {i + 1}:")
+            print(f"Success: {result.success}")
+            if result.success:
+                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+            else:
+                print("Failed to crawl the URL")
+
+async def demo_parallel_crawl():
+    """Crawl multiple URLs in parallel"""
+    print("\n=== 2. Parallel Crawling ===")
+
+    urls = [
+        "https://news.ycombinator.com/",
+        "https://example.com/",
+        "https://httpbin.org/html",
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun_many(
+            urls=urls,
+        )
+
+        print(f"Crawled {len(results)} URLs in parallel:")
+        for i, result in enumerate(results):
+            print(
+                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+            )
+
+async def demo_fit_markdown():
+    """Generate focused markdown with LLM content filter"""
+    print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
+            config=CrawlerRunConfig(
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter()
+                )
+            ),
+        )
+
+        # Print stats and save the fit markdown
+        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+        print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+async def demo_llm_structured_extraction_no_schema():
+    # Create a simple LLM extraction strategy (no schema required)
+    extraction_strategy = LLMExtractionStrategy(
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
+        extract_type="schema",
+        schema="{title: string, url: string, comments: int}",
+        extra_args={
+            "temperature": 0.0,
+            "max_tokens": 4096,
+        },
+        verbose=True,
+    )
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://news.ycombinator.com/", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_css_structured_extraction_no_schema():
+    """Extract structured data using CSS selectors"""
+    print("\n=== 5. CSS-Based Structured Extraction ===")
+    # Sample HTML for schema generation (one-time cost)
+    sample_html = """
+<div class="body-post clear">
+    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
+        <div class="clear home-post-box cf">
+            <div class="home-img clear">
+                <div class="img-ratio">
+                    <img alt="..." src="...">
+                </div>
+            </div>
+            <div class="clear home-right">
+                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
+                <div class="item-label">
+                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
+                    <span class="h-tags">Malware / Supply Chain Attack</span>
+                </div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
+            </div>
+        </div>
+    </a>
+</div>
+    """
+
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )
+
+    print(f"Generated schema: {json.dumps(schema, indent=2)}")
+    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)
+
+    # Create no-LLM extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema)
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    # Use the fast CSS extraction (no LLM calls during extraction)
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://thehackernews.com", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_deep_crawl():
+    """Deep crawling with BFS strategy"""
+    print("\n=== 6. Deep Crawling ===")
+
+    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=1, max_pages=5, filter_chain=filter_chain
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+        )
+
+        print(f"Deep crawl returned {len(results)} pages:")
+        for i, result in enumerate(results):
+            depth = result.metadata.get("depth", "unknown")
+            print(f"  {i + 1}. {result.url} (Depth: {depth})")
+
+async def demo_js_interaction():
+    """Execute JavaScript to load more content"""
+    print("\n=== 7. JavaScript Interaction ===")
+
+    # A simple page that needs JS to reveal content
+    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+        # Initial load
+
+        news_schema = {
+            "name": "news",
+            "baseSelector": "tr.athing",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "span.titleline",
+                    "type": "text",
+                }
+            ],
+        }
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=CrawlerRunConfig(
+                session_id="hn_session",  # Keep session
+                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+            ),
+        )
+
+        news = []
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+        print(f"Initial items: {len(news)}")
+
+        # Click "More" link
+        more_config = CrawlerRunConfig(
+            js_code="document.querySelector('a.morelink').click();",
+            js_only=True,  # Continue in same page
+            session_id="hn_session",  # Keep session
+            extraction_strategy=JsonCssExtractionStrategy(
+                schema=news_schema,
+            ),
+        )
+
+        result: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com", config=more_config
+        )
+
+        # Extract new items
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+        print(f"Total items: {len(news)}")
+
+async def demo_media_and_links():
+    """Extract media and links from a page"""
+    print("\n=== 8. Media and Links Extraction ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+        for i, result in enumerate(result):
+            # Extract and save all images
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+
+            # Extract and save all links (internal and external)
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Found {len(internal_links)} internal links")
+            print(f"Found {len(external_links)} external links")
+
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")
+
+            # # Save everything to files
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)
+
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
+
+async def demo_screenshot_and_pdf():
+    """Capture screenshot and PDF of a page"""
+    print("\n=== 9. Screenshot and PDF Capture ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun(
+            # url="https://example.com",
+            url="https://en.wikipedia.org/wiki/Giant_anteater",
+            config=CrawlerRunConfig(screenshot=True, pdf=True),
+        )
+
+        for i, result in enumerate(result):
+            # if result.screenshot_data:
+            if result.screenshot:
+                # Save screenshot
+                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+                with open(screenshot_path, "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"Screenshot saved to {screenshot_path}")
+
+            # if result.pdf_data:
+            if result.pdf:
+                # Save PDF
+                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+                with open(pdf_path, "wb") as f:
+                    f.write(result.pdf)
+                print(f"PDF saved to {pdf_path}")
+
+async def demo_proxy_rotation():
+    """Proxy rotation for multiple requests"""
+    print("\n=== 10. Proxy Rotation ===")
+
+    # Example proxies (replace with real ones)
+    proxies = [
+        ProxyConfig(server="http://proxy1.example.com:8080"),
+        ProxyConfig(server="http://proxy2.example.com:8080"),
+    ]
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    print(f"Using {len(proxies)} proxies in rotation")
+    print(
+        "Note: This example uses placeholder proxies - replace with real ones to test"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            proxy_rotation_strategy=proxy_strategy
+        )
+
+        # In a real scenario, these would be run and the proxies would rotate
+        print("In a real scenario, requests would rotate through the available proxies")
+
+async def demo_raw_html_and_file():
+    """Process raw HTML and local files"""
+    print("\n=== 11. Raw HTML and Local Files ===")
+
+    raw_html = """
+    <html><body>
+        <h1>Sample Article</h1>
+        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
+    </body></html>
+    """
+
+    # Save to file
+    file_path = Path("docs/examples/tmp/sample.html").absolute()
+    with open(file_path, "w") as f:
+        f.write(raw_html)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl raw HTML
+        raw_result = await crawler.arun(
+            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Raw HTML processing:")
+        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+        # Crawl local file
+        file_result = await crawler.arun(
+            url=f"file://{file_path}",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("\nLocal file processing:")
+        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+    # Clean up
+    os.remove(file_path)
+    print(f"Processed both raw HTML and local file ({file_path})")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Comprehensive Crawl4AI Demo ===")
+    print("Note: Some examples require API keys or other configurations")
+
+    # Run all demos
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
+    await demo_deep_crawl()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    await demo_raw_html_and_file()
+
+    # Clean up any temp files that may have been created
+    print("\n=== Demo Complete ===")
+    print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+
+
+## File: docs/examples/dispatcher_example.py
+
+```py
+import asyncio
+import time
+from rich import print
+from rich.table import Table
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
+    LXMLWebScrapingStrategy,
+)
+
+
+async def memory_adaptive(urls, browser_config, run_config):
+    """Memory adaptive crawler with monitoring"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
+    """Memory adaptive crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=95.0,
+            max_session_permit=10,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore(urls, browser_config, run_config):
+    """Basic semaphore crawler"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore_with_rate_limit(urls, browser_config, run_config):
+    """Semaphore crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+def create_performance_table(results):
+    """Creates a rich table showing performance results"""
+    table = Table(title="Crawler Strategy Performance Comparison")
+    table.add_column("Strategy", style="cyan")
+    table.add_column("URLs Crawled", justify="right", style="green")
+    table.add_column("Time (seconds)", justify="right", style="yellow")
+    table.add_column("URLs/second", justify="right", style="magenta")
+
+    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
+
+    for strategy, (urls_crawled, duration) in sorted_results:
+        urls_per_second = urls_crawled / duration
+        table.add_row(
+            strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
+        )
+
+    return table
+
+
+async def main():
+    urls = [f"https://example.com/page{i}" for i in range(1, 40)]
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
+
+    results = {
+        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
+        # "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
+        #     urls, browser_config, run_config
+        # ),
+        # "Semaphore": await semaphore(urls, browser_config, run_config),
+        # "Semaphore + Rate Limit": await semaphore_with_rate_limit(
+        #     urls, browser_config, run_config
+        # ),
+    }
+
+    table = create_performance_table(results)
+    print("\nPerformance Summary:")
+    print(table)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/hello_world.py
+
+```py
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
+
+async def example_cdp():
+    browser_conf = BrowserConfig(
+        headless=False,
+        cdp_url="http://localhost:9223"
+    )
+    crawler_config = CrawlerRunConfig(
+        session_id="test",
+        js_code = """(() => { return {"result": "Hello World!"} })()""",
+        js_only=True
+    )
+    async with AsyncWebCrawler(
+        config=browser_conf,
+        verbose=True,
+    ) as crawler:
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org",
+            config=crawler_config,
+        )
+        print(result.js_execution_result)
+                   
+
+async def main():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                     threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
+            ),
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        print(result.markdown.raw_markdown[:500])
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/hooks_example.py
+
+```py
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating different hook use cases")
+
+    # Configure browser settings
+    browser_config = BrowserConfig(headless=True)
+
+    # Configure crawler settings
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS,
+    )
+
+    # Create crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Define and set hook functions
+    async def on_browser_created(browser, context: BrowserContext, **kwargs):
+        """Hook called after the browser is created"""
+        print("[HOOK] on_browser_created - Browser is ready!")
+        # Example: Set a cookie that will be used for all requests
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after a new page and context are created"""
+        print("[HOOK] on_page_context_created - New page created!")
+        # Example: Set default viewport size
+        await context.add_cookies(
+            [
+                {
+                    "name": "session_id",
+                    "value": "example_session",
+                    "domain": ".example.com",
+                    "path": "/",
+                }
+            ]
+        )
+        await page.set_viewport_size({"width": 1080, "height": 800})
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, user_agent: str, **kwargs
+    ):
+        """Hook called when the user agent is updated"""
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after custom JavaScript execution"""
+        print("[HOOK] on_execution_started - Custom JS executed!")
+        return page
+
+    async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
+        """Hook called before navigating to each URL"""
+        print(f"[HOOK] before_goto - About to visit: {url}")
+        # Example: Add custom headers for the request
+        await page.set_extra_http_headers({"Custom-Header": "my-value"})
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
+        """Hook called after navigating to each URL"""
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # Example: Wait for a specific element to be loaded
+        try:
+            await page.wait_for_selector(".content", timeout=1000)
+            print("Content element found!")
+        except:
+            print("Content element not found, continuing anyway")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        """Hook called before retrieving the HTML content"""
+        print("[HOOK] before_retrieve_html - About to get HTML content")
+        # Example: Scroll to bottom to trigger lazy loading
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        """Hook called before returning the HTML content"""
+        print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
+        # Example: You could modify the HTML content here if needed
+        return page
+
+    # Set all the hooks
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
+    crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
+    crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+    await crawler.start()
+
+    # Example usage: crawl a simple website
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    print(f"\nCrawled URL: {result.url}")
+    print(f"HTML length: {len(result.html)}")
+
+    await crawler.close()
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
+
+```
+
+
+
+## File: crawl4ai/deep_crawling/__init__.py
+
+```py
+# deep_crawling/__init__.py
+from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
+from .bfs_strategy import BFSDeepCrawlStrategy
+from .bff_strategy import BestFirstCrawlingStrategy
+from .dfs_strategy import DFSDeepCrawlStrategy
+from .filters import (
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    URLFilter,
+    URLPatternFilter,
+    FilterStats,
+    ContentRelevanceFilter,
+    SEOFilter
+)
+from .scorers import (
+    KeywordRelevanceScorer,
+    URLScorer,
+    CompositeScorer,
+    DomainAuthorityScorer,
+    FreshnessScorer,
+    PathDepthScorer,
+    ContentTypeScorer
+)
+
+__all__ = [
+    "DeepCrawlDecorator",
+    "DeepCrawlStrategy",
+    "BFSDeepCrawlStrategy",
+    "BestFirstCrawlingStrategy",
+    "DFSDeepCrawlStrategy",
+    "FilterChain",
+    "ContentTypeFilter",
+    "DomainFilter",
+    "URLFilter",
+    "URLPatternFilter",
+    "FilterStats",
+    "ContentRelevanceFilter",
+    "SEOFilter",
+    "KeywordRelevanceScorer",
+    "URLScorer",
+    "CompositeScorer",
+    "DomainAuthorityScorer",
+    "FreshnessScorer",
+    "PathDepthScorer",
+    "ContentTypeScorer",
+]
+
+```
+
+
+## File: crawl4ai/deep_crawling/base_strategy.py
+
+```py
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Optional, Set, List, Dict
+from functools import wraps
+from contextvars import ContextVar
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+
+class DeepCrawlDecorator:
+    """Decorator that adds deep crawling capability to arun method."""
+    deep_crawl_active = ContextVar("deep_crawl_active", default=False)
+    
+    def __init__(self, crawler: AsyncWebCrawler): 
+        self.crawler = crawler
+
+    def __call__(self, original_arun):
+        @wraps(original_arun)
+        async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
+            # If deep crawling is already active, call the original method to avoid recursion.
+            if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
+                token = self.deep_crawl_active.set(True)
+                # Await the arun call to get the actual result object.
+                result_obj = await config.deep_crawl_strategy.arun(
+                    crawler=self.crawler,
+                    start_url=url,
+                    config=config
+                )
+                if config.stream:
+                    async def result_wrapper():
+                        try:
+                            async for result in result_obj:
+                                yield result
+                        finally:
+                            self.deep_crawl_active.reset(token)
+                    return result_wrapper()
+                else:
+                    try:
+                        return result_obj
+                    finally:
+                        self.deep_crawl_active.reset(token)
+            return await original_arun(url, config=config, **kwargs)
+        return wrapped_arun
+
+class DeepCrawlStrategy(ABC):
+    """
+    Abstract base class for deep crawling strategies.
+    
+    Core functions:
+      - arun: Main entry point that returns an async generator of CrawlResults.
+      - shutdown: Clean up resources.
+      - can_process_url: Validate a URL and decide whether to process it.
+      - _process_links: Extract and process links from a CrawlResult.
+    """
+
+    @abstractmethod
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        pass
+
+    @abstractmethod
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        pass
+    
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> RunManyReturn:
+        """
+        Traverse the given URL using the specified crawler.
+        
+        Args:
+            start_url (str): The URL from which to start crawling.
+            crawler (AsyncWebCrawler): The crawler instance to use.
+            crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
+        
+        Returns:
+            Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
+        return self.arun(start_url, crawler, config)
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """
+        Clean up resources used by the deep crawl strategy.
+        """
+        pass
+
+    @abstractmethod
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply custom filtering logic.
+        
+        Args:
+            url (str): The URL to validate.
+            depth (int): The current depth in the crawl.
+        
+        Returns:
+            bool: True if the URL should be processed, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[tuple],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract and process links from the given crawl result.
+        
+        This method should:
+          - Validate each extracted URL using can_process_url.
+          - Optionally score URLs.
+          - Append valid URLs (and their parent references) to the next_level list.
+          - Update the depths dictionary with the new depth for each URL.
+        
+        Args:
+            result (CrawlResult): The result from a crawl operation.
+            source_url (str): The URL from which this result was obtained.
+            current_depth (int): The depth at which the source URL was processed.
+            visited (Set[str]): Set of already visited URLs.
+            next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
+            depths (Dict[str, int]): Mapping of URLs to their current depth.
+        """
+        pass
+
+
+```
+
+
+## File: crawl4ai/deep_crawling/bff_strategy.py
+
+```py
+# best_first_crawling_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy
+
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+from math import inf as infinity
+
+# Configurable batch size for processing items from the priority queue
+BATCH_SIZE = 10
+
+
+class BestFirstCrawlingStrategy(DeepCrawlStrategy):
+    """
+    Best-First Crawling Strategy using a priority queue.
+    
+    This strategy prioritizes URLs based on their score, ensuring that higher-value
+    pages are crawled first. It reimplements the core traversal loop to use a priority
+    queue while keeping URL validation and link discovery consistent with our design.
+    
+    Core methods:
+      - arun: Returns either a list (batch mode) or an async generator (stream mode).
+      - _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
+      - can_process_url: Validates URLs and applies filtering (inherited behavior).
+      - link_discovery: Extracts and validates links from a CrawlResult.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,
+        include_external: bool = False,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply filtering.
+        For the starting URL (depth 0), filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_links: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract links from the crawl result, validate them, and append new URLs
+        (with their parent references) to next_links.
+        Also updates the depths dictionary.
+        """
+        new_depth = current_depth + 1
+        if new_depth > self.max_depth:
+            return
+            
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Retrieve internal links; include external links if enabled.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        # If we have more links than remaining capacity, limit how many we'll process
+        valid_links = []
+        for link in links:
+            url = link.get("href")
+            if url in visited:
+                continue
+            if not await self.can_process_url(url, new_depth):
+                self.stats.urls_skipped += 1
+                continue
+                
+            valid_links.append(url)
+            
+        # If we have more valid links than capacity, limit them
+        if len(valid_links) > remaining_capacity:
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Record the new depths and add to next_links
+        for url in valid_links:
+            depths[url] = new_depth
+            next_links.append((url, source_url))
+
+    async def _arun_best_first(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Core best-first crawl method using a priority queue.
+        
+        The queue items are tuples of (score, depth, url, parent_url). Lower scores
+        are treated as higher priority. URLs are processed in batches for efficiency.
+        """
+        queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
+        # Push the initial URL with score 0 and depth 0.
+        await queue.put((0, 0, start_url, None))
+        visited: Set[str] = set()
+        depths: Dict[str, int] = {start_url: 0}
+
+        while not queue.empty() and not self._cancel_event.is_set():
+            # Stop if we've reached the max pages limit
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
+            batch: List[Tuple[float, int, str, Optional[str]]] = []
+            # Retrieve up to BATCH_SIZE items from the priority queue.
+            for _ in range(BATCH_SIZE):
+                if queue.empty():
+                    break
+                item = await queue.get()
+                score, depth, url, parent_url = item
+                if url in visited:
+                    continue
+                visited.add(url)
+                batch.append(item)
+
+            if not batch:
+                continue
+
+            # Process the current batch of URLs.
+            urls = [item[2] for item in batch]
+            batch_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=batch_config)
+            async for result in stream_gen:
+                result_url = result.url
+                # Find the corresponding tuple from the batch.
+                corresponding = next((item for item in batch if item[2] == result_url), None)
+                if not corresponding:
+                    continue
+                score, depth, url, parent_url = corresponding
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent_url
+                result.metadata["score"] = score
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Discover new links from this result
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, result_url, depth, visited, new_links, depths)
+                    
+                    for new_url, new_parent in new_links:
+                        new_depth = depths.get(new_url, depth + 1)
+                        new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
+                        await queue.put((new_score, new_depth, new_url, new_parent))
+
+        # End of crawl.
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Best-first crawl in batch mode.
+        
+        Aggregates all CrawlResults into a list.
+        """
+        results: List[CrawlResult] = []
+        async for result in self._arun_best_first(start_url, crawler, config):
+            results.append(result)
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Best-first crawl in streaming mode.
+        
+        Yields CrawlResults as they become available.
+        """
+        async for result in self._arun_best_first(start_url, crawler, config):
+            yield result
+
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> "RunManyReturn":
+        """
+        Main entry point for best-first crawling.
+        
+        Returns either a list (batch mode) or an async generator (stream mode)
+        of CrawlResults.
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    async def shutdown(self) -> None:
+        """
+        Signal cancellation and clean up resources.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
+
+```
+
+
+## File: crawl4ai/deep_crawling/bfs_strategy.py
+
+```py
+# bfs_deep_crawl_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy  
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
+from math import inf as infinity
+
+class BFSDeepCrawlStrategy(DeepCrawlStrategy):
+    """
+    Breadth-First Search deep crawling strategy.
+    
+    Core functions:
+      - arun: Main entry point; splits execution into batch or stream modes.
+      - link_discovery: Extracts, filters, and (if needed) scores the outgoing URLs.
+      - can_process_url: Validates URL format and applies the filter chain.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,        
+        include_external: bool = False,
+        score_threshold: float = -infinity,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.score_threshold = score_threshold
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validates the URL and applies the filter chain.
+        For the start URL (depth 0) filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extracts links from the crawl result, validates and scores them, and
+        prepares the next level of URLs.
+        Each valid URL is appended to next_level as a tuple (url, parent_url)
+        and its depth is tracked.
+        """            
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Get internal links and, if enabled, external links.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        valid_links = []
+        
+        # First collect all valid links
+        for link in links:
+            url = link.get("href")
+            # Strip URL fragments to avoid duplicate crawling
+            # base_url = url.split('#')[0] if url else url
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
+                continue
+            if not await self.can_process_url(url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            # Score the URL if a scorer is provided
+            score = self.url_scorer.score(base_url) if self.url_scorer else 0
+            
+            # Skip URLs with scores below the threshold
+            if score < self.score_threshold:
+                self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
+                self.stats.urls_skipped += 1
+                continue
+            
+            valid_links.append((base_url, score))
+        
+        # If we have more valid links than capacity, sort by score and take the top ones
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                # Sort by score in descending order
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            # Take only as many as we have capacity for
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Process the final selected links
+        for url, score in valid_links:
+            # attach the score to metadata if needed
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        visited: Set[str] = set()
+        # current_level holds tuples: (url, parent_url)
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        results: List[CrawlResult] = []
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            # Clone the config to disable deep crawling recursion and enforce batch mode.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            batch_results = await crawler.arun_many(urls=urls, config=batch_config)
+            
+            # Update pages crawled counter - count only successful crawls
+            successful_results = [r for r in batch_results if r.success]
+            self._pages_crawled += len(successful_results)
+            
+            for result in batch_results:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                results.append(result)
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+
+            current_level = next_level
+
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        visited: Set[str] = set()
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
+            
+            # Keep track of processed results for this batch
+            results_count = 0
+            async for result in stream_gen:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                
+                # Count only successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                
+                results_count += 1
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+            
+            # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
+            # by considering these URLs as visited but not counting them toward the max_pages limit
+            if results_count == 0 and urls:
+                self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited")
+                
+            current_level = next_level
+
+    async def shutdown(self) -> None:
+        """
+        Clean up resources and signal cancellation of the crawl.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
+
+```
+
+
+## File: crawl4ai/deep_crawling/filters.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union
+from urllib.parse import urlparse
+from array import array
+import re
+import logging
+from functools import lru_cache
+import fnmatch
+from dataclasses import dataclass
+import weakref
+import math
+from collections import defaultdict
+from typing import Dict
+from ..utils import HeadPeekr
+import asyncio
+import inspect
+
+
+@dataclass
+class FilterStats:
+    __slots__ = ("_counters",)
+
+    def __init__(self):
+        # Use array of unsigned ints for atomic operations
+        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
+
+    @property
+    def total_urls(self):
+        return self._counters[0]
+
+    @property
+    def passed_urls(self):
+        return self._counters[1]
+
+    @property
+    def rejected_urls(self):
+        return self._counters[2]
+
+
+class URLFilter(ABC):
+    """Optimized base filter class"""
+
+    __slots__ = ("name", "stats", "_logger_ref")
+
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        # Lazy logger initialization using weakref
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger(f"urlfilter.{self.name}")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # Use direct array index for speed
+        self.stats._counters[0] += 1  # total
+        self.stats._counters[1] += passed  # passed
+        self.stats._counters[2] += not passed  # rejected
+
+
+class FilterChain:
+    """Optimized filter chain"""
+
+    __slots__ = ("filters", "stats", "_logger_ref")
+
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = tuple(filters or [])  # Immutable tuple for speed
+        self.stats = FilterStats()
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger("urlfilter.chain")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    def add_filter(self, filter_: URLFilter) -> "FilterChain":
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    async def apply(self, url: str) -> bool:
+        """Apply all filters concurrently when possible"""
+        self.stats._counters[0] += 1  # Total processed URLs
+
+        tasks = []
+        for f in self.filters:
+            result = f.apply(url)
+
+            if inspect.isawaitable(result):
+                tasks.append(result)  # Collect async tasks
+            elif not result:  # Sync rejection
+                self.stats._counters[2] += 1  # Sync rejected
+                return False
+
+        if tasks:
+            results = await asyncio.gather(*tasks)
+
+            # Count how many filters rejected
+            rejections = results.count(False)
+            self.stats._counters[2] += rejections
+
+            if not all(results):
+                return False  # Stop early if any filter rejected
+
+        self.stats._counters[1] += 1  # Passed
+        return True
+
+
+class URLPatternFilter(URLFilter):
+    """Pattern filter balancing speed and completeness"""
+
+    __slots__ = (
+        "_simple_suffixes",
+        "_simple_prefixes",
+        "_domain_patterns",
+        "_path_patterns",
+        "_reverse",
+    )
+
+    PATTERN_TYPES = {
+        "SUFFIX": 1,  # *.html
+        "PREFIX": 2,  # /foo/*
+        "DOMAIN": 3,  # *.example.com
+        "PATH": 4,  # Everything else
+        "REGEX": 5,
+    }
+
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+        reverse: bool = False,
+    ):
+        super().__init__()
+        self._reverse = reverse
+        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+
+        self._simple_suffixes = set()
+        self._simple_prefixes = set()
+        self._domain_patterns = []
+        self._path_patterns = []
+
+        for pattern in patterns:
+            pattern_type = self._categorize_pattern(pattern)
+            self._add_pattern(pattern, pattern_type)
+
+    def _categorize_pattern(self, pattern: str) -> int:
+        """Categorize pattern for specialized handling"""
+        if not isinstance(pattern, str):
+            return self.PATTERN_TYPES["PATH"]
+
+        # Check if it's a regex pattern
+        if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
+            return self.PATTERN_TYPES["REGEX"]
+
+        if pattern.count("*") == 1:
+            if pattern.startswith("*."):
+                return self.PATTERN_TYPES["SUFFIX"]
+            if pattern.endswith("/*"):
+                return self.PATTERN_TYPES["PREFIX"]
+
+        if "://" in pattern and pattern.startswith("*."):
+            return self.PATTERN_TYPES["DOMAIN"]
+
+        return self.PATTERN_TYPES["PATH"]
+
+    def _add_pattern(self, pattern: str, pattern_type: int):
+        """Add pattern to appropriate matcher"""
+        if pattern_type == self.PATTERN_TYPES["REGEX"]:
+            # For regex patterns, compile directly without glob translation
+            if isinstance(pattern, str) and (
+                pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
+            ):
+                self._path_patterns.append(re.compile(pattern))
+                return
+        elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
+            self._simple_suffixes.add(pattern[2:])
+        elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
+            self._simple_prefixes.add(pattern[:-2])
+        elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
+            self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
+        else:
+            if isinstance(pattern, str):
+                # Handle complex glob patterns
+                if "**" in pattern:
+                    pattern = pattern.replace("**", ".*")
+                if "{" in pattern:
+                    # Convert {a,b} to (a|b)
+                    pattern = re.sub(
+                        r"\{([^}]+)\}",
+                        lambda m: f'({"|".join(m.group(1).split(","))})',
+                        pattern,
+                    )
+                pattern = fnmatch.translate(pattern)
+            self._path_patterns.append(
+                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
+            )
+
+    @lru_cache(maxsize=10000)
+    def apply(self, url: str) -> bool:
+        # Quick suffix check (*.html)
+        if self._simple_suffixes:
+            path = url.split("?")[0]
+            if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Domain check
+        if self._domain_patterns:
+            for pattern in self._domain_patterns:
+                if pattern.match(url):
+                    result = True
+                    self._update_stats(result)
+                    return not result if self._reverse else result
+
+        # Prefix check (/foo/*)
+        if self._simple_prefixes:
+            path = url.split("?")[0]
+            if any(path.startswith(p) for p in self._simple_prefixes):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Complex patterns
+        if self._path_patterns:
+            if any(p.search(url) for p in self._path_patterns):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        result = False
+        self._update_stats(result)
+        return not result if self._reverse else result
+
+
+class ContentTypeFilter(URLFilter):
+    """Optimized content type filter using fast lookups"""
+
+    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
+
+    # Fast extension to mime type mapping
+    _MIME_MAP = {
+        # Text Formats
+        "txt": "text/plain",
+        "html": "text/html",
+        "htm": "text/html",
+        "xhtml": "application/xhtml+xml",
+        "css": "text/css",
+        "csv": "text/csv",
+        "ics": "text/calendar",
+        "js": "application/javascript",
+        # Images
+        "bmp": "image/bmp",
+        "gif": "image/gif",
+        "jpeg": "image/jpeg",
+        "jpg": "image/jpeg",
+        "png": "image/png",
+        "svg": "image/svg+xml",
+        "tiff": "image/tiff",
+        "ico": "image/x-icon",
+        "webp": "image/webp",
+        # Audio
+        "mp3": "audio/mpeg",
+        "wav": "audio/wav",
+        "ogg": "audio/ogg",
+        "m4a": "audio/mp4",
+        "aac": "audio/aac",
+        # Video
+        "mp4": "video/mp4",
+        "mpeg": "video/mpeg",
+        "webm": "video/webm",
+        "avi": "video/x-msvideo",
+        "mov": "video/quicktime",
+        "flv": "video/x-flv",
+        "wmv": "video/x-ms-wmv",
+        "mkv": "video/x-matroska",
+        # Applications
+        "json": "application/json",
+        "xml": "application/xml",
+        "pdf": "application/pdf",
+        "zip": "application/zip",
+        "gz": "application/gzip",
+        "tar": "application/x-tar",
+        "rar": "application/vnd.rar",
+        "7z": "application/x-7z-compressed",
+        "exe": "application/vnd.microsoft.portable-executable",
+        "msi": "application/x-msdownload",
+        # Fonts
+        "woff": "font/woff",
+        "woff2": "font/woff2",
+        "ttf": "font/ttf",
+        "otf": "font/otf",
+        # Microsoft Office
+        "doc": "application/msword",
+        "dot": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "xls": "application/vnd.ms-excel",
+        "ppt": "application/vnd.ms-powerpoint",
+        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        # OpenDocument Formats
+        "odt": "application/vnd.oasis.opendocument.text",
+        "ods": "application/vnd.oasis.opendocument.spreadsheet",
+        "odp": "application/vnd.oasis.opendocument.presentation",
+        # Archives
+        "tar.gz": "application/gzip",
+        "tgz": "application/gzip",
+        "bz2": "application/x-bzip2",
+        # Others
+        "rtf": "application/rtf",
+        "apk": "application/vnd.android.package-archive",
+        "epub": "application/epub+zip",
+        "jar": "application/java-archive",
+        "swf": "application/x-shockwave-flash",
+        "midi": "audio/midi",
+        "mid": "audio/midi",
+        "ps": "application/postscript",
+        "ai": "application/postscript",
+        "eps": "application/postscript",
+        # Custom or less common
+        "bin": "application/octet-stream",
+        "dmg": "application/x-apple-diskimage",
+        "iso": "application/x-iso9660-image",
+        "deb": "application/x-debian-package",
+        "rpm": "application/x-rpm",
+        "sqlite": "application/vnd.sqlite3",
+        # Placeholder
+        "unknown": "application/octet-stream",  # Fallback for unknown file types
+    }
+
+    @staticmethod
+    @lru_cache(maxsize=1000)
+    def _extract_extension(url: str) -> str:
+        """Extracts file extension from a URL."""
+        # Remove scheme (http://, https://) if present
+        if "://" in url:
+            url = url.split("://", 1)[-1]  # Get everything after '://'
+
+        # Remove domain (everything up to the first '/')
+        path_start = url.find("/")
+        path = url[path_start:] if path_start != -1 else ""
+
+        # Extract last filename in path
+        filename = path.rsplit("/", 1)[-1] if "/" in path else ""
+
+        # Extract and validate extension
+        if "." not in filename:
+            return ""
+
+        return filename.rpartition(".")[-1].lower()
+
+    def __init__(
+        self,
+        allowed_types: Union[str, List[str]],
+        check_extension: bool = True,
+        ext_map: Dict[str, str] = _MIME_MAP,
+    ):
+        super().__init__()
+        # Normalize and store as frozenset for fast lookup
+        self.allowed_types = frozenset(
+            t.lower()
+            for t in (
+                allowed_types if isinstance(allowed_types, list) else [allowed_types]
+            )
+        )
+        self._check_extension = check_extension
+
+        # Pre-compute extension map for allowed types
+        self._ext_map = frozenset(
+            ext
+            for ext, mime in self._MIME_MAP.items()
+            if any(allowed in mime for allowed in self.allowed_types)
+        )
+
+    @lru_cache(maxsize=1000)
+    def _check_url_cached(self, url: str) -> bool:
+        """Cached URL checking"""
+        if not self._check_extension:
+            return True
+        ext = self._extract_extension(url)
+        if not ext:
+            return True
+
+        return ext in self._ext_map
+
+    def apply(self, url: str) -> bool:
+        """Fast extension check with caching"""
+        result = self._check_url_cached(url)
+        self._update_stats(result)
+        return result
+
+
+class DomainFilter(URLFilter):
+    """Optimized domain filter with fast lookups and caching"""
+
+    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
+
+    # Regex for fast domain extraction
+    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
+
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+
+        # Convert inputs to frozensets for immutable, fast lookups
+        self._allowed_domains = (
+            frozenset(self._normalize_domains(allowed_domains))
+            if allowed_domains
+            else None
+        )
+        self._blocked_domains = (
+            frozenset(self._normalize_domains(blocked_domains))
+            if blocked_domains
+            else frozenset()
+        )
+
+    @staticmethod
+    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
+        """Fast domain normalization"""
+        if isinstance(domains, str):
+            return {domains.lower()}
+        return {d.lower() for d in domains}
+    
+    @staticmethod
+    def _is_subdomain(domain: str, parent_domain: str) -> bool:
+        """Check if domain is a subdomain of parent_domain"""
+        return domain == parent_domain or domain.endswith(f".{parent_domain}")
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Ultra-fast domain extraction with regex and caching"""
+        match = DomainFilter._DOMAIN_REGEX.search(url)
+        return match.group(1).lower() if match else ""
+
+    def apply(self, url: str) -> bool:
+        """Optimized domain checking with early returns"""
+        # Skip processing if no filters
+        if not self._blocked_domains and self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        domain = self._extract_domain(url)
+
+        # Check for blocked domains, including subdomains
+        for blocked in self._blocked_domains:
+            if self._is_subdomain(domain, blocked):
+                self._update_stats(False)
+                return False
+
+        # If no allowed domains specified, accept all non-blocked
+        if self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        # Check if domain matches any allowed domain (including subdomains)
+        for allowed in self._allowed_domains:
+            if self._is_subdomain(domain, allowed):
+                self._update_stats(True)
+                return True
+
+        # No matches found
+        self._update_stats(False)
+        return False
+
+
+class ContentRelevanceFilter(URLFilter):
+    """BM25-based relevance filter using head section content"""
+
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+
+    def __init__(
+        self,
+        query: str,
+        threshold: float,
+        k1: float = 1.2,
+        b: float = 0.75,
+        avgdl: int = 1000,
+    ):
+        super().__init__(name="BM25RelevanceFilter")
+        self.query_terms = self._tokenize(query)
+        self.threshold = threshold
+        self.k1 = k1  # TF saturation parameter
+        self.b = b  # Length normalization parameter
+        self.avgdl = avgdl  # Average document length (empirical value)
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        # Field extraction with weighting
+        fields = {
+            "title": HeadPeekr.get_title(head_content) or "",
+            "meta": HeadPeekr.extract_meta_tags(head_content),
+        }
+        doc_text = self._build_document(fields)
+
+        score = self._bm25(doc_text)
+        decision = score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _build_document(self, fields: Dict) -> str:
+        """Weighted document construction"""
+        return " ".join(
+            [
+                fields["title"] * 3,  # Title weight
+                fields["meta"].get("description", "") * 2,
+                fields["meta"].get("keywords", ""),
+                " ".join(fields["meta"].values()),
+            ]
+        )
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Fast case-insensitive tokenization"""
+        return text.lower().split()
+
+    def _bm25(self, document: str) -> float:
+        """Optimized BM25 implementation for head sections"""
+        doc_terms = self._tokenize(document)
+        doc_len = len(doc_terms)
+        tf = defaultdict(int)
+
+        for term in doc_terms:
+            tf[term] += 1
+
+        score = 0.0
+        for term in set(self.query_terms):
+            term_freq = tf[term]
+            idf = math.log((1 + 1) / (term_freq + 0.5) + 1)  # Simplified IDF
+            numerator = term_freq * (self.k1 + 1)
+            denominator = term_freq + self.k1 * (
+                1 - self.b + self.b * (doc_len / self.avgdl)
+            )
+            score += idf * (numerator / denominator)
+
+        return score
+
+
+class SEOFilter(URLFilter):
+    """Quantitative SEO quality assessment filter using head section analysis"""
+
+    __slots__ = ("threshold", "_weights", "_kw_patterns")
+
+    # Based on SEMrush/Google ranking factors research
+    DEFAULT_WEIGHTS = {
+        "title_length": 0.15,
+        "title_kw": 0.18,
+        "meta_description": 0.12,
+        "canonical": 0.10,
+        "robot_ok": 0.20,  # Most critical factor
+        "schema_org": 0.10,
+        "url_quality": 0.15,
+    }
+
+    def __init__(
+        self,
+        threshold: float = 0.65,
+        keywords: List[str] = None,
+        weights: Dict[str, float] = None,
+    ):
+        super().__init__(name="SEOFilter")
+        self.threshold = threshold
+        self._weights = weights or self.DEFAULT_WEIGHTS
+        self._kw_patterns = (
+            re.compile(
+                r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
+            )
+            if keywords
+            else None
+        )
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        meta = HeadPeekr.extract_meta_tags(head_content)
+        title = HeadPeekr.get_title(head_content) or ""
+        parsed_url = urlparse(url)
+
+        scores = {
+            "title_length": self._score_title_length(title),
+            "title_kw": self._score_keyword_presence(title),
+            "meta_description": self._score_meta_description(
+                meta.get("description", "")
+            ),
+            "canonical": self._score_canonical(meta.get("canonical"), url),
+            "robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
+            "schema_org": self._score_schema_org(head_content),
+            "url_quality": self._score_url_quality(parsed_url),
+        }
+
+        total_score = sum(
+            weight * scores[factor] for factor, weight in self._weights.items()
+        )
+
+        decision = total_score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _score_title_length(self, title: str) -> float:
+        length = len(title)
+        if 50 <= length <= 60:
+            return 1.0
+        if 40 <= length < 50 or 60 < length <= 70:
+            return 0.7
+        return 0.3  # Poor length
+
+    def _score_keyword_presence(self, text: str) -> float:
+        if not self._kw_patterns:
+            return 0.0
+        matches = len(self._kw_patterns.findall(text))
+        return min(matches * 0.3, 1.0)  # Max 3 matches
+
+    def _score_meta_description(self, desc: str) -> float:
+        length = len(desc)
+        if 140 <= length <= 160:
+            return 1.0
+        return 0.5 if 120 <= length <= 200 else 0.2
+
+    def _score_canonical(self, canonical: str, original: str) -> float:
+        if not canonical:
+            return 0.5  # Neutral score
+        return 1.0 if canonical == original else 0.2
+
+    def _score_schema_org(self, html: str) -> float:
+        # Detect any schema.org markup in head
+        return (
+            1.0
+            if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
+            else 0.0
+        )
+
+    def _score_url_quality(self, parsed_url) -> float:
+        score = 1.0
+        path = parsed_url.path.lower()
+
+        # Penalty factors
+        if len(path) > 80:
+            score *= 0.7
+        if re.search(r"\d{4}", path):
+            score *= 0.8  # Numbers in path
+        if parsed_url.query:
+            score *= 0.6  # URL parameters
+        if "_" in path:
+            score *= 0.9  # Underscores vs hyphens
+
+        return score
+
+```
+
+
+## File: crawl4ai/deep_crawling/scorers.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from urllib.parse import urlparse, unquote
+import re
+import logging
+from functools import lru_cache
+from array import array
+import ctypes
+import platform
+PLATFORM = platform.system()
+
+# Pre-computed scores for common year differences
+_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25]
+
+# Pre-computed scores for common year differences
+_FRESHNESS_SCORES = [
+   1.0,    # Current year
+   0.9,    # Last year
+   0.8,    # 2 years ago
+   0.7,    # 3 years ago
+   0.6,    # 4 years ago
+   0.5,    # 5 years ago
+]
+
+class ScoringStats:
+    __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
+    
+    def __init__(self):
+        self._urls_scored = 0
+        self._total_score = 0.0
+        self._min_score = None  # Lazy initialization
+        self._max_score = None
+    
+    def update(self, score: float) -> None:
+        """Optimized update with minimal operations"""
+        self._urls_scored += 1
+        self._total_score += score
+        
+        # Lazy min/max tracking - only if actually accessed
+        if self._min_score is not None:
+            if score < self._min_score:
+                self._min_score = score
+        if self._max_score is not None:
+            if score > self._max_score:
+                self._max_score = score
+                
+    def get_average(self) -> float:
+        """Direct calculation instead of property"""
+        return self._total_score / self._urls_scored if self._urls_scored else 0.0
+    
+    def get_min(self) -> float:
+        """Lazy min calculation"""
+        if self._min_score is None:
+            self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._min_score
+        
+    def get_max(self) -> float:
+        """Lazy max calculation"""
+        if self._max_score is None:
+            self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._max_score
+class URLScorer(ABC):
+    __slots__ = ('_weight', '_stats')
+    
+    def __init__(self, weight: float = 1.0):
+        # Store weight directly as float32 for memory efficiency
+        self._weight = ctypes.c_float(weight).value
+        self._stats = ScoringStats()
+    
+    @abstractmethod
+    def _calculate_score(self, url: str) -> float:
+        """Calculate raw score for URL."""
+        pass
+    
+    def score(self, url: str) -> float:
+        """Calculate weighted score with minimal overhead."""
+        score = self._calculate_score(url) * self._weight
+        self._stats.update(score)
+        return score
+    
+    @property
+    def stats(self):
+        """Access to scoring statistics."""
+        return self._stats
+    
+    @property
+    def weight(self):
+        return self._weight
+
+class CompositeScorer(URLScorer):
+    __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
+    
+    def __init__(self, scorers: List[URLScorer], normalize: bool = True):
+        """Initialize composite scorer combining multiple scoring strategies.
+        
+        Optimized for:
+        - Fast parallel scoring
+        - Memory efficient score aggregation
+        - Quick short-circuit conditions
+        - Pre-allocated arrays
+        
+        Args:
+            scorers: List of scoring strategies to combine
+            normalize: Whether to normalize final score by scorer count
+        """
+        super().__init__(weight=1.0)
+        self._scorers = scorers
+        self._normalize = normalize
+        
+        # Pre-allocate arrays for scores and weights
+        self._weights_array = array('f', [s.weight for s in scorers])
+        self._score_array = array('f', [0.0] * len(scorers))
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate combined score from all scoring strategies.
+        
+        Uses:
+        1. Pre-allocated arrays for scores
+        2. Short-circuit on zero scores
+        3. Optimized normalization
+        4. Vectorized operations where possible
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Combined and optionally normalized score
+        """
+        total_score = 0.0
+        scores = self._score_array
+        
+        # Get scores from all scorers
+        for i, scorer in enumerate(self._scorers):
+            # Use public score() method which applies weight
+            scores[i] = scorer.score(url)
+            total_score += scores[i]
+            
+        # Normalize if requested
+        if self._normalize and self._scorers:
+            count = len(self._scorers)
+            return total_score / count
+            
+        return total_score
+
+    def score(self, url: str) -> float:
+        """Public scoring interface with stats tracking.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Final combined score
+        """
+        score = self._calculate_score(url)
+        self.stats.update(score)
+        return score
+
+class KeywordRelevanceScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
+    
+    def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
+        super().__init__(weight=weight)
+        self._case_sensitive = case_sensitive
+        # Pre-process keywords once
+        self._keywords = [k if case_sensitive else k.lower() for k in keywords]
+    
+    @lru_cache(maxsize=10000)
+    def _url_bytes(self, url: str) -> bytes:
+        """Cache decoded URL bytes"""
+        return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8')
+    
+    
+    def _calculate_score(self, url: str) -> float:
+        """Fast string matching without regex or byte conversion"""
+        if not self._case_sensitive:
+            url = url.lower()
+            
+        matches = sum(1 for k in self._keywords if k in url)
+        
+        # Fast return paths
+        if not matches:
+            return 0.0
+        if matches == len(self._keywords):
+            return 1.0
+            
+        return matches / len(self._keywords)
+
+class PathDepthScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_optimal_depth')  # Remove _url_cache
+    
+    def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self._optimal_depth = optimal_depth
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_depth(path: str) -> int:
+        """Ultra fast path depth calculation.
+        
+        Examples:
+            - "http://example.com" -> 0  # No path segments
+            - "http://example.com/" -> 0  # Empty path
+            - "http://example.com/a" -> 1
+            - "http://example.com/a/b" -> 2
+        """
+        if not path or path == '/':
+            return 0
+            
+        if '/' not in path:
+            return 0
+            
+        depth = 0
+        last_was_slash = True
+        
+        for c in path:
+            if c == '/':
+                if not last_was_slash:
+                    depth += 1
+                last_was_slash = True
+            else:
+                last_was_slash = False
+                
+        if not last_was_slash:
+            depth += 1
+            
+        return depth
+
+    @lru_cache(maxsize=10000)  # Cache the whole calculation
+    def _calculate_score(self, url: str) -> float:
+        pos = url.find('/', url.find('://') + 3)
+        if pos == -1:
+            depth = 0
+        else:
+            depth = self._quick_depth(url[pos:])
+            
+        # Use lookup table for common distances
+        distance = depth - self._optimal_depth
+        distance = distance if distance >= 0 else -distance  # Faster than abs()
+        
+        if distance < 4:
+            return _SCORE_LOOKUP[distance]
+            
+        return 1.0 / (1.0 + distance)                                             
+
+class ContentTypeScorer(URLScorer):
+    __slots__ = ('_weight', '_exact_types', '_regex_types')
+
+    def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
+        """Initialize scorer with type weights map.
+        
+        Args:
+            type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0})
+            weight: Overall weight multiplier for this scorer
+        """
+        super().__init__(weight=weight)
+        self._exact_types = {}  # Fast lookup for simple extensions
+        self._regex_types = []  # Fallback for complex patterns
+        
+        # Split into exact vs regex matchers for performance
+        for pattern, score in type_weights.items():
+            if pattern.startswith('.') and pattern.endswith('$'):
+                ext = pattern[1:-1]
+                self._exact_types[ext] = score
+            else:
+                self._regex_types.append((re.compile(pattern), score))
+                
+        # Sort complex patterns by score for early exit
+        self._regex_types.sort(key=lambda x: -x[1])
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_extension(url: str) -> str:
+        """Extract file extension ultra-fast without regex/splits.
+        
+        Handles:
+        - Basic extensions: "example.html" -> "html"
+        - Query strings: "page.php?id=1" -> "php" 
+        - Fragments: "doc.pdf#page=1" -> "pdf"
+        - Path params: "file.jpg;width=100" -> "jpg"
+        
+        Args:
+            url: URL to extract extension from
+            
+        Returns:
+            Extension without dot, or empty string if none found
+        """
+        pos = url.rfind('.')
+        if pos == -1:
+            return ''
+        
+        # Find first non-alphanumeric char after extension
+        end = len(url)
+        for i in range(pos + 1, len(url)):
+            c = url[i]
+            # Stop at query string, fragment, path param or any non-alphanumeric
+            if c in '?#;' or not c.isalnum():
+                end = i
+                break
+                
+        return url[pos + 1:end].lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate content type score for URL.
+        
+        Uses staged approach:
+        1. Try exact extension match (fast path)
+        2. Fall back to regex patterns if needed
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        # Fast path: direct extension lookup
+        ext = self._quick_extension(url)
+        if ext:
+            score = self._exact_types.get(ext, None)
+            if score is not None:
+                return score
+                
+        # Slow path: regex patterns
+        for pattern, score in self._regex_types:
+            if pattern.search(url):
+                return score
+
+        return 0.0
+
+class FreshnessScorer(URLScorer):
+    __slots__ = ('_weight', '_date_pattern', '_current_year')
+
+    def __init__(self, weight: float = 1.0, current_year: int = 2024):
+        """Initialize freshness scorer.
+        
+        Extracts and scores dates from URLs using format:
+        - YYYY/MM/DD 
+        - YYYY-MM-DD
+        - YYYY_MM_DD
+        - YYYY (year only)
+        
+        Args:
+            weight: Score multiplier
+            current_year: Year to calculate freshness against (default 2024)
+        """
+        super().__init__(weight=weight)
+        self._current_year = current_year
+        
+        # Combined pattern for all date formats
+        # Uses non-capturing groups (?:) and alternation
+        self._date_pattern = re.compile(
+            r'(?:/'  # Path separator
+            r'|[-_])'  # or date separators
+            r'((?:19|20)\d{2})'  # Year group (1900-2099)
+            r'(?:'  # Optional month/day group
+            r'(?:/|[-_])'  # Date separator  
+            r'(?:\d{2})'  # Month
+            r'(?:'  # Optional day
+            r'(?:/|[-_])'  # Date separator
+            r'(?:\d{2})'  # Day
+            r')?'  # Day is optional
+            r')?'  # Month/day group is optional
+        )
+
+    @lru_cache(maxsize=10000)
+    def _extract_year(self, url: str) -> Optional[int]:
+        """Extract the most recent year from URL.
+        
+        Args:
+            url: URL to extract year from
+            
+        Returns:
+            Year as int or None if no valid year found
+        """
+        matches = self._date_pattern.finditer(url)
+        latest_year = None
+        
+        # Find most recent year
+        for match in matches:
+            year = int(match.group(1))
+            if (year <= self._current_year and  # Sanity check
+                (latest_year is None or year > latest_year)):
+                latest_year = year
+                
+        return latest_year
+
+    @lru_cache(maxsize=10000) 
+    def _calculate_score(self, url: str) -> float:
+        """Calculate freshness score based on URL date.
+        
+        More recent years score higher. Uses pre-computed scoring
+        table for common year differences.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        year = self._extract_year(url)
+        if year is None:
+            return 0.5  # Default score
+            
+        # Use lookup table for common year differences
+        year_diff = self._current_year - year
+        if year_diff < len(_FRESHNESS_SCORES):
+            return _FRESHNESS_SCORES[year_diff]
+            
+        # Fallback calculation for older content
+        return max(0.1, 1.0 - year_diff * 0.1)
+
+class DomainAuthorityScorer(URLScorer):
+    __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
+    
+    def __init__(
+        self,
+        domain_weights: Dict[str, float],
+        default_weight: float = 0.5,
+        weight: float = 1.0,
+    ):
+        """Initialize domain authority scorer.
+        
+        Args:
+            domain_weights: Dict mapping domains to authority scores
+            default_weight: Score for unknown domains
+            weight: Overall scorer weight multiplier
+            
+        Example:
+            {
+                'python.org': 1.0,
+                'github.com': 0.9,
+                'medium.com': 0.7
+            }
+        """
+        super().__init__(weight=weight)
+        
+        # Pre-process domains for faster lookup
+        self._domain_weights = {
+            domain.lower(): score 
+            for domain, score in domain_weights.items()
+        }
+        self._default_weight = default_weight
+        
+        # Cache top domains for fast path
+        self._top_domains = {
+            domain: score
+            for domain, score in sorted(
+                domain_weights.items(), 
+                key=lambda x: -x[1]
+            )[:5]  # Keep top 5 highest scoring domains
+        }
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Extract domain from URL ultra-fast.
+        
+        Handles:
+        - Basic domains: "example.com"
+        - Subdomains: "sub.example.com" 
+        - Ports: "example.com:8080"
+        - IPv4: "192.168.1.1"
+        
+        Args:
+            url: Full URL to extract domain from
+            
+        Returns:
+            Lowercase domain without port
+        """
+        # Find domain start
+        start = url.find('://') 
+        if start == -1:
+            start = 0
+        else:
+            start += 3
+            
+        # Find domain end
+        end = url.find('/', start)
+        if end == -1:
+            end = url.find('?', start)
+            if end == -1:
+                end = url.find('#', start)
+                if end == -1:
+                    end = len(url)
+                    
+        # Extract domain and remove port
+        domain = url[start:end]
+        port_idx = domain.rfind(':')
+        if port_idx != -1:
+            domain = domain[:port_idx]
+            
+        return domain.lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate domain authority score.
+        
+        Uses staged approach:
+        1. Check top domains (fastest)
+        2. Check full domain weights
+        3. Return default weight
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Authority score between 0.0 and 1.0 * weight
+        """
+        domain = self._extract_domain(url)
+        
+        # Fast path: check top domains first
+        score = self._top_domains.get(domain)
+        if score is not None:
+            return score
+            
+        # Regular path: check all domains
+        return self._domain_weights.get(domain, self._default_weight)
+```
+
+
+## File: docs/examples/deepcrawl_example.py
+
+```py
+import asyncio
+import time
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    ContentRelevanceFilter,
+    SEOFilter,
+)
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer,
+)
+
+
+# 1️⃣ Basic Deep Crawl Setup
+async def basic_deep_crawl():
+    """
+    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
+
+    This function shows:
+    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
+    - Setting depth and domain parameters
+    - Processing the results to show the hierarchy
+    """
+    print("\n===== BASIC DEEP CRAWL SETUP =====")
+
+    # Configure a 2-level deep crawl using Breadth-First Search strategy
+    # max_depth=2 means: initial page (depth 0) + 2 more levels
+    # include_external=False means: only follow links within the same domain
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,  # Show progress during crawling
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        # Group results by depth to visualize the crawl tree
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+
+        print(f"✅ Crawled {len(results)} pages total")
+
+        # Display crawl structure by depth
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"\nDepth {depth}: {len(urls)} pages")
+            # Show first 3 URLs for each depth as examples
+            for url in urls[:3]:
+                print(f"  → {url}")
+            if len(urls) > 3:
+                print(f"  ... and {len(urls) - 3} more")
+
+        print(
+            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+# 2️⃣ Stream vs. Non-Stream Execution
+async def stream_vs_nonstream():
+    """
+    PART 2: Demonstrates the difference between stream and non-stream execution.
+
+    Non-stream: Waits for all results before processing
+    Stream: Processes results as they become available
+    """
+    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
+
+    # Common configuration for both examples
+    base_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=False,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # NON-STREAMING MODE
+        print("\n📊 NON-STREAMING MODE:")
+        print("  In this mode, all results are collected before being returned.")
+
+        non_stream_config = base_config.clone()
+        non_stream_config.stream = False
+
+        start_time = time.perf_counter()
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=non_stream_config
+        )
+
+        print(f"  ✅ Received all {len(results)} results at once")
+        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        # STREAMING MODE
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+
+        stream_config = base_config.clone()
+        stream_config.stream = True
+
+        start_time = time.perf_counter()
+        result_count = 0
+        first_result_time = None
+
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=stream_config
+        ):
+            result_count += 1
+            if result_count == 1:
+                first_result_time = time.perf_counter() - start_time
+                print(
+                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
+                )
+            elif result_count % 5 == 0:  # Show every 5th result for brevity
+                print(f"  → Result #{result_count}: {result.url}")
+
+        print(f"  ✅ Total: {result_count} results")
+        print(f"  ✅ First result: {first_result_time:.2f} seconds")
+        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
+
+# 3️⃣ Introduce Filters & Scorers
+async def filters_and_scorers():
+    """
+    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
+
+    This function progressively adds:
+    1. A single URL pattern filter
+    2. Multiple filters in a chain
+    3. Scorers for prioritizing pages
+    """
+    print("\n===== FILTERS AND SCORERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SINGLE FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
+        print("  Only crawl pages containing 'core' in the URL")
+
+        # Create a filter that only allows URLs with 'guide' in them
+        url_filter = URLPatternFilter(patterns=["*core*"])
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1,
+                include_external=False,
+                filter_chain=FilterChain([url_filter]),  # Single filter
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
+        for result in results[:3]:  # Show first 3 results
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # MULTIPLE FILTERS EXAMPLE
+        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
+        print("  Only crawl pages that:")
+        print("  1. Contain '2024' in the URL")
+        print("  2. Are from 'techcrunch.com'")
+        print("  3. Are of text/html or application/javascript content type")
+
+        # Create a chain of filters
+        filter_chain = FilterChain(
+            [
+                URLPatternFilter(patterns=["*2024*"]),
+                DomainFilter(
+                    allowed_domains=["techcrunch.com"],
+                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
+                ),
+                ContentTypeFilter(
+                    allowed_types=["text/html", "application/javascript"]
+                ),
+            ]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, include_external=False, filter_chain=filter_chain
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://techcrunch.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
+        for result in results[:3]:
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # SCORERS EXAMPLE
+        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
+        print(
+            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
+        )
+
+        # Create a keyword relevance scorer
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  
+                max_depth=1, include_external=False, url_scorer=keyword_scorer
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+            stream=True,
+        )
+
+        results = []
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score")
+            print(f"  → Score: {score:.2f} | {result.url}")
+
+        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
+        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
+
+# 4️⃣ Advanced Filters
+async def advanced_filters():
+    """
+    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
+
+    This function covers:
+    - SEO filters
+    - Text relevancy filtering
+    - Combining advanced filters
+    """
+    print("\n===== ADVANCED FILTERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SEO FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SEO FILTERS")
+        print(
+            "Quantitative SEO quality assessment filter based searching keywords in the head section"
+        )
+
+        seo_filter = SEOFilter(
+            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([seo_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages with relevant keywords")
+        for result in results:
+            print(f"  → {result.url}")
+
+        # ADVANCED TEXT RELEVANCY FILTER
+        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
+
+        # More sophisticated content relevance filter
+        relevance_filter = ContentRelevanceFilter(
+            query="Interact with the web using your authentic digital identity",
+            threshold=0.7,
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([relevance_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages")
+        for result in results:
+            relevance_score = result.metadata.get("relevance_score", 0)
+            print(f"  → Score: {relevance_score:.2f} | {result.url}")
+
+# 5️⃣ Max Pages and Score Thresholds
+async def max_pages_and_thresholds():
+    """
+    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    
+    This function shows:
+    - How to limit the number of pages crawled
+    - How to set score thresholds for more targeted crawling
+    - Comparing BFS, DFS, and Best-First strategies with these parameters
+    """
+    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
+    
+    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define a common keyword scorer for all examples
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["browser", "crawler", "web", "automation"], 
+            weight=1.0
+        )
+        
+        # EXAMPLE 1: BFS WITH MAX PAGES
+        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
+        print("  Limit the crawler to a maximum of 5 pages")
+        
+        bfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=2, 
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=5  # Only crawl 5 pages
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
+        
+        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | {result.url}")
+            
+        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
+        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
+        print("  Only crawl pages with a relevance score above 0.5")
+        
+        dfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=DFSDeepCrawlStrategy(
+                max_depth=2,
+                include_external=False, 
+                url_scorer=keyword_scorer,
+                score_threshold=0.7,  # Only process URLs with scores above 0.5
+                max_pages=10
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
+        
+        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
+        for result in results:
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
+        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
+        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
+        
+        bf_config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(
+                max_depth=2,
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=7,          # Limit to 7 pages total
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+            stream=True,
+        )
+        
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
+        if results:
+            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+            print(f"  ✅ Average score: {avg_score:.2f}")
+            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
+
+# 6️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 6: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+async def run_tutorial():
+    """
+    Executes all tutorial sections in sequence.
+    """
+    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
+    print("======================================")
+    print("This tutorial will walk you through deep crawling techniques,")
+    print("from basic to advanced, using the Crawl4AI library.")
+
+    # Define sections - uncomment to run specific parts during development
+    tutorial_sections = [
+        basic_deep_crawl,
+        stream_vs_nonstream,
+        filters_and_scorers,
+        max_pages_and_thresholds, 
+        advanced_filters,
+        wrap_up,
+    ]
+
+    for section in tutorial_sections:
+        await section()
+
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
+    print("For more information, check out https://docs.crawl4ai.com")
+
+# Execute the tutorial when run directly
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())
+```
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
new file mode 100644
index 00000000..1642f85e
--- /dev/null
+++ b/deploy/docker/c4ai-doc-context.md
@@ -0,0 +1,8899 @@
+# Crawl4AI Doc Context
+
+Generated on 2025-04-21
+
+## File: docs/md_v2/core/ask-ai.md
+
+```md
+<div class="ask-ai-container">
+<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
+</div>
+
+<script>
+// Iframe height adjustment
+function resizeAskAiIframe() {
+  const iframe = document.getElementById('ask-ai-frame');
+  if (iframe) {
+    const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
+    // Footer is removed by JS below, so calculate height based on header + small buffer
+    const topOffset = headerHeight + 20; // Header + buffer/margin
+
+    const availableHeight = window.innerHeight - topOffset;
+    iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
+  }
+}
+
+// Run immediately and on resize/load
+resizeAskAiIframe(); // Initial call
+let resizeTimer;
+window.addEventListener('load', resizeAskAiIframe);
+window.addEventListener('resize', () => {
+    clearTimeout(resizeTimer);
+    resizeTimer = setTimeout(resizeAskAiIframe, 150);
+});
+
+// Remove Footer & HR from parent page (DOM Ready might be safer)
+document.addEventListener('DOMContentLoaded', () => {
+    setTimeout(() => { // Add slight delay just in case elements render slowly
+        const footer = window.parent.document.querySelector('footer'); // Target parent document
+        if (footer) {
+            const hrBeforeFooter = footer.previousElementSibling;
+            if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
+                hrBeforeFooter.remove();
+            }
+            footer.remove();
+            // Trigger resize again after removing footer
+            resizeAskAiIframe();
+        } else {
+             console.warn("Ask AI Page: Could not find footer in parent document to remove.");
+        }
+    }, 100); // Shorter delay
+});
+</script>
+
+<style>
+#terminal-mkdocs-main-content {
+    padding: 0 !important;
+    margin: 0;
+    width: 100%;
+    height: 100%;
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+}
+
+/* Ensure iframe container takes full space */
+#terminal-mkdocs-main-content .ask-ai-container {
+    /* Remove negative margins if footer removal handles space */
+     margin: 0;
+    padding: 0;
+    max-width: none;
+    /* Let the JS set the height */
+    /* height: 600px; Initial fallback height */
+    overflow: hidden; /* Hide potential overflow before JS resize */
+}
+
+/* Hide title/paragraph if they were part of the markdown */
+/* Alternatively, just remove them from the .md file directly */
+/* #terminal-mkdocs-main-content > h1,
+#terminal-mkdocs-main-content > p:first-of-type {
+    display: none;
+} */
+
+</style>
+
+```
+
+
+## File: docs/md_v2/core/browser-crawler-config.md
+
+```md
+# Browser, Crawler & LLM Configuration (Quick Overview)
+
+Crawl4AI’s flexibility stems from two key classes:
+
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
+3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
+
+In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
+
+---
+
+## 1. BrowserConfig Essentials
+
+```python
+class BrowserConfig:
+    def __init__(
+        browser_type="chromium",
+        headless=True,
+        proxy_config=None,
+        viewport_width=1080,
+        viewport_height=600,
+        verbose=True,
+        use_persistent_context=False,
+        user_data_dir=None,
+        cookies=None,
+        headers=None,
+        user_agent=None,
+        text_mode=False,
+        light_mode=False,
+        extra_args=None,
+        # ... other advanced parameters omitted here
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+
+
+1. **`browser_type`**  
+- Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
+- Defaults to `"chromium"`.  
+- If you need a different engine, specify it here.
+
+2. **`headless`**  
+   - `True`: Runs the browser in headless mode (invisible browser).  
+   - `False`: Runs the browser in visible mode, which helps with debugging.
+
+3. **`proxy_config`**  
+   - A dictionary with fields like:  
+```json
+{
+    "server": "http://proxy.example.com:8080", 
+    "username": "...", 
+    "password": "..."
+}
+```
+   - Leave as `None` if a proxy is not required.
+
+4. **`viewport_width` & `viewport_height`**:  
+   - The initial window size.  
+   - Some sites behave differently with smaller or bigger viewports.
+
+5. **`verbose`**:  
+   - If `True`, prints extra logs.  
+   - Handy for debugging.
+
+6. **`use_persistent_context`**:  
+   - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
+   - Typically also set `user_data_dir` to point to a folder.
+
+7. **`cookies`** & **`headers`**:  
+   - If you want to start with specific cookies or add universal HTTP headers, set them here.  
+   - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
+
+8. **`user_agent`**:  
+   - Custom User-Agent string. If `None`, a default is used.  
+   - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
+
+9. **`text_mode`** & **`light_mode`**:  
+   - `text_mode=True` disables images, possibly speeding up text-only crawls.  
+   - `light_mode=True` turns off certain background features for performance.  
+
+10. **`extra_args`**:  
+    - Additional flags for the underlying browser.  
+    - E.g. `["--disable-extensions"]`.
+
+### Helper Methods
+
+Both configuration classes provide a `clone()` method to create modified copies:
+
+```python
+# Create a base browser config
+base_browser = BrowserConfig(
+    browser_type="chromium",
+    headless=True,
+    text_mode=True
+)
+
+# Create a visible browser config for debugging
+debug_browser = base_browser.clone(
+    headless=False,
+    verbose=True
+)
+```
+
+**Minimal Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_conf = BrowserConfig(
+    browser_type="firefox",
+    headless=False,
+    text_mode=True
+)
+
+async with AsyncWebCrawler(config=browser_conf) as crawler:
+    result = await crawler.arun("https://example.com")
+    print(result.markdown[:300])
+```
+
+---
+
+## 2. CrawlerRunConfig Essentials
+
+```python
+class CrawlerRunConfig:
+    def __init__(
+        word_count_threshold=200,
+        extraction_strategy=None,
+        markdown_generator=None,
+        cache_mode=None,
+        js_code=None,
+        wait_for=None,
+        screenshot=False,
+        pdf=False,
+        capture_mhtml=False,
+        enable_rate_limiting=False,
+        rate_limit_config=None,
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=20,
+        display_mode=None,
+        verbose=True,
+        stream=False,  # Enable streaming for arun_many()
+        # ... other advanced parameters omitted
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+1. **`word_count_threshold`**:  
+   - The minimum word count before a block is considered.  
+   - If your site has lots of short paragraphs or items, you can lower it.
+
+2. **`extraction_strategy`**:  
+   - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
+   - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
+
+3. **`markdown_generator`**:  
+   - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
+   - If `None`, a default approach is used.
+
+4. **`cache_mode`**:  
+   - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
+   - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
+
+5. **`js_code`**:  
+   - A string or list of JS strings to execute.  
+   - Great for “Load More” buttons or user interactions.  
+
+6. **`wait_for`**:  
+   - A CSS or JS expression to wait for before extracting content.  
+   - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
+
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
+8. **`verbose`**:  
+   - Logs additional runtime details.  
+   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
+
+9. **`enable_rate_limiting`**:  
+   - If `True`, enables rate limiting for batch processing.  
+   - Requires `rate_limit_config` to be set.
+
+10. **`memory_threshold_percent`**:  
+    - The memory threshold (as a percentage) to monitor.  
+    - If exceeded, the crawler will pause or slow down.
+
+11. **`check_interval`**:  
+    - The interval (in seconds) to check system resources.  
+    - Affects how often memory and CPU usage are monitored.
+
+12. **`max_session_permit`**:  
+    - The maximum number of concurrent crawl sessions.  
+    - Helps prevent overwhelming the system.
+
+13. **`display_mode`**:  
+    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
+    - Affects how much information is printed during the crawl.
+
+### Helper Methods
+
+The `clone()` method is particularly useful for creating variations of your crawler configuration:
+
+```python
+# Create a base configuration
+base_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    word_count_threshold=200,
+    wait_until="networkidle"
+)
+
+# Create variations for different use cases
+stream_config = base_config.clone(
+    stream=True,  # Enable streaming mode
+    cache_mode=CacheMode.BYPASS
+)
+
+debug_config = base_config.clone(
+    page_timeout=120000,  # Longer timeout for debugging
+    verbose=True
+)
+```
+
+The `clone()` method:
+- Creates a new instance with all the same settings
+- Updates only the specified parameters
+- Leaves the original configuration unchanged
+- Perfect for creating variations without repeating all parameters
+
+---
+
+
+
+
+
+## 3. LLMConfig Essentials
+
+### Key fields to note
+
+1. **`provider`**:  
+- Which LLM provoder to use. 
+- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
+
+2. **`api_token`**:  
+    - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
+    - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
+    - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
+
+3. **`base_url`**:  
+   - If your provider has a custom endpoint
+
+```python
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+## 4. Putting It All Together
+
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # 1) Browser config: headless, bigger viewport, no proxy
+    browser_conf = BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=720
+    )
+
+    # 2) Example extraction strategy
+    schema = {
+        "name": "Articles",
+        "baseSelector": "div.article",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    extraction = JsonCssExtractionStrategy(schema)
+
+    # 3) Example LLM content filtering
+
+    gemini_config = LLMConfig(
+        provider="gemini/gemini-1.5-pro" 
+        api_token = "env:GEMINI_API_TOKEN"
+    )
+
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config=gemini_config,  # or your preferred provider
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=500,  # Adjust based on your needs
+        verbose=True
+    )
+
+    md_generator = DefaultMarkdownGenerator(
+    content_filter=filter,
+    options={"ignore_links": True}
+
+    # 4) Crawler run config: skip cache, use extraction
+    run_conf = CrawlerRunConfig(
+        markdown_generator=md_generator,
+        extraction_strategy=extraction,
+        cache_mode=CacheMode.BYPASS,
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        # 4) Execute the crawl
+        result = await crawler.arun(url="https://example.com/news", config=run_conf)
+
+        if result.success:
+            print("Extracted content:", result.extracted_content)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 5. Next Steps
+
+For a **detailed list** of available parameters (including advanced ones), see:
+
+- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)  
+
+You can explore topics like:
+
+- **Custom Hooks & Auth** (Inject JavaScript or handle login forms).  
+- **Session Management** (Re-use pages, preserve state across multiple calls).  
+- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior).  
+- **Advanced Caching** (Fine-tune read/write cache modes).  
+
+---
+
+## 6. Conclusion
+
+**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
+
+- **Which** browser to launch, how it should run, and any proxy or user agent needs.  
+- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
+- **Which** LLM provider to use, api token, temperature and base url for custom endpoints
+
+Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!
+```
+
+
+## File: docs/md_v2/core/cache-modes.md
+
+```md
+# Crawl4AI Cache System and Migration Guide
+
+## Overview
+Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
+
+## Old vs New Approach
+
+### Old Way (Deprecated)
+The old system used multiple boolean flags:
+- `bypass_cache`: Skip cache entirely
+- `disable_cache`: Disable all caching
+- `no_cache_read`: Don't read from cache
+- `no_cache_write`: Don't write to cache
+
+### New Way (Recommended)
+The new system uses a single `CacheMode` enum:
+- `CacheMode.ENABLED`: Normal caching (read/write)
+- `CacheMode.DISABLED`: No caching at all
+- `CacheMode.READ_ONLY`: Only read from cache
+- `CacheMode.WRITE_ONLY`: Only write to cache
+- `CacheMode.BYPASS`: Skip cache for this operation
+
+## Migration Example
+
+### Old Code (Deprecated)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            bypass_cache=True  # Old way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### New Code (Recommended)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def use_proxy():
+    # Use CacheMode in CrawlerRunConfig
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=config  # Pass the configuration object
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Common Migration Patterns
+
+| Old Flag              | New Mode                       |
+|-----------------------|---------------------------------|
+| `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
+| `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
+| `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
+| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
+```
+
+
+## File: docs/md_v2/core/cli.md
+
+```md
+# Crawl4AI CLI Guide
+
+## Table of Contents
+- [Installation](#installation)
+- [Basic Usage](#basic-usage)
+- [Configuration](#configuration)
+  - [Browser Configuration](#browser-configuration)
+  - [Crawler Configuration](#crawler-configuration)
+  - [Extraction Configuration](#extraction-configuration)
+  - [Content Filtering](#content-filtering)
+- [Advanced Features](#advanced-features)
+  - [LLM Q&A](#llm-qa)
+  - [Structured Data Extraction](#structured-data-extraction)
+  - [Content Filtering](#content-filtering-1)
+- [Output Formats](#output-formats)
+- [Examples](#examples)
+- [Configuration Reference](#configuration-reference)
+- [Best Practices & Tips](#best-practices--tips)
+
+## Basic Usage
+
+The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
+
+```bash
+# Basic crawling
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# Verbose JSON output with cache bypass
+crwl https://example.com -o json -v --bypass-cache
+
+# See usage examples
+crwl --example
+```
+
+## Quick Example of Advanced Usage
+
+If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
+
+```bash
+crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
+```
+
+## Configuration
+
+### Browser Configuration
+
+Browser settings can be configured via YAML file or command line parameters:
+
+```yaml
+# browser.yml
+headless: true
+viewport_width: 1280
+user_agent_mode: "random"
+verbose: true
+ignore_https_errors: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -B browser.yml
+
+# Using direct parameters
+crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+```
+
+### Crawler Configuration
+
+Control crawling behavior:
+
+```yaml
+# crawler.yml
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -C crawler.yml
+
+# Using direct parameters
+crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+```
+
+### Extraction Configuration
+
+Two types of extraction are supported:
+
+1. CSS/XPath-based extraction:
+```yaml
+# extract_css.yml
+type: "json-css"
+params:
+  verbose: true
+```
+
+```json
+// css_schema.json
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".article",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h1.title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "a.read-more",
+      "type": "attribute",
+      "attribute": "href"
+    }
+  ]
+}
+```
+
+2. LLM-based extraction:
+```yaml
+# extract_llm.yml
+type: "llm"
+provider: "openai/gpt-4"
+instruction: "Extract all articles with their titles and links"
+api_token: "your-token"
+params:
+  temperature: 0.3
+  max_tokens: 1000
+```
+
+```json
+// llm_schema.json
+{
+  "title": "Article",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title of the article"
+    },
+    "link": {
+      "type": "string",
+      "description": "URL to the full article"
+    }
+  }
+}
+```
+
+## Advanced Features
+
+### LLM Q&A
+
+Ask questions about crawled content:
+
+```bash
+# Simple question
+crwl https://example.com -q "What is the main topic discussed?"
+
+# View content then ask questions
+crwl https://example.com -o markdown  # See content first
+crwl https://example.com -q "Summarize the key points"
+crwl https://example.com -q "What are the conclusions?"
+
+# Combined with advanced crawling
+crwl https://example.com \
+    -B browser.yml \
+    -c "css_selector=article,scan_full_page=true" \
+    -q "What are the pros and cons mentioned?"
+```
+
+First-time setup:
+- Prompts for LLM provider and API token
+- Saves configuration in `~/.crawl4ai/global.yml`
+- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
+- For case of `ollama` you do not need to provide API token.
+- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
+
+### Structured Data Extraction
+
+Extract structured data using CSS selectors:
+
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json
+```
+
+Or using LLM-based extraction:
+
+```bash
+crwl https://example.com \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -o json
+```
+
+### Content Filtering
+
+Filter content for relevance:
+
+```yaml
+# filter_bm25.yml
+type: "bm25"
+query: "target content"
+threshold: 1.0
+
+# filter_pruning.yml
+type: "pruning"
+query: "focus topic"
+threshold: 0.48
+```
+
+```bash
+crwl https://example.com -f filter_bm25.yml -o markdown-fit
+```
+
+## Output Formats
+
+- `all` - Full crawl result including metadata
+- `json` - Extracted structured data (when using extraction)
+- `markdown` / `md` - Raw markdown output
+- `markdown-fit` / `md-fit` - Filtered markdown for better readability
+
+## Complete Examples
+
+1. Basic Extraction:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -C crawler.yml \
+    -o json
+```
+
+2. Structured Data Extraction:
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json \
+    -v
+```
+
+3. LLM Extraction with Filtering:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -f filter_bm25.yml \
+    -o json
+```
+
+4. Interactive Q&A:
+```bash
+# First crawl and view
+crwl https://example.com -o markdown
+
+# Then ask questions
+crwl https://example.com -q "What are the main points?"
+crwl https://example.com -q "Summarize the conclusions"
+```
+
+## Best Practices & Tips
+
+1. **Configuration Management**:
+   - Keep common configurations in YAML files
+   - Use CLI parameters for quick overrides
+   - Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
+
+2. **Performance Optimization**:
+   - Use `--bypass-cache` for fresh content
+   - Enable `scan_full_page` for infinite scroll pages
+   - Adjust `delay_before_return_html` for dynamic content
+
+3. **Content Extraction**:
+   - Use CSS extraction for structured content
+   - Use LLM extraction for unstructured content
+   - Combine with filters for focused results
+
+4. **Q&A Workflow**:
+   - View content first with `-o markdown`
+   - Ask specific questions
+   - Use broader context with appropriate selectors
+
+## Recap
+
+The Crawl4AI CLI provides:
+- Flexible configuration via files and parameters
+- Multiple extraction strategies (CSS, XPath, LLM)
+- Content filtering and optimization
+- Interactive Q&A capabilities
+- Various output formats
+
+
+```
+
+
+## File: docs/md_v2/core/content-selection.md
+
+```md
+# Content Selection
+
+Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters.
+
+Below, we show how to configure these parameters and combine them for precise control.
+
+---
+
+## 1. CSS-Based Selection
+
+There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
+
+### 1.1 Using `css_selector`
+
+A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # e.g., first 30 items from Hacker News
+        css_selector=".athing:nth-child(-n+30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        print("Partial HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Result**: Only elements matching that selector remain in `result.cleaned_html`.
+
+### 1.2 Using `target_elements`
+
+The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Target article body and sidebar, but not other content
+        target_elements=["article.main-content", "aside.sidebar"]
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/blog-post", 
+            config=config
+        )
+        print("Markdown focused on target elements")
+        print("Links from entire page still available:", len(result.links.get("internal", [])))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
+
+---
+
+## 2. Content Filtering & Exclusions
+
+### 2.1 Basic Overview
+
+```python
+config = CrawlerRunConfig(
+    # Content thresholds
+    word_count_threshold=10,        # Minimum words per block
+
+    # Tag exclusions
+    excluded_tags=['form', 'header', 'footer', 'nav'],
+
+    # Link filtering
+    exclude_external_links=True,    
+    exclude_social_media_links=True,
+    # Block entire domains
+    exclude_domains=["adtrackers.com", "spammynews.org"],    
+    exclude_social_media_domains=["facebook.com", "twitter.com"],
+
+    # Media filtering
+    exclude_external_images=True
+)
+```
+
+**Explanation**:
+
+- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers.  
+- **`excluded_tags`**: Removes entire tags (`<form>`, `<header>`, `<footer>`, etc.).  
+- **Link Filtering**:  
+  - `exclude_external_links`: Strips out external links and may remove them from `result.links`.  
+  - `exclude_social_media_links`: Removes links pointing to known social media domains.  
+  - `exclude_domains`: A custom list of domains to block if discovered in links.  
+  - `exclude_social_media_domains`: A curated list (override or add to it) for social media sites.  
+- **Media Filtering**:  
+  - `exclude_external_images`: Discards images not hosted on the same domain as the main page (or its subdomains).
+
+By default in case you set `exclude_social_media_links=True`, the following social media domains are excluded:
+```python
+[
+    'facebook.com',
+    'twitter.com',
+    'x.com',
+    'linkedin.com',
+    'instagram.com',
+    'pinterest.com',
+    'tiktok.com',
+    'snapchat.com',
+    'reddit.com',
+]
+```
+
+
+### 2.2 Example Usage
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        css_selector="main.content", 
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+        exclude_domains=["ads.com", "spammytrackers.net"],
+        exclude_external_images=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        print("Cleaned HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Note**: If these parameters remove too much, reduce or disable them accordingly.
+
+---
+
+## 3. Handling Iframes
+
+Some sites embed content in `<iframe>` tags. If you want that inline:
+```python
+config = CrawlerRunConfig(
+    # Merge iframe content into the final output
+    process_iframes=True,    
+    remove_overlay_elements=True
+)
+```
+
+**Usage**:
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        process_iframes=True,
+        remove_overlay_elements=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.org/iframe-demo", 
+            config=config
+        )
+        print("Iframe-merged length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 4. Structured Extraction Examples
+
+You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.
+
+### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # Minimal schema for repeated items
+    schema = {
+        "name": "News Items",
+        "baseSelector": "tr.athing",
+        "fields": [
+            {"name": "title", "selector": "span.titleline a", "type": "text"},
+            {
+                "name": "link", 
+                "selector": "span.titleline a", 
+                "type": "attribute", 
+                "attribute": "href"
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Content filtering
+        excluded_tags=["form", "header"],
+        exclude_domains=["adsite.com"],
+        
+        # CSS selection or entire page
+        css_selector="table.itemlist",
+
+        # No caching for demonstration
+        cache_mode=CacheMode.BYPASS,
+
+        # Extraction strategy
+        extraction_strategy=JsonCssExtractionStrategy(schema)
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        data = json.loads(result.extracted_content)
+        print("Sample extracted item:", data[:1])  # Show first item
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 4.2 LLM-Based Extraction
+
+```python
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class ArticleData(BaseModel):
+    headline: str
+    summary: str
+
+async def main():
+    llm_strategy = LLMExtractionStrategy(
+        llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
+        schema=ArticleData.schema(),
+        extraction_type="schema",
+        instruction="Extract 'headline' and a short 'summary' from the content."
+    )
+
+    config = CrawlerRunConfig(
+        exclude_external_links=True,
+        word_count_threshold=20,
+        extraction_strategy=llm_strategy
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        article = json.loads(result.extracted_content)
+        print(article)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here, the crawler:
+
+- Filters out external links (`exclude_external_links=True`).  
+- Ignores very short text blocks (`word_count_threshold=20`).  
+- Passes the final HTML to your LLM strategy for an AI-driven parse.
+
+---
+
+## 5. Comprehensive Example
+
+Below is a short function that unifies **CSS selection**, **exclusion** logic, and a pattern-based extraction, demonstrating how you can fine-tune your final data:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_main_articles(url: str):
+    schema = {
+        "name": "ArticleBlock",
+        "baseSelector": "div.article-block",
+        "fields": [
+            {"name": "headline", "selector": "h2", "type": "text"},
+            {"name": "summary", "selector": ".summary", "type": "text"},
+            {
+                "name": "metadata",
+                "type": "nested",
+                "fields": [
+                    {"name": "author", "selector": ".author", "type": "text"},
+                    {"name": "date", "selector": ".date", "type": "text"}
+                ]
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Keep only #main-content
+        css_selector="#main-content",
+        
+        # Filtering
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],  
+        exclude_external_links=True,
+        exclude_domains=["somebadsite.com"],
+        exclude_external_images=True,
+
+        # Extraction
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url, config=config)
+        if not result.success:
+            print(f"Error: {result.error_message}")
+            return None
+        return json.loads(result.extracted_content)
+
+async def main():
+    articles = await extract_main_articles("https://news.ycombinator.com/newest")
+    if articles:
+        print("Extracted Articles:", articles[:2])  # Show first 2
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why This Works**:
+- **CSS** scoping with `#main-content`.  
+- Multiple **exclude_** parameters to remove domains, external images, etc.  
+- A **JsonCssExtractionStrategy** to parse repeated article blocks.
+
+---
+
+## 6. Scraping Modes
+
+Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
+
+async def main():
+    config = CrawlerRunConfig(
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com", 
+            config=config
+        )
+```
+
+You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
+
+```python
+from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
+
+class CustomScrapingStrategy(ContentScrapingStrategy):
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # Implement your custom scraping logic here
+        return ScrapingResult(
+            cleaned_html="<html>...</html>",  # Cleaned HTML content
+            success=True,                     # Whether scraping was successful
+            media=Media(
+                images=[                      # List of images found
+                    MediaItem(
+                        src="https://example.com/image.jpg",
+                        alt="Image description",
+                        desc="Surrounding text",
+                        score=1,
+                        type="image",
+                        group_id=1,
+                        format="jpg",
+                        width=800
+                    )
+                ],
+                videos=[],                    # List of videos (same structure as images)
+                audios=[]                     # List of audio files (same structure as images)
+            ),
+            links=Links(
+                internal=[                    # List of internal links
+                    Link(
+                        href="https://example.com/page",
+                        text="Link text",
+                        title="Link title",
+                        base_domain="example.com"
+                    )
+                ],
+                external=[]                   # List of external links (same structure)
+            ),
+            metadata={                        # Additional metadata
+                "title": "Page Title",
+                "description": "Page description"
+            }
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+```
+
+### Performance Considerations
+
+The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
+
+1. LXML strategy is currently experimental
+2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
+3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
+
+Choose LXML strategy when:
+- Processing large HTML documents (recommended for >100KB)
+- Performance is critical
+- Working with well-formed HTML
+
+Stick to BeautifulSoup strategy (default) when:
+- Maximum compatibility is needed
+- Working with malformed HTML
+- Exact parsing behavior is critical
+
+---
+
+## 7. Combining CSS Selection Methods
+
+You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    # Target specific content but preserve page context
+    config = CrawlerRunConfig(
+        # Focus markdown on main content and sidebar
+        target_elements=["#main-content", ".sidebar"],
+        
+        # Global filters applied to entire page
+        excluded_tags=["nav", "footer", "header"],
+        exclude_external_links=True,
+        
+        # Use basic content thresholds
+        word_count_threshold=15,
+        
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/article",
+            config=config
+        )
+        
+        print(f"Content focuses on specific elements, but all links still analyzed")
+        print(f"Internal links: {len(result.links.get('internal', []))}")
+        print(f"External links: {len(result.links.get('external', []))}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+This approach gives you the best of both worlds:
+- Markdown generation and content extraction focus on the elements you care about
+- Links, images and other page data still give you the full context of the page
+- Content filtering still applies globally
+
+## 8. Conclusion
+
+By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
+
+1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
+2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.  
+3. **`word_count_threshold`** – Skip short blocks.  
+4. **`excluded_tags`** – Remove entire HTML tags.  
+5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
+6. **`exclude_external_images`** – Remove images from external sources.  
+7. **`process_iframes`** – Merge iframe content if needed.  
+
+Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
+```
+
+
+## File: docs/md_v2/core/crawler-result.md
+
+```md
+# Crawl Result and Output
+
+When you call `arun()` on a page, Crawl4AI returns a **`CrawlResult`** object containing everything you might need—raw HTML, a cleaned version, optional screenshots or PDFs, structured extraction results, and more. This document explains those fields and how they map to different output types.  
+
+---
+
+## 1. The `CrawlResult` Model
+
+Below is the core schema. Each field captures a different aspect of the crawl’s result:
+
+```python
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    screenshot: Optional[str] = None
+    pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
+    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    class Config:
+        arbitrary_types_allowed = True
+```
+
+### Table: Key Fields in `CrawlResult`
+
+| Field (Name & Type)                       | Description                                                                                         |
+|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
+| **url (`str`)**                           | The final or actual URL crawled (in case of redirects).                                             |
+| **html (`str`)**                          | Original, unmodified page HTML. Good for debugging or custom processing.                            |
+| **success (`bool`)**                      | `True` if the crawl completed without major errors, else `False`.                                   |
+| **cleaned_html (`Optional[str]`)**        | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
+| **media (`Dict[str, List[Dict]]`)**       | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc.   |
+| **links (`Dict[str, List[Dict]]`)**       | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
+| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
+| **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
+| **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
+| **mhtml (`Optional[str]`)**               | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources.      |
+| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
+| **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
+| **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
+| **error_message (`Optional[str]`)**       | If `success=False`, contains a short description of what went wrong.                                |
+| **session_id (`Optional[str]`)**          | The ID of the session used for multi-page or persistent crawling.                                   |
+| **response_headers (`Optional[dict]`)**   | HTTP response headers, if captured.                                                                 |
+| **status_code (`Optional[int]`)**         | HTTP status code (e.g., 200 for OK).                                                                |
+| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`.                                               |
+
+---
+
+## 2. HTML Variants
+
+### `html`: Raw HTML
+
+Crawl4AI preserves the exact HTML as `result.html`. Useful for:
+
+- Debugging page issues or checking the original content.
+- Performing your own specialized parse if needed.
+
+### `cleaned_html`: Sanitized
+
+If you specify any cleanup or exclusion parameters in `CrawlerRunConfig` (like `excluded_tags`, `remove_forms`, etc.), you’ll see the result here:
+
+```python
+config = CrawlerRunConfig(
+    excluded_tags=["form", "header", "footer"],
+    keep_data_attributes=False
+)
+result = await crawler.arun("https://example.com", config=config)
+print(result.cleaned_html)  # Freed of forms, header, footer, data-* attributes
+```
+
+---
+
+## 3. Markdown Generation
+
+### 3.1 `markdown`
+
+- **`markdown`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
+- **`markdown_v2`**: Deprecated since v0.5.
+
+**`MarkdownGenerationResult`** Fields:
+
+| Field                   | Description                                                                    |
+|-------------------------|--------------------------------------------------------------------------------|
+| **raw_markdown**        | The basic HTML→Markdown conversion.                                            |
+| **markdown_with_citations** | Markdown including inline citations that reference links at the end.        |
+| **references_markdown** | The references/citations themselves (if `citations=True`).                      |
+| **fit_markdown**        | The filtered/“fit” markdown if a content filter was used.                       |
+| **fit_html**            | The filtered HTML that generated `fit_markdown`.                                |
+
+### 3.2 Basic Example with a Markdown Generator
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        options={"citations": True, "body_width": 80}  # e.g. pass html2text style options
+    )
+)
+result = await crawler.arun(url="https://example.com", config=config)
+
+md_res = result.markdown  # or eventually 'result.markdown'
+print(md_res.raw_markdown[:500])
+print(md_res.markdown_with_citations)
+print(md_res.references_markdown)
+```
+
+**Note**: If you use a filter like `PruningContentFilter`, you’ll get `fit_markdown` and `fit_html` as well.
+
+---
+
+## 4. Structured Extraction: `extracted_content`
+
+If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structured data is **not** stored in `markdown`—it’s placed in **`result.extracted_content`** as a JSON string (or sometimes plain text).
+
+### Example: CSS Extraction with `raw://` HTML
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here:
+- `url="raw://..."` passes the HTML content directly, no network requests.  
+- The **CSS** extraction strategy populates `result.extracted_content` with the JSON array `[{"title": "...", "link": "..."}]`.
+
+---
+
+## 5. More Fields: Links, Media, and More
+
+### 5.1 `links`
+
+A dictionary, typically with `"internal"` and `"external"` lists. Each entry might have `href`, `text`, `title`, etc. This is automatically captured if you haven’t disabled link extraction.
+
+```python
+print(result.links["internal"][:3])  # Show first 3 internal links
+```
+
+### 5.2 `media`
+
+Similarly, a dictionary with `"images"`, `"audio"`, `"video"`, etc. Each item could include `src`, `alt`, `score`, and more, if your crawler is set to gather them.
+
+```python
+images = result.media.get("images", [])
+for img in images:
+    print("Image URL:", img["src"], "Alt:", img.get("alt"))
+```
+
+### 5.3 `screenshot`, `pdf`, and `mhtml`
+
+If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
+
+- `result.screenshot` contains a base64-encoded PNG string.
+- `result.pdf` contains raw PDF bytes (you can write them to a file).
+- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
+
+```python
+# Save the PDF
+with open("page.pdf", "wb") as f:
+    f.write(result.pdf)
+
+# Save the MHTML
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
+```
+
+The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
+
+### 5.4 `ssl_certificate`
+
+If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
+
+---
+
+## 6. Accessing These Fields
+
+After you run:
+
+```python
+result = await crawler.arun(url="https://example.com", config=some_config)
+```
+
+Check any field:
+
+```python
+if result.success:
+    print(result.status_code, result.response_headers)
+    print("Links found:", len(result.links.get("internal", [])))
+    if result.markdown:
+        print("Markdown snippet:", result.markdown.raw_markdown[:200])
+    if result.extracted_content:
+        print("Structured JSON:", result.extracted_content)
+else:
+    print("Error:", result.error_message)
+```
+
+**Deprecation**: Since v0.5 `result.markdown_v2`, `result.fit_html`,`result.fit_markdown` are deprecated. Use `result.markdown` instead! It holds `MarkdownGenerationResult`, which includes `fit_html` and `fit_markdown`
+as it's properties.
+
+
+---
+
+## 7. Next Steps
+
+- **Markdown Generation**: Dive deeper into how to configure `DefaultMarkdownGenerator` and various filters.  
+- **Content Filtering**: Learn how to use `BM25ContentFilter` and `PruningContentFilter`.
+- **Session & Hooks**: If you want to manipulate the page or preserve state across multiple `arun()` calls, see the hooking or session docs.  
+- **LLM Extraction**: For complex or unstructured content requiring AI-driven parsing, check the LLM-based strategies doc.
+
+**Enjoy** exploring all that `CrawlResult` offers—whether you need raw HTML, sanitized output, markdown, or fully structured data, Crawl4AI has you covered!
+```
+
+
+## File: docs/md_v2/core/deep-crawling.md
+
+```md
+# Deep Crawling
+
+One of Crawl4AI's most powerful features is its ability to perform **configurable deep crawling** that can explore websites beyond a single page. With fine-tuned control over crawl depth, domain boundaries, and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
+
+In this tutorial, you'll learn:
+
+1. How to set up a **Basic Deep Crawler** with BFS strategy  
+2. Understanding the difference between **streamed and non-streamed** output  
+3. Implementing **filters and scorers** to target specific content  
+4. Creating **advanced filtering chains** for sophisticated crawls  
+5. Using **BestFirstCrawling** for intelligent exploration prioritization  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here's a minimal code snippet that implements a basic deep crawl using the **BFSDeepCrawlStrategy**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+async def main():
+    # Configure a 2-level deep crawl
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2, 
+            include_external=False
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://example.com", config=config)
+        
+        print(f"Crawled {len(results)} pages in total")
+        
+        # Access individual results
+        for result in results[:3]:  # Show first 3 results
+            print(f"URL: {result.url}")
+            print(f"Depth: {result.metadata.get('depth', 0)}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What's happening?**  
+- `BFSDeepCrawlStrategy(max_depth=2, include_external=False)` instructs Crawl4AI to:
+  - Crawl the starting page (depth 0) plus 2 more levels
+  - Stay within the same domain (don't follow external links)
+- Each result contains metadata like the crawl depth
+- Results are returned as a list after all crawling is complete
+
+---
+
+## 2. Understanding Deep Crawling Strategy Options
+
+### 2.1 BFSDeepCrawlStrategy (Breadth-First Search)
+
+The **BFSDeepCrawlStrategy** uses a breadth-first approach, exploring all links at one depth before moving deeper:
+
+```python
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = BFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=50,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
+
+The **DFSDeepCrawlStrategy** uses a depth-first approach, explores as far down a branch as possible before backtracking.
+
+```python
+from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=30,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.5,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
+
+For more intelligent crawling, use **BestFirstCrawlingStrategy** with scorers to prioritize the most relevant pages:
+
+```python
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+# Create a scorer
+scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7
+)
+
+# Configure the strategy
+strategy = BestFirstCrawlingStrategy(
+    max_depth=2,
+    include_external=False,
+    url_scorer=scorer,
+    max_pages=25,              # Maximum number of pages to crawl (optional)
+)
+```
+
+This crawling approach:
+- Evaluates each discovered URL based on scorer criteria
+- Visits higher-scoring pages first
+- Helps focus crawl resources on the most relevant content
+- Can limit total pages crawled with `max_pages`
+- Does not need `score_threshold` as it naturally prioritizes by score
+
+---
+
+## 3. Streaming vs. Non-Streaming Results
+
+Crawl4AI can return results in two modes:
+
+### 3.1 Non-Streaming Mode (Default)
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=False  # Default behavior
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Wait for ALL results to be collected before returning
+    results = await crawler.arun("https://example.com", config=config)
+    
+    for result in results:
+        process_result(result)
+```
+
+**When to use non-streaming mode:**
+- You need the complete dataset before processing
+- You're performing batch operations on all results together
+- Crawl time isn't a critical factor
+
+### 3.2 Streaming Mode
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=True  # Enable streaming
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Returns an async iterator
+    async for result in await crawler.arun("https://example.com", config=config):
+        # Process each result as it becomes available
+        process_result(result)
+```
+
+**Benefits of streaming mode:**
+- Process results immediately as they're discovered
+- Start working with early results while crawling continues
+- Better for real-time applications or progressive display
+- Reduces memory pressure when handling many pages
+
+---
+
+## 4. Filtering Content with Filter Chains
+
+Filters help you narrow down which pages to crawl. Combine multiple filters using **FilterChain** for powerful targeting.
+
+### 4.1 Basic URL Pattern Filter
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
+
+# Only follow URLs containing "blog" or "docs"
+url_filter = URLPatternFilter(patterns=["*blog*", "*docs*"])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([url_filter])
+    )
+)
+```
+
+### 4.2 Combining Multiple Filters
+
+```python
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter
+)
+
+# Create a chain of filters
+filter_chain = FilterChain([
+    # Only follow URLs with specific patterns
+    URLPatternFilter(patterns=["*guide*", "*tutorial*"]),
+    
+    # Only crawl specific domains
+    DomainFilter(
+        allowed_domains=["docs.example.com"],
+        blocked_domains=["old.docs.example.com"]
+    ),
+    
+    # Only include specific content types
+    ContentTypeFilter(allowed_types=["text/html"])
+])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=2,
+        filter_chain=filter_chain
+    )
+)
+```
+
+### 4.3 Available Filter Types
+
+Crawl4AI includes several specialized filters:
+
+- **`URLPatternFilter`**: Matches URL patterns using wildcard syntax
+- **`DomainFilter`**: Controls which domains to include or exclude
+- **`ContentTypeFilter`**: Filters based on HTTP Content-Type
+- **`ContentRelevanceFilter`**: Uses similarity to a text query
+- **`SEOFilter`**: Evaluates SEO elements (meta tags, headers, etc.)
+
+---
+
+## 5. Using Scorers for Prioritized Crawling
+
+Scorers assign priority values to discovered URLs, helping the crawler focus on the most relevant content first.
+
+### 5.1 KeywordRelevanceScorer
+
+```python
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+
+# Create a keyword relevance scorer
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7  # Importance of this scorer (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BestFirstCrawlingStrategy(
+        max_depth=2,
+        url_scorer=keyword_scorer
+    ),
+    stream=True  # Recommended with BestFirstCrawling
+)
+
+# Results will come in order of relevance score
+async with AsyncWebCrawler() as crawler:
+    async for result in await crawler.arun("https://example.com", config=config):
+        score = result.metadata.get("score", 0)
+        print(f"Score: {score:.2f} | {result.url}")
+```
+
+**How scorers work:**
+- Evaluate each discovered URL before crawling
+- Calculate relevance based on various signals
+- Help the crawler make intelligent choices about traversal order
+
+---
+
+## 6. Advanced Filtering Techniques
+
+### 6.1 SEO Filter for Quality Assessment
+
+The **SEOFilter** helps you identify pages with strong SEO characteristics:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, SEOFilter
+
+# Create an SEO filter that looks for specific keywords in page metadata
+seo_filter = SEOFilter(
+    threshold=0.5,  # Minimum score (0.0 to 1.0)
+    keywords=["tutorial", "guide", "documentation"]
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([seo_filter])
+    )
+)
+```
+
+### 6.2 Content Relevance Filter
+
+The **ContentRelevanceFilter** analyzes the actual content of pages:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, ContentRelevanceFilter
+
+# Create a content relevance filter
+relevance_filter = ContentRelevanceFilter(
+    query="Web crawling and data extraction with Python",
+    threshold=0.7  # Minimum similarity score (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([relevance_filter])
+    )
+)
+```
+
+This filter:
+- Measures semantic similarity between query and page content
+- It's a BM25-based relevance filter using head section content
+
+---
+
+## 7. Building a Complete Advanced Crawler
+
+This example combines multiple techniques for a sophisticated crawl:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+async def run_advanced_crawler():
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain boundaries
+        DomainFilter(
+            allowed_domains=["docs.example.com"],
+            blocked_domains=["old.docs.example.com"]
+        ),
+        
+        # URL patterns to include
+        URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),
+        
+        # Content type filtering
+        ContentTypeFilter(allowed_types=["text/html"])
+    ])
+
+    # Create a relevance scorer
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"],
+        weight=0.7
+    )
+
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True
+    )
+
+    # Execute the crawl
+    results = []
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.example.com", config=config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    # Analyze the results
+    print(f"Crawled {len(results)} high-value pages")
+    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+if __name__ == "__main__":
+    asyncio.run(run_advanced_crawler())
+```
+
+---
+
+
+## 8. Limiting and Controlling Crawl Size
+
+### 8.1 Using max_pages
+
+You can limit the total number of pages crawled with the `max_pages` parameter:
+
+```python
+# Limit to exactly 20 pages regardless of depth
+strategy = BFSDeepCrawlStrategy(
+    max_depth=3,
+    max_pages=20
+)
+```
+
+This feature is useful for:
+- Controlling API costs
+- Setting predictable execution times
+- Focusing on the most important content
+- Testing crawl configurations before full execution
+
+### 8.2 Using score_threshold
+
+For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
+
+```python
+# Only follow links with scores above 0.4
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,
+    url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
+    score_threshold=0.4  # Skip URLs with scores below this value
+)
+```
+
+Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
+
+## 9. Common Pitfalls & Tips
+
+1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
+
+2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
+
+3.**Be a good web citizen.**  Respect robots.txt. (disabled by default)
+  
+4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
+
+5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
+
+- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
+- Process results in streaming or non-streaming mode
+- Apply filters to target specific content
+- Use scorers to prioritize the most relevant pages
+- Limit crawls with `max_pages` and `score_threshold` parameters
+- Build a complete advanced crawler with combined techniques
+
+With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
+
+```
+
+
+## File: docs/md_v2/core/docker-deployment.md
+
+```md
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Local Build](#local-build)
+  - [Docker Hub](#docker-hub)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Getting Help](#getting-help)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher)
+- At least 4GB of RAM available for the container
+- Python 3.10+ (if using the Python SDK)
+- Node.js 16+ (if using the Node.js examples)
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+### Local Build
+
+Let's get your local environment set up step by step!
+
+#### 1. Building the Image
+
+First, clone the repository and build the Docker image:
+
+```bash
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai/deploy
+
+# Build the Docker image
+docker build --platform=linux/amd64 --no-cache -t crawl4ai .
+
+# Or build for arm64
+docker build --platform=linux/arm64 --no-cache -t crawl4ai .
+```
+
+#### 2. Environment Setup
+
+If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
+
+```env
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# DeepSeek
+DEEPSEEK_API_KEY=your-deepseek-key
+
+# Check out https://docs.litellm.ai/docs/providers for more providers!
+```
+
+> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
+
+#### 3. Running the Container
+
+You have several options for running the container:
+
+Basic run (no LLM support):
+```bash
+docker run -d -p 8000:8000 --name crawl4ai crawl4ai
+```
+
+With LLM support:
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --name crawl4ai \
+  crawl4ai
+```
+
+Using host environment variables (Not a good practice, but works for local testing):
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --env "$(env)" \
+  --name crawl4ai \
+  crawl4ai
+```
+
+#### Multi-Platform Build
+For distributing your image across different architectures, use `buildx`:
+
+```bash
+# Set up buildx builder
+docker buildx create --use
+
+# Build for multiple platforms
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  -t crawl4ai \
+  --push \
+  .
+```
+
+> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
+
+#### Development Build
+For development, you might want to enable all features:
+
+```bash
+docker build -t crawl4ai
+  --build-arg INSTALL_TYPE=all \
+  --build-arg PYTHON_VERSION=3.10 \
+  --build-arg ENABLE_GPU=true \
+  .
+```
+
+#### GPU-Enabled Build
+If you plan to use GPU acceleration:
+
+```bash
+docker build -t crawl4ai
+  --build-arg ENABLE_GPU=true \
+  deploy/docker/
+```
+
+### Build Arguments Explained
+
+| Argument | Description | Default | Options |
+|----------|-------------|---------|----------|
+| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
+| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
+| ENABLE_GPU | GPU support | false | true, false |
+| APP_HOME | Install path | /app | any valid path |
+
+### Build Best Practices
+
+1. **Choose the Right Install Type**
+   - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
+   - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+
+2. **Platform Considerations**
+   - Let Docker auto-detect platform unless you need cross-compilation
+   - Use --platform for specific architecture requirements
+   - Consider buildx for multi-architecture distribution
+
+3. **Performance Optimization**
+   - The image automatically includes platform-specific optimizations
+   - AMD64 gets OpenMP optimizations
+   - ARM64 gets OpenBLAS optimizations
+
+### Docker Hub
+
+> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
+
+## Using the API
+
+In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
+
+### Python SDK
+
+The SDK makes things easier! Here's how to use it:
+
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+      # If JWT is enabled, you can authenticate like this: (more on this later)
+        # await client.authenticate("test@example.com")
+        
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
+        
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
+
+- `base_url` (str): Base URL of the Crawl4AI Docker server
+- `timeout` (float): Default timeout for requests in seconds
+- `verify_ssl` (bool): Whether to verify SSL certificates
+- `verbose` (bool): Whether to show logging output
+- `log_file` (str, optional): Path to log file if file logging is desired
+
+This client SDK generates a properly structured JSON request for the server's HTTP API.
+
+## Second Approach: Direct API Calls
+
+This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
+
+### Understanding Configuration Structure
+
+Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
+
+#### The Basic Pattern
+
+Try this in Python to understand the structure:
+```python
+from crawl4ai import BrowserConfig
+
+# Create a config and see its structure
+config = BrowserConfig(headless=True)
+print(config.dump())
+```
+
+This outputs:
+```json
+{
+    "type": "BrowserConfig",
+    "params": {
+        "headless": true
+    }
+}
+```
+
+#### Simple vs Complex Values
+
+The structure follows these rules:
+- Simple values (strings, numbers, booleans, lists) are passed directly
+- Complex values (classes, dictionaries) use the type-params pattern
+
+For example, with dictionaries:
+```json
+{
+    "browser_config": {
+        "type": "BrowserConfig",
+        "params": {
+            "headless": true,           // Simple boolean - direct value
+            "viewport": {               // Complex dictionary - needs type-params
+                "type": "dict",
+                "value": {
+                    "width": 1200,
+                    "height": 800
+                }
+            }
+        }
+    }
+}
+```
+
+#### Strategy Pattern and Nesting
+
+Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "chunking_strategy": {
+                "type": "RegexChunking",      // Strategy implementation
+                "params": {
+                    "patterns": ["\n\n", "\\.\\s+"]
+                }
+            }
+        }
+    }
+}
+```
+
+Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
+
+#### Complex Nested Example
+
+Let's look at a more complex example with content filtering:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed"
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+This shows how deeply configurations can nest while maintaining a consistent structure.
+
+#### Quick Grammar Overview
+```
+config := {
+    "type": string,
+    "params": {
+        key: simple_value | complex_value
+    }
+}
+
+simple_value := string | number | boolean | [simple_value]
+complex_value := config | dict_value
+
+dict_value := {
+    "type": "dict",
+    "value": object
+}
+```
+
+#### Important Rules 🚨
+
+- Always use the type-params pattern for class instances
+- Use direct values for primitives (numbers, strings, booleans)
+- Wrap dictionaries with {"type": "dict", "value": {...}}
+- Arrays/lists are passed directly without type-params
+- All parameters are optional unless specifically required
+
+#### Pro Tip 💡
+
+The easiest way to get the correct structure is to:
+1. Create configuration objects in Python
+2. Use the `dump()` method to see their JSON representation
+3. Use that JSON in your API calls
+
+Example:
+```python
+from crawl4ai import CrawlerRunConfig, PruningContentFilter
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
+    ),
+    cache_mode= CacheMode.BYPASS
+)
+print(config.dump())  # Use this JSON in your API calls
+```
+
+
+#### More Examples
+
+**Advanced Crawler Configuration**
+
+```json
+{
+    "urls": ["https://example.com"],
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "cache_mode": "bypass",
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed",
+                            "min_word_threshold": 0
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**Extraction Strategy**:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "baseSelector": "article.post",
+                        "fields": [
+                            {"name": "title", "selector": "h1", "type": "text"},
+                            {"name": "content", "selector": ".content", "type": "html"}
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "extraction_strategy": {
+        "type": "LLMExtractionStrategy",
+        "params": {
+          "instruction": "Extract article title, author, publication date and main content",
+          "provider": "openai/gpt-4",
+          "api_token": "your-api-token",
+          "schema": {
+            "type": "dict",
+            "value": {
+              "title": "Article Schema",
+              "type": "object",
+              "properties": {
+                "title": {
+                  "type": "string",
+                  "description": "The article's headline"
+                },
+                "author": {
+                  "type": "string",
+                  "description": "The author's name"
+                },
+                "published_date": {
+                  "type": "string",
+                  "format": "date-time",
+                  "description": "Publication date and time"
+                },
+                "content": {
+                  "type": "string",
+                  "description": "The main article content"
+                }
+              },
+              "required": ["title", "content"]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+**Deep Crawler Example**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "deep_crawl_strategy": {
+        "type": "BFSDeepCrawlStrategy",
+        "params": {
+          "max_depth": 3,
+          "filter_chain": {
+            "type": "FilterChain",
+            "params": {
+              "filters": [
+                {
+                  "type": "ContentTypeFilter",
+                  "params": {
+                    "allowed_types": ["text/html", "application/xhtml+xml"]
+                  }
+                },
+                {
+                  "type": "DomainFilter",
+                  "params": {
+                    "allowed_domains": ["blog.*", "docs.*"],
+                  }
+                }
+              ]
+            }
+          },
+          "url_scorer": {
+            "type": "CompositeScorer",
+            "params": {
+              "scorers": [
+                {
+                  "type": "KeywordRelevanceScorer",
+                  "params": {
+                    "keywords": ["tutorial", "guide", "documentation"],
+                  }
+                },
+                {
+                  "type": "PathDepthScorer",
+                  "params": {
+                    "weight": 0.5,
+                    "optimal_depth": 3  
+                  }
+                }
+              ]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+### REST API Examples
+
+Let's look at some practical examples:
+
+#### Simple Crawl
+
+```python
+import requests
+
+crawl_payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"headless": True},
+    "crawler_config": {"stream": False}
+}
+response = requests.post(
+    "http://localhost:8000/crawl",
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled, more on this later
+    json=crawl_payload
+)
+print(response.json())  # Print the response for debugging
+```
+
+#### Streaming Results
+
+```python
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  
+            "https://example.com/page2",  
+            "https://example.com/page3",  
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+    }
+
+    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+```
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:8000/health
+```
+
+## Deployment Scenarios
+
+> 🚧 Coming soon! We'll cover:
+> - Kubernetes deployment
+> - Cloud provider setups (AWS, GCP, Azure)
+> - High-availability configurations
+> - Load balancing strategies
+
+## Complete Examples
+
+Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
+[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
+[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+
+### Understanding config.yml
+
+The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+
+Here's a detailed breakdown of the configuration options:
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"           # Server title in OpenAPI docs
+  version: "1.0.0"               # API version
+  host: "0.0.0.0"               # Listen on all interfaces
+  port: 8000                    # Server port
+  reload: True                  # Enable hot reloading (development only)
+  timeout_keep_alive: 300       # Keep-alive timeout in seconds
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True                 # Enable/disable rate limiting
+  default_limit: "100/minute"   # Rate limit format: "number/timeunit"
+  trusted_proxies: []          # List of trusted proxy IPs
+  storage_uri: "memory://"     # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+  enabled: false               # Master toggle for security features
+  jwt_enabled: true            # Enable JWT authentication
+  https_redirect: True         # Force HTTPS
+  trusted_hosts: ["*"]         # Allowed hosts (use specific domains in production)
+  headers:                     # Security headers
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0  # Memory usage threshold
+  rate_limiter:
+    base_delay: [1.0, 2.0]      # Min and max delay between requests
+  timeouts:
+    stream_init: 30.0           # Stream initialization timeout
+    batch_process: 300.0        # Batch processing timeout
+
+# Logging Configuration
+logging:
+  level: "INFO"                 # Log level (DEBUG, INFO, WARNING, ERROR)
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True              # Enable Prometheus metrics
+    endpoint: "/metrics"       # Metrics endpoint
+  health_check:
+    endpoint: "/health"        # Health check endpoint
+```
+
+### JWT Authentication
+
+When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
+
+#### Getting a Token
+```python
+POST /token
+Content-Type: application/json
+
+{
+    "email": "user@example.com"
+}
+```
+
+The endpoint returns:
+```json
+{
+    "email": "user@example.com",
+    "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
+    "token_type": "bearer"
+}
+```
+
+#### Using the Token
+Add the token to your requests:
+```bash
+curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
+```
+
+Using the Python SDK:
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+
+async with Crawl4aiDockerClient() as client:
+    # Authenticate first
+    await client.authenticate("user@example.com")
+    
+    # Now all requests will include the token automatically
+    result = await client.crawl(urls=["https://example.com"])
+```
+
+#### Production Considerations 💡
+The default implementation uses a simple email verification. For production use, consider:
+- Email verification via OTP/magic links
+- OAuth2 integration
+- Rate limiting token generation
+- Token expiration and refresh mechanisms
+- IP-based restrictions
+
+### Configuration Tips and Best Practices
+
+1. **Production Settings** 🏭
+
+   ```yaml
+   app:
+     reload: False              # Disable reload in production
+     timeout_keep_alive: 120    # Lower timeout for better resource management
+   
+   rate_limiting:
+     storage_uri: "redis://redis:6379"  # Use Redis for distributed rate limiting
+     default_limit: "50/minute"         # More conservative rate limit
+   
+   security:
+     enabled: true                      # Enable all security features
+     trusted_hosts: ["your-domain.com"] # Restrict to your domain
+   ```
+
+2. **Development Settings** 🛠️
+
+   ```yaml
+   app:
+     reload: True               # Enable hot reloading
+     timeout_keep_alive: 300    # Longer timeout for debugging
+   
+   logging:
+     level: "DEBUG"            # More verbose logging
+   ```
+
+3. **High-Traffic Settings** 🚦
+
+   ```yaml
+   crawler:
+     memory_threshold_percent: 85.0  # More conservative memory limit
+     rate_limiter:
+       base_delay: [2.0, 4.0]       # More aggressive rate limiting
+   ```
+
+### Customizing Your Configuration
+
+#### Method 1: Pre-build Configuration
+
+```bash
+# Copy and modify config before building
+cd crawl4ai/deploy
+vim custom-config.yml # Or use any editor
+
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
+```
+
+#### Method 2: Build-time Configuration
+
+Use a custom config during build:
+
+```bash
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache \
+  --build-arg CONFIG_PATH=/path/to/custom-config.yml \ 
+  -t crawl4ai:latest .
+```
+
+#### Method 3: Runtime Configuration
+```bash
+# Mount custom config at runtime
+docker run -d -p 8000:8000 \
+  -v $(pwd)/custom-config.yml:/app/config.yml \
+  crawl4ai-server:prod
+```
+
+> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
+> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
+```
+
+
+## File: docs/md_v2/core/fit-markdown.md
+
+```md
+# Fit Markdown with Pruning & BM25
+
+**Fit Markdown** is a specialized **filtered** version of your page’s markdown, focusing on the most relevant content. By default, Crawl4AI converts the entire HTML into a broad **raw_markdown**. With fit markdown, we apply a **content filter** algorithm (e.g., **Pruning** or **BM25**) to remove or rank low-value sections—such as repetitive sidebars, shallow text blocks, or irrelevancies—leaving a concise textual “core.”
+
+---
+
+## 1. How “Fit Markdown” Works
+
+### 1.1 The `content_filter`
+
+In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
+
+- **`result.markdown.raw_markdown`** (unfiltered)
+- **`result.markdown.fit_markdown`** (filtered or “fit” version)
+- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
+
+
+### 1.2 Common Filters
+
+1. **PruningContentFilter** – Scores each node by text density, link density, and tag importance, discarding those below a threshold.  
+2. **BM25ContentFilter** – Focuses on textual relevance using BM25 ranking, especially useful if you have a specific user query (e.g., “machine learning” or “food nutrition”).
+
+---
+
+## 2. PruningContentFilter
+
+**Pruning** discards less relevant nodes based on **text density, link density, and tag importance**. It’s a heuristic-based approach—if certain sections appear too “thin” or too “spammy,” they’re pruned.
+
+### 2.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # Step 1: Create a pruning filter
+    prune_filter = PruningContentFilter(
+        # Lower → more content retained, higher → more content pruned
+        threshold=0.45,           
+        # "fixed" or "dynamic"
+        threshold_type="dynamic",  
+        # Ignore nodes with <5 words
+        min_word_threshold=5      
+    )
+
+    # Step 2: Insert it into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+    
+    # Step 3: Pass it to CrawlerRunConfig
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        
+        if result.success:
+            # 'fit_markdown' is your pruned content, focusing on "denser" text
+            print("Raw Markdown length:", len(result.markdown.raw_markdown))
+            print("Fit Markdown length:", len(result.markdown.fit_markdown))
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Key Parameters
+
+- **`min_word_threshold`** (int): If a block has fewer words than this, it’s pruned.  
+- **`threshold_type`** (str):
+  - `"fixed"` → each node must exceed `threshold` (0–1).  
+  - `"dynamic"` → node scoring adjusts according to tag type, text/link density, etc.  
+- **`threshold`** (float, default ~0.48): The base or “anchor” cutoff.  
+
+**Algorithmic Factors**:
+
+- **Text density** – Encourages blocks that have a higher ratio of text to overall content.  
+- **Link density** – Penalizes sections that are mostly links.  
+- **Tag importance** – e.g., an `<article>` or `<p>` might be more important than a `<div>`.  
+- **Structural context** – If a node is deeply nested or in a suspected sidebar, it might be deprioritized.
+
+---
+
+## 3. BM25ContentFilter
+
+**BM25** is a classical text ranking algorithm often used in search engines. If you have a **user query** or rely on page metadata to derive a query, BM25 can identify which text chunks best match that query.
+
+### 3.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # 1) A BM25 filter with a user query
+    bm25_filter = BM25ContentFilter(
+        user_query="startup fundraising tips",
+        # Adjust for stricter or looser results
+        bm25_threshold=1.2  
+    )
+
+    # 2) Insert into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+    
+    # 3) Pass to crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        if result.success:
+            print("Fit Markdown (BM25 query-based):")
+            print(result.markdown.fit_markdown)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 3.2 Parameters
+
+- **`user_query`** (str, optional): E.g. `"machine learning"`. If blank, the filter tries to glean a query from page metadata.  
+- **`bm25_threshold`** (float, default 1.0):  
+  - Higher → fewer chunks but more relevant.  
+  - Lower → more inclusive.  
+
+> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
+
+---
+
+## 4. Accessing the “Fit” Output
+
+After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**. 
+
+```python
+fit_md = result.markdown.fit_markdown
+fit_html = result.markdown.fit_html
+```
+
+If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
+
+---
+
+## 5. Code Patterns Recap
+
+### 5.1 Pruning
+
+```python
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",
+    min_word_threshold=10
+)
+md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+### 5.2 BM25
+
+```python
+bm25_filter = BM25ContentFilter(
+    user_query="health benefits fruit",
+    bm25_threshold=1.2
+)
+md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+---
+
+## 6. Combining with “word_count_threshold” & Exclusions
+
+Remember you can also specify:
+
+```python
+config = CrawlerRunConfig(
+    word_count_threshold=10,
+    excluded_tags=["nav", "footer", "header"],
+    exclude_external_links=True,
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.5)
+    )
+)
+```
+
+Thus, **multi-level** filtering occurs:
+
+1. The crawler’s `excluded_tags` are removed from the HTML first.  
+2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.  
+3. The final “fit” content is generated in `result.markdown.fit_markdown`.
+
+---
+
+## 7. Custom Filters
+
+If you need a different approach (like a specialized ML model or site-specific heuristics), you can create a new class inheriting from `RelevantContentFilter` and implement `filter_content(html)`. Then inject it into your **markdown generator**:
+
+```python
+from crawl4ai.content_filter_strategy import RelevantContentFilter
+
+class MyCustomFilter(RelevantContentFilter):
+    def filter_content(self, html, min_word_threshold=None):
+        # parse HTML, implement custom logic
+        return [block for block in ... if ... some condition...]
+
+```
+
+**Steps**:
+
+1. Subclass `RelevantContentFilter`.  
+2. Implement `filter_content(...)`.  
+3. Use it in your `DefaultMarkdownGenerator(content_filter=MyCustomFilter(...))`.
+
+---
+
+## 8. Final Thoughts
+
+**Fit Markdown** is a crucial feature for:
+
+- **Summaries**: Quickly get the important text from a cluttered page.  
+- **Search**: Combine with **BM25** to produce content relevant to a query.  
+- **AI Pipelines**: Filter out boilerplate so LLM-based extraction or summarization runs on denser text.
+
+**Key Points**:
+- **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.  
+- **BM25ContentFilter**: Perfect for query-based extraction or searching.  
+- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.  
+- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
+
+With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
+
+- Last Updated: 2025-01-01
+```
+
+
+## File: docs/md_v2/core/installation.md
+
+```md
+# Installation & Setup (2023 Edition)
+
+## 1. Basic Installation
+
+```bash
+pip install crawl4ai
+```
+
+This installs the **core** Crawl4AI library along with essential dependencies. **No** advanced features (like transformers or PyTorch) are included yet.
+
+## 2. Initial Setup & Diagnostics
+
+### 2.1 Run the Setup Command
+After installing, call:
+
+```bash
+crawl4ai-setup
+```
+
+**What does it do?**
+- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
+- Performs OS-level checks (e.g., missing libs on Linux)
+- Confirms your environment is ready to crawl
+
+### 2.2 Diagnostics
+Optionally, you can run **diagnostics** to confirm everything is functioning:
+
+```bash
+crawl4ai-doctor
+```
+
+This command attempts to:
+- Check Python version compatibility
+- Verify Playwright installation
+- Inspect environment variables or library conflicts
+
+If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`.
+
+---
+
+## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`)
+
+Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+        )
+        print(result.markdown[:300])  # Show the first 300 characters of extracted text
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Expected** outcome:
+- A headless browser session loads `example.com`
+- Crawl4AI returns ~300 characters of markdown.  
+If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly.
+
+---
+
+## 4. Advanced Installation (Optional)
+
+**Warning**: Only install these **if you truly need them**. They bring in larger dependencies, including big models, which can increase disk usage and memory load significantly.
+
+### 4.1 Torch, Transformers, or All
+
+- **Text Clustering (Torch)**  
+  ```bash
+  pip install crawl4ai[torch]
+  crawl4ai-setup
+  ```
+  Installs PyTorch-based features (e.g., cosine similarity or advanced semantic chunking).
+
+- **Transformers**  
+  ```bash
+  pip install crawl4ai[transformer]
+  crawl4ai-setup
+  ```
+  Adds Hugging Face-based summarization or generation strategies.
+
+- **All Features**  
+  ```bash
+  pip install crawl4ai[all]
+  crawl4ai-setup
+  ```
+
+#### (Optional) Pre-Fetching Models
+```bash
+crawl4ai-download-models
+```
+This step caches large models locally (if needed). **Only do this** if your workflow requires them.
+
+---
+
+## 5. Docker (Experimental)
+
+We provide a **temporary** Docker approach for testing. **It’s not stable and may break** with future releases. We plan a major Docker revamp in a future stable version, 2025 Q1. If you still want to try:
+
+```bash
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+You can then make POST requests to `http://localhost:11235/crawl` to perform crawls. **Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025).
+
+---
+
+## 6. Local Server Mode (Legacy)
+
+Some older docs mention running Crawl4AI as a local server. This approach has been **partially replaced** by the new Docker-based prototype and upcoming stable server release. You can experiment, but expect major changes. Official local server instructions will arrive once the new Docker architecture is finalized.
+
+---
+
+## Summary
+
+1. **Install** with `pip install crawl4ai` and run `crawl4ai-setup`.
+2. **Diagnose** with `crawl4ai-doctor` if you see errors.
+3. **Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`.
+4. **Advanced** features (Torch, Transformers) are **optional**—avoid them if you don’t need them (they significantly increase resource usage).
+5. **Docker** is **experimental**—use at your own risk until the stable version is released.
+6. **Local server** references in older docs are largely deprecated; a new solution is in progress.
+
+**Got questions?** Check [GitHub issues](https://github.com/unclecode/crawl4ai/issues) for updates or ask the community!
+```
+
+
+## File: docs/md_v2/core/link-media.md
+
+```md
+# Link & Media 
+
+In this tutorial, you’ll learn how to:
+
+1. Extract links (internal, external) from crawled pages  
+2. Filter or exclude specific domains (e.g., social media or custom domains)  
+3. Access and ma### 3.2 Excluding Images
+
+#### Excluding External Images
+
+If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
+
+#### Excluding All Images
+
+If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_all_images=True
+)
+```
+
+This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
+- You don't need image data in your results
+- You're crawling image-heavy pages that cause memory issues
+- You want to focus only on text content
+- You need to maximize crawling speeddata (especially images) in the crawl result  
+4. Configure your crawler to exclude or prioritize certain images
+
+> **Prerequisites**  
+> - You have completed or are familiar with the [AsyncWebCrawler Basics](../core/simple-crawling.md) tutorial.  
+> - You can run Crawl4AI in your environment (Playwright, Python, etc.).
+
+---
+
+Below is a revised version of the **Link Extraction** and **Media Extraction** sections that includes example data structures showing how links and media items are stored in `CrawlResult`. Feel free to adjust any field names or descriptions to match your actual output.
+
+---
+
+## 1. Link Extraction
+
+### 1.1 `result.links`
+
+When you call `arun()` or `arun_many()` on a URL, Crawl4AI automatically extracts links and stores them in the `links` field of `CrawlResult`. By default, the crawler tries to distinguish **internal** links (same domain) from **external** links (different domains).
+
+**Basic Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://www.example.com")
+    if result.success:
+        internal_links = result.links.get("internal", [])
+        external_links = result.links.get("external", [])
+        print(f"Found {len(internal_links)} internal links.")
+        print(f"Found {len(internal_links)} external links.")
+        print(f"Found {len(result.media)} media items.")
+
+        # Each link is typically a dictionary with fields like:
+        # { "href": "...", "text": "...", "title": "...", "base_domain": "..." }
+        if internal_links:
+            print("Sample Internal Link:", internal_links[0])
+    else:
+        print("Crawl failed:", result.error_message)
+```
+
+**Structure Example**:
+
+```python
+result.links = {
+  "internal": [
+    {
+      "href": "https://kidocode.com/",
+      "text": "",
+      "title": "",
+      "base_domain": "kidocode.com"
+    },
+    {
+      "href": "https://kidocode.com/degrees/technology",
+      "text": "Technology Degree",
+      "title": "KidoCode Tech Program",
+      "base_domain": "kidocode.com"
+    },
+    # ...
+  ],
+  "external": [
+    # possibly other links leading to third-party sites
+  ]
+}
+```
+
+- **`href`**: The raw hyperlink URL.  
+- **`text`**: The link text (if any) within the `<a>` tag.  
+- **`title`**: The `title` attribute of the link (if present).  
+- **`base_domain`**: The domain extracted from `href`. Helpful for filtering or grouping by domain.
+
+---
+
+## 2. Domain Filtering
+
+Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
+
+- **`exclude_external_links`**: If `True`, discard any link pointing outside the root domain.  
+- **`exclude_social_media_domains`**: Provide a list of social media platforms (e.g., `["facebook.com", "twitter.com"]`) to exclude from your crawl.  
+- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.  
+- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
+
+### 2.1 Example: Excluding External & Social Media Links
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        exclude_external_links=True,          # No links outside primary domain
+        exclude_social_media_links=True       # Skip recognized social media domains
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://www.example.com",
+            config=crawler_cfg
+        )
+        if result.success:
+            print("[OK] Crawled:", result.url)
+            print("Internal links count:", len(result.links.get("internal", [])))
+            print("External links count:", len(result.links.get("external", [])))  
+            # Likely zero external links in this scenario
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Example: Excluding Specific Domains
+
+If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_domains=["suspiciousads.com"]
+)
+```
+
+This approach is handy when you still want external links but need to block certain sites you consider spammy.
+
+---
+
+## 3. Media Extraction
+
+### 3.1 Accessing `result.media`
+
+By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
+
+**Basic Example**:
+
+```python
+if result.success:
+    # Get images
+    images_info = result.media.get("images", [])
+    print(f"Found {len(images_info)} images in total.")
+    for i, img in enumerate(images_info[:3]):  # Inspect just the first 3
+        print(f"[Image {i}] URL: {img['src']}")
+        print(f"           Alt text: {img.get('alt', '')}")
+        print(f"           Score: {img.get('score')}")
+        print(f"           Description: {img.get('desc', '')}\n")
+    
+    # Get tables
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables in total.")
+    for i, table in enumerate(tables):
+        print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
+        print(f"           Columns: {len(table.get('headers', []))}")
+        print(f"           Rows: {len(table.get('rows', []))}")
+```
+
+**Structure Example**:
+
+```python
+result.media = {
+  "images": [
+    {
+      "src": "https://cdn.prod.website-files.com/.../Group%2089.svg",
+      "alt": "coding school for kids",
+      "desc": "Trial Class Degrees degrees All Degrees AI Degree Technology ...",
+      "score": 3,
+      "type": "image",
+      "group_id": 0,
+      "format": None,
+      "width": None,
+      "height": None
+    },
+    # ...
+  ],
+  "videos": [
+    # Similar structure but with video-specific fields
+  ],
+  "audio": [
+    # Similar structure but with audio-specific fields
+  ],
+  "tables": [
+    {
+      "headers": ["Name", "Age", "Location"],
+      "rows": [
+        ["John Doe", "34", "New York"],
+        ["Jane Smith", "28", "San Francisco"],
+        ["Alex Johnson", "42", "Chicago"]
+      ],
+      "caption": "Employee Directory",
+      "summary": "Directory of company employees"
+    },
+    # More tables if present
+  ]
+}
+```
+
+Depending on your Crawl4AI version or scraping strategy, these dictionaries can include fields like:
+
+- **`src`**: The media URL (e.g., image source)  
+- **`alt`**: The alt text for images (if present)  
+- **`desc`**: A snippet of nearby text or a short description (optional)  
+- **`score`**: A heuristic relevance score if you’re using content-scoring features  
+- **`width`**, **`height`**: If the crawler detects dimensions for the image/video  
+- **`type`**: Usually `"image"`, `"video"`, or `"audio"`  
+- **`group_id`**: If you’re grouping related media items, the crawler might assign an ID  
+
+With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
+
+### 3.2 Excluding External Images
+
+If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
+
+### 3.3 Working with Tables
+
+Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
+
+- Presence of thead and tbody sections
+- Use of th elements for headers
+- Column consistency
+- Text density
+- And other factors
+
+Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
+
+**Accessing Table Data**:
+
+```python
+if result.success:
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables on the page")
+    
+    if tables:
+        # Access the first table
+        first_table = tables[0]
+        print(f"Table caption: {first_table.get('caption', 'No caption')}")
+        print(f"Headers: {first_table.get('headers', [])}")
+        
+        # Print the first 3 rows
+        for i, row in enumerate(first_table.get('rows', [])[:3]):
+            print(f"Row {i+1}: {row}")
+```
+
+**Configuring Table Extraction**:
+
+You can adjust the sensitivity of the table detection algorithm with:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    table_score_threshold=5  # Lower value = more tables detected (default: 7)
+)
+```
+
+Each extracted table contains:
+- `headers`: Column header names
+- `rows`: List of rows, each containing cell values
+- `caption`: Table caption text (if available)
+- `summary`: Table summary attribute (if specified)
+
+### 3.4 Additional Media Config
+
+- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.  
+- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.  
+- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
+- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
+
+#### Example: Capturing Page as MHTML
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        capture_mhtml=True  # Enable MHTML capture
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=crawler_cfg)
+        
+        if result.success and result.mhtml:
+            # Save the MHTML snapshot to a file
+            with open("example.mhtml", "w", encoding="utf-8") as f:
+                f.write(result.mhtml)
+            print("MHTML snapshot saved to example.mhtml")
+        else:
+            print("Failed to capture MHTML:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+The MHTML format is particularly useful because:
+- It captures the complete page state including all resources
+- It can be opened in most modern browsers for offline viewing
+- It preserves the page exactly as it appeared during crawling
+- It's a single file, making it easy to store and transfer
+
+---
+
+## 4. Putting It All Together: Link & Media Filtering
+
+Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # Suppose we want to keep only internal links, remove certain domains, 
+    # and discard external images from the final crawl data.
+    crawler_cfg = CrawlerRunConfig(
+        exclude_external_links=True,
+        exclude_domains=["spammyads.com"],
+        exclude_social_media_links=True,   # skip Twitter, Facebook, etc.
+        exclude_external_images=True,      # keep only images from main domain
+        wait_for_images=True,             # ensure images are loaded
+        verbose=True
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://www.example.com", config=crawler_cfg)
+
+        if result.success:
+            print("[OK] Crawled:", result.url)
+            
+            # 1. Links
+            in_links = result.links.get("internal", [])
+            ext_links = result.links.get("external", [])
+            print("Internal link count:", len(in_links))
+            print("External link count:", len(ext_links))  # should be zero with exclude_external_links=True
+            
+            # 2. Images
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            
+            # Let's see a snippet of these images
+            for i, img in enumerate(images[:3]):
+                print(f"  - {img['src']} (alt={img.get('alt','')}, score={img.get('score','N/A')})")
+        else:
+            print("[ERROR] Failed to crawl. Reason:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 5. Common Pitfalls & Tips
+
+1. **Conflicting Flags**:  
+   - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant.  
+   - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic.
+
+2. **Relevancy Scores**:  
+   - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it.
+
+3. **Performance**:  
+   - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages.  
+   - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code.
+
+4. **Social Media Lists**:  
+   - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version).
+
+---
+
+**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
+### Table Extraction Tips
+
+- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
+- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
+- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
+
+The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
+
+```
+
+
+## File: docs/md_v2/core/local-files.md
+
+```md
+# Prefix-Based Input Handling in Crawl4AI
+
+This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
+
+## Crawling a Web URL
+
+To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_web():
+    config = CrawlerRunConfig(bypass_cache=True)
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/apple", 
+            config=config
+        )
+        if result.success:
+            print("Markdown Content:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl: {result.error_message}")
+
+asyncio.run(crawl_web())
+```
+
+## Crawling a Local HTML File
+
+To crawl a local HTML file, prefix the file path with `file://`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_local_file():
+    local_file_path = "/path/to/apple.html"  # Replace with your file path
+    file_url = f"file://{local_file_path}"
+    config = CrawlerRunConfig(bypass_cache=True)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=file_url, config=config)
+        if result.success:
+            print("Markdown Content from Local File:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl local file: {result.error_message}")
+
+asyncio.run(crawl_local_file())
+```
+
+## Crawling Raw HTML Content
+
+To crawl raw HTML content, prefix the HTML string with `raw:`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_raw_html():
+    raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
+    raw_html_url = f"raw:{raw_html}"
+    config = CrawlerRunConfig(bypass_cache=True)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=raw_html_url, config=config)
+        if result.success:
+            print("Markdown Content from Raw HTML:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl raw HTML: {result.error_message}")
+
+asyncio.run(crawl_raw_html())
+```
+
+---
+
+# Complete Example
+
+Below is a comprehensive script that:
+
+1. Crawls the Wikipedia page for "Apple."
+2. Saves the HTML content to a local file (`apple.html`).
+3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
+4. Crawls the raw HTML content from the saved file and verifies consistency.
+
+```python
+import os
+import sys
+import asyncio
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def main():
+    wikipedia_url = "https://en.wikipedia.org/wiki/apple"
+    script_dir = Path(__file__).parent
+    html_file_path = script_dir / "apple.html"
+
+    async with AsyncWebCrawler() as crawler:
+        # Step 1: Crawl the Web URL
+        print("\n=== Step 1: Crawling the Wikipedia URL ===")
+        web_config = CrawlerRunConfig(bypass_cache=True)
+        result = await crawler.arun(url=wikipedia_url, config=web_config)
+
+        if not result.success:
+            print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
+            return
+
+        with open(html_file_path, 'w', encoding='utf-8') as f:
+            f.write(result.html)
+        web_crawl_length = len(result.markdown)
+        print(f"Length of markdown from web crawl: {web_crawl_length}\n")
+
+        # Step 2: Crawl from the Local HTML File
+        print("=== Step 2: Crawling from the Local HTML File ===")
+        file_url = f"file://{html_file_path.resolve()}"
+        file_config = CrawlerRunConfig(bypass_cache=True)
+        local_result = await crawler.arun(url=file_url, config=file_config)
+
+        if not local_result.success:
+            print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
+            return
+
+        local_crawl_length = len(local_result.markdown)
+        assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
+        print("✅ Markdown length matches between web and local file crawl.\n")
+
+        # Step 3: Crawl Using Raw HTML Content
+        print("=== Step 3: Crawling Using Raw HTML Content ===")
+        with open(html_file_path, 'r', encoding='utf-8') as f:
+            raw_html_content = f.read()
+        raw_html_url = f"raw:{raw_html_content}"
+        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
+
+        if not raw_result.success:
+            print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
+            return
+
+        raw_crawl_length = len(raw_result.markdown)
+        assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
+        print("✅ Markdown length matches between web and raw HTML crawl.\n")
+
+        print("All tests passed successfully!")
+    if html_file_path.exists():
+        os.remove(html_file_path)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+# Conclusion
+
+With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.
+```
+
+
+## File: docs/md_v2/core/markdown-generation.md
+
+```md
+# Markdown Generation Basics
+
+One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows.
+
+In this tutorial, you’ll learn:
+
+1. How to configure the **Default Markdown Generator**  
+2. How **content filters** (BM25 or Pruning) help you refine markdown and discard junk  
+3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`)  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here’s a minimal code snippet that uses the **DefaultMarkdownGenerator** with no additional filtering:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator()
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        
+        if result.success:
+            print("Raw Markdown Output:\n")
+            print(result.markdown)  # The unfiltered markdown from the page
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**  
+- `CrawlerRunConfig( markdown_generator = DefaultMarkdownGenerator() )` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl.  
+- The resulting markdown is accessible via `result.markdown`.
+
+---
+
+## 2. How Markdown Generation Works
+
+### 2.1 HTML-to-Text Conversion (Forked & Modified)
+
+Under the hood, **DefaultMarkdownGenerator** uses a specialized HTML-to-text approach that:
+
+- Preserves headings, code blocks, bullet points, etc.  
+- Removes extraneous tags (scripts, styles) that don’t add meaningful content.  
+- Can optionally generate references for links or skip them altogether.
+
+A set of **options** (passed as a dict) allows you to customize precisely how HTML converts to markdown. These map to standard html2text-like configuration plus your own enhancements (e.g., ignoring internal links, preserving certain tags verbatim, or adjusting line widths).
+
+### 2.2 Link Citations & References
+
+By default, the generator can convert `<a href="...">` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner.
+
+### 2.3 Optional Content Filters
+
+Before or after the HTML-to-Markdown step, you can apply a **content filter** (like BM25 or Pruning) to reduce noise and produce a “fit_markdown”—a heavily pruned version focusing on the page’s main text. We’ll cover these filters shortly.
+
+---
+
+## 3. Configuring the Default Markdown Generator
+
+You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example:
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Example: ignore all links, don't escape HTML, and wrap text at 80 characters
+    md_generator = DefaultMarkdownGenerator(
+        options={
+            "ignore_links": True,
+            "escape_html": False,
+            "body_width": 80
+        }
+    )
+
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/docs", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown[:500])  # Just a snippet
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+Some commonly used `options`:
+
+- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown.  
+- **`ignore_images`** (bool): Remove all `![image]()` references.  
+- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`).  
+- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping.  
+- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
+- **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.
+
+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Option 1: Use the raw HTML directly from the webpage (before any processing)
+    raw_md_generator = DefaultMarkdownGenerator(
+        content_source="raw_html",
+        options={"ignore_links": True}
+    )
+    
+    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+    cleaned_md_generator = DefaultMarkdownGenerator(
+        content_source="cleaned_html",  # This is the default
+        options={"ignore_links": True}
+    )
+    
+    # Option 3: Use preprocessed HTML optimized for schema extraction
+    fit_md_generator = DefaultMarkdownGenerator(
+        content_source="fit_html",
+        options={"ignore_links": True}
+    )
+    
+    # Use one of the generators in your crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=raw_md_generator  # Try each of the generators
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown.raw_markdown[:500])
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
+---
+
+## 5. Content Filters
+
+**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
+
+### 5.1 BM25ContentFilter
+
+If you have a **search query**, BM25 is a good choice:
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai import CrawlerRunConfig
+
+bm25_filter = BM25ContentFilter(
+    user_query="machine learning",
+    bm25_threshold=1.2,
+    use_stemming=True
+)
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=bm25_filter,
+    options={"ignore_links": True}
+)
+
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
+- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
+- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+
+**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
+
+### 5.2 PruningContentFilter
+
+If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
+
+```python
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",  # or "dynamic"
+    min_word_threshold=50
+)
+```
+
+- **`threshold`**: Score boundary. Blocks below this score get removed.  
+- **`threshold_type`**:  
+    - `"fixed"`: Straight comparison (`score >= threshold` keeps the block).  
+    - `"dynamic"`: The filter adjusts threshold in a data-driven manner.  
+- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful.
+
+**When to Use PruningContentFilter**  
+- You want a broad cleanup without a user query.  
+- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
+
+### 5.3 LLMContentFilter
+
+For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def main():
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=4096,  # Adjust based on your needs
+        verbose=True
+    )
+
+    config = CrawlerRunConfig(
+        content_filter=filter
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        print(result.markdown.fit_markdown)  # Filtered markdown content
+```
+
+**Key Features:**
+- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
+- **Customizable Instructions**: Tailor the filtering process with specific instructions
+- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
+- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
+
+**Two Common Use Cases:**
+
+1. **Exact Content Preservation**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Extract the main educational content while preserving its original wording and substance completely.
+    1. Maintain the exact language and terminology
+    2. Keep all technical explanations and examples intact
+    3. Preserve the original flow and structure
+    4. Remove only clearly irrelevant elements like navigation menus and ads
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+2. **Focused Content Extraction**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Focus on extracting specific types of content:
+    - Technical documentation
+    - Code examples
+    - API references
+    Reformat the content into clear, well-structured markdown
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
+
+---
+
+## 6. Using Fit Markdown
+
+When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
+
+1. **`raw_markdown`**: The full unfiltered markdown.  
+2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+async def main():
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(threshold=0.6),
+            options={"ignore_links": True}
+        )
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://news.example.com/tech", config=config)
+        if result.success:
+            print("Raw markdown:\n", result.markdown)
+            
+            # If a filter is used, we also have .fit_markdown:
+            md_object = result.markdown  # or your equivalent
+            print("Filtered markdown:\n", md_object.fit_markdown)
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 7. The `MarkdownGenerationResult` Object
+
+If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
+
+- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering).  
+- **`markdown_with_citations`**: A version that moves links to reference-style footnotes.  
+- **`references_markdown`**: A separate string or section containing the gathered references.  
+- **`fit_markdown`**: The filtered markdown if you used a content filter.  
+- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage).
+
+**Example**:
+
+```python
+md_obj = result.markdown  # your library’s naming may vary
+print("RAW:\n", md_obj.raw_markdown)
+print("CITED:\n", md_obj.markdown_with_citations)
+print("REFERENCES:\n", md_obj.references_markdown)
+print("FIT:\n", md_obj.fit_markdown)
+```
+
+**Why Does This Matter?**  
+- You can supply `raw_markdown` to an LLM if you want the entire text.  
+- Or feed `fit_markdown` into a vector database to reduce token usage.  
+- `references_markdown` can help you keep track of link provenance.
+
+---
+
+Below is a **revised section** under “Combining Filters (BM25 + Pruning)” that demonstrates how you can run **two** passes of content filtering without re-crawling, by taking the HTML (or text) from a first pass and feeding it into the second filter. It uses real code patterns from the snippet you provided for **BM25ContentFilter**, which directly accepts **HTML** strings (and can also handle plain text with minimal adaptation).
+
+---
+
+## 8. Combining Filters (BM25 + Pruning) in Two Passes
+
+You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
+
+1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML).  
+2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query.
+
+### Two-Pass Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
+from bs4 import BeautifulSoup
+
+async def main():
+    # 1. Crawl with minimal or no markdown generator, just get raw HTML
+    config = CrawlerRunConfig(
+        # If you only want raw HTML, you can skip passing a markdown_generator
+        # or provide one but focus on .html in this example
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/tech-article", config=config)
+
+        if not result.success or not result.html:
+            print("Crawl failed or no HTML content.")
+            return
+        
+        raw_html = result.html
+        
+        # 2. First pass: PruningContentFilter on raw HTML
+        pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50)
+        
+        # filter_content returns a list of "text chunks" or cleaned HTML sections
+        pruned_chunks = pruning_filter.filter_content(raw_html)
+        # This list is basically pruned content blocks, presumably in HTML or text form
+        
+        # For demonstration, let's combine these chunks back into a single HTML-like string
+        # or you could do further processing. It's up to your pipeline design.
+        pruned_html = "\n".join(pruned_chunks)
+        
+        # 3. Second pass: BM25ContentFilter with a user query
+        bm25_filter = BM25ContentFilter(
+            user_query="machine learning",
+            bm25_threshold=1.2,
+            language="english"
+        )
+        
+        # returns a list of text chunks
+        bm25_chunks = bm25_filter.filter_content(pruned_html)  
+        
+        if not bm25_chunks:
+            print("Nothing matched the BM25 query after pruning.")
+            return
+        
+        # 4. Combine or display final results
+        final_text = "\n---\n".join(bm25_chunks)
+        
+        print("==== PRUNED OUTPUT (first pass) ====")
+        print(pruned_html[:500], "... (truncated)")  # preview
+
+        print("\n==== BM25 OUTPUT (second pass) ====")
+        print(final_text[:500], "... (truncated)")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### What’s Happening?
+
+1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`.  
+2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**.  
+3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.)  
+4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.”
+
+**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**.
+
+### Tips & Variations
+
+- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"<p>some text</p>"`), it will parse it as HTML.  
+- **Chaining in a Single Pipeline**: If your code supports it, you can chain multiple filters automatically. Otherwise, manual two-pass filtering (as shown) is straightforward.  
+- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two.
+
+### One-Pass Combination?
+
+If your codebase or pipeline design allows applying multiple filters in one pass, you could do so. But often it’s simpler—and more transparent—to run them sequentially, analyzing each step’s result.
+
+**Bottom Line**: By **manually chaining** your filtering logic in two passes, you get powerful incremental control over the final content. First, remove “global” clutter with Pruning, then refine further with BM25-based query relevance—without incurring a second network crawl.
+
+---
+
+## 9. Common Pitfalls & Tips
+
+1. **No Markdown Output?**  
+   - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
+   - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears.
+
+2. **Performance Considerations**  
+   - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading.  
+   - If your final use case is LLM ingestion, consider summarizing further or chunking big texts.
+
+3. **Take Advantage of `fit_markdown`**  
+   - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted.  
+   - Still verify the textual quality—some sites have crucial data in footers or sidebars.
+
+4. **Adjusting `html2text` Options**  
+   - If you see lots of raw HTML slipping into the text, turn on `escape_html`.  
+   - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Markdown Generation Basics** tutorial, you learned to:
+
+- Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
+- Select different HTML sources using the `content_source` parameter.  
+- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
+- Distinguish between raw and filtered markdown (`fit_markdown`).  
+- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
+
+Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries.
+
+**Last Updated**: 2025-01-01
+
+```
+
+
+## File: docs/md_v2/core/page-interaction.md
+
+```md
+# Page Interaction
+
+Crawl4AI provides powerful features for interacting with **dynamic** webpages, handling JavaScript execution, waiting for conditions, and managing multi-step flows. By combining **js_code**, **wait_for**, and certain **CrawlerRunConfig** parameters, you can:
+
+1. Click “Load More” buttons  
+2. Fill forms and submit them  
+3. Wait for elements or data to appear  
+4. Reuse sessions across multiple steps  
+
+Below is a quick overview of how to do it.
+
+---
+
+## 1. JavaScript Execution
+
+### Basic Execution
+
+**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.  
+**Example**: We’ll scroll to the bottom of the page, then optionally click a “Load More” button.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Single JS command
+    config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Example site
+            config=config
+        )
+        print("Crawled length:", len(result.cleaned_html))
+
+    # Multiple commands
+    js_commands = [
+        "window.scrollTo(0, document.body.scrollHeight);",
+        # 'More' link on Hacker News
+        "document.querySelector('a.morelink')?.click();",  
+    ]
+    config = CrawlerRunConfig(js_code=js_commands)
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Another pass
+            config=config
+        )
+        print("After scroll+click, length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Relevant `CrawlerRunConfig` params**:
+- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
+- **`js_only`**: If set to `True` on subsequent calls, indicates we’re continuing an existing session without a new full navigation.  
+- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
+
+---
+
+## 2. Wait Conditions
+
+### 2.1 CSS-Based Waiting
+
+Sometimes, you just want to wait for a specific element to appear. For example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Wait for at least 30 items on Hacker News
+        wait_for="css:.athing:nth-child(30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("We have at least 30 items loaded!")
+        # Rough check
+        print("Total items in HTML:", result.cleaned_html.count("athing"))  
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key param**:
+- **`wait_for="css:..."`**: Tells the crawler to wait until that CSS selector is present.
+
+### 2.2 JavaScript-Based Waiting
+
+For more complex conditions (e.g., waiting for content length to exceed a threshold), prefix `js:`:
+
+```python
+wait_condition = """() => {
+    const items = document.querySelectorAll('.athing');
+    return items.length > 50;  // Wait for at least 51 items
+}"""
+
+config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
+```
+
+**Behind the Scenes**: Crawl4AI keeps polling the JS function until it returns `true` or a timeout occurs.
+
+---
+
+## 3. Handling Dynamic Content
+
+Many modern sites require **multiple steps**: scrolling, clicking “Load More,” or updating via JavaScript. Below are typical patterns.
+
+### 3.1 Load More Example (Hacker News “More” Link)
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Step 1: Load initial Hacker News page
+    config = CrawlerRunConfig(
+        wait_for="css:.athing:nth-child(30)"  # Wait for 30 items
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("Initial items loaded.")
+
+        # Step 2: Let's scroll and click the "More" link
+        load_more_js = [
+            "window.scrollTo(0, document.body.scrollHeight);",
+            # The "More" link at page bottom
+            "document.querySelector('a.morelink')?.click();"  
+        ]
+        
+        next_page_conf = CrawlerRunConfig(
+            js_code=load_more_js,
+            wait_for="""js:() => {
+                return document.querySelectorAll('.athing').length > 30;
+            }""",
+            # Mark that we do not re-navigate, but run JS in the same session:
+            js_only=True,
+            session_id="hn_session"
+        )
+
+        # Re-use the same crawler session
+        result2 = await crawler.arun(
+            url="https://news.ycombinator.com",  # same URL but continuing session
+            config=next_page_conf
+        )
+        total_items = result2.cleaned_html.count("athing")
+        print("Items after load-more:", total_items)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key params**:
+- **`session_id="hn_session"`**: Keep the same page across multiple calls to `arun()`.
+- **`js_only=True`**: We’re not performing a full reload, just applying JS in the existing page.
+- **`wait_for`** with `js:`: Wait for item count to grow beyond 30.
+
+---
+
+### 3.2 Form Interaction
+
+If the site has a search or login form, you can fill fields and submit them with **`js_code`**. For instance, if GitHub had a local search form:
+
+```python
+js_form_interaction = """
+document.querySelector('#your-search').value = 'TypeScript commits';
+document.querySelector('form').submit();
+"""
+
+config = CrawlerRunConfig(
+    js_code=js_form_interaction,
+    wait_for="css:.commit"
+)
+result = await crawler.arun(url="https://github.com/search", config=config)
+```
+
+**In reality**: Replace IDs or classes with the real site’s form selectors.
+
+---
+
+## 4. Timing Control
+
+1. **`page_timeout`** (ms): Overall page load or script execution time limit.  
+2. **`delay_before_return_html`** (seconds): Wait an extra moment before capturing the final HTML.  
+3. **`mean_delay`** & **`max_range`**: If you call `arun_many()` with multiple URLs, these add a random pause between each request.
+
+**Example**:
+
+```python
+config = CrawlerRunConfig(
+    page_timeout=60000,  # 60s limit
+    delay_before_return_html=2.5
+)
+```
+
+---
+
+## 5. Multi-Step Interaction Example
+
+Below is a simplified script that does multiple “Load More” clicks on GitHub’s TypeScript commits page. It **re-uses** the same session to accumulate new commits each time. The code includes the relevant **`CrawlerRunConfig`** parameters you’d rely on.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def multi_page_commits():
+    browser_cfg = BrowserConfig(
+        headless=False,  # Visible for demonstration
+        verbose=True
+    )
+    session_id = "github_ts_commits"
+    
+    base_wait = """js:() => {
+        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+        return commits.length > 0;
+    }"""
+
+    # Step 1: Load initial commits
+    config1 = CrawlerRunConfig(
+        wait_for=base_wait,
+        session_id=session_id,
+        cache_mode=CacheMode.BYPASS,
+        # Not using js_only yet since it's our first load
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://github.com/microsoft/TypeScript/commits/main",
+            config=config1
+        )
+        print("Initial commits loaded. Count:", result.cleaned_html.count("commit"))
+
+        # Step 2: For subsequent pages, we run JS to click 'Next Page' if it exists
+        js_next_page = """
+        const selector = 'a[data-testid="pagination-next-button"]';
+        const button = document.querySelector(selector);
+        if (button) button.click();
+        """
+        
+        # Wait until new commits appear
+        wait_for_more = """js:() => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            if (!window.firstCommit && commits.length>0) {
+                window.firstCommit = commits[0].textContent;
+                return false;
+            }
+            // If top commit changes, we have new commits
+            const topNow = commits[0]?.textContent.trim();
+            return topNow && topNow !== window.firstCommit;
+        }"""
+
+        for page in range(2):  # let's do 2 more "Next" pages
+            config_next = CrawlerRunConfig(
+                session_id=session_id,
+                js_code=js_next_page,
+                wait_for=wait_for_more,
+                js_only=True,       # We're continuing from the open tab
+                cache_mode=CacheMode.BYPASS
+            )
+            result2 = await crawler.arun(
+                url="https://github.com/microsoft/TypeScript/commits/main",
+                config=config_next
+            )
+            print(f"Page {page+2} commits count:", result2.cleaned_html.count("commit"))
+
+        # Optionally kill session
+        await crawler.crawler_strategy.kill_session(session_id)
+
+async def main():
+    await multi_page_commits()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`session_id`**: Keep the same page open.  
+- **`js_code`** + **`wait_for`** + **`js_only=True`**: We do partial refreshes, waiting for new commits to appear.  
+- **`cache_mode=CacheMode.BYPASS`** ensures we always see fresh data each step.
+
+---
+
+## 6. Combine Interaction with Extraction
+
+Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+schema = {
+    "name": "Commits",
+    "baseSelector": "li.Box-sc-g0xbh4-0",
+    "fields": [
+        {"name": "title", "selector": "h4.markdown-title", "type": "text"}
+    ]
+}
+config = CrawlerRunConfig(
+    session_id="ts_commits_session",
+    js_code=js_next_page,
+    wait_for=wait_for_more,
+    extraction_strategy=JsonCssExtractionStrategy(schema)
+)
+```
+
+When done, check `result.extracted_content` for the JSON.
+
+---
+
+## 7. Relevant `CrawlerRunConfig` Parameters
+
+Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
+
+- **`js_code`**: JavaScript to run after initial load.  
+- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.  
+- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.  
+- **`session_id`**: Reuse the same page across calls.  
+- **`cache_mode`**: Whether to read/write from the cache or bypass.  
+- **`remove_overlay_elements`**: Remove certain popups automatically.  
+- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
+
+---
+
+## 8. Conclusion
+
+Crawl4AI’s **page interaction** features let you:
+
+1. **Execute JavaScript** for scrolling, clicks, or form filling.  
+2. **Wait** for CSS or custom JS conditions before capturing data.  
+3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.  
+4. Combine with **structured extraction** for dynamic sites.
+
+With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
+```
+
+
+## File: docs/md_v2/core/quickstart.md
+
+```md
+# Getting Started with Crawl4AI
+
+Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
+
+1. Run your **first crawl** using minimal configuration.  
+2. Generate **Markdown** output (and learn how it’s influenced by content filters).  
+3. Experiment with a simple **CSS-based extraction** strategy.  
+4. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options).  
+5. Crawl a **dynamic** page that loads content via JavaScript.
+
+---
+
+## 1. Introduction
+
+Crawl4AI provides:
+
+- An asynchronous crawler, **`AsyncWebCrawler`**.  
+- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**.  
+- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters).  
+- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based).
+
+By the end of this guide, you’ll have performed a basic crawl, generated Markdown, tried out two extraction strategies, and crawled a dynamic page that uses “Load More” buttons or JavaScript updates.
+
+---
+
+## 2. Your First Crawl
+
+Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # Print first 300 chars
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**
+- **`AsyncWebCrawler`** launches a headless browser (Chromium by default).
+- It fetches `https://example.com`.
+- Crawl4AI automatically converts the HTML into Markdown.
+
+You now have a simple, working crawl!
+
+---
+
+## 3. Basic Configuration (Light Introduction)
+
+Crawl4AI’s crawler can be heavily customized using two main classes:
+
+1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).  
+2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).
+
+Below is an example with minimal usage:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_conf
+        )
+        print(result.markdown)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
+
+We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
+
+---
+
+## 4. Generating Markdown Output
+
+By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**.
+
+- **`result.markdown`**:  
+  The direct HTML-to-Markdown conversion.  
+- **`result.markdown.fit_markdown`**:  
+  The same content after applying any configured **content filter** (e.g., `PruningContentFilter`).
+
+### Example: Using a Filter with `DefaultMarkdownGenerator`
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
+)
+
+config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    markdown_generator=md_generator
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://news.ycombinator.com", config=config)
+    print("Raw Markdown length:", len(result.markdown.raw_markdown))
+    print("Fit Markdown length:", len(result.markdown.fit_markdown))
+```
+
+**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial.
+
+---
+
+## 5. Simple Data Extraction (CSS-based)
+
+Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example:
+
+> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import LLMConfig
+
+# Generate a schema (one-time cost)
+html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
+
+# Using OpenAI (requires API token)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")  # Required for OpenAI
+)
+
+# Or using Ollama (open source, no token needed)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
+)
+
+# Use the schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(schema)
+```
+
+For a complete guide on schema generation and advanced usage, see [No-LLM Extraction Strategies](../extraction/no-llm-strategies.md).
+
+Here's a basic extraction example:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        # The JSON output is stored in 'extracted_content'
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why is this helpful?**
+- Great for repetitive page structures (e.g., item listings, articles).
+- No AI usage or costs.
+- The crawler returns a JSON string you can parse or store.
+
+> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
+
+---
+
+## 6. Simple Data Extraction (LLM-based)
+
+For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers:
+
+- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)  
+- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`)  
+- Or any provider supported by the underlying library
+
+Below is an example using **open-source** style (no token) and closed-source:
+
+```python
+import os
+import json
+import asyncio
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config = LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+if __name__ == "__main__":
+
+    asyncio.run(
+        extract_structured_data_using_llm(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        )
+    )
+```
+
+**What’s happening?**
+- We define a Pydantic schema (`PricingInfo`) describing the fields we want.
+- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON.
+- Depending on the **provider** and **api_token**, you can use local models or a remote API.
+
+---
+
+## 7. Multi-URL Concurrency (Preview)
+
+If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def quick_parallel_example():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3"
+    ]
+    
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # Stream results as they complete
+        async for result in await crawler.arun_many(urls, config=run_conf):
+            if result.success:
+                print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
+            else:
+                print(f"[ERROR] {result.url} => {result.error_message}")
+
+        # Or get all results at once (default behavior)
+        run_conf = run_conf.clone(stream=False)
+        results = await crawler.arun_many(urls, config=run_conf)
+        for res in results:
+            if res.success:
+                print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
+            else:
+                print(f"[ERROR] {res.url} => {res.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(quick_parallel_example())
+```
+
+The example above shows two ways to handle multiple URLs:
+1. **Streaming mode** (`stream=True`): Process results as they become available using `async for`
+2. **Batch mode** (`stream=False`): Wait for all results to complete
+
+For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
+
+---
+
+## 8. Dynamic Content Example
+
+Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+async def main():
+    await extract_structured_data_using_css_extractor()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`BrowserConfig(headless=False)`**: We want to watch it click “Next Page.”  
+- **`CrawlerRunConfig(...)`**: We specify the extraction strategy, pass `session_id` to reuse the same page.  
+- **`js_code`** and **`wait_for`** are used for subsequent pages (`page > 0`) to click the “Next” button and wait for new commits to load.  
+- **`js_only=True`** indicates we’re not re-navigating but continuing the existing session.  
+- Finally, we call `kill_session()` to clean up the page and browser session.
+
+---
+
+## 9. Next Steps
+
+Congratulations! You have:
+
+1. Performed a basic crawl and printed Markdown.  
+2. Used **content filters** with a markdown generator.  
+3. Extracted JSON via **CSS** or **LLM** strategies.  
+4. Handled **dynamic** pages with JavaScript triggers.
+
+If you’re ready for more, check out:
+
+- **Installation**: A deeper dive into advanced installs, Docker usage (experimental), or optional dependencies.  
+- **Hooks & Auth**: Learn how to run custom JavaScript or handle logins with cookies, local storage, etc.  
+- **Deployment**: Explore ephemeral testing in Docker or plan for the upcoming stable Docker release.  
+- **Browser Management**: Delve into user simulation, stealth modes, and concurrency best practices.  
+
+Crawl4AI is a powerful, flexible tool. Enjoy building out your scrapers, data pipelines, or AI-driven extraction flows. Happy crawling!
+```
+
+
+## File: docs/md_v2/core/simple-crawling.md
+
+```md
+# Simple Crawling
+
+This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.
+
+## Basic Usage
+
+Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_config = BrowserConfig()  # Default browser configuration
+    run_config = CrawlerRunConfig()   # Default crawl run configuration
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        print(result.markdown)  # Print clean markdown content
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Understanding the Response
+
+The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    config=CrawlerRunConfig(fit_markdown=True)
+)
+
+# Different content formats
+print(result.html)         # Raw HTML
+print(result.cleaned_html) # Cleaned HTML
+print(result.markdown.raw_markdown) # Raw markdown from cleaned html
+print(result.markdown.fit_markdown) # Most relevant content in markdown
+
+# Check success status
+print(result.success)      # True if crawl succeeded
+print(result.status_code)  # HTTP status code (e.g., 200, 404)
+
+# Access extracted media and links
+print(result.media)        # Dictionary of found media (images, videos, audio)
+print(result.links)        # Dictionary of internal and external links
+```
+
+## Adding Basic Options
+
+Customize your crawl using `CrawlerRunConfig`:
+
+```python
+run_config = CrawlerRunConfig(
+    word_count_threshold=10,        # Minimum words per content block
+    exclude_external_links=True,    # Remove external links
+    remove_overlay_elements=True,   # Remove popups/modals
+    process_iframes=True           # Process iframe content
+)
+
+result = await crawler.arun(
+    url="https://example.com",
+    config=run_config
+)
+```
+
+## Handling Errors
+
+Always check if the crawl was successful:
+
+```python
+run_config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=run_config)
+
+if not result.success:
+    print(f"Crawl failed: {result.error_message}")
+    print(f"Status code: {result.status_code}")
+```
+
+## Logging and Debugging
+
+Enable verbose logging in `BrowserConfig`:
+
+```python
+browser_config = BrowserConfig(verbose=True)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    run_config = CrawlerRunConfig()
+    result = await crawler.arun(url="https://example.com", config=run_config)
+```
+
+## Complete Example
+
+Here's a more comprehensive example demonstrating common usage patterns:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_config = BrowserConfig(verbose=True)
+    run_config = CrawlerRunConfig(
+        # Content filtering
+        word_count_threshold=10,
+        excluded_tags=['form', 'header'],
+        exclude_external_links=True,
+        
+        # Content processing
+        process_iframes=True,
+        remove_overlay_elements=True,
+        
+        # Cache control
+        cache_mode=CacheMode.ENABLED  # Use cache if available
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        
+        if result.success:
+            # Print clean content
+            print("Content:", result.markdown[:500])  # First 500 chars
+            
+            # Process images
+            for image in result.media["images"]:
+                print(f"Found image: {image['src']}")
+            
+            # Process links
+            for link in result.links["internal"]:
+                print(f"Internal link: {link['href']}")
+                
+        else:
+            print(f"Crawl failed: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+```
+
+
+## File: docs/md_v2/advanced/advanced-features.md
+
+```md
+# Overview of Some Important Advanced Features 
+(Proxy, PDF, Screenshot, SSL, Headers, & Storage State)
+
+Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers:
+
+1. **Proxy Usage**  
+2. **Capturing PDFs & Screenshots**  
+3. **Handling SSL Certificates**  
+4. **Custom Headers**  
+5. **Session Persistence & Local Storage**  
+6. **Robots.txt Compliance**  
+
+> **Prerequisites**  
+> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
+> - You know how to run or configure your Python environment with Playwright installed
+
+---
+
+## 1. Proxy Usage
+
+If you need to route your crawl traffic through a proxy—whether for IP rotation, geo-testing, or privacy—Crawl4AI supports it via `BrowserConfig.proxy_config`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_cfg = BrowserConfig(
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "myuser",
+            "password": "mypass",
+        },
+        headless=True
+    )
+    crawler_cfg = CrawlerRunConfig(
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://www.whatismyip.com/",
+            config=crawler_cfg
+        )
+        if result.success:
+            print("[OK] Page fetched via proxy.")
+            print("Page HTML snippet:", result.html[:200])
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**  
+- **`proxy_config`** expects a dict with `server` and optional auth credentials.  
+- Many commercial proxies provide an HTTP/HTTPS “gateway” server that you specify in `server`.  
+- If your proxy doesn’t need auth, omit `username`/`password`.
+
+---
+
+## 2. Capturing PDFs & Screenshots
+
+Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI can do both in one pass:
+
+```python
+import os, asyncio
+from base64 import b64decode
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
+            cache_mode=CacheMode.BYPASS,
+            pdf=True,
+            screenshot=True
+        )
+        
+        if result.success:
+            # Save screenshot
+            if result.screenshot:
+                with open("wikipedia_screenshot.png", "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Save PDF
+            if result.pdf:
+                with open("wikipedia_page.pdf", "wb") as f:
+                    f.write(result.pdf)
+            
+            print("[OK] PDF & screenshot captured.")
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why PDF + Screenshot?**  
+- Large or complex pages can be slow or error-prone with “traditional” full-page screenshots.  
+- Exporting a PDF is more reliable for very long pages. Crawl4AI automatically converts the first PDF page into an image if you request both.  
+
+**Relevant Parameters**  
+- **`pdf=True`**: Exports the current page as a PDF (base64-encoded in `result.pdf`).  
+- **`screenshot=True`**: Creates a screenshot (base64-encoded in `result.screenshot`).  
+- **`scan_full_page`** or advanced hooking can further refine how the crawler captures content.
+
+---
+
+## 3. Handling SSL Certificates
+
+If you need to verify or export a site’s SSL certificate—for compliance, debugging, or data analysis—Crawl4AI can fetch it during the crawl:
+
+```python
+import asyncio, os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = os.path.join(os.getcwd(), "tmp")
+    os.makedirs(tmp_dir, exist_ok=True)
+    
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            print("\nCertificate Information:")
+            print(f"Issuer (CN): {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # Export in multiple formats:
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+            
+            print("\nCertificate exported to JSON/PEM/DER in 'tmp' folder.")
+        else:
+            print("[ERROR] No certificate or crawl failed.")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**  
+- **`fetch_ssl_certificate=True`** triggers certificate retrieval.  
+- `result.ssl_certificate` includes methods (`to_json`, `to_pem`, `to_der`) for saving in various formats (handy for server config, Java keystores, etc.).
+
+---
+
+## 4. Custom Headers
+
+Sometimes you need to set custom headers (e.g., language preferences, authentication tokens, or specialized user-agent strings). You can do this in multiple ways:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    # Option 1: Set headers at the crawler strategy level
+    crawler1 = AsyncWebCrawler(
+        # The underlying strategy can accept headers in its constructor
+        crawler_strategy=None  # We'll override below for clarity
+    )
+    crawler1.crawler_strategy.update_user_agent("MyCustomUA/1.0")
+    crawler1.crawler_strategy.set_custom_headers({
+        "Accept-Language": "fr-FR,fr;q=0.9"
+    })
+    result1 = await crawler1.arun("https://www.example.com")
+    print("Example 1 result success:", result1.success)
+
+    # Option 2: Pass headers directly to `arun()`
+    crawler2 = AsyncWebCrawler()
+    result2 = await crawler2.arun(
+        url="https://www.example.com",
+        headers={"Accept-Language": "es-ES,es;q=0.9"}
+    )
+    print("Example 2 result success:", result2.success)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Notes**  
+- Some sites may react differently to certain headers (e.g., `Accept-Language`).  
+- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-based-crawling.md) or use `UserAgentGenerator`.
+
+---
+
+## 5. Session Persistence & Local Storage
+
+Crawl4AI can preserve cookies and localStorage so you can continue where you left off—ideal for logging into sites or skipping repeated auth flows.
+
+### 5.1 `storage_state`
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    storage_dict = {
+        "cookies": [
+            {
+                "name": "session",
+                "value": "abcd1234",
+                "domain": "example.com",
+                "path": "/",
+                "expires": 1699999999.0,
+                "httpOnly": False,
+                "secure": False,
+                "sameSite": "None"
+            }
+        ],
+        "origins": [
+            {
+                "origin": "https://example.com",
+                "localStorage": [
+                    {"name": "token", "value": "my_auth_token"}
+                ]
+            }
+        ]
+    }
+
+    # Provide the storage state as a dictionary to start "already logged in"
+    async with AsyncWebCrawler(
+        headless=True,
+        storage_state=storage_dict
+    ) as crawler:
+        result = await crawler.arun("https://example.com/protected")
+        if result.success:
+            print("Protected page content length:", len(result.html))
+        else:
+            print("Failed to crawl protected page")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 5.2 Exporting & Reusing State
+
+You can sign in once, export the browser context, and reuse it later—without re-entering credentials.
+
+- **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file.  
+- Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step.
+
+**See**: [Detailed session management tutorial](./session-management.md) or [Explanations → Browser Context & Managed Browser](./identity-based-crawling.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages).
+
+---
+
+## 6. Robots.txt Compliance
+
+Crawl4AI supports respecting robots.txt rules with efficient caching:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable robots.txt checking in config
+    config = CrawlerRunConfig(
+        check_robots_txt=True  # Will check and respect robots.txt rules
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://example.com",
+            config=config
+        )
+        
+        if not result.success and result.status_code == 403:
+            print("Access denied by robots.txt")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**
+- Robots.txt files are cached locally for efficiency
+- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
+- Cache has a default TTL of 7 days
+- If robots.txt can't be fetched, crawling is allowed
+- Returns 403 status code if URL is disallowed
+
+---
+
+## Putting It All Together
+
+Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
+
+```python
+import os, asyncio
+from base64 import b64decode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    # 1. Browser config with proxy + headless
+    browser_cfg = BrowserConfig(
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "myuser",
+            "password": "mypass",
+        },
+        headless=True,
+    )
+
+    # 2. Crawler config with PDF, screenshot, SSL, custom headers, and ignoring caches
+    crawler_cfg = CrawlerRunConfig(
+        pdf=True,
+        screenshot=True,
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,
+        headers={"Accept-Language": "en-US,en;q=0.8"},
+        storage_state="my_storage.json",  # Reuse session from a previous sign-in
+        verbose=True,
+    )
+
+    # 3. Crawl
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url = "https://secure.example.com/protected", 
+            config=crawler_cfg
+        )
+        
+        if result.success:
+            print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", [])))
+            
+            # Save PDF & screenshot
+            if result.pdf:
+                with open("result.pdf", "wb") as f:
+                    f.write(b64decode(result.pdf))
+            if result.screenshot:
+                with open("result.png", "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Check SSL cert
+            if result.ssl_certificate:
+                print("SSL Issuer CN:", result.ssl_certificate.issuer.get("CN", ""))
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## Conclusion & Next Steps
+
+You’ve now explored several **advanced** features:
+
+- **Proxy Usage**  
+- **PDF & Screenshot** capturing for large or critical pages  
+- **SSL Certificate** retrieval & exporting  
+- **Custom Headers** for language or specialized requests  
+- **Session Persistence** via storage state
+- **Robots.txt Compliance**
+
+With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
+
+**Last Updated**: 2025-01-01
+```
+
+
+## File: docs/md_v2/advanced/crawl-dispatcher.md
+
+```md
+# Crawl Dispatcher
+
+We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
+
+Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
+
+Below is a **sample** of how the dispatcher’s performance monitor might look in action:
+
+![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
+
+
+We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!
+```
+
+
+## File: docs/md_v2/advanced/file-downloading.md
+
+```md
+# Download Handling in Crawl4AI
+
+This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
+
+## Enabling Downloads
+
+To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
+
+```python
+from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
+
+async def main():
+    config = BrowserConfig(accept_downloads=True)  # Enable downloads globally
+    async with AsyncWebCrawler(config=config) as crawler:
+        # ... your crawling logic ...
+
+asyncio.run(main())
+```
+
+## Specifying Download Location
+
+Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+import os
+
+downloads_path = os.path.join(os.getcwd(), "my_downloads")  # Custom download path
+os.makedirs(downloads_path, exist_ok=True)
+
+config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
+
+async def main():
+    async with AsyncWebCrawler(config=config) as crawler:
+        result = await crawler.arun(url="https://example.com")
+        # ...
+```
+
+## Triggering Downloads
+
+Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig(
+    js_code="""
+        const downloadLink = document.querySelector('a[href$=".exe"]');
+        if (downloadLink) {
+            downloadLink.click();
+        }
+    """,
+    wait_for=5  # Wait 5 seconds for the download to start
+)
+
+result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
+```
+
+## Accessing Downloaded Files
+
+The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
+
+```python
+if result.downloaded_files:
+    print("Downloaded files:")
+    for file_path in result.downloaded_files:
+        print(f"- {file_path}")
+        file_size = os.path.getsize(file_path)
+        print(f"- File size: {file_size} bytes")
+else:
+    print("No files downloaded.")
+```
+
+## Example: Downloading Multiple Files
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import os
+from pathlib import Path
+
+async def download_multiple_files(url: str, download_path: str):
+    config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
+    async with AsyncWebCrawler(config=config) as crawler:
+        run_config = CrawlerRunConfig(
+            js_code="""
+                const downloadLinks = document.querySelectorAll('a[download]');
+                for (const link of downloadLinks) {
+                    link.click();
+                    // Delay between clicks
+                    await new Promise(r => setTimeout(r, 2000));  
+                }
+            """,
+            wait_for=10  # Wait for all downloads to start
+        )
+        result = await crawler.arun(url=url, config=run_config)
+
+        if result.downloaded_files:
+            print("Downloaded files:")
+            for file in result.downloaded_files:
+                print(f"- {file}")
+        else:
+            print("No files downloaded.")
+
+# Usage
+download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
+os.makedirs(download_path, exist_ok=True)
+
+asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
+```
+
+## Important Considerations
+
+- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
+- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
+- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
+- **Security:** Scan downloaded files for potential security threats before use.
+
+This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!
+```
+
+
+## File: docs/md_v2/advanced/hooks-auth.md
+
+```md
+# Hooks & Auth in AsyncWebCrawler
+
+Crawl4AI’s **hooks** let you customize the crawler at specific points in the pipeline:
+
+1. **`on_browser_created`** – After browser creation.  
+2. **`on_page_context_created`** – After a new context & page are created.  
+3. **`before_goto`** – Just before navigating to a page.  
+4. **`after_goto`** – Right after navigation completes.  
+5. **`on_user_agent_updated`** – Whenever the user agent changes.  
+6. **`on_execution_started`** – Once custom JavaScript execution begins.  
+7. **`before_retrieve_html`** – Just before the crawler retrieves final HTML.  
+8. **`before_return_html`** – Right before returning the HTML content.
+
+**Important**: Avoid heavy tasks in `on_browser_created` since you don’t yet have a page context. If you need to *log in*, do so in **`on_page_context_created`**.
+
+> note "Important Hook Usage Warning"
+    **Avoid Misusing Hooks**: Do not manipulate page objects in the wrong hook or at the wrong time, as it can crash the pipeline or produce incorrect results. A common mistake is attempting to handle authentication prematurely—such as creating or closing pages in `on_browser_created`. 
+
+>   **Use the Right Hook for Auth**: If you need to log in or set tokens, use `on_page_context_created`. This ensures you have a valid page/context to work with, without disrupting the main crawling flow.
+
+>    **Identity-Based Crawling**: For robust auth, consider identity-based crawling (or passing a session ID) to preserve state. Run your initial login steps in a separate, well-defined process, then feed that session to your main crawl—rather than shoehorning complex authentication into early hooks. Check out [Identity-Based Crawling](../advanced/identity-based-crawling.md) for more details.
+
+>    **Be Cautious**: Overwriting or removing elements in the wrong hook can compromise the final crawl. Keep hooks focused on smaller tasks (like route filters, custom headers), and let your main logic (crawling, data extraction) proceed normally.
+
+
+Below is an example demonstration.
+
+---
+
+## Example: Using Hooks in AsyncWebCrawler
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating recommended usage")
+
+    # 1) Configure the browser
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+
+    # 2) Configure the crawler run
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # 3) Create the crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    #
+    # Define Hook Functions
+    #
+
+    async def on_browser_created(browser, **kwargs):
+        # Called once the browser instance is created (but no pages or contexts yet)
+        print("[HOOK] on_browser_created - Browser created successfully!")
+        # Typically, do minimal setup here if needed
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        # Called right after a new page + context are created (ideal for auth or route config).
+        print("[HOOK] on_page_context_created - Setting up page & context.")
+        
+        # Example 1: Route filtering (e.g., block images)
+        async def route_filter(route):
+            if route.request.resource_type == "image":
+                print(f"[HOOK] Blocking image request: {route.request.url}")
+                await route.abort()
+            else:
+                await route.continue_()
+
+        await context.route("**", route_filter)
+
+        # Example 2: (Optional) Simulate a login scenario
+        # (We do NOT create or close pages here, just do quick steps if needed)
+        # e.g., await page.goto("https://example.com/login")
+        # e.g., await page.fill("input[name='username']", "testuser")
+        # e.g., await page.fill("input[name='password']", "password123")
+        # e.g., await page.click("button[type='submit']")
+        # e.g., await page.wait_for_selector("#welcome")
+        # e.g., await context.add_cookies([...])
+        # Then continue
+
+        # Example 3: Adjust the viewport
+        await page.set_viewport_size({"width": 1080, "height": 600})
+        return page
+
+    async def before_goto(
+        page: Page, context: BrowserContext, url: str, **kwargs
+    ):
+        # Called before navigating to each URL.
+        print(f"[HOOK] before_goto - About to navigate: {url}")
+        # e.g., inject custom headers
+        await page.set_extra_http_headers({
+            "Custom-Header": "my-value"
+        })
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, 
+        url: str, response, **kwargs
+    ):
+        # Called after navigation completes.
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # e.g., wait for a certain element if we want to verify
+        try:
+            await page.wait_for_selector('.content', timeout=1000)
+            print("[HOOK] Found .content element!")
+        except:
+            print("[HOOK] .content not found, continuing anyway.")
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, 
+        user_agent: str, **kwargs
+    ):
+        # Called whenever the user agent updates.
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        # Called after custom JavaScript execution begins.
+        print("[HOOK] on_execution_started - JS code is running!")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        # Called before final HTML retrieval.
+        print("[HOOK] before_retrieve_html - We can do final actions")
+        # Example: Scroll again
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        # Called just before returning the HTML in the result.
+        print(f"[HOOK] before_return_html - HTML length: {len(html)}")
+        return page
+
+    #
+    # Attach Hooks
+    #
+
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook(
+        "on_user_agent_updated", on_user_agent_updated
+    )
+    crawler.crawler_strategy.set_hook(
+        "on_execution_started", on_execution_started
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_retrieve_html", before_retrieve_html
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_return_html", before_return_html
+    )
+
+    await crawler.start()
+
+    # 4) Run the crawler on an example page
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    
+    if result.success:
+        print("\nCrawled URL:", result.url)
+        print("HTML length:", len(result.html))
+    else:
+        print("Error:", result.error_message)
+
+    await crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## Hook Lifecycle Summary
+
+1. **`on_browser_created`**:  
+   - Browser is up, but **no** pages or contexts yet.  
+   - Light setup only—don’t try to open or close pages here (that belongs in `on_page_context_created`).
+
+2. **`on_page_context_created`**:  
+   - Perfect for advanced **auth** or route blocking.  
+   - You have a **page** + **context** ready but haven’t navigated to the target URL yet.
+
+3. **`before_goto`**:  
+   - Right before navigation. Typically used for setting **custom headers** or logging the target URL.
+
+4. **`after_goto`**:  
+   - After page navigation is done. Good place for verifying content or waiting on essential elements. 
+
+5. **`on_user_agent_updated`**:  
+   - Whenever the user agent changes (for stealth or different UA modes).
+
+6. **`on_execution_started`**:  
+   - If you set `js_code` or run custom scripts, this runs once your JS is about to start.
+
+7. **`before_retrieve_html`**:  
+   - Just before the final HTML snapshot is taken. Often you do a final scroll or lazy-load triggers here.
+
+8. **`before_return_html`**:  
+   - The last hook before returning HTML to the `CrawlResult`. Good for logging HTML length or minor modifications.
+
+---
+
+## When to Handle Authentication
+
+**Recommended**: Use **`on_page_context_created`** if you need to:
+
+- Navigate to a login page or fill forms
+- Set cookies or localStorage tokens
+- Block resource routes to avoid ads
+
+This ensures the newly created context is under your control **before** `arun()` navigates to the main URL.
+
+---
+
+## Additional Considerations
+
+- **Session Management**: If you want multiple `arun()` calls to reuse a single session, pass `session_id=` in your `CrawlerRunConfig`. Hooks remain the same.  
+- **Performance**: Hooks can slow down crawling if they do heavy tasks. Keep them concise.  
+- **Error Handling**: If a hook fails, the overall crawl might fail. Catch exceptions or handle them gracefully.  
+- **Concurrency**: If you run `arun_many()`, each URL triggers these hooks in parallel. Ensure your hooks are thread/async-safe.
+
+---
+
+## Conclusion
+
+Hooks provide **fine-grained** control over:
+
+- **Browser** creation (light tasks only)
+- **Page** and **context** creation (auth, route blocking)
+- **Navigation** phases
+- **Final HTML** retrieval
+
+Follow the recommended usage:
+- **Login** or advanced tasks in `on_page_context_created`  
+- **Custom headers** or logs in `before_goto` / `after_goto`  
+- **Scrolling** or final checks in `before_retrieve_html` / `before_return_html`
+
+
+```
+
+
+## File: docs/md_v2/advanced/identity-based-crawling.md
+
+```md
+# Preserve Your Identity with Crawl4AI
+
+Crawl4AI empowers you to navigate and interact with the web using your **authentic digital identity**, ensuring you’re recognized as a human and not mistaken for a bot. This tutorial covers:
+
+1. **Managed Browsers** – The recommended approach for persistent profiles and identity-based crawling.  
+2. **Magic Mode** – A simplified fallback solution for quick automation without persistent identity.
+
+---
+
+## 1. Managed Browsers: Your Digital Identity Solution
+
+**Managed Browsers** let developers create and use **persistent browser profiles**. These profiles store local storage, cookies, and other session data, letting you browse as your **real self**—complete with logins, preferences, and cookies.
+
+### Key Benefits
+
+- **Authentic Browsing Experience**: Retain session data and browser fingerprints as though you’re a normal user.  
+- **Effortless Configuration**: Once you log in or solve CAPTCHAs in your chosen data directory, you can re-run crawls without repeating those steps.  
+- **Empowered Data Access**: If you can see the data in your own browser, you can automate its retrieval with your genuine identity.
+
+---
+
+Below is a **partial update** to your **Managed Browsers** tutorial, specifically the section about **creating a user-data directory** using **Playwright’s Chromium** binary rather than a system-wide Chrome/Edge. We’ll show how to **locate** that binary and launch it with a `--user-data-dir` argument to set up your profile. You can then point `BrowserConfig.user_data_dir` to that folder for subsequent crawls.
+
+---
+
+### Creating a User Data Directory (Command-Line Approach via Playwright)
+
+If you installed Crawl4AI (which installs Playwright under the hood), you already have a Playwright-managed Chromium on your system. Follow these steps to launch that **Chromium** from your command line, specifying a **custom** data directory:
+
+1. **Find** the Playwright Chromium binary:
+   - On most systems, installed browsers go under a `~/.cache/ms-playwright/` folder or similar path.  
+   - To see an overview of installed browsers, run:
+     ```bash
+     python -m playwright install --dry-run
+     ```
+     or
+     ```bash
+     playwright install --dry-run
+     ```
+     (depending on your environment). This shows where Playwright keeps Chromium.
+
+   - For instance, you might see a path like:
+     ```
+     ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome
+     ```
+     on Linux, or a corresponding folder on macOS/Windows.
+
+2. **Launch** the Playwright Chromium binary with a **custom** user-data directory:
+   ```bash
+   # Linux example
+   ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome \
+       --user-data-dir=/home/<you>/my_chrome_profile
+   ```
+   ```bash
+   # macOS example (Playwright’s internal binary)
+   ~/Library/Caches/ms-playwright/chromium-1234/chrome-mac/Chromium.app/Contents/MacOS/Chromium \
+       --user-data-dir=/Users/<you>/my_chrome_profile
+   ```
+   ```powershell
+   # Windows example (PowerShell/cmd)
+   "C:\Users\<you>\AppData\Local\ms-playwright\chromium-1234\chrome-win\chrome.exe" ^
+       --user-data-dir="C:\Users\<you>\my_chrome_profile"
+   ```
+   
+   **Replace** the path with the actual subfolder indicated in your `ms-playwright` cache structure.  
+   - This **opens** a fresh Chromium with your new or existing data folder.  
+   - **Log into** any sites or configure your browser the way you want.  
+   - **Close** when done—your profile data is saved in that folder.
+
+3. **Use** that folder in **`BrowserConfig.user_data_dir`**:
+   ```python
+   from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+   browser_config = BrowserConfig(
+       headless=True,
+       use_managed_browser=True,
+       user_data_dir="/home/<you>/my_chrome_profile",
+       browser_type="chromium"
+   )
+   ```
+   - Next time you run your code, it reuses that folder—**preserving** your session data, cookies, local storage, etc.
+
+---
+
+## 3. Using Managed Browsers in Crawl4AI
+
+Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # 1) Reference your persistent data directory
+    browser_config = BrowserConfig(
+        headless=True,             # 'True' for automated runs
+        verbose=True,
+        use_managed_browser=True,  # Enables persistent browser strategy
+        browser_type="chromium",
+        user_data_dir="/path/to/my-chrome-profile"
+    )
+
+    # 2) Standard crawl config
+    crawl_config = CrawlerRunConfig(
+        wait_for="css:.logged-in-content"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="https://example.com/private", config=crawl_config)
+        if result.success:
+            print("Successfully accessed private data with your identity!")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Workflow
+
+1. **Login** externally (via CLI or your normal Chrome with `--user-data-dir=...`).  
+2. **Close** that browser.  
+3. **Use** the same folder in `user_data_dir=` in Crawl4AI.  
+4. **Crawl** – The site sees your identity as if you’re the same user who just logged in.
+
+---
+
+## 4. Magic Mode: Simplified Automation
+
+If you **don’t** need a persistent profile or identity-based approach, **Magic Mode** offers a quick way to simulate human-like browsing without storing long-term data.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        config=CrawlerRunConfig(
+            magic=True,  # Simplifies a lot of interaction
+            remove_overlay_elements=True,
+            page_timeout=60000
+        )
+    )
+```
+
+**Magic Mode**:
+
+- Simulates a user-like experience  
+- Randomizes user agent & navigator
+- Randomizes interactions & timings  
+- Masks automation signals  
+- Attempts pop-up handling  
+
+**But** it’s no substitute for **true** user-based sessions if you want a fully legitimate identity-based solution.
+
+---
+
+## 5. Comparing Managed Browsers vs. Magic Mode
+
+| Feature                    | **Managed Browsers**                                           | **Magic Mode**                                     |
+|----------------------------|---------------------------------------------------------------|-----------------------------------------------------|
+| **Session Persistence**    | Full localStorage/cookies retained in user_data_dir           | No persistent data (fresh each run)                |
+| **Genuine Identity**       | Real user profile with full rights & preferences              | Emulated user-like patterns, but no actual identity |
+| **Complex Sites**          | Best for login-gated sites or heavy config                    | Simple tasks, minimal login or config needed        |
+| **Setup**                  | External creation of user_data_dir, then use in Crawl4AI       | Single-line approach (`magic=True`)                 |
+| **Reliability**            | Extremely consistent (same data across runs)                  | Good for smaller tasks, can be less stable          |
+
+---
+
+## 6. Using the BrowserProfiler Class
+
+Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
+
+### Creating and Managing Profiles with BrowserProfiler
+
+The `BrowserProfiler` class offers a comprehensive API for browser profile management:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler
+
+async def manage_profiles():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Create a profile interactively - opens a browser window
+    profile_path = await profiler.create_profile(
+        profile_name="my-login-profile"  # Optional: name your profile
+    )
+    
+    print(f"Profile saved at: {profile_path}")
+    
+    # List all available profiles
+    profiles = profiler.list_profiles()
+    
+    for profile in profiles:
+        print(f"Profile: {profile['name']}")
+        print(f"  Path: {profile['path']}")
+        print(f"  Created: {profile['created']}")
+        print(f"  Browser type: {profile['type']}")
+    
+    # Get a specific profile path by name
+    specific_profile = profiler.get_profile_path("my-login-profile")
+    
+    # Delete a profile when no longer needed
+    success = profiler.delete_profile("old-profile-name")
+    
+asyncio.run(manage_profiles())
+```
+
+**How profile creation works:**
+1. A browser window opens for you to interact with
+2. You log in to websites, set preferences, etc.
+3. When you're done, press 'q' in the terminal to close the browser
+4. The profile is saved in the Crawl4AI profiles directory
+5. You can use the returned path with `BrowserConfig.user_data_dir`
+
+### Interactive Profile Management
+
+The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
+
+# Define a function to use a profile for crawling
+async def crawl_with_profile(profile_path, url):
+    browser_config = BrowserConfig(
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url)
+        return result
+
+async def main():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Launch the interactive profile manager
+    # Passing the crawl function as a callback adds a "crawl with profile" option
+    await profiler.interactive_manager(crawl_callback=crawl_with_profile)
+    
+asyncio.run(main())
+```
+
+### Legacy Methods
+
+For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
+
+```python
+from crawl4ai.browser_manager import ManagedBrowser
+
+# These methods still work but use BrowserProfiler internally
+profiles = ManagedBrowser.list_profiles()
+```
+
+### Complete Example
+
+See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
+
+---
+
+## 7. Summary
+
+- **Create** your user-data directory either:
+  - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` 
+  - Or by using the built-in `BrowserProfiler.create_profile()` method
+  - Or through the interactive interface with `profiler.interactive_manager()`
+- **Log in** or configure sites as needed, then close the browser
+- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
+- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
+- **Manage** your profiles with the dedicated `BrowserProfiler` class
+- Enjoy **persistent** sessions that reflect your real identity
+- If you only need quick, ephemeral automation, **Magic Mode** might suffice
+
+**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
+
+With these approaches, you preserve your **authentic** browsing environment, ensuring the site sees you exactly as a normal user—no repeated logins or wasted time.
+```
+
+
+## File: docs/md_v2/advanced/lazy-loading.md
+
+```md
+## Handling Lazy-Loaded Images
+
+Many websites now load images **lazily** as you scroll. If you need to ensure they appear in your final crawl (and in `result.media`), consider:
+
+1. **`wait_for_images=True`** – Wait for images to fully load.  
+2. **`scan_full_page`** – Force the crawler to scroll the entire page, triggering lazy loads.  
+3. **`scroll_delay`** – Add small delays between scroll steps.  
+
+**Note**: If the site requires multiple “Load More” triggers or complex interactions, see the [Page Interaction docs](../core/page-interaction.md).
+
+### Example: Ensuring Lazy Images Appear
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
+from crawl4ai.async_configs import CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        # Force the crawler to wait until images are fully loaded
+        wait_for_images=True,
+
+        # Option 1: If you want to automatically scroll the page to load images
+        scan_full_page=True,  # Tells the crawler to try scrolling the entire page
+        scroll_delay=0.5,     # Delay (seconds) between scroll steps
+
+        # Option 2: If the site uses a 'Load More' or JS triggers for images,
+        # you can also specify js_code or wait_for logic here.
+
+        cache_mode=CacheMode.BYPASS,
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        result = await crawler.arun("https://www.example.com/gallery", config=config)
+        
+        if result.success:
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            for i, img in enumerate(images[:5]):
+                print(f"[Image {i}] URL: {img['src']}, Score: {img.get('score','N/A')}")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Explanation**:
+
+- **`wait_for_images=True`**  
+  The crawler tries to ensure images have finished loading before finalizing the HTML.  
+- **`scan_full_page=True`**  
+  Tells the crawler to attempt scrolling from top to bottom. Each scroll step helps trigger lazy loading.  
+- **`scroll_delay=0.5`**  
+  Pause half a second between each scroll step. Helps the site load images before continuing.
+
+**When to Use**:
+
+- **Lazy-Loading**: If images appear only when the user scrolls into view, `scan_full_page` + `scroll_delay` helps the crawler see them.  
+- **Heavier Pages**: If a page is extremely long, be mindful that scanning the entire page can be slow. Adjust `scroll_delay` or the max scroll steps as needed.
+
+---
+
+## Combining with Other Link & Media Filters
+
+You can still combine **lazy-load** logic with the usual **exclude_external_images**, **exclude_domains**, or link filtration:
+
+```python
+config = CrawlerRunConfig(
+    wait_for_images=True,
+    scan_full_page=True,
+    scroll_delay=0.5,
+
+    # Filter out external images if you only want local ones
+    exclude_external_images=True,
+
+    # Exclude certain domains for links
+    exclude_domains=["spammycdn.com"],
+)
+```
+
+This approach ensures you see **all** images from the main domain while ignoring external ones, and the crawler physically scrolls the entire page so that lazy-loading triggers.
+
+---
+
+## Tips & Troubleshooting
+
+1. **Long Pages**  
+   - Setting `scan_full_page=True` on extremely long or infinite-scroll pages can be resource-intensive.  
+   - Consider using [hooks](../core/page-interaction.md) or specialized logic to load specific sections or “Load More” triggers repeatedly.
+
+2. **Mixed Image Behavior**  
+   - Some sites load images in batches as you scroll. If you’re missing images, increase your `scroll_delay` or call multiple partial scrolls in a loop with JS code or hooks.
+
+3. **Combining with Dynamic Wait**  
+   - If the site has a placeholder that only changes to a real image after a certain event, you might do `wait_for="css:img.loaded"` or a custom JS `wait_for`.
+
+4. **Caching**  
+   - If `cache_mode` is enabled, repeated crawls might skip some network fetches. If you suspect caching is missing new images, set `cache_mode=CacheMode.BYPASS` for fresh fetches.
+
+---
+
+With **lazy-loading** support, **wait_for_images**, and **scan_full_page** settings, you can capture the entire gallery or feed of images you expect—even if the site only loads them as the user scrolls. Combine these with the standard media filtering and domain exclusion for a complete link & media handling strategy.
+```
+
+
+## File: docs/md_v2/advanced/multi-url-crawling.md
+
+```md
+# Advanced Multi-URL Crawling with Dispatchers
+
+> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
+
+## 1. Introduction
+
+When crawling many URLs:
+
+- **Basic**: Use `arun()` in a loop (simple but less efficient)
+- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
+- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
+
+**Why Dispatchers?**  
+
+- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
+- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
+- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
+- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
+
+---
+
+## 2. Core Components
+
+### 2.1 Rate Limiter
+
+```python
+class RateLimiter:
+    def __init__(
+        # Random delay range between requests
+        base_delay: Tuple[float, float] = (1.0, 3.0),  
+        
+        # Maximum backoff delay
+        max_delay: float = 60.0,                        
+        
+        # Retries before giving up
+        max_retries: int = 3,                          
+        
+        # Status codes triggering backoff
+        rate_limit_codes: List[int] = [429, 503]        
+    )
+```
+
+Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines.
+
+#### RateLimiter Constructor Parameters
+
+The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters.
+
+**Parameters of the `RateLimiter` constructor:**
+
+1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`)  
+  The range for a random delay (in seconds) between consecutive requests to the same domain.
+
+- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request.  
+- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits.
+
+**Example:**  
+If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc.
+
+---
+
+2. **`max_delay`** (`float`, default: `60.0`)  
+  The maximum allowable delay when rate-limiting errors occur.
+
+- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter.  
+- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value.
+
+**Example:**  
+For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`.
+
+---
+
+3. **`max_retries`** (`int`, default: `3`)  
+  The maximum number of retries for a request if rate-limiting errors occur.
+
+- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times.  
+- If all retries fail, the request is marked as failed, and the process continues.
+
+**Example:**  
+If `max_retries = 3`, the system retries a failed request three times before giving up.
+
+---
+
+4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`)  
+  A list of HTTP status codes that trigger the rate-limiting logic.
+
+- These status codes indicate the server is overwhelmed or actively limiting requests.  
+- You can customize this list to include other codes based on specific server behavior.
+
+**Example:**  
+If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes.
+
+---
+
+**How to Use the `RateLimiter`:**
+
+Here’s an example of initializing and using a `RateLimiter` in your project:
+
+```python
+from crawl4ai import RateLimiter
+
+# Create a RateLimiter with custom settings
+rate_limiter = RateLimiter(
+    base_delay=(2.0, 4.0),  # Random delay between 2-4 seconds
+    max_delay=30.0,         # Cap delay at 30 seconds
+    max_retries=5,          # Retry up to 5 times on rate-limiting errors
+    rate_limit_codes=[429, 503]  # Handle these HTTP status codes
+)
+
+# RateLimiter will handle delays and retries internally
+# No additional setup is required for its operation
+```
+
+The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency.
+
+
+### 2.2 Crawler Monitor
+
+The CrawlerMonitor provides real-time visibility into crawling operations:
+
+```python
+from crawl4ai import CrawlerMonitor, DisplayMode
+monitor = CrawlerMonitor(
+    # Maximum rows in live display
+    max_visible_rows=15,          
+
+    # DETAILED or AGGREGATED view
+    display_mode=DisplayMode.DETAILED  
+)
+```
+
+**Display Modes**:
+
+1. **DETAILED**: Shows individual task status, memory usage, and timing
+2. **AGGREGATED**: Displays summary statistics and overall progress
+
+---
+
+## 3. Available Dispatchers
+
+### 3.1 MemoryAdaptiveDispatcher (Default)
+
+Automatically manages concurrency based on system memory usage:
+
+```python
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
+
+dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=90.0,  # Pause if memory exceeds this
+    check_interval=1.0,             # How often to check memory
+    max_session_permit=10,          # Maximum concurrent tasks
+    rate_limiter=RateLimiter(       # Optional rate limiting
+        base_delay=(1.0, 2.0),
+        max_delay=30.0,
+        max_retries=2
+    ),
+    monitor=CrawlerMonitor(         # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+**Constructor Parameters:**
+
+1. **`memory_threshold_percent`** (`float`, default: `90.0`)  
+  Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload.
+
+2. **`check_interval`** (`float`, default: `1.0`)  
+  The interval (in seconds) at which the dispatcher checks system memory usage.
+
+3. **`max_session_permit`** (`int`, default: `10`)  
+  The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
+
+4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+  Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
+
+5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
+  Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details.
+
+6. **`monitor`** (`CrawlerMonitor`, default: `None`)  
+  Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details.
+
+---
+
+### 3.2 SemaphoreDispatcher
+
+Provides simple concurrency control with a fixed limit:
+
+```python
+from crawl4ai.async_dispatcher import SemaphoreDispatcher
+
+dispatcher = SemaphoreDispatcher(
+    max_session_permit=20,         # Maximum concurrent tasks
+    rate_limiter=RateLimiter(      # Optional rate limiting
+        base_delay=(0.5, 1.0),
+        max_delay=10.0
+    ),
+    monitor=CrawlerMonitor(        # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+**Constructor Parameters:**
+
+1. **`max_session_permit`** (`int`, default: `20`)  
+  The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots.
+
+2. **`rate_limiter`** (`RateLimiter`, default: `None`)  
+  Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details.
+
+3. **`monitor`** (`CrawlerMonitor`, default: `None`)  
+  Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details.
+
+---
+
+## 4. Usage Examples
+
+### 4.1 Batch Processing (Default)
+
+```python
+async def crawl_batch():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=False  # Default: get all results at once
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Get all results at once
+        results = await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        
+        # Process all results after completion
+        for result in results:
+            if result.success:
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+**Review:**  
+- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete.  
+- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory.  
+- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing.  
+- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl.
+
+---
+
+### 4.2 Streaming Mode
+
+```python
+async def crawl_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Process results as they become available
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                # Process each result immediately
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+**Review:**  
+- **Purpose:** Enables streaming to process results as soon as they’re available.  
+- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management.  
+- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling.  
+- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage.
+
+---
+
+### 4.3 Semaphore-based Crawling
+
+```python
+async def crawl_with_semaphore(urls):
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    
+    dispatcher = SemaphoreDispatcher(
+        semaphore_count=5,
+        rate_limiter=RateLimiter(
+            base_delay=(0.5, 1.0),
+            max_delay=10.0
+        ),
+        monitor=CrawlerMonitor(
+            max_visible_rows=15,
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls, 
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        return results
+```
+
+**Review:**  
+- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots.  
+- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks.  
+- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests.  
+- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory.
+
+---
+
+### 4.4 Robots.txt Consideration
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    urls = [
+        "https://example1.com",
+        "https://example2.com",
+        "https://example3.com"
+    ]
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        check_robots_txt=True,  # Will respect robots.txt for each URL
+        semaphore_count=3      # Max concurrent requests
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in crawler.arun_many(urls, config=config):
+            if result.success:
+                print(f"Successfully crawled {result.url}")
+            elif result.status_code == 403 and "robots.txt" in result.error_message:
+                print(f"Skipped {result.url} - blocked by robots.txt")
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Review:**  
+- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling.  
+- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling.  
+- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`).  
+- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices.
+
+---
+
+## 5. Dispatch Results
+
+Each crawl result includes dispatch information:
+
+```python
+@dataclass
+class DispatchResult:
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: datetime
+    end_time: datetime
+    error_message: str = ""
+```
+
+Access via `result.dispatch_result`:
+
+```python
+for result in results:
+    if result.success:
+        dr = result.dispatch_result
+        print(f"URL: {result.url}")
+        print(f"Memory: {dr.memory_usage:.1f}MB")
+        print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+## 6. Summary
+
+1. **Two Dispatcher Types**:
+
+   - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
+   - SemaphoreDispatcher: Fixed concurrency limit
+
+2. **Optional Components**:
+
+   - RateLimiter: Smart request pacing and backoff
+   - CrawlerMonitor: Real-time progress visualization
+
+3. **Key Benefits**:
+
+   - Automatic memory management
+   - Built-in rate limiting
+   - Live progress monitoring
+   - Flexible concurrency control
+
+Choose the dispatcher that best fits your needs:
+
+- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
+- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
+
+```
+
+
+## File: docs/md_v2/advanced/network-console-capture.md
+
+```md
+# Network Requests & Console Message Capturing
+
+Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
+
+## Configuration
+
+To enable network and console capturing, use these configuration options:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+# Enable both network request capture and console message capture
+config = CrawlerRunConfig(
+    capture_network_requests=True,  # Capture all network requests and responses
+    capture_console_messages=True   # Capture all browser console output
+)
+```
+
+## Example Usage
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable both network request capture and console message capture
+    config = CrawlerRunConfig(
+        capture_network_requests=True,
+        capture_console_messages=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            # Analyze network requests
+            if result.network_requests:
+                print(f"Captured {len(result.network_requests)} network events")
+                
+                # Count request types
+                request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+                response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+                failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
+                
+                print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
+                
+                # Find API calls
+                api_calls = [r for r in result.network_requests 
+                            if r.get("event_type") == "request" and "api" in r.get("url", "")]
+                if api_calls:
+                    print(f"Detected {len(api_calls)} API calls:")
+                    for call in api_calls[:3]:  # Show first 3
+                        print(f"  - {call.get('method')} {call.get('url')}")
+            
+            # Analyze console messages
+            if result.console_messages:
+                print(f"Captured {len(result.console_messages)} console messages")
+                
+                # Group by type
+                message_types = {}
+                for msg in result.console_messages:
+                    msg_type = msg.get("type", "unknown")
+                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
+                
+                print("Message types:", message_types)
+                
+                # Show errors (often the most important)
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"Found {len(errors)} console errors:")
+                    for err in errors[:2]:  # Show first 2
+                        print(f"  - {err.get('text', '')[:100]}")
+            
+            # Export all captured data to a file for detailed analysis
+            with open("network_capture.json", "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "network_requests": result.network_requests or [],
+                    "console_messages": result.console_messages or []
+                }, f, indent=2)
+            
+            print("Exported detailed capture data to network_capture.json")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Captured Data Structure
+
+### Network Requests
+
+The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
+| `url` | The URL of the request |
+| `timestamp` | Unix timestamp when the event was captured |
+
+#### Request Event Fields
+
+```json
+{
+  "event_type": "request",
+  "url": "https://example.com/api/data.json",
+  "method": "GET",
+  "headers": {"User-Agent": "...", "Accept": "..."},
+  "post_data": "key=value&otherkey=value",
+  "resource_type": "fetch",
+  "is_navigation_request": false,
+  "timestamp": 1633456789.123
+}
+```
+
+#### Response Event Fields
+
+```json
+{
+  "event_type": "response",
+  "url": "https://example.com/api/data.json",
+  "status": 200,
+  "status_text": "OK",
+  "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
+  "from_service_worker": false,
+  "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
+  "timestamp": 1633456789.456
+}
+```
+
+#### Failed Request Event Fields
+
+```json
+{
+  "event_type": "request_failed",
+  "url": "https://example.com/missing.png",
+  "method": "GET",
+  "resource_type": "image",
+  "failure_text": "net::ERR_ABORTED 404",
+  "timestamp": 1633456789.789
+}
+```
+
+### Console Messages
+
+The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
+| `text` | The message text |
+| `timestamp` | Unix timestamp when the message was captured |
+
+#### Console Message Example
+
+```json
+{
+  "type": "error",
+  "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
+  "location": "https://example.com/script.js:123:45",
+  "timestamp": 1633456790.123
+}
+```
+
+## Key Benefits
+
+- **Full Request Visibility**: Capture all network activity including:
+  - Requests (URLs, methods, headers, post data)
+  - Responses (status codes, headers, timing)
+  - Failed requests (with error messages)
+  
+- **Console Message Access**: View all JavaScript console output:
+  - Log messages
+  - Warnings
+  - Errors with stack traces
+  - Developer debugging information
+
+- **Debugging Power**: Identify issues such as:
+  - Failed API calls or resource loading
+  - JavaScript errors affecting page functionality
+  - CORS or other security issues
+  - Hidden API endpoints and data flows
+
+- **Security Analysis**: Detect:
+  - Unexpected third-party requests
+  - Data leakage in request payloads
+  - Suspicious script behavior
+
+- **Performance Insights**: Analyze:
+  - Request timing data
+  - Resource loading patterns
+  - Potential bottlenecks
+
+## Use Cases
+
+1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
+2. **Debugging**: Track down JavaScript errors affecting page functionality
+3. **Security Auditing**: Detect unwanted third-party requests or data leakage
+4. **Performance Analysis**: Identify slow-loading resources
+5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
+
+This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
+```
+
+
+## File: docs/md_v2/advanced/proxy-security.md
+
+```md
+# Proxy 
+
+## Basic Proxy Setup
+
+Simple proxy configuration with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+# Using proxy URL
+browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Using SOCKS proxy
+browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Authenticated Proxy
+
+Use an authenticated proxy with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+proxy_config = {
+    "server": "http://proxy.example.com:8080",
+    "username": "user",
+    "password": "pass"
+}
+
+browser_config = BrowserConfig(proxy_config=proxy_config)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+Here's the corrected documentation:
+
+## Rotating Proxies 
+
+Example using a proxy rotation service dynamically:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def get_next_proxy():
+    # Your proxy rotation logic here
+    return {"server": "http://next.proxy.com:8080"}
+
+async def main():
+    browser_config = BrowserConfig()
+    run_config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # For each URL, create a new run config with different proxy
+        for url in urls:
+            proxy = await get_next_proxy()
+            # Clone the config and update proxy - this creates a new browser context
+            current_config = run_config.clone(proxy_config=proxy)
+            result = await crawler.arun(url=url, config=current_config)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+
+```
+
+
+## File: docs/md_v2/advanced/session-management.md
+
+```md
+# Session Management
+
+Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
+
+- **Performing JavaScript actions before and after crawling.**
+- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly.
+
+**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations.
+
+---
+
+#### Basic Session Usage
+
+Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    session_id = "my_session"
+
+    # Define configurations
+    config1 = CrawlerRunConfig(
+        url="https://example.com/page1", session_id=session_id
+    )
+    config2 = CrawlerRunConfig(
+        url="https://example.com/page2", session_id=session_id
+    )
+
+    # First request
+    result1 = await crawler.arun(config=config1)
+
+    # Subsequent request using the same session
+    result2 = await crawler.arun(config=config2)
+
+    # Clean up when done
+    await crawler.crawler_strategy.kill_session(session_id)
+```
+
+---
+
+#### Dynamic Content with Sessions
+
+Here's an example of crawling GitHub commits across multiple pages while preserving session state:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.cache_context import CacheMode
+
+async def crawl_dynamic_content():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "github_commits_session"
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        all_commits = []
+
+        # Define extraction schema
+        schema = {
+            "name": "Commit Extractor",
+            "baseSelector": "li.Box-sc-g0xbh4-0",
+            "fields": [{
+                "name": "title", "selector": "h4.markdown-title", "type": "text"
+            }],
+        }
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        # JavaScript and wait configurations
+        js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
+        wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
+
+        # Crawl multiple pages
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page if page > 0 else None,
+                wait_for=wait_for if page > 0 else None,
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            if result.success:
+                commits = json.loads(result.extracted_content)
+                all_commits.extend(commits)
+                print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        # Clean up session
+        await crawler.crawler_strategy.kill_session(session_id)
+        return all_commits
+```
+
+---
+
+## Example 1: Basic Session-Based Crawling
+
+A simple example using session-based crawling:
+
+```python
+import asyncio
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
+
+async def basic_session_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "dynamic_content_session"
+        url = "https://example.com/dynamic-content"
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
+                css_selector=".content-item",
+                cache_mode=CacheMode.BYPASS
+            )
+            
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(basic_session_crawl())
+```
+
+This example shows:
+1. Reusing the same `session_id` across multiple requests.
+2. Executing JavaScript to load more content dynamically.
+3. Properly closing the session to free resources.
+
+---
+
+## Advanced Technique 1: Custom Execution Hooks
+
+> Warning: You might feel confused by the end of the next few examples 😅, so make sure you are comfortable with the order of the parts before you start this.
+
+Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
+
+```python
+async def advanced_session_crawl_with_hooks():
+    first_commit = ""
+
+    async def on_execution_started(page):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.commit-item h4")
+                commit = await page.query_selector("li.commit-item h4")
+                commit = await commit.evaluate("(element) => element.textContent").strip()
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear: {e}")
+
+    async with AsyncWebCrawler() as crawler:
+        session_id = "commit_session"
+        url = "https://github.com/example/repo/commits/main"
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        js_next_page = """document.querySelector('a.pagination-next').click();"""
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(advanced_session_crawl_with_hooks())
+```
+
+This technique ensures new content loads before the next action.
+
+---
+
+## Advanced Technique 2: Integrated JavaScript Execution and Waiting
+
+Combine JavaScript execution and waiting logic for concise handling of dynamic content:
+
+```python
+async def integrated_js_and_wait_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "integrated_session"
+        url = "https://github.com/example/repo/commits/main"
+
+        js_next_page_and_wait = """
+        (async () => {
+            const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
+            const initialCommit = getCurrentCommit();
+            document.querySelector('a.pagination-next').click();
+            while (getCurrentCommit() === initialCommit) {
+                await new Promise(resolve => setTimeout(resolve, 100));
+            }
+        })();
+        """
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(integrated_js_and_wait_crawl())
+```
+
+---
+
+#### Common Use Cases for Sessions
+
+1. **Authentication Flows**: Login and interact with secured pages.
+
+2. **Pagination Handling**: Navigate through multiple pages.
+
+3. **Form Submissions**: Fill forms, submit, and process results.
+
+4. **Multi-step Processes**: Complete workflows that span multiple actions.
+
+5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
+
+```
+
+
+## File: docs/md_v2/advanced/ssl-certificate.md
+
+```md
+# `SSLCertificate` Reference
+
+The **`SSLCertificate`** class encapsulates an SSL certificate’s data and allows exporting it in various formats (PEM, DER, JSON, or text). It’s used within **Crawl4AI** whenever you set **`fetch_ssl_certificate=True`** in your **`CrawlerRunConfig`**.  
+
+## 1. Overview
+
+**Location**: `crawl4ai/ssl_certificate.py`
+
+```python
+class SSLCertificate:
+    """
+    Represents an SSL certificate with methods to export in various formats.
+
+    Main Methods:
+    - from_url(url, timeout=10)
+    - from_file(file_path)
+    - from_binary(binary_data)
+    - to_json(filepath=None)
+    - to_pem(filepath=None)
+    - to_der(filepath=None)
+    ...
+
+    Common Properties:
+    - issuer
+    - subject
+    - valid_from
+    - valid_until
+    - fingerprint
+    """
+```
+
+### Typical Use Case
+1. You **enable** certificate fetching in your crawl by:
+   ```python
+   CrawlerRunConfig(fetch_ssl_certificate=True, ...)
+   ```
+2. After `arun()`, if `result.ssl_certificate` is present, it’s an instance of **`SSLCertificate`**.  
+3. You can **read** basic properties (issuer, subject, validity) or **export** them in multiple formats.
+
+---
+
+## 2. Construction & Fetching
+
+### 2.1 **`from_url(url, timeout=10)`**
+Manually load an SSL certificate from a given URL (port 443). Typically used internally, but you can call it directly if you want:
+
+```python
+cert = SSLCertificate.from_url("https://example.com")
+if cert:
+    print("Fingerprint:", cert.fingerprint)
+```
+
+### 2.2 **`from_file(file_path)`**
+Load from a file containing certificate data in ASN.1 or DER. Rarely needed unless you have local cert files:
+
+```python
+cert = SSLCertificate.from_file("/path/to/cert.der")
+```
+
+### 2.3 **`from_binary(binary_data)`**
+Initialize from raw binary. E.g., if you captured it from a socket or another source:
+
+```python
+cert = SSLCertificate.from_binary(raw_bytes)
+```
+
+---
+
+## 3. Common Properties
+
+After obtaining a **`SSLCertificate`** instance (e.g. `result.ssl_certificate` from a crawl), you can read:
+
+1. **`issuer`** *(dict)*  
+   - E.g. `{"CN": "My Root CA", "O": "..."}`
+2. **`subject`** *(dict)*  
+   - E.g. `{"CN": "example.com", "O": "ExampleOrg"}`
+3. **`valid_from`** *(str)*  
+   - NotBefore date/time. Often in ASN.1/UTC format.
+4. **`valid_until`** *(str)*  
+   - NotAfter date/time.
+5. **`fingerprint`** *(str)*  
+   - The SHA-256 digest (lowercase hex).  
+   - E.g. `"d14d2e..."`
+
+---
+
+## 4. Export Methods
+
+Once you have a **`SSLCertificate`** object, you can **export** or **inspect** it:
+
+### 4.1 **`to_json(filepath=None)` → `Optional[str]`**
+- Returns a JSON string containing the parsed certificate fields.  
+- If `filepath` is provided, saves it to disk instead, returning `None`.
+
+**Usage**:
+```python
+json_data = cert.to_json()  # returns JSON string
+cert.to_json("certificate.json")  # writes file, returns None
+```
+
+### 4.2 **`to_pem(filepath=None)` → `Optional[str]`**
+- Returns a PEM-encoded string (common for web servers).  
+- If `filepath` is provided, saves it to disk instead.
+
+```python
+pem_str = cert.to_pem()              # in-memory PEM string
+cert.to_pem("/path/to/cert.pem")     # saved to file
+```
+
+### 4.3 **`to_der(filepath=None)` → `Optional[bytes]`**
+- Returns the original DER (binary ASN.1) bytes.  
+- If `filepath` is specified, writes the bytes there instead.
+
+```python
+der_bytes = cert.to_der()
+cert.to_der("certificate.der")
+```
+
+### 4.4 (Optional) **`export_as_text()`**
+- If you see a method like `export_as_text()`, it typically returns an OpenSSL-style textual representation.  
+- Not always needed, but can help for debugging or manual inspection.
+
+---
+
+## 5. Example Usage in Crawl4AI
+
+Below is a minimal sample showing how the crawler obtains an SSL cert from a site, then reads or exports it. The code snippet:
+
+```python
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = "tmp"
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            # 1. Basic Info
+            print("Issuer CN:", cert.issuer.get("CN", ""))
+            print("Valid until:", cert.valid_until)
+            print("Fingerprint:", cert.fingerprint)
+            
+            # 2. Export
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+    
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Notes & Best Practices
+
+1. **Timeout**: `SSLCertificate.from_url` internally uses a default **10s** socket connect and wraps SSL.  
+2. **Binary Form**: The certificate is loaded in ASN.1 (DER) form, then re-parsed by `OpenSSL.crypto`.  
+3. **Validation**: This does **not** validate the certificate chain or trust store. It only fetches and parses.  
+4. **Integration**: Within Crawl4AI, you typically just set `fetch_ssl_certificate=True` in `CrawlerRunConfig`; the final result’s `ssl_certificate` is automatically built.  
+5. **Export**: If you need to store or analyze a cert, the `to_json` and `to_pem` are quite universal.
+
+---
+
+### Summary
+
+- **`SSLCertificate`** is a convenience class for capturing and exporting the **TLS certificate** from your crawled site(s).  
+- Common usage is in the **`CrawlResult.ssl_certificate`** field, accessible after setting `fetch_ssl_certificate=True`.  
+- Offers quick access to essential certificate details (`issuer`, `subject`, `fingerprint`) and is easy to export (PEM, DER, JSON) for further analysis or server usage.
+
+Use it whenever you need **insight** into a site’s certificate or require some form of cryptographic or compliance check.
+```
+
+
+## File: docs/md_v2/extraction/chunking.md
+
+```md
+# Chunking Strategies
+Chunking strategies are critical for dividing large texts into manageable parts, enabling effective content processing and extraction. These strategies are foundational in cosine similarity-based extraction techniques, which allow users to retrieve only the most relevant chunks of content for a given query. Additionally, they facilitate direct integration into RAG (Retrieval-Augmented Generation) systems for structured and scalable workflows.
+
+### Why Use Chunking?
+1. **Cosine Similarity and Query Relevance**: Prepares chunks for semantic similarity analysis.
+2. **RAG System Integration**: Seamlessly processes and stores chunks for retrieval.
+3. **Structured Processing**: Allows for diverse segmentation methods, such as sentence-based, topic-based, or windowed approaches.
+
+### Methods of Chunking
+
+#### 1. Regex-Based Chunking
+Splits text based on regular expression patterns, useful for coarse segmentation.
+
+**Code Example**:
+```python
+class RegexChunking:
+    def __init__(self, patterns=None):
+        self.patterns = patterns or [r'\n\n']  # Default pattern for paragraphs
+
+    def chunk(self, text):
+        paragraphs = [text]
+        for pattern in self.patterns:
+            paragraphs = [seg for p in paragraphs for seg in re.split(pattern, p)]
+        return paragraphs
+
+# Example Usage
+text = """This is the first paragraph.
+
+This is the second paragraph."""
+chunker = RegexChunking()
+print(chunker.chunk(text))
+```
+
+#### 2. Sentence-Based Chunking
+Divides text into sentences using NLP tools, ideal for extracting meaningful statements.
+
+**Code Example**:
+```python
+from nltk.tokenize import sent_tokenize
+
+class NlpSentenceChunking:
+    def chunk(self, text):
+        sentences = sent_tokenize(text)
+        return [sentence.strip() for sentence in sentences]
+
+# Example Usage
+text = "This is sentence one. This is sentence two."
+chunker = NlpSentenceChunking()
+print(chunker.chunk(text))
+```
+
+#### 3. Topic-Based Segmentation
+Uses algorithms like TextTiling to create topic-coherent chunks.
+
+**Code Example**:
+```python
+from nltk.tokenize import TextTilingTokenizer
+
+class TopicSegmentationChunking:
+    def __init__(self):
+        self.tokenizer = TextTilingTokenizer()
+
+    def chunk(self, text):
+        return self.tokenizer.tokenize(text)
+
+# Example Usage
+text = """This is an introduction.
+This is a detailed discussion on the topic."""
+chunker = TopicSegmentationChunking()
+print(chunker.chunk(text))
+```
+
+#### 4. Fixed-Length Word Chunking
+Segments text into chunks of a fixed word count.
+
+**Code Example**:
+```python
+class FixedLengthWordChunking:
+    def __init__(self, chunk_size=100):
+        self.chunk_size = chunk_size
+
+    def chunk(self, text):
+        words = text.split()
+        return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
+
+# Example Usage
+text = "This is a long text with many words to be chunked into fixed sizes."
+chunker = FixedLengthWordChunking(chunk_size=5)
+print(chunker.chunk(text))
+```
+
+#### 5. Sliding Window Chunking
+Generates overlapping chunks for better contextual coherence.
+
+**Code Example**:
+```python
+class SlidingWindowChunking:
+    def __init__(self, window_size=100, step=50):
+        self.window_size = window_size
+        self.step = step
+
+    def chunk(self, text):
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words) - self.window_size + 1, self.step):
+            chunks.append(' '.join(words[i:i + self.window_size]))
+        return chunks
+
+# Example Usage
+text = "This is a long text to demonstrate sliding window chunking."
+chunker = SlidingWindowChunking(window_size=5, step=2)
+print(chunker.chunk(text))
+```
+
+### Combining Chunking with Cosine Similarity
+To enhance the relevance of extracted content, chunking strategies can be paired with cosine similarity techniques. Here’s an example workflow:
+
+**Code Example**:
+```python
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+class CosineSimilarityExtractor:
+    def __init__(self, query):
+        self.query = query
+        self.vectorizer = TfidfVectorizer()
+
+    def find_relevant_chunks(self, chunks):
+        vectors = self.vectorizer.fit_transform([self.query] + chunks)
+        similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
+        return [(chunks[i], similarities[i]) for i in range(len(chunks))]
+
+# Example Workflow
+text = """This is a sample document. It has multiple sentences. 
+We are testing chunking and similarity."""
+
+chunker = SlidingWindowChunking(window_size=5, step=3)
+chunks = chunker.chunk(text)
+query = "testing chunking"
+extractor = CosineSimilarityExtractor(query)
+relevant_chunks = extractor.find_relevant_chunks(chunks)
+
+print(relevant_chunks)
+```
+
+```
+
+
+## File: docs/md_v2/extraction/clustring-strategies.md
+
+```md
+# Cosine Strategy
+
+The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns.
+
+## How It Works
+
+The Cosine Strategy:
+1. Breaks down page content into meaningful chunks
+2. Converts text into vector representations
+3. Calculates similarity between chunks
+4. Clusters similar content together
+5. Ranks and filters content based on relevance
+
+## Basic Usage
+
+```python
+from crawl4ai.extraction_strategy import CosineStrategy
+
+strategy = CosineStrategy(
+    semantic_filter="product reviews",    # Target content type
+    word_count_threshold=10,             # Minimum words per cluster
+    sim_threshold=0.3                    # Similarity threshold
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com/reviews",
+        extraction_strategy=strategy
+    )
+    
+    content = result.extracted_content
+```
+
+## Configuration Options
+
+### Core Parameters
+
+```python
+CosineStrategy(
+    # Content Filtering
+    semantic_filter: str = None,       # Keywords/topic for content filtering
+    word_count_threshold: int = 10,    # Minimum words per cluster
+    sim_threshold: float = 0.3,        # Similarity threshold (0.0 to 1.0)
+    
+    # Clustering Parameters
+    max_dist: float = 0.2,            # Maximum distance for clustering
+    linkage_method: str = 'ward',      # Clustering linkage method
+    top_k: int = 3,                   # Number of top categories to extract
+    
+    # Model Configuration
+    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',  # Embedding model
+    
+    verbose: bool = False             # Enable logging
+)
+```
+
+### Parameter Details
+
+1. **semantic_filter**
+   - Sets the target topic or content type
+   - Use keywords relevant to your desired content
+   - Example: "technical specifications", "user reviews", "pricing information"
+
+2. **sim_threshold**
+   - Controls how similar content must be to be grouped together
+   - Higher values (e.g., 0.8) mean stricter matching
+   - Lower values (e.g., 0.3) allow more variation
+   ```python
+   # Strict matching
+   strategy = CosineStrategy(sim_threshold=0.8)
+   
+   # Loose matching
+   strategy = CosineStrategy(sim_threshold=0.3)
+   ```
+
+3. **word_count_threshold**
+   - Filters out short content blocks
+   - Helps eliminate noise and irrelevant content
+   ```python
+   # Only consider substantial paragraphs
+   strategy = CosineStrategy(word_count_threshold=50)
+   ```
+
+4. **top_k**
+   - Number of top content clusters to return
+   - Higher values return more diverse content
+   ```python
+   # Get top 5 most relevant content clusters
+   strategy = CosineStrategy(top_k=5)
+   ```
+
+## Use Cases
+
+### 1. Article Content Extraction
+```python
+strategy = CosineStrategy(
+    semantic_filter="main article content",
+    word_count_threshold=100,  # Longer blocks for articles
+    top_k=1                   # Usually want single main content
+)
+
+result = await crawler.arun(
+    url="https://example.com/blog/post",
+    extraction_strategy=strategy
+)
+```
+
+### 2. Product Review Analysis
+```python
+strategy = CosineStrategy(
+    semantic_filter="customer reviews and ratings",
+    word_count_threshold=20,   # Reviews can be shorter
+    top_k=10,                 # Get multiple reviews
+    sim_threshold=0.4         # Allow variety in review content
+)
+```
+
+### 3. Technical Documentation
+```python
+strategy = CosineStrategy(
+    semantic_filter="technical specifications documentation",
+    word_count_threshold=30,
+    sim_threshold=0.6,        # Stricter matching for technical content
+    max_dist=0.3             # Allow related technical sections
+)
+```
+
+## Advanced Features
+
+### Custom Clustering
+```python
+strategy = CosineStrategy(
+    linkage_method='complete',  # Alternative clustering method
+    max_dist=0.4,              # Larger clusters
+    model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'  # Multilingual support
+)
+```
+
+### Content Filtering Pipeline
+```python
+strategy = CosineStrategy(
+    semantic_filter="pricing plans features",
+    word_count_threshold=15,
+    sim_threshold=0.5,
+    top_k=3
+)
+
+async def extract_pricing_features(url: str):
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+            extraction_strategy=strategy
+        )
+        
+        if result.success:
+            content = json.loads(result.extracted_content)
+            return {
+                'pricing_features': content,
+                'clusters': len(content),
+                'similarity_scores': [item['score'] for item in content]
+            }
+```
+
+## Best Practices
+
+1. **Adjust Thresholds Iteratively**
+   - Start with default values
+   - Adjust based on results
+   - Monitor clustering quality
+
+2. **Choose Appropriate Word Count Thresholds**
+   - Higher for articles (100+)
+   - Lower for reviews/comments (20+)
+   - Medium for product descriptions (50+)
+
+3. **Optimize Performance**
+   ```python
+   strategy = CosineStrategy(
+       word_count_threshold=10,  # Filter early
+       top_k=5,                 # Limit results
+       verbose=True             # Monitor performance
+   )
+   ```
+
+4. **Handle Different Content Types**
+   ```python
+   # For mixed content pages
+   strategy = CosineStrategy(
+       semantic_filter="product features",
+       sim_threshold=0.4,      # More flexible matching
+       max_dist=0.3,          # Larger clusters
+       top_k=3                # Multiple relevant sections
+   )
+   ```
+
+## Error Handling
+
+```python
+try:
+    result = await crawler.arun(
+        url="https://example.com",
+        extraction_strategy=strategy
+    )
+    
+    if result.success:
+        content = json.loads(result.extracted_content)
+        if not content:
+            print("No relevant content found")
+    else:
+        print(f"Extraction failed: {result.error_message}")
+        
+except Exception as e:
+    print(f"Error during extraction: {str(e)}")
+```
+
+The Cosine Strategy is particularly effective when:
+- Content structure is inconsistent
+- You need semantic understanding
+- You want to find similar content blocks
+- Structure-based extraction (CSS/XPath) isn't reliable
+
+It works well with other strategies and can be used as a pre-processing step for LLM-based extraction.
+```
+
+
+## File: docs/md_v2/extraction/llm-strategies.md
+
+```md
+# Extracting JSON (LLM)
+
+In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
+
+1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).  
+2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
+3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
+
+**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./no-llm-strategies.md) or [`JsonXPathExtractionStrategy`](./no-llm-strategies.md) first. But if you need AI to interpret or reorganize content, read on!
+
+---
+
+## 1. Why Use an LLM?
+
+- **Complex Reasoning**: If the site’s data is unstructured, scattered, or full of natural language context.  
+- **Semantic Extraction**: Summaries, knowledge graphs, or relational data that require comprehension.  
+- **Flexible**: You can pass instructions to the model to do more advanced transformations or classification.
+
+---
+
+## 2. Provider-Agnostic via LightLLM
+
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+
+- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
+- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
+- **`api_base`** (optional): If your provider has a custom endpoint.  
+
+This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
+
+---
+
+## 3. How LLM Extraction Works
+
+### 3.1 Flow
+
+1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.).  
+2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples).  
+3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency).  
+4. **Combining**: The results from each chunk are merged and parsed into JSON.
+
+### 3.2 `extraction_type`
+
+- **`"schema"`**: The model tries to return JSON conforming to your Pydantic-based schema.  
+- **`"block"`**: The model returns freeform text, or smaller JSON structures, which the library collects.  
+
+For structured data, `"schema"` is recommended. You provide `schema=YourPydanticModel.model_json_schema()`.
+
+---
+
+## 4. Key Parameters
+
+Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
+
+1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
+2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
+3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+4. **`extraction_type`** (str): `"schema"` or `"block"`.  
+5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+   - `"markdown"`: The raw markdown (default).  
+   - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
+   - `"html"`: The cleaned or raw HTML.  
+10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+
+**Example**:
+
+```python
+extraction_strategy = LLMExtractionStrategy(
+    llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
+    schema=MyModel.model_json_schema(),
+    extraction_type="schema",
+    instruction="Extract a list of items from the text with 'name' and 'price' fields.",
+    chunk_token_threshold=1200,
+    overlap_rate=0.1,
+    apply_chunking=True,
+    input_format="html",
+    extra_args={"temperature": 0.1, "max_tokens": 1000},
+    verbose=True
+)
+```
+
+---
+
+## 5. Putting It in `CrawlerRunConfig`
+
+**Important**: In Crawl4AI, all strategy definitions should go inside the `CrawlerRunConfig`, not directly as a param in `arun()`. Here’s a full example:
+
+```python
+import os
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class Product(BaseModel):
+    name: str
+    price: str
+
+async def main():
+    # 1. Define the LLM extraction strategy
+    llm_strategy = LLMExtractionStrategy(
+        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
+        schema=Product.schema_json(), # Or use model_json_schema()
+        extraction_type="schema",
+        instruction="Extract all product objects with 'name' and 'price' from the content.",
+        chunk_token_threshold=1000,
+        overlap_rate=0.0,
+        apply_chunking=True,
+        input_format="markdown",   # or "html", "fit_markdown"
+        extra_args={"temperature": 0.0, "max_tokens": 800}
+    )
+
+    # 2. Build the crawler config
+    crawl_config = CrawlerRunConfig(
+        extraction_strategy=llm_strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # 3. Create a browser config if needed
+    browser_cfg = BrowserConfig(headless=True)
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        # 4. Let's say we want to crawl a single page
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=crawl_config
+        )
+
+        if result.success:
+            # 5. The extracted content is presumably JSON
+            data = json.loads(result.extracted_content)
+            print("Extracted items:", data)
+            
+            # 6. Show usage stats
+            llm_strategy.show_usage()  # prints token usage
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Chunking Details
+
+### 6.1 `chunk_token_threshold`
+
+If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments.
+
+### 6.2 `overlap_rate`
+
+To keep context continuous across chunks, we can overlap them. E.g., `overlap_rate=0.1` means each subsequent chunk includes 10% of the previous chunk’s text. This is helpful if your needed info might straddle chunk boundaries.
+
+### 6.3 Performance & Parallelism
+
+By chunking, you can potentially process multiple chunks in parallel (depending on your concurrency settings and the LLM provider). This reduces total time if the site is huge or has many sections.
+
+---
+
+## 7. Input Format
+
+By default, **LLMExtractionStrategy** uses `input_format="markdown"`, meaning the **crawler’s final markdown** is fed to the LLM. You can change to:
+
+- **`html`**: The cleaned HTML or raw HTML (depending on your crawler config) goes into the LLM.  
+- **`fit_markdown`**: If you used, for instance, `PruningContentFilter`, the “fit” version of the markdown is used. This can drastically reduce tokens if you trust the filter.  
+- **`markdown`**: Standard markdown output from the crawler’s `markdown_generator`.
+
+This setting is crucial: if the LLM instructions rely on HTML tags, pick `"html"`. If you prefer a text-based approach, pick `"markdown"`.
+
+```python
+LLMExtractionStrategy(
+    # ...
+    input_format="html",  # Instead of "markdown" or "fit_markdown"
+)
+```
+
+---
+
+## 8. Token Usage & Show Usage
+
+To keep track of tokens and cost, each chunk is processed with an LLM call. We record usage in:
+
+- **`usages`** (list): token usage per chunk or call.  
+- **`total_usage`**: sum of all chunk calls.  
+- **`show_usage()`**: prints a usage report (if the provider returns usage data).
+
+```python
+llm_strategy = LLMExtractionStrategy(...)
+# ...
+llm_strategy.show_usage()
+# e.g. “Total usage: 1241 tokens across 2 chunk calls”
+```
+
+If your model provider doesn’t return usage info, these fields might be partial or empty.
+
+---
+
+## 9. Example: Building a Knowledge Graph
+
+Below is a snippet combining **`LLMExtractionStrategy`** with a Pydantic schema for a knowledge graph. Notice how we pass an **`instruction`** telling the model what to parse.
+
+```python
+import os
+import json
+import asyncio
+from typing import List
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class Entity(BaseModel):
+    name: str
+    description: str
+
+class Relationship(BaseModel):
+    entity1: Entity
+    entity2: Entity
+    description: str
+    relation_type: str
+
+class KnowledgeGraph(BaseModel):
+    entities: List[Entity]
+    relationships: List[Relationship]
+
+async def main():
+    # LLM extraction strategy
+    llm_strat = LLMExtractionStrategy(
+        provider="openai/gpt-4",
+        api_token=os.getenv('OPENAI_API_KEY'),
+        schema=KnowledgeGraph.schema_json(),
+        extraction_type="schema",
+        instruction="Extract entities and relationships from the content. Return valid JSON.",
+        chunk_token_threshold=1400,
+        apply_chunking=True,
+        input_format="html",
+        extra_args={"temperature": 0.1, "max_tokens": 1500}
+    )
+
+    crawl_config = CrawlerRunConfig(
+        extraction_strategy=llm_strat,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        # Example page
+        url = "https://www.nbcnews.com/business"
+        result = await crawler.arun(url=url, config=crawl_config)
+
+        if result.success:
+            with open("kb_result.json", "w", encoding="utf-8") as f:
+                f.write(result.extracted_content)
+            llm_strat.show_usage()
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Observations**:
+
+- **`extraction_type="schema"`** ensures we get JSON fitting our `KnowledgeGraph`.  
+- **`input_format="html"`** means we feed HTML to the model.  
+- **`instruction`** guides the model to output a structured knowledge graph.  
+
+---
+
+## 10. Best Practices & Caveats
+
+1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data.  
+2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential.  
+3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability.  
+4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error.  
+5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers.  
+6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup.
+
+---
+
+## 11. Conclusion
+
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+
+- Put your LLM strategy **in `CrawlerRunConfig`**.  
+- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
+- Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently.  
+- Monitor token usage with `show_usage()`.
+
+If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./no-llm-strategies.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website.
+
+**Next Steps**:
+
+1. **Experiment with Different Providers**  
+   - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost.  
+   - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results.
+
+2. **Performance Tuning**  
+   - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput.  
+   - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks.
+
+3. **Validate Outputs**  
+   - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step.  
+   - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON.
+
+4. **Explore Hooks & Automation**  
+   - Integrate LLM extraction with [hooks](../advanced/hooks-auth.md) for complex pre/post-processing.  
+   - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis.
+
+**Last Updated**: 2025-01-01
+
+---
+
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
+```
+
+
+## File: docs/md_v2/extraction/no-llm-strategies.md
+
+```md
+# Extracting JSON (No LLM)
+
+One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
+
+**Why avoid LLM for basic extractions?**
+
+1. **Faster & Cheaper**: No API calls or GPU overhead.  
+2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
+3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
+4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
+
+Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**.
+
+---
+
+## 1. Intro to Schema-Based Extraction
+
+A schema defines:
+
+1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card).  
+2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
+3. **Nested** or **list** types for repeated or hierarchical structures.  
+
+For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages.
+
+---
+
+## 2. Simple Example: Crypto Prices
+
+Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM:
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_crypto_prices():
+    # 1. Define a simple extraction schema
+    schema = {
+        "name": "Crypto Prices",
+        "baseSelector": "div.crypto-row",    # Repeated elements
+        "fields": [
+            {
+                "name": "coin_name",
+                "selector": "h2.coin-name",
+                "type": "text"
+            },
+            {
+                "name": "price",
+                "selector": "span.coin-price",
+                "type": "text"
+            }
+        ]
+    }
+
+    # 2. Create the extraction strategy
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+    # 3. Set up your crawler config (if needed)
+    config = CrawlerRunConfig(
+        # e.g., pass js_code or wait_for if the page is dynamic
+        # wait_for="css:.crypto-row:nth-child(20)"
+        cache_mode = CacheMode.BYPASS,
+        extraction_strategy=extraction_strategy,
+    )
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # 4. Run the crawl and extraction
+        result = await crawler.arun(
+            url="https://example.com/crypto-prices",
+            
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+
+        # 5. Parse the extracted JSON
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} coin entries")
+        print(json.dumps(data[0], indent=2) if data else "No data found")
+
+asyncio.run(extract_crypto_prices())
+```
+
+**Highlights**:
+
+- **`baseSelector`**: Tells us where each “item” (crypto row) is.  
+- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.  
+- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
+
+No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items.
+
+---
+
+### **XPath Example with `raw://` HTML**
+
+Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+
+async def extract_crypto_prices_xpath():
+    # 1. Minimal dummy HTML with some repeating rows
+    dummy_html = """
+    <html>
+      <body>
+        <div class='crypto-row'>
+          <h2 class='coin-name'>Bitcoin</h2>
+          <span class='coin-price'>$28,000</span>
+        </div>
+        <div class='crypto-row'>
+          <h2 class='coin-name'>Ethereum</h2>
+          <span class='coin-price'>$1,800</span>
+        </div>
+      </body>
+    </html>
+    """
+
+    # 2. Define the JSON schema (XPath version)
+    schema = {
+        "name": "Crypto Prices via XPath",
+        "baseSelector": "//div[@class='crypto-row']",
+        "fields": [
+            {
+                "name": "coin_name",
+                "selector": ".//h2[@class='coin-name']",
+                "type": "text"
+            },
+            {
+                "name": "price",
+                "selector": ".//span[@class='coin-price']",
+                "type": "text"
+            }
+        ]
+    }
+
+    # 3. Place the strategy in the CrawlerRunConfig
+    config = CrawlerRunConfig(
+        extraction_strategy=JsonXPathExtractionStrategy(schema, verbose=True)
+    )
+
+    # 4. Use raw:// scheme to pass dummy_html directly
+    raw_url = f"raw://{dummy_html}"
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=raw_url,
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} coin rows")
+        if data:
+            print("First item:", data[0])
+
+asyncio.run(extract_crypto_prices_xpath())
+```
+
+**Key Points**:
+
+1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
+2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
+3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
+4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
+
+That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
+
+---
+
+## 3. Advanced Schema & Nested Structures
+
+Real sites often have **nested** or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define **nested** or **list** (and even **nested_list**) fields.
+
+### Sample E-Commerce HTML
+
+We have a **sample e-commerce** HTML file on GitHub (example):
+```
+https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+```
+This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
+
+```python
+schema = {
+    "name": "E-commerce Product Catalog",
+    "baseSelector": "div.category",
+    # (1) We can define optional baseFields if we want to extract attributes 
+    # from the category container
+    "baseFields": [
+        {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, 
+    ],
+    "fields": [
+        {
+            "name": "category_name",
+            "selector": "h2.category-name",
+            "type": "text"
+        },
+        {
+            "name": "products",
+            "selector": "div.product",
+            "type": "nested_list",    # repeated sub-objects
+            "fields": [
+                {
+                    "name": "name",
+                    "selector": "h3.product-name",
+                    "type": "text"
+                },
+                {
+                    "name": "price",
+                    "selector": "p.product-price",
+                    "type": "text"
+                },
+                {
+                    "name": "details",
+                    "selector": "div.product-details",
+                    "type": "nested",  # single sub-object
+                    "fields": [
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "features",
+                    "selector": "ul.product-features li",
+                    "type": "list",
+                    "fields": [
+                        {"name": "feature", "type": "text"} 
+                    ]
+                },
+                {
+                    "name": "reviews",
+                    "selector": "div.review",
+                    "type": "nested_list",
+                    "fields": [
+                        {
+                            "name": "reviewer", 
+                            "selector": "span.reviewer", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating", 
+                            "selector": "span.rating", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "comment", 
+                            "selector": "p.review-text", 
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "related_products",
+                    "selector": "ul.related-products li",
+                    "type": "list",
+                    "fields": [
+                        {
+                            "name": "name", 
+                            "selector": "span.related-name", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "price", 
+                            "selector": "span.related-price", 
+                            "type": "text"
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+```
+
+Key Takeaways:
+
+- **Nested vs. List**:  
+  - **`type: "nested"`** means a **single** sub-object (like `details`).  
+  - **`type: "list"`** means multiple items that are **simple** dictionaries or single text fields.  
+  - **`type: "nested_list"`** means repeated **complex** objects (like `products` or `reviews`).
+- **Base Fields**: We can extract **attributes** from the container element via `"baseFields"`. For instance, `"data_cat_id"` might be `data-cat-id="elect123"`.  
+- **Transforms**: We can also define a `transform` if we want to lower/upper case, strip whitespace, or even run a custom function.
+
+### Running the Extraction
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+ecommerce_schema = {
+    # ... the advanced schema from above ...
+}
+
+async def extract_ecommerce_data():
+    strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True)
+    
+    config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+            extraction_strategy=strategy,
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+        
+        # Parse the JSON output
+        data = json.loads(result.extracted_content)
+        print(json.dumps(data, indent=2) if data else "No data found.")
+
+asyncio.run(extract_ecommerce_data())
+```
+
+If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
+
+---
+
+## 4. Why “No LLM” Is Often Better
+
+1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
+2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
+3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
+4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
+
+**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
+
+---
+
+## 5. Base Element Attributes & Additional Fields
+
+It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
+
+```json
+{
+  "name": "href",
+  "type": "attribute",
+  "attribute": "href",
+  "default": null
+}
+```
+
+You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `<div>`.
+
+---
+
+## 6. Putting It All Together: Larger Example
+
+Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
+
+```python
+schema = {
+  "name": "Blog Posts",
+  "baseSelector": "a.blog-post-card",
+  "baseFields": [
+    {"name": "post_url", "type": "attribute", "attribute": "href"}
+  ],
+  "fields": [
+    {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"},
+    {"name": "date", "selector": "time.post-date", "type": "text", "default": ""},
+    {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""},
+    {"name": "author", "selector": "span.post-author", "type": "text", "default": ""}
+  ]
+}
+```
+
+Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post objects, each with `"post_url"`, `"title"`, `"date"`, `"summary"`, `"author"`.
+
+---
+
+## 7. Tips & Best Practices
+
+1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
+2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
+3. **Test** your schema on partial HTML or a test page before a big crawl.  
+4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
+5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
+6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
+7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+
+---
+
+## 8. Schema Generation Utility
+
+While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
+
+1. You're dealing with a new website structure and want a quick starting point
+2. You need to extract complex nested data structures
+3. You want to avoid the learning curve of CSS/XPath selector syntax
+
+### Using the Schema Generator
+
+The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import LLMConfig
+
+# Sample HTML with product information
+html = """
+<div class="product-card">
+    <h2 class="title">Gaming Laptop</h2>
+    <div class="price">$999.99</div>
+    <div class="specs">
+        <ul>
+            <li>16GB RAM</li>
+            <li>1TB SSD</li>
+        </ul>
+    </div>
+</div>
+"""
+
+# Option 1: Using OpenAI (requires API token)
+css_schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    schema_type="css", 
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
+)
+
+# Option 2: Using Ollama (open source, no token needed)
+xpath_schema = JsonXPathExtractionStrategy.generate_schema(
+    html,
+    schema_type="xpath",
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
+)
+
+# Use the generated schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(css_schema)
+```
+
+### LLM Provider Options
+
+1. **OpenAI GPT-4 (`openai/gpt4o`)**
+   - Default provider
+   - Requires an API token
+   - Generally provides more accurate schemas
+   - Set via environment variable: `OPENAI_API_KEY`
+
+2. **Ollama (`ollama/llama3.3`)**
+   - Open source alternative
+   - No API token required
+   - Self-hosted option
+   - Good for development and testing
+
+### Benefits of Schema Generation
+
+1. **One-Time Cost**: While schema generation uses LLM, it's a one-time cost. The generated schema can be reused for unlimited extractions without further LLM calls.
+2. **Smart Pattern Recognition**: The LLM analyzes the HTML structure and identifies common patterns, often producing more robust selectors than manual attempts.
+3. **Automatic Nesting**: Complex nested structures are automatically detected and properly represented in the schema.
+4. **Learning Tool**: The generated schemas serve as excellent examples for learning how to write your own schemas.
+
+### Best Practices
+
+1. **Review Generated Schemas**: While the generator is smart, always review and test the generated schema before using it in production.
+2. **Provide Representative HTML**: The better your sample HTML represents the overall structure, the more accurate the generated schema will be.
+3. **Consider Both CSS and XPath**: Try both schema types and choose the one that works best for your specific case.
+4. **Cache Generated Schemas**: Since generation uses LLM, save successful schemas for reuse.
+5. **API Token Security**: Never hardcode API tokens. Use environment variables or secure configuration management.
+6. **Choose Provider Wisely**: 
+   - Use OpenAI for production-quality schemas
+   - Use Ollama for development, testing, or when you need a self-hosted solution
+
+That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+
+---
+
+## 9. Conclusion
+
+With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
+
+- Scrape any consistent site for structured data.  
+- Support nested objects, repeating lists, or advanced transformations.  
+- Scale to thousands of pages quickly and reliably.
+
+**Next Steps**:
+
+- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.  
+- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
+
+**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
+
+**Last Updated**: 2025-01-01
+
+---
+
+That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+```
+
diff --git a/deploy/docker/mcp_bridge.py b/deploy/docker/mcp_bridge.py
new file mode 100644
index 00000000..c55ed14c
--- /dev/null
+++ b/deploy/docker/mcp_bridge.py
@@ -0,0 +1,252 @@
+# deploy/docker/mcp_bridge.py
+
+from __future__ import annotations
+import inspect, json, re, anyio
+from contextlib import suppress
+from typing import Any, Callable, Dict, List, Tuple
+import httpx
+
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi import Request
+from sse_starlette.sse import EventSourceResponse
+from pydantic import BaseModel
+from mcp.server.sse import SseServerTransport
+
+import mcp.types as t
+from mcp.server.lowlevel.server import Server, NotificationOptions
+from mcp.server.models import InitializationOptions
+
+# ── opt‑in decorators ───────────────────────────────────────────
+def mcp_resource(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "resource", name
+        return fn
+    return deco
+
+def mcp_template(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "template", name
+        return fn
+    return deco
+
+def mcp_tool(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "tool", name
+        return fn
+    return deco
+
+# ── HTTP‑proxy helper for FastAPI endpoints ─────────────────────
+def _make_http_proxy(base_url: str, route):
+    method = list(route.methods - {"HEAD", "OPTIONS"})[0]
+    async def proxy(**kwargs):
+        # replace `/items/{id}` style params first
+        path = route.path
+        for k, v in list(kwargs.items()):
+            placeholder = "{" + k + "}"
+            if placeholder in path:
+                path = path.replace(placeholder, str(v))
+                kwargs.pop(k)
+        url = base_url.rstrip("/") + path
+
+        async with httpx.AsyncClient() as client:
+            try:
+                r = (
+                    await client.get(url, params=kwargs)
+                    if method == "GET"
+                    else await client.request(method, url, json=kwargs)
+                )
+                r.raise_for_status()
+                return r.text if method == "GET" else r.json()
+            except httpx.HTTPStatusError as e:
+                # surface FastAPI error details instead of plain 500
+                raise HTTPException(e.response.status_code, e.response.text)
+    return proxy
+
+# ── main entry point ────────────────────────────────────────────
+def attach_mcp(
+    app: FastAPI,
+    *,                          # keyword‑only
+    base: str = "/mcp",
+    name: str | None = None,
+    base_url: str,              # eg. "http://127.0.0.1:8020"
+) -> None:
+    """Call once after all routes are declared to expose WS+SSE MCP endpoints."""
+    server_name = name or app.title or "FastAPI-MCP"
+    mcp = Server(server_name)
+
+    # tools: Dict[str, Callable] = {}
+    tools: Dict[str, Tuple[Callable, Callable]] = {}
+    resources: Dict[str, Callable] = {}
+    templates: Dict[str, Callable] = {}
+
+    # register decorated FastAPI routes
+    for route in app.routes:
+        fn = getattr(route, "endpoint", None)
+        kind = getattr(fn, "__mcp_kind__", None)
+        if not kind:
+            continue
+
+        key = fn.__mcp_name__ or re.sub(r"[/{}}]", "_", route.path).strip("_")
+
+        # if kind == "tool":
+        #     tools[key] = _make_http_proxy(base_url, route)
+        if kind == "tool":
+            proxy = _make_http_proxy(base_url, route)
+            tools[key] = (proxy, fn)
+            continue
+        if kind == "resource":
+            resources[key] = fn
+        if kind == "template":
+            templates[key] = fn
+
+    # helpers for JSON‑Schema
+    def _schema(model: type[BaseModel] | None) -> dict:
+        return {"type": "object"} if model is None else model.model_json_schema()
+
+    def _body_model(fn: Callable) -> type[BaseModel] | None:
+        for p in inspect.signature(fn).parameters.values():
+            a = p.annotation
+            if inspect.isclass(a) and issubclass(a, BaseModel):
+                return a
+        return None
+
+    # MCP handlers
+    @mcp.list_tools()
+    async def _list_tools() -> List[t.Tool]:
+        out = []
+        for k, (proxy, orig_fn) in tools.items():
+            desc   = getattr(orig_fn, "__mcp_description__", None) or inspect.getdoc(orig_fn) or ""
+            schema = getattr(orig_fn, "__mcp_schema__", None) or _schema(_body_model(orig_fn))
+            out.append(
+                t.Tool(name=k, description=desc, inputSchema=schema)
+            )
+        return out
+             
+
+    @mcp.call_tool()
+    async def _call_tool(name: str, arguments: Dict | None) -> List[t.TextContent]:
+        if name not in tools:
+            raise HTTPException(404, "tool not found")
+        
+        proxy, _ = tools[name]
+        try:
+            res = await proxy(**(arguments or {}))
+        except HTTPException as exc:
+            # map server‑side errors into MCP "text/error" payloads
+            err = {"error": exc.status_code, "detail": exc.detail}
+            return [t.TextContent(type = "text", text=json.dumps(err))]
+        return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
+
+    @mcp.list_resources()
+    async def _list_resources() -> List[t.Resource]:
+        return [
+            t.Resource(name=k, description=inspect.getdoc(f) or "", mime_type="application/json")
+            for k, f in resources.items()
+        ]
+
+    @mcp.read_resource()
+    async def _read_resource(name: str) -> List[t.TextContent]:
+        if name not in resources:
+            raise HTTPException(404, "resource not found")
+        res = resources[name]()
+        return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
+
+    @mcp.list_resource_templates()
+    async def _list_templates() -> List[t.ResourceTemplate]:
+        return [
+            t.ResourceTemplate(
+                name=k,
+                description=inspect.getdoc(f) or "",
+                parameters={
+                    p: {"type": "string"} for p in _path_params(app, f)
+                },
+            )
+            for k, f in templates.items()
+        ]
+
+    init_opts = InitializationOptions(
+        server_name=server_name,
+        server_version="0.1.0",
+        capabilities=mcp.get_capabilities(
+            notification_options=NotificationOptions(),
+            experimental_capabilities={},
+        ),
+    )
+
+    # ── WebSocket transport ────────────────────────────────────
+    @app.websocket_route(f"{base}/ws")
+    async def _ws(ws: WebSocket):
+        await ws.accept()
+        c2s_send, c2s_recv = anyio.create_memory_object_stream(100)
+        s2c_send, s2c_recv = anyio.create_memory_object_stream(100)
+
+        from pydantic import TypeAdapter
+        from mcp.types import JSONRPCMessage
+        adapter = TypeAdapter(JSONRPCMessage)
+
+        init_done = anyio.Event()
+
+        async def srv_to_ws():
+            first = True 
+            try:
+                async for msg in s2c_recv:
+                    await ws.send_json(msg.model_dump())
+                    if first:
+                        init_done.set()
+                        first = False
+            finally:
+                # make sure cleanup survives TaskGroup cancellation
+                with anyio.CancelScope(shield=True):
+                    with suppress(RuntimeError):       # idempotent close
+                        await ws.close()
+
+        async def ws_to_srv():
+            try:
+                # 1st frame is always "initialize"
+                first = adapter.validate_python(await ws.receive_json())
+                await c2s_send.send(first)
+                await init_done.wait()          # block until server ready
+                while True:
+                    data = await ws.receive_json()
+                    await c2s_send.send(adapter.validate_python(data))
+            except WebSocketDisconnect:
+                await c2s_send.aclose()
+
+        async with anyio.create_task_group() as tg:
+            tg.start_soon(mcp.run, c2s_recv, s2c_send, init_opts)
+            tg.start_soon(ws_to_srv)
+            tg.start_soon(srv_to_ws)
+
+    # ── SSE transport (official) ─────────────────────────────
+    sse = SseServerTransport(f"{base}/messages/")
+
+    @app.get(f"{base}/sse")
+    async def _mcp_sse(request: Request):
+        async with sse.connect_sse(
+            request.scope, request.receive, request._send  # starlette ASGI primitives
+        ) as (read_stream, write_stream):
+            await mcp.run(read_stream, write_stream, init_opts)
+
+    # client → server frames are POSTed here
+    app.mount(f"{base}/messages", app=sse.handle_post_message)
+
+    # ── schema endpoint ───────────────────────────────────────
+    @app.get(f"{base}/schema")
+    async def _schema_endpoint():
+        return JSONResponse({
+            "tools": [x.model_dump() for x in await _list_tools()],
+            "resources": [x.model_dump() for x in await _list_resources()],
+            "resource_templates": [x.model_dump() for x in await _list_templates()],
+        })
+
+
+# ── helpers ────────────────────────────────────────────────────
+def _route_name(path: str) -> str:
+    return re.sub(r"[/{}}]", "_", path).strip("_")
+
+def _path_params(app: FastAPI, fn: Callable) -> List[str]:
+    for r in app.routes:
+        if r.endpoint is fn:
+            return list(r.param_convertors.keys())
+    return []
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index 40a33a79..0dbb684c 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,9 +1,15 @@
-fastapi
-uvicorn
+fastapi==0.115.12
+uvicorn==0.34.2
 gunicorn>=23.0.0
-slowapi>=0.1.9
-prometheus-fastapi-instrumentator>=7.0.2
+slowapi==0.1.9
+prometheus-fastapi-instrumentator>=7.1.0
 redis>=5.2.1
 jwt>=1.3.1
 dnspython>=2.7.0
-email-validator>=2.2.0
\ No newline at end of file
+email-validator==2.2.0
+sse-starlette==2.2.1
+pydantic==2.11
+rank-bm25==0.2.2
+anyio==4.9.0
+PyJWT==2.10.1
+
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index ae60ffa2..7c02a74f 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -7,14 +7,47 @@ Crawl4AI FastAPI entry‑point
 """
 
 # ── stdlib & 3rd‑party imports ───────────────────────────────
-import os, sys, time, asyncio
-from typing import List, Optional, Dict
+from crawler_pool import get_crawler, close_all, janitor
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from auth import create_access_token, get_token_dependency, TokenRequest
+from pydantic import BaseModel
+from typing import Optional, List, Dict
+from fastapi import Request, Depends 
+from fastapi.responses import FileResponse
+import base64
+import re
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from api import (
+    handle_markdown_request, handle_llm_qa,
+    handle_stream_crawl_request, handle_crawl_request,
+    stream_results
+)
+from utils import (
+    FilterType, load_config, setup_logging, verify_email_domain
+)
+import os
+import sys
+import time
+import asyncio
+from typing import List
 from contextlib import asynccontextmanager
 import pathlib
 
 from fastapi import (
     FastAPI, HTTPException, Request, Path, Query, Depends
 )
+from rank_bm25 import BM25Okapi
+
+def chunk_code_functions(code: str) -> List[str]:
+    tree = ast.parse(code)
+    lines = code.splitlines()
+    chunks = []
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start = node.lineno - 1
+            end = getattr(node, 'end_lineno', start + 1)
+            chunks.append("\n".join(lines[start:end]))
+    return chunks
 from fastapi.responses import (
     StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
 )
@@ -22,7 +55,10 @@ from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
 from fastapi.staticfiles import StaticFiles
 
-import ast, crawl4ai as _c4
+from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
+
+import ast
+import crawl4ai as _c4
 from pydantic import BaseModel, Field
 from slowapi import Limiter
 from slowapi.util import get_remote_address
@@ -31,17 +67,6 @@ from redis import asyncio as aioredis
 
 # ── internal imports (after sys.path append) ─────────────────
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from utils import (
-    FilterType, load_config, setup_logging, verify_email_domain
-)
-from api import (
-    handle_markdown_request, handle_llm_qa,
-    handle_stream_crawl_request, handle_crawl_request,
-    stream_results
-)
-from auth import create_access_token, get_token_dependency, TokenRequest
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawler_pool import get_crawler, close_all, janitor
 
 # ────────────────── configuration / logging ──────────────────
 config = load_config()
@@ -66,12 +91,16 @@ GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
 #         GLOBAL_SEM.release()                          # ← free slot
 
 orig_arun = AsyncWebCrawler.arun
+
+
 async def capped_arun(self, *a, **kw):
     async with GLOBAL_SEM:
         return await orig_arun(self, *a, **kw)
 AsyncWebCrawler.arun = capped_arun
 
 # ───────────────────── FastAPI lifespan ──────────────────────
+
+
 @asynccontextmanager
 async def lifespan(_: FastAPI):
     await get_crawler(BrowserConfig(
@@ -101,6 +130,8 @@ app.mount(
 )
 
 # Optional nice‑to‑have: opening the root shows the playground
+
+
 @app.get("/")
 async def root():
     return RedirectResponse("/playground")
@@ -114,6 +145,7 @@ limiter = Limiter(
     storage_uri=config["rate_limiting"]["storage_uri"],
 )
 
+
 def _setup_security(app_: FastAPI):
     sec = config["security"]
     if not sec["enabled"]:
@@ -124,6 +156,8 @@ def _setup_security(app_: FastAPI):
         app_.add_middleware(
             TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
         )
+
+
 _setup_security(app)
 
 if config["observability"]["prometheus"]["enabled"]:
@@ -131,6 +165,7 @@ if config["observability"]["prometheus"]["enabled"]:
 
 token_dep = get_token_dependency(config)
 
+
 @app.middleware("http")
 async def add_security_headers(request: Request, call_next):
     resp = await call_next(request)
@@ -144,6 +179,7 @@ ALLOWED_TYPES = {
     "BrowserConfig": BrowserConfig,
 }
 
+
 def _safe_eval_config(expr: str) -> dict:
     """
     Accept exactly one top‑level call to CrawlerRunConfig(...) or BrowserConfig(...).
@@ -159,7 +195,8 @@ def _safe_eval_config(expr: str) -> dict:
 
     call = tree.body
     if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
-        raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
+        raise ValueError(
+            "Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
 
     # forbid nested calls to keep the surface tiny
     for node in ast.walk(call):
@@ -167,8 +204,10 @@ def _safe_eval_config(expr: str) -> dict:
             raise ValueError("Nested function calls are not permitted")
 
     # expose everything that crawl4ai exports, nothing else
-    safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
-    obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
+    safe_env = {name: getattr(_c4, name)
+                for name in dir(_c4) if not name.startswith("_")}
+    obj = eval(compile(tree, "<config>", "eval"),
+               {"__builtins__": {}}, safe_env)
     return obj.dump()
 
 
@@ -178,10 +217,42 @@ class CrawlRequest(BaseModel):
     browser_config: Optional[Dict] = Field(default_factory=dict)
     crawler_config: Optional[Dict] = Field(default_factory=dict)
 
+# ────────────── Schemas ──────────────
+class MarkdownRequest(BaseModel):
+    """Request body for the /md endpoint."""
+    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
+    f:   FilterType             = Field(FilterType.FIT,
+                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
+    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
+    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
+
+
 class RawCode(BaseModel):
     code: str
 
+class HTMLRequest(BaseModel):
+    url: str
+    
+class ScreenshotRequest(BaseModel):
+    url: str
+    screenshot_wait_for: Optional[float] = 2
+    output_path: Optional[str] = None
+
+class PDFRequest(BaseModel):
+    url: str
+    output_path: Optional[str] = None
+
+
+class JSEndpointRequest(BaseModel):
+    url: str
+    scripts: List[str] = Field(
+        ...,
+        description="List of separated JavaScript snippets to execute"
+    )
+
 # ──────────────────────── Endpoints ──────────────────────────
+
+
 @app.post("/token")
 async def get_token(req: TokenRequest):
     if not verify_email_domain(req.email):
@@ -189,6 +260,7 @@ async def get_token(req: TokenRequest):
     token = create_access_token({"sub": req.email})
     return {"email": req.email, "access_token": token, "token_type": "bearer"}
 
+
 @app.post("/config/dump")
 async def config_dump(raw: RawCode):
     try:
@@ -197,18 +269,164 @@ async def config_dump(raw: RawCode):
         raise HTTPException(400, str(e))
 
 
-@app.get("/md/{url:path}")
+@app.post("/md")
 @limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("md")
 async def get_markdown(
     request: Request,
-    url: str,
-    f: FilterType = FilterType.FIT,
-    q: Optional[str] = None,
-    c: str = "0",
+    body: MarkdownRequest,
     _td: Dict = Depends(token_dep),
 ):
-    md = await handle_markdown_request(url, f, q, c, config)
-    return PlainTextResponse(md)
+    if not body.url.startswith(("http://", "https://")):
+        raise HTTPException(400, "URL must be absolute and start with http/https")
+    markdown = await handle_markdown_request(
+        body.url, body.f, body.q, body.c, config
+    )
+    return JSONResponse({
+        "url": body.url,
+        "filter": body.f,
+        "query": body.q,
+        "cache": body.c,
+        "markdown": markdown,
+        "success": True
+    })
+
+
+@app.post("/html")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("html")
+async def generate_html(
+    request: Request,
+    body: HTMLRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
+    Use when you need sanitized HTML structures for building schemas or further processing.
+    """
+    cfg = CrawlerRunConfig()
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    raw_html = results[0].html
+    from crawl4ai.utils import preprocess_html_for_schema
+    processed_html = preprocess_html_for_schema(raw_html)
+    return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+
+# Screenshot endpoint
+
+@app.post("/screenshot")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("screenshot")
+async def generate_screenshot(
+    request: Request,
+    body: ScreenshotRequest, 
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Capture a full-page PNG screenshot of the specified URL, waiting an optional delay before capture,
+    Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
+    Then in result instead of the screenshot you will get a path to the saved file.
+    """
+    cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    screenshot_data = results[0].screenshot
+    if body.output_path:
+        abs_path = os.path.abspath(body.output_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "wb") as f:
+            f.write(base64.b64decode(screenshot_data))
+        return {"success": True, "path": abs_path}
+    return {"success": True, "screenshot": screenshot_data}
+
+# PDF endpoint
+
+@app.post("/pdf")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("pdf")
+async def generate_pdf(
+    request: Request,
+    body: PDFRequest, 
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Generate a PDF document of the specified URL,
+    Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
+    Then in result instead of the PDF you will get a path to the saved file.
+    """
+    cfg = CrawlerRunConfig(pdf=True)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    pdf_data = results[0].pdf
+    if body.output_path:
+        abs_path = os.path.abspath(body.output_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "wb") as f:
+            f.write(pdf_data)
+        return {"success": True, "path": abs_path}
+    return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+
+
+@app.post("/execute_js")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("execute_js")
+async def execute_js(
+    request: Request,
+    body: JSEndpointRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Execute a sequence of JavaScript snippets on the specified URL.
+    Return the full CrawlResult JSON (first result).
+    Use this when you need to interact with dynamic pages using JS.
+    REMEMBER: Scripts accept a list of separated JS snippets to execute and execute them in order.
+    IMPORTANT: Each script should be an expression that returns a value. It can be an IIFE or an async function. You can think of it as such.
+        Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
+    Return Format:
+        - The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
+        
+        ```python
+        class CrawlResult(BaseModel):
+            url: str
+            html: str
+            success: bool
+            cleaned_html: Optional[str] = None
+            media: Dict[str, List[Dict]] = {}
+            links: Dict[str, List[Dict]] = {}
+            downloaded_files: Optional[List[str]] = None
+            js_execution_result: Optional[Dict[str, Any]] = None
+            screenshot: Optional[str] = None
+            pdf: Optional[bytes] = None
+            mhtml: Optional[str] = None
+            _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+            extracted_content: Optional[str] = None
+            metadata: Optional[dict] = None
+            error_message: Optional[str] = None
+            session_id: Optional[str] = None
+            response_headers: Optional[dict] = None
+            status_code: Optional[int] = None
+            ssl_certificate: Optional[SSLCertificate] = None
+            dispatch_result: Optional[DispatchResult] = None
+            redirected_url: Optional[str] = None
+            network_requests: Optional[List[Dict[str, Any]]] = None
+            console_messages: Optional[List[Dict[str, Any]]] = None
+
+        class MarkdownGenerationResult(BaseModel):
+            raw_markdown: str
+            markdown_with_citations: str
+            references_markdown: str
+            fit_markdown: Optional[str] = None
+            fit_html: Optional[str] = None
+        ```
+        
+    """
+    cfg = CrawlerRunConfig(js_code=body.scripts)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    # Return JSON-serializable dict of the first CrawlResult
+    data = results[0].model_dump()
+    return JSONResponse(data)
+
 
 @app.get("/llm/{url:path}")
 async def llm_endpoint(
@@ -224,27 +442,35 @@ async def llm_endpoint(
     answer = await handle_llm_qa(url, q, config)
     return JSONResponse({"answer": answer})
 
+
 @app.get("/schema")
 async def get_schema():
     from crawl4ai import BrowserConfig, CrawlerRunConfig
     return {"browser": BrowserConfig().dump(),
             "crawler": CrawlerRunConfig().dump()}
 
+
 @app.get(config["observability"]["health_check"]["endpoint"])
 async def health():
     return {"status": "ok", "timestamp": time.time(), "version": __version__}
 
+
 @app.get(config["observability"]["prometheus"]["endpoint"])
 async def metrics():
     return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
 
+
 @app.post("/crawl")
 @limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("crawl")
 async def crawl(
     request: Request,
     crawl_request: CrawlRequest,
     _td: Dict = Depends(token_dep),
 ):
+    """
+    Crawl a list of URLs and return the results as JSON.
+    """
     if not crawl_request.urls:
         raise HTTPException(400, "At least one URL required")
     res = await handle_crawl_request(
@@ -255,6 +481,7 @@ async def crawl(
     )
     return JSONResponse(res)
 
+
 @app.post("/crawl/stream")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 async def crawl_stream(
@@ -280,6 +507,133 @@ async def crawl_stream(
         },
     )
 
+def chunk_code_functions(code_md: str) -> List[str]:
+    """Extract each function/class from markdown code blocks per file."""
+    pattern = re.compile(
+        # match "## File: <path>" then a ```py fence, then capture until the closing ```
+        r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n'      # file header
+        r'```py\s*?\r?\n'                         # opening fence
+        r'(?P<code>.*?)(?=\r?\n```)',             # code block
+        re.DOTALL
+    )
+    chunks: List[str] = []
+    for m in pattern.finditer(code_md):
+        file_path = m.group("path").strip()
+        code_blk = m.group("code")
+        tree = ast.parse(code_blk)
+        lines = code_blk.splitlines()
+        for node in tree.body:
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                start = node.lineno - 1
+                end = getattr(node, "end_lineno", start + 1)
+                snippet = "\n".join(lines[start:end])
+                chunks.append(f"# File: {file_path}\n{snippet}")
+    return chunks
+
+def chunk_doc_sections(doc: str) -> List[str]:
+    lines = doc.splitlines(keepends=True)
+    sections = []
+    current: List[str] = []
+    for line in lines:
+        if re.match(r"^#{1,6}\s", line):
+            if current:
+                sections.append("".join(current))
+            current = [line]
+        else:
+            current.append(line)
+    if current:
+        sections.append("".join(current))
+    return sections
+
+@app.get("/ask")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("ask")
+async def get_context(
+    request: Request,
+    _td: Dict = Depends(token_dep),
+    context_type: str = Query("all", regex="^(code|doc|all)$"),
+    query: Optional[str] = Query(None, description="search query to filter chunks"),
+    score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
+    max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
+):
+    """
+    This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai. 
+    You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
+    Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
+    
+    Parameters:
+    - context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
+    - query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
+    - score_ratio: Minimum score as a fraction of the maximum score for filtering results.
+    - max_results: Maximum number of results to return. Default is 20.
+    
+    Returns:
+    - JSON response with the requested context.
+    - If "code" is specified, returns the code context.
+    - If "doc" is specified, returns the documentation context.
+    - If "all" is specified, returns both code and documentation contexts.
+    """
+    # load contexts
+    base = os.path.dirname(__file__)
+    code_path = os.path.join(base, "c4ai-code-context.md")
+    doc_path  = os.path.join(base, "c4ai-doc-context.md")
+    if not os.path.exists(code_path) or not os.path.exists(doc_path):
+        raise HTTPException(404, "Context files not found")
+
+    with open(code_path, "r") as f:
+        code_content = f.read()
+    with open(doc_path, "r") as f:
+        doc_content = f.read()
+
+    # if no query, just return raw contexts
+    if not query:
+        if context_type == "code":
+            return JSONResponse({"code_context": code_content})
+        if context_type == "doc":
+            return JSONResponse({"doc_context": doc_content})
+        return JSONResponse({
+            "code_context": code_content,
+            "doc_context": doc_content,
+        })
+
+    tokens = query.split()
+    results: Dict[str, List[Dict[str, float]]] = {}
+
+    # code BM25 over functions/classes
+    if context_type in ("code", "all"):
+        code_chunks = chunk_code_functions(code_content)
+        bm25 = BM25Okapi([c.split() for c in code_chunks])
+        scores = bm25.get_scores(tokens)
+        max_sc = float(scores.max()) if scores.size > 0 else 0.0
+        cutoff = max_sc * score_ratio
+        picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
+        picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
+        results["code_results"] = [{"text": c, "score": s} for c, s in picked]
+
+    # doc BM25 over markdown sections
+    if context_type in ("doc", "all"):
+        sections = chunk_doc_sections(doc_content)
+        bm25d = BM25Okapi([sec.split() for sec in sections])
+        scores_d = bm25d.get_scores(tokens)
+        max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
+        cutoff_d = max_sd * score_ratio
+        idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
+        neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
+        valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
+        valid = valid[:max_results]
+        results["doc_results"] = [
+            {"text": sections[i], "score": scores_d[i]} for i in valid
+        ]
+
+    return JSONResponse(results)
+    
+
+# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
+attach_mcp(
+    app,
+    base_url=f"http://{config['app']['host']}:{config['app']['port']}"
+)
+
 # ────────────────────────── cli ──────────────────────────────
 if __name__ == "__main__":
     import uvicorn
diff --git a/tests/mcp/test_mcp_socket.py b/tests/mcp/test_mcp_socket.py
new file mode 100644
index 00000000..ecb3070f
--- /dev/null
+++ b/tests/mcp/test_mcp_socket.py
@@ -0,0 +1,119 @@
+# pip install "mcp-sdk[ws]" anyio
+import anyio, json
+from mcp.client.websocket import websocket_client
+from mcp.client.session import ClientSession
+
+async def test_list():
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()
+
+            print("tools      :", [t.name for t in (await s.list_tools()).tools])
+            print("resources  :", [r.name for r in (await s.list_resources()).resources])
+            print("templates  :", [t.name for t in (await s.list_resource_templates()).resource_templates])
+
+
+async def test_crawl(s: ClientSession) -> None:
+    """Hit the @mcp_tool('crawl') endpoint."""
+    res = await s.call_tool(
+        "crawl",
+        {
+            "urls": ["https://example.com"],
+            "browser_config": {},
+            "crawler_config": {},
+        },
+    )
+    print("crawl →", json.loads(res.content[0].text))
+
+
+async def test_md(s: ClientSession) -> None:
+    """Hit the @mcp_tool('md') endpoint."""
+    res = await s.call_tool(
+        "md",
+        {
+            "url": "https://example.com",
+            "f": "fit",   # or RAW, BM25, LLM
+            "q": None,
+            "c": "0",
+        },
+    )
+    result = json.loads(res.content[0].text)
+    print("md →", result['markdown'][:100], "...")
+
+async def test_screenshot(s: ClientSession):
+    res = await s.call_tool(
+        "screenshot",
+        {
+            "url": "https://example.com",
+            "screenshot_wait_for": 1.0,
+        },
+    )
+    png_b64 = json.loads(res.content[0].text)["screenshot"]
+    print("screenshot →", png_b64[:60], "… (base64)")
+
+
+async def test_pdf(s: ClientSession):
+    res = await s.call_tool(
+        "pdf",
+        {
+            "url": "https://example.com",
+        },
+    )
+    pdf_b64 = json.loads(res.content[0].text)["pdf"]
+    print("pdf →", pdf_b64[:60], "… (base64)")
+
+async def test_execute_js(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "execute_js",
+        {
+            "url": "https://news.ycombinator.com/news",
+            "js_code": [
+                "await page.click('a.morelink')",
+                "await page.waitForTimeout(1000)",
+            ],
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+    
+async def test_html(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "html",
+        {
+            "url": "https://news.ycombinator.com/news",
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+    
+async def test_context(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "ask",
+        {
+            "query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+
+
+async def main() -> None:
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()                       # handshake
+            tools = (await s.list_tools()).tools
+            print("tools:", [t.name for t in tools])
+
+            # await test_list()
+            # await test_crawl(s)
+            # await test_md(s)
+            # await test_screenshot(s)
+            # await test_pdf(s)
+            # await test_execute_js(s)
+            # await test_html(s)
+            await test_context(s)
+
+anyio.run(main)
diff --git a/tests/mcp/test_mcp_sse.py b/tests/mcp/test_mcp_sse.py
new file mode 100644
index 00000000..d9eee557
--- /dev/null
+++ b/tests/mcp/test_mcp_sse.py
@@ -0,0 +1,11 @@
+from mcp.client.sse import sse_client
+from mcp.client.session import ClientSession
+
+async def main():
+    async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
+        async with ClientSession(r, w) as sess:
+            print(await sess.list_tools())      # now works
+            
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
diff --git a/tests/memory/test_docker_congif_gen.py b/tests/memory/test_docker_config_gen.py
similarity index 87%
rename from tests/memory/test_docker_congif_gen.py
rename to tests/memory/test_docker_config_gen.py
index 2da26078..ae6e533c 100644
--- a/tests/memory/test_docker_congif_gen.py
+++ b/tests/memory/test_docker_config_gen.py
@@ -11,7 +11,8 @@ If the server isn’t running, start it first:
 
 import sys, json, textwrap, requests
 
-BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
 URL  = f"{BASE.rstrip('/')}/config/dump"
 
 CASES = [

From b5c25731e6561c92ff877e72fd1685f1c32b600f Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 21 Apr 2025 23:20:59 +0800
Subject: [PATCH 68/78] feat(browser): add geolocation, locale and timezone
 support

Add support for controlling browser geolocation, locale and timezone settings:
- New GeolocationConfig class for managing GPS coordinates
- Add locale and timezone_id parameters to CrawlerRunConfig
- Update browser context creation to handle location settings
- Add example script for geolocation usage
- Update documentation with location-based identity features

This enables more precise control over browser identity and location reporting.
---
 crawl4ai/__init__.py                          |  3 +-
 crawl4ai/async_configs.py                     | 73 ++++++++++++++
 crawl4ai/browser_manager.py                   | 84 ++++++++++++++--
 docs/examples/use_geo_location.py             | 70 +++++++++++++
 .../md_v2/advanced/identity-based-crawling.md | 98 ++++++++++++++++++-
 docs/md_v2/core/browser-crawler-config.md     |  5 +
 6 files changed, 322 insertions(+), 11 deletions(-)
 create mode 100644 docs/examples/use_geo_location.py

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 37dd8366..9dff4453 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -2,7 +2,7 @@
 import warnings
 
 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
 
 from .content_scraping_strategy import (
     ContentScrapingStrategy,
@@ -71,6 +71,7 @@ __all__ = [
     "AsyncWebCrawler",
     "BrowserProfiler",
     "LLMConfig",
+    "GeolocationConfig",
     "DeepCrawlStrategy",
     "BFSDeepCrawlStrategy",
     "BestFirstCrawlingStrategy",
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index faa29024..dd5c584a 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -159,6 +159,55 @@ def is_empty_value(value: Any) -> bool:
         return True
     return False
 
+class GeolocationConfig:
+    def __init__(
+        self,
+        latitude: float,
+        longitude: float,
+        accuracy: Optional[float] = 0.0
+    ):
+        """Configuration class for geolocation settings.
+        
+        Args:
+            latitude: Latitude coordinate (e.g., 37.7749)
+            longitude: Longitude coordinate (e.g., -122.4194)
+            accuracy: Accuracy in meters. Default: 0.0
+        """
+        self.latitude = latitude
+        self.longitude = longitude
+        self.accuracy = accuracy
+    
+    @staticmethod
+    def from_dict(geo_dict: Dict) -> "GeolocationConfig":
+        """Create a GeolocationConfig from a dictionary."""
+        return GeolocationConfig(
+            latitude=geo_dict.get("latitude"),
+            longitude=geo_dict.get("longitude"),
+            accuracy=geo_dict.get("accuracy", 0.0)
+        )
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "latitude": self.latitude,
+            "longitude": self.longitude,
+            "accuracy": self.accuracy
+        }
+    
+    def clone(self, **kwargs) -> "GeolocationConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            GeolocationConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return GeolocationConfig.from_dict(config_dict)
+
+
 class ProxyConfig:
     def __init__(
         self,
@@ -680,6 +729,14 @@ class CrawlerRunConfig():
         proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                      If None, no additional proxy config. Default: None.
 
+        # Browser Location and Identity Parameters
+        locale (str or None): Locale to use for the browser context (e.g., "en-US").
+                             Default: None.
+        timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York").
+                                  Default: None.
+        geolocation (GeolocationConfig or None): Geolocation configuration for the browser.
+                                                Default: None.
+
         # SSL Parameters
         fetch_ssl_certificate: bool = False,
         # Caching Parameters
@@ -829,6 +886,10 @@ class CrawlerRunConfig():
         scraping_strategy: ContentScrapingStrategy = None,
         proxy_config: Union[ProxyConfig, dict, None] = None,
         proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
+        # Browser Location and Identity Parameters
+        locale: Optional[str] = None,
+        timezone_id: Optional[str] = None,
+        geolocation: Optional[GeolocationConfig] = None,
         # SSL Parameters
         fetch_ssl_certificate: bool = False,
         # Caching Parameters
@@ -917,6 +978,11 @@ class CrawlerRunConfig():
         self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
         self.proxy_config = proxy_config
         self.proxy_rotation_strategy = proxy_rotation_strategy
+        
+        # Browser Location and Identity Parameters
+        self.locale = locale
+        self.timezone_id = timezone_id
+        self.geolocation = geolocation
 
         # SSL Parameters
         self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -1057,6 +1123,10 @@ class CrawlerRunConfig():
             scraping_strategy=kwargs.get("scraping_strategy"),
             proxy_config=kwargs.get("proxy_config"),
             proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
+            # Browser Location and Identity Parameters
+            locale=kwargs.get("locale", None),
+            timezone_id=kwargs.get("timezone_id", None),
+            geolocation=kwargs.get("geolocation", None),
             # SSL Parameters
             fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
             # Caching Parameters
@@ -1166,6 +1236,9 @@ class CrawlerRunConfig():
             "scraping_strategy": self.scraping_strategy,
             "proxy_config": self.proxy_config,
             "proxy_rotation_strategy": self.proxy_rotation_strategy,
+            "locale": self.locale,
+            "timezone_id": self.timezone_id,
+            "geolocation": self.geolocation,
             "fetch_ssl_certificate": self.fetch_ssl_certificate,
             "cache_mode": self.cache_mode,
             "session_id": self.session_id,
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 642fd6c2..4be5f938 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -76,6 +76,51 @@ class ManagedBrowser:
             _cleanup(): Terminates the browser process and removes the temporary directory.
             create_profile(): Static method to create a user profile by launching a browser for user interaction.
     """
+    
+    @staticmethod
+    def build_browser_flags(config: BrowserConfig) -> List[str]:
+        """Common CLI flags for launching Chromium"""
+        flags = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+        ]
+        if config.light_mode:
+            flags.extend(BROWSER_DISABLE_OPTIONS)
+        if config.text_mode:
+            flags.extend([
+                "--blink-settings=imagesEnabled=false",
+                "--disable-remote-fonts",
+                "--disable-images",
+                "--disable-javascript",
+                "--disable-software-rasterizer",
+                "--disable-dev-shm-usage",
+            ])
+        # proxy support
+        if config.proxy:
+            flags.append(f"--proxy-server={config.proxy}")
+        elif config.proxy_config:
+            creds = ""
+            if config.proxy_config.username and config.proxy_config.password:
+                creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
+            flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
+        # dedupe
+        return list(dict.fromkeys(flags))
 
     browser_type: str
     user_data_dir: str
@@ -280,29 +325,29 @@ class ManagedBrowser:
         return browser_path
 
     async def _get_browser_args(self) -> List[str]:
-        """Returns browser-specific command line arguments"""
-        base_args = [await self._get_browser_path()]
-
+        """Returns full CLI args for launching the browser"""
+        base = [await self._get_browser_path()]
         if self.browser_type == "chromium":
-            args = [
+            flags = [
                 f"--remote-debugging-port={self.debugging_port}",
                 f"--user-data-dir={self.user_data_dir}",
             ]
             if self.headless:
-                args.append("--headless=new")
+                flags.append("--headless=new")
+            # merge common launch flags
+            flags.extend(self.build_browser_flags(self.browser_config))
         elif self.browser_type == "firefox":
-            args = [
+            flags = [
                 "--remote-debugging-port",
                 str(self.debugging_port),
                 "--profile",
                 self.user_data_dir,
             ]
             if self.headless:
-                args.append("--headless")
+                flags.append("--headless")
         else:
             raise NotImplementedError(f"Browser type {self.browser_type} not supported")
-
-        return base_args + args
+        return base + flags
 
     async def cleanup(self):
         """Cleanup browser process and temporary directory"""
@@ -789,6 +834,23 @@ class BrowserManager:
             # Update context settings with text mode settings
             context_settings.update(text_mode_settings)
 
+        # inject locale / tz / geo if user provided them
+        if crawlerRunConfig:
+            if crawlerRunConfig.locale:
+                context_settings["locale"] = crawlerRunConfig.locale
+            if crawlerRunConfig.timezone_id:
+                context_settings["timezone_id"] = crawlerRunConfig.timezone_id
+            if crawlerRunConfig.geolocation:
+                context_settings["geolocation"] = {
+                    "latitude": crawlerRunConfig.geolocation.latitude,
+                    "longitude": crawlerRunConfig.geolocation.longitude,
+                    "accuracy": crawlerRunConfig.geolocation.accuracy,
+                }
+                # ensure geolocation permission
+                perms = context_settings.get("permissions", [])
+                perms.append("geolocation")
+                context_settings["permissions"] = perms
+
         # Create and return the context with all settings
         context = await self.browser.new_context(**context_settings)
 
@@ -821,6 +883,10 @@ class BrowserManager:
             "semaphore_count",
             "url"
         ]
+        
+        # Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
+        # and should cause a new context to be created if they change
+        
         for key in ephemeral_keys:
             if key in config_dict:
                 del config_dict[key]
diff --git a/docs/examples/use_geo_location.py b/docs/examples/use_geo_location.py
new file mode 100644
index 00000000..2cfc866f
--- /dev/null
+++ b/docs/examples/use_geo_location.py
@@ -0,0 +1,70 @@
+# use_geo_location.py
+"""
+Example: override locale, timezone, and geolocation using Crawl4ai patterns.
+
+This demo uses `AsyncWebCrawler.arun()` to fetch a page with
+browser context primed for specific locale, timezone, and GPS,
+and saves a screenshot for visual verification.
+"""
+
+import asyncio
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BrowserConfig,
+    GeolocationConfig,
+    CrawlResult,
+)
+
+async def demo_geo_override():
+    """Demo: Crawl a geolocation-test page with overrides and screenshot."""
+    print("\n=== Geo-Override Crawl ===")
+
+    # 1) Browser setup: use Playwright-managed contexts
+    browser_cfg = BrowserConfig(
+        headless=False,
+        viewport_width=1280,
+        viewport_height=720,
+        use_managed_browser=False,
+    )
+
+    # 2) Run config: include locale, timezone_id, geolocation, and screenshot
+    run_cfg = CrawlerRunConfig(
+        url="https://browserleaks.com/geo",          # test page that shows your location
+        locale="en-US",                              # Accept-Language & UI locale
+        timezone_id="America/Los_Angeles",           # JS Date()/Intl timezone
+        geolocation=GeolocationConfig(                 # override GPS coords
+            latitude=34.0522,
+            longitude=-118.2437,
+            accuracy=10.0,
+        ),
+        screenshot=True,                               # capture screenshot after load
+        session_id="geo_test",                       # reuse context if rerunning
+        delay_before_return_html=5
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        # 3) Run crawl (returns list even for single URL)
+        results: List[CrawlResult] = await crawler.arun(
+            url=run_cfg.url,
+            config=run_cfg,            
+        )
+        result = results[0]
+
+        # 4) Save screenshot and report path
+        if result.screenshot:
+            __current_dir = Path(__file__).parent
+            out_dir = __current_dir / "tmp"
+            out_dir.mkdir(exist_ok=True)
+            shot_path = out_dir / "geo_test.png"
+            with open(shot_path, "wb") as f:
+                f.write(base64.b64decode(result.screenshot))
+            print(f"Saved screenshot to {shot_path}")
+        else:
+            print("No screenshot captured, check configuration.")
+
+if __name__ == "__main__":
+    asyncio.run(demo_geo_override())
diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md
index 403acb9a..3864f840 100644
--- a/docs/md_v2/advanced/identity-based-crawling.md
+++ b/docs/md_v2/advanced/identity-based-crawling.md
@@ -263,7 +263,102 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
 
 ---
 
-## 7. Summary
+## 7. Locale, Timezone, and Geolocation Control
+
+In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
+
+### Setting Locale and Timezone
+
+You can set the browser's locale and timezone through `CrawlerRunConfig`:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        config=CrawlerRunConfig(
+            # Set browser locale (language and region formatting)
+            locale="fr-FR",  # French (France)
+            
+            # Set browser timezone
+            timezone_id="Europe/Paris",
+            
+            # Other normal options...
+            magic=True,
+            page_timeout=60000
+        )
+    )
+```
+
+**How it works:**
+- `locale` affects language preferences, date formats, number formats, etc.
+- `timezone_id` affects JavaScript's Date object and time-related functionality
+- These settings are applied when creating the browser context and maintained throughout the session
+
+### Configuring Geolocation
+
+Control the GPS coordinates reported by the browser's geolocation API:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://maps.google.com",  # Or any location-aware site
+        config=CrawlerRunConfig(
+            # Configure precise GPS coordinates
+            geolocation=GeolocationConfig(
+                latitude=48.8566,   # Paris coordinates
+                longitude=2.3522,
+                accuracy=100        # Accuracy in meters (optional)
+            ),
+            
+            # This site will see you as being in Paris
+            page_timeout=60000
+        )
+    )
+```
+
+**Important notes:**
+- When `geolocation` is specified, the browser is automatically granted permission to access location
+- Websites using the Geolocation API will receive the exact coordinates you specify
+- This affects map services, store locators, delivery services, etc.
+- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
+
+### Combining with Managed Browsers
+
+These settings work perfectly with managed browsers for a complete identity solution:
+
+```python
+from crawl4ai import (
+    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, 
+    GeolocationConfig
+)
+
+browser_config = BrowserConfig(
+    use_managed_browser=True,
+    user_data_dir="/path/to/my-profile",
+    browser_type="chromium"
+)
+
+crawl_config = CrawlerRunConfig(
+    # Location settings
+    locale="es-MX",                  # Spanish (Mexico)
+    timezone_id="America/Mexico_City",
+    geolocation=GeolocationConfig(
+        latitude=19.4326,            # Mexico City
+        longitude=-99.1332
+    )
+)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com", config=crawl_config)
+```
+
+Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
+
+## 8. Summary
 
 - **Create** your user-data directory either:
   - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` 
@@ -271,6 +366,7 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
   - Or through the interactive interface with `profiler.interactive_manager()`
 - **Log in** or configure sites as needed, then close the browser
 - **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
+- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
 - **List and reuse** profiles with `BrowserProfiler.list_profiles()`
 - **Manage** your profiles with the dedicated `BrowserProfiler` class
 - Enjoy **persistent** sessions that reflect your real identity
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 1f7e5ee2..0dc846a7 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -137,6 +137,11 @@ class CrawlerRunConfig:
         screenshot=False,
         pdf=False,
         capture_mhtml=False,
+        # Location and Identity Parameters
+        locale=None,            # e.g. "en-US", "fr-FR"
+        timezone_id=None,       # e.g. "America/New_York"
+        geolocation=None,       # GeolocationConfig object
+        # Resource Management
         enable_rate_limiting=False,
         rate_limit_config=None,
         memory_threshold_percent=70.0,

From 0007aea204cd6facace6c19133c6851e67532c5a Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 21 Apr 2025 23:21:49 +0800
Subject: [PATCH 69/78] Update changelog

---
 CHANGELOG.md                              |  4 ++
 docs/md_v2/core/browser-crawler-config.md | 71 +++++++++++++----------
 2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fea79456..b50e4eef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Added intelligent context search with query filtering
   - Added syntax-aware code function chunking
   - Implemented efficient HTML processing pipeline
+- Added support for controlling browser geolocation via new GeolocationConfig class
+  - Added locale and timezone configuration options to CrawlerRunConfig
+  - Added example script demonstrating geolocation and locale usage
+  - Added documentation for location-based identity features
 
 ### [Refactor] 2025-04-20
 - Replaced crawler_manager.py with simpler crawler_pool.py implementation
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 0dc846a7..b8817c6f 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -1,9 +1,9 @@
 # Browser, Crawler & LLM Configuration (Quick Overview)
 
-Crawl4AI’s flexibility stems from two key classes:
+Crawl4AI's flexibility stems from two key classes:
 
-1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
-2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
 3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
 
 In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
@@ -38,16 +38,16 @@ class BrowserConfig:
 
 
 
-1. **`browser_type`**  
+1. **`browser_type`**  
 - Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
 - Defaults to `"chromium"`.  
 - If you need a different engine, specify it here.
 
-2. **`headless`**  
+2. **`headless`**  
    - `True`: Runs the browser in headless mode (invisible browser).  
    - `False`: Runs the browser in visible mode, which helps with debugging.
 
-3. **`proxy_config`**  
+3. **`proxy_config`**  
    - A dictionary with fields like:  
 ```json
 {
@@ -58,31 +58,31 @@ class BrowserConfig:
 ```
    - Leave as `None` if a proxy is not required.
 
-4. **`viewport_width` & `viewport_height`**:  
+4. **`viewport_width` & `viewport_height`**:  
    - The initial window size.  
    - Some sites behave differently with smaller or bigger viewports.
 
-5. **`verbose`**:  
+5. **`verbose`**:  
    - If `True`, prints extra logs.  
    - Handy for debugging.
 
-6. **`use_persistent_context`**:  
+6. **`use_persistent_context`**:  
    - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
    - Typically also set `user_data_dir` to point to a folder.
 
-7. **`cookies`** & **`headers`**:  
+7. **`cookies`** & **`headers`**:  
    - If you want to start with specific cookies or add universal HTTP headers, set them here.  
    - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
 
-8. **`user_agent`**:  
+8. **`user_agent`**:  
    - Custom User-Agent string. If `None`, a default is used.  
    - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
 
-9. **`text_mode`** & **`light_mode`**:  
+9. **`text_mode`** & **`light_mode`**:  
    - `text_mode=True` disables images, possibly speeding up text-only crawls.  
    - `light_mode=True` turns off certain background features for performance.  
 
-10. **`extra_args`**:  
+10. **`extra_args`**:  
     - Additional flags for the underlying browser.  
     - E.g. `["--disable-extensions"]`.
 
@@ -157,54 +157,61 @@ class CrawlerRunConfig:
 
 ### Key Fields to Note
 
-1. **`word_count_threshold`**:  
+1. **`word_count_threshold`**:  
    - The minimum word count before a block is considered.  
    - If your site has lots of short paragraphs or items, you can lower it.
 
-2. **`extraction_strategy`**:  
+2. **`extraction_strategy`**:  
    - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
    - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
 
-3. **`markdown_generator`**:  
+3. **`markdown_generator`**:  
    - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
    - If `None`, a default approach is used.
 
-4. **`cache_mode`**:  
+4. **`cache_mode`**:  
    - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
    - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
 
-5. **`js_code`**:  
+5. **`js_code`**:  
    - A string or list of JS strings to execute.  
-   - Great for “Load More” buttons or user interactions.  
+   - Great for "Load More" buttons or user interactions.  
 
-6. **`wait_for`**:  
+6. **`wait_for`**:  
    - A CSS or JS expression to wait for before extracting content.  
    - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
 
 7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
    - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
    - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
-8. **`verbose`**:  
-   - Logs additional runtime details.  
-   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
 
-9. **`enable_rate_limiting`**:  
+8. **Location Parameters**:  
+   - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
+   - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
+   - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
+   - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
+
+9. **`verbose`**:  
+   - Logs additional runtime details.  
+   - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
+
+10. **`enable_rate_limiting`**:  
    - If `True`, enables rate limiting for batch processing.  
    - Requires `rate_limit_config` to be set.
 
-10. **`memory_threshold_percent`**:  
+11. **`memory_threshold_percent`**:  
     - The memory threshold (as a percentage) to monitor.  
     - If exceeded, the crawler will pause or slow down.
 
-11. **`check_interval`**:  
+12. **`check_interval`**:  
     - The interval (in seconds) to check system resources.  
     - Affects how often memory and CPU usage are monitored.
 
-12. **`max_session_permit`**:  
+13. **`max_session_permit`**:  
     - The maximum number of concurrent crawl sessions.  
     - Helps prevent overwhelming the system.
 
-13. **`display_mode`**:  
+14. **`display_mode`**:  
     - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
     - Affects how much information is printed during the crawl.
 
@@ -248,16 +255,16 @@ The `clone()` method:
 
 ### Key fields to note
 
-1. **`provider`**:  
+1. **`provider`**:  
 - Which LLM provoder to use. 
 - Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
 
-2. **`api_token`**:  
+2. **`api_token`**:  
     - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
     - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
     - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
 
-3. **`base_url`**:  
+3. **`base_url`**:  
    - If your provider has a custom endpoint
 
 ```python
@@ -266,7 +273,7 @@ llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENA
 
 ## 4. Putting It All Together
 
-In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
 
 ```python
 import asyncio

From 4812f08a73f419de343bd93c8a48809bb305618c Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 22 Apr 2025 22:35:25 +0800
Subject: [PATCH 70/78] feat(docker): update Docker deployment for v0.6.0

Major updates to Docker deployment infrastructure:
- Switch default port to 11235 for all services
- Add MCP (Model Context Protocol) support with WebSocket/SSE endpoints
- Simplify docker-compose.yml with auto-platform detection
- Update documentation with new features and examples
- Consolidate configuration and improve resource management

BREAKING CHANGE: Default port changed from 8020 to 11235. Update your configurations and deployment scripts accordingly.
---
 CHANGELOG.md                               |   47 +
 Dockerfile                                 |    5 +
 crawl4ai/__version__.py                    |    3 +-
 deploy/docker/README-new.md                |  644 -----------
 deploy/docker/README.md                    | 1178 ++++++++++----------
 deploy/docker/config.yml                   |    4 +-
 deploy/docker/requirements.txt             |    9 +-
 deploy/docker/server.py                    |    1 +
 deploy/docker/static/playground/index.html |    6 +-
 deploy/docker/supervisord.conf             |    2 +-
 docker-compose.yml                         |   63 +-
 docs/md_v2/blog/releases/0.6.0.md          |   51 +
 pyproject.toml                             |    2 +-
 tests/mcp/test_mcp_socket.py               |   14 +-
 14 files changed, 726 insertions(+), 1303 deletions(-)
 delete mode 100644 deploy/docker/README-new.md
 create mode 100644 docs/md_v2/blog/releases/0.6.0.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b50e4eef..bc3da893 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,53 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.0rc1‑r1] ‑ 2025‑04‑22
+
+### Added
+- Browser pooling with page pre‑warming and fine‑grained **geolocation, locale, and timezone** controls  
+- Crawler pool manager (SDK + Docker API) for smarter resource allocation  
+- Network & console log capture plus MHTML snapshot export  
+- **Table extractor**: turn HTML `<table>`s into DataFrames or CSV with one flag  
+- High‑volume stress‑test framework in `tests/memory` and API load scripts  
+- MCP protocol endpoints with socket & SSE support; playground UI scaffold  
+- Docs v2 revamp: TOC, GitHub badge, copy‑code buttons, Docker API demo  
+- “Ask AI” helper button *(work‑in‑progress, shipping soon)*  
+- New examples: geo‑location usage, network/console capture, Docker API, markdown source selection, crypto analysis  
+- Expanded automated test suites for browser, Docker, MCP and memory benchmarks  
+
+### Changed
+- Consolidated and renamed browser strategies; legacy docker strategy modules removed  
+- `ProxyConfig` moved to `async_configs`  
+- Server migrated to pool‑based crawler management  
+- FastAPI validators replace custom query validation  
+- Docker build now uses Chromium base image  
+- Large‑scale repo tidy‑up (≈36 k insertions, ≈5 k deletions)  
+
+### Fixed
+- Async crawler session leak, duplicate‑visit handling, URL normalisation  
+- Target‑element regressions in scraping strategies  
+- Logged‑URL readability, encoded‑URL decoding, middle truncation for long URLs  
+- Closed issues: #701, #733, #756, #774, #804, #822, #839, #841, #842, #843, #867, #902, #911  
+
+### Removed
+- Obsolete modules under `crawl4ai/browser/*` superseded by the new pooled browser layer  
+
+### Deprecated
+- Old markdown generator names now alias `DefaultMarkdownGenerator` and emit warnings  
+
+---
+
+#### Upgrade notes
+1. Update any direct imports from `crawl4ai/browser/*` to the new pooled browser modules  
+2. If you override `AsyncPlaywrightCrawlerStrategy.get_page`, adopt the new signature  
+3. Rebuild Docker images to pull the new Chromium layer  
+4. Switch to `DefaultMarkdownGenerator` (or silence the deprecation warning)  
+
+---
+
+`121 files changed, ≈36 223 insertions, ≈4 975 deletions` :contentReference[oaicite:0]{index=0}&#8203;:contentReference[oaicite:1]{index=1}
+
+
 ### [Feature] 2025-04-21
 - Implemented MCP protocol for machine-to-machine communication
   - Added WebSocket and SSE transport for MCP server
diff --git a/Dockerfile b/Dockerfile
index d32639a5..7ea648f9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,10 @@
 FROM python:3.10-slim
 
+# C4ai version
+ARG C4AI_VER=0.6.0
+ENV C4AI_VERSION=$C4AI_VER
+LABEL c4ai.version=$C4AI_VER
+
 # Set build arguments
 ARG APP_HOME=/app
 ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index cc2aaa57..06e10ed9 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post8"
+__version__ = "0.6.0rc1"
+
diff --git a/deploy/docker/README-new.md b/deploy/docker/README-new.md
deleted file mode 100644
index 3a9bdf52..00000000
--- a/deploy/docker/README-new.md
+++ /dev/null
@@ -1,644 +0,0 @@
-# Crawl4AI Docker Guide 🐳
-
-## Table of Contents
-- [Prerequisites](#prerequisites)
-- [Installation](#installation)
-  - [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
-  - [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
-  - [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
-- [Dockerfile Parameters](#dockerfile-parameters)
-- [Using the API](#using-the-api)
-  - [Understanding Request Schema](#understanding-request-schema)
-  - [REST API Examples](#rest-api-examples)
-  - [Python SDK](#python-sdk)
-- [Metrics & Monitoring](#metrics--monitoring)
-- [Deployment Scenarios](#deployment-scenarios)
-- [Complete Examples](#complete-examples)
-- [Server Configuration](#server-configuration)
-  - [Understanding config.yml](#understanding-configyml)
-  - [JWT Authentication](#jwt-authentication)
-  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
-  - [Customizing Your Configuration](#customizing-your-configuration)
-  - [Configuration Recommendations](#configuration-recommendations)
-- [Getting Help](#getting-help)
-
-## Prerequisites
-
-Before we dive in, make sure you have:
-- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
-- `git` for cloning the repository.
-- At least 4GB of RAM available for the container (more recommended for heavy use).
-- Python 3.10+ (if using the Python SDK).
-- Node.js 16+ (if using the Node.js examples).
-
-> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
-
-## Installation
-
-We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
-
-### Option 1: Using Docker Compose (Recommended)
-
-Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
-
-#### 1. Clone Repository
-
-```bash
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-```
-
-#### 2. Environment Setup (API Keys)
-
-If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
-
-```bash
-# Make sure you are in the 'crawl4ai' root directory
-cp deploy/docker/.llm.env.example .llm.env
-
-# Now edit .llm.env and add your API keys
-# Example content:
-# OPENAI_API_KEY=sk-your-key
-# ANTHROPIC_API_KEY=your-anthropic-key
-# ...
-```
-> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
-
-#### 3. Build and Run with Compose
-
-The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
-
-*   **Build and Run Locally (AMD64):**
-    ```bash
-    # Builds the image locally using Dockerfile and runs it
-    docker compose --profile local-amd64 up --build -d
-    ```
-
-*   **Build and Run Locally (ARM64):**
-    ```bash
-    # Builds the image locally using Dockerfile and runs it
-    docker compose --profile local-arm64 up --build -d
-    ```
-
-*   **Run Pre-built Image from Docker Hub (AMD64):**
-    ```bash
-    # Pulls and runs the specified AMD64 image from Docker Hub
-    # (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
-    docker compose --profile hub-amd64 up -d
-    ```
-
-*   **Run Pre-built Image from Docker Hub (ARM64):**
-    ```bash
-    # Pulls and runs the specified ARM64 image from Docker Hub
-    docker compose --profile hub-arm64 up -d
-    ```
-
-> The server will be available at `http://localhost:11235`.
-
-#### 4. Stopping Compose Services
-
-```bash
-# Stop the service(s) associated with a profile (e.g., local-amd64)
-docker compose --profile local-amd64 down
-```
-
-### Option 2: Manual Local Build & Run
-
-If you prefer not to use Docker Compose for local builds.
-
-#### 1. Clone Repository & Setup Environment
-
-Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
-
-#### 2. Build the Image (Multi-Arch)
-
-Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
-
-```bash
-# Make sure you are in the 'crawl4ai' root directory
-docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
-```
-
-#### 3. Run the Container
-
-*   **Basic run (no LLM support):**
-    ```bash
-    # Replace --platform if your host is ARM64
-    docker run -d \
-      -p 11235:11235 \
-      --name crawl4ai-standalone \
-      --shm-size=1g \
-      --platform linux/amd64 \
-      crawl4ai-local:latest
-    ```
-
-*   **With LLM support:**
-    ```bash
-    # Make sure .llm.env is in the current directory (project root)
-    # Replace --platform if your host is ARM64
-    docker run -d \
-      -p 11235:11235 \
-      --name crawl4ai-standalone \
-      --env-file .llm.env \
-      --shm-size=1g \
-      --platform linux/amd64 \
-      crawl4ai-local:latest
-    ```
-
-> The server will be available at `http://localhost:11235`.
-
-#### 4. Stopping the Manual Container
-
-```bash
-docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
-```
-
-### Option 3: Using Pre-built Docker Hub Images
-
-Pull and run images directly from Docker Hub without building locally.
-
-#### 1. Pull the Image
-
-We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
-
-```bash
-# Pull a specific version (recommended for stability)
-docker pull unclecode/crawl4ai:0.5.1-d1
-
-# Or pull the latest stable version
-docker pull unclecode/crawl4ai:latest
-```
-
-#### 2. Setup Environment (API Keys)
-
-If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
-
-#### 3. Run the Container
-
-*   **Basic run:**
-    ```bash
-    docker run -d \
-      -p 11235:11235 \
-      --name crawl4ai-hub \
-      --shm-size=1g \
-      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
-    ```
-
-*   **With LLM support:**
-    ```bash
-    # Make sure .llm.env is in the current directory you are running docker from
-    docker run -d \
-      -p 11235:11235 \
-      --name crawl4ai-hub \
-      --env-file .llm.env \
-      --shm-size=1g \
-      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
-    ```
-
-> The server will be available at `http://localhost:11235`.
-
-#### 4. Stopping the Hub Container
-
-```bash
-docker stop crawl4ai-hub && docker rm crawl4ai-hub
-```
-
-#### Docker Hub Versioning Explained
-
-*   **Image Name:** `unclecode/crawl4ai`
-*   **Tag Format:** `LIBRARY_VERSION-dREVISION`
-    *   `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
-    *   `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
-*   **Example:** `unclecode/crawl4ai:0.5.1-d1`
-*   **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
-*   **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
-
----
-
-*(Rest of the document remains largely the same, but with key updates below)*
-
----
-
-## Dockerfile Parameters
-
-You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
-
-```bash
-# Example: Build with 'all' features using buildx
-docker buildx build \
-  --platform linux/amd64,linux/arm64 \
-  --build-arg INSTALL_TYPE=all \
-  -t yourname/crawl4ai-all:latest \
-  --load \
-  . # Build from root context
-```
-
-### Build Arguments Explained
-
-| Argument     | Description                              | Default   | Options                            |
-| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
-| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
-| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
-| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
-| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
-| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
-| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
-
-*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
-
-### Build Best Practices
-
-1.  **Choose the Right Install Type**
-    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
-    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
-2.  **Platform Considerations**
-    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
-    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
-3.  **Performance Optimization**
-    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
-
----
-
-## Using the API
-
-Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
-
-### Python SDK
-
-Install the SDK: `pip install crawl4ai`
-
-```python
-import asyncio
-from crawl4ai.docker_client import Crawl4aiDockerClient
-from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
-
-async def main():
-    # Point to the correct server port
-    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
-        # If JWT is enabled on the server, authenticate first:
-        # await client.authenticate("user@example.com") # See Server Configuration section
-
-        # Example Non-streaming crawl
-        print("--- Running Non-Streaming Crawl ---")
-        results = await client.crawl(
-            ["https://httpbin.org/html"],
-            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
-            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
-        )
-        if results: # client.crawl returns None on failure
-          print(f"Non-streaming results success: {results.success}")
-          if results.success:
-              for result in results: # Iterate through the CrawlResultContainer
-                  print(f"URL: {result.url}, Success: {result.success}")
-        else:
-            print("Non-streaming crawl failed.")
-
-
-        # Example Streaming crawl
-        print("\n--- Running Streaming Crawl ---")
-        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
-        try:
-            async for result in await client.crawl( # client.crawl returns an async generator for streaming
-                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
-                browser_config=BrowserConfig(headless=True),
-                crawler_config=stream_config
-            ):
-                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
-        except Exception as e:
-            print(f"Streaming crawl failed: {e}")
-
-
-        # Example Get schema
-        print("\n--- Getting Schema ---")
-        schema = await client.get_schema()
-        print(f"Schema received: {bool(schema)}") # Print whether schema was received
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-*(SDK parameters like timeout, verify_ssl etc. remain the same)*
-
-### Second Approach: Direct API Calls
-
-Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
-
-*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
-
-#### More Examples *(Ensure Schema example uses type/value wrapper)*
-
-**Advanced Crawler Configuration**
-*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
-
-**Extraction Strategy**
-```json
-{
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "extraction_strategy": {
-                "type": "JsonCssExtractionStrategy",
-                "params": {
-                    "schema": {
-                        "type": "dict",
-                        "value": {
-                           "baseSelector": "article.post",
-                           "fields": [
-                               {"name": "title", "selector": "h1", "type": "text"},
-                               {"name": "content", "selector": ".content", "type": "html"}
-                           ]
-                         }
-                    }
-                }
-            }
-        }
-    }
-}
-```
-
-**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
-*(Keep Deep Crawler Example)*
-
-### REST API Examples
-
-Update URLs to use port `11235`.
-
-#### Simple Crawl
-
-```python
-import requests
-
-# Configuration objects converted to the required JSON structure
-browser_config_payload = {
-    "type": "BrowserConfig",
-    "params": {"headless": True}
-}
-crawler_config_payload = {
-    "type": "CrawlerRunConfig",
-    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
-}
-
-crawl_payload = {
-    "urls": ["https://httpbin.org/html"],
-    "browser_config": browser_config_payload,
-    "crawler_config": crawler_config_payload
-}
-response = requests.post(
-    "http://localhost:11235/crawl", # Updated port
-    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
-    json=crawl_payload
-)
-print(f"Status Code: {response.status_code}")
-if response.ok:
-    print(response.json())
-else:
-    print(f"Error: {response.text}")
-
-```
-
-#### Streaming Results
-
-```python
-import json
-import httpx # Use httpx for async streaming example
-
-async def test_stream_crawl(token: str = None): # Made token optional
-    """Test the /crawl/stream endpoint with multiple URLs."""
-    url = "http://localhost:11235/crawl/stream" # Updated port
-    payload = {
-        "urls": [
-            "https://httpbin.org/html",
-            "https://httpbin.org/links/5/0",
-        ],
-        "browser_config": {
-            "type": "BrowserConfig",
-            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
-        },
-        "crawler_config": {
-            "type": "CrawlerRunConfig",
-            "params": {"stream": True, "cache_mode": "bypass"}
-        }
-    }
-
-    headers = {}
-    # if token:
-    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
-
-    try:
-        async with httpx.AsyncClient() as client:
-            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
-                print(f"Status: {response.status_code} (Expected: 200)")
-                response.raise_for_status() # Raise exception for bad status codes
-
-                # Read streaming response line-by-line (NDJSON)
-                async for line in response.aiter_lines():
-                    if line:
-                        try:
-                            data = json.loads(line)
-                            # Check for completion marker
-                            if data.get("status") == "completed":
-                                print("Stream completed.")
-                                break
-                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
-                        except json.JSONDecodeError:
-                            print(f"Warning: Could not decode JSON line: {line}")
-
-    except httpx.HTTPStatusError as e:
-         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
-    except Exception as e:
-        print(f"Error in streaming crawl test: {str(e)}")
-
-# To run this example:
-# import asyncio
-# asyncio.run(test_stream_crawl())
-```
-
----
-
-## Metrics & Monitoring
-
-Keep an eye on your crawler with these endpoints:
-
-- `/health` - Quick health check
-- `/metrics` - Detailed Prometheus metrics
-- `/schema` - Full API schema
-
-Example health check:
-```bash
-curl http://localhost:11235/health
-```
-
----
-
-*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
-
----
-
-## Server Configuration
-
-The server's behavior can be customized through the `config.yml` file.
-
-### Understanding config.yml
-
-The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
-
-Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
-
-```yaml
-# Application Configuration
-app:
-  title: "Crawl4AI API"
-  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
-  host: "0.0.0.0"
-  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
-  reload: False # Default set to False - suitable for production
-  timeout_keep_alive: 300
-
-# Default LLM Configuration
-llm:
-  provider: "openai/gpt-4o-mini"
-  api_key_env: "OPENAI_API_KEY"
-  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
-
-# Redis Configuration (Used by internal Redis server managed by supervisord)
-redis:
-  host: "localhost"
-  port: 6379
-  db: 0
-  password: ""
-  # ... other redis options ...
-
-# Rate Limiting Configuration
-rate_limiting:
-  enabled: True
-  default_limit: "1000/minute"
-  trusted_proxies: []
-  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
-
-# Security Configuration
-security:
-  enabled: false # Master toggle for security features
-  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
-  https_redirect: false # Force HTTPS (requires security.enabled=true)
-  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
-  headers: # Security headers (applied if security.enabled=true)
-    x_content_type_options: "nosniff"
-    x_frame_options: "DENY"
-    content_security_policy: "default-src 'self'"
-    strict_transport_security: "max-age=63072000; includeSubDomains"
-
-# Crawler Configuration
-crawler:
-  memory_threshold_percent: 95.0
-  rate_limiter:
-    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
-  timeouts:
-    stream_init: 30.0  # Timeout for stream initialization
-    batch_process: 300.0 # Timeout for non-streaming /crawl processing
-
-# Logging Configuration
-logging:
-  level: "INFO"
-  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-
-# Observability Configuration
-observability:
-  prometheus:
-    enabled: True
-    endpoint: "/metrics"
-  health_check:
-    endpoint: "/health"
-```
-
-*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
-
-*(Configuration Tips and Best Practices remain the same)*
-
-### Customizing Your Configuration
-
-You can override the default `config.yml`.
-
-#### Method 1: Modify Before Build
-
-1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
-2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
-
-#### Method 2: Runtime Mount (Recommended for Custom Deploys)
-
-1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
-2.  Mount it when running the container:
-
-    *   **Using `docker run`:**
-        ```bash
-        # Assumes my-custom-config.yml is in the current directory
-        docker run -d -p 11235:11235 \
-          --name crawl4ai-custom-config \
-          --env-file .llm.env \
-          --shm-size=1g \
-          -v $(pwd)/my-custom-config.yml:/app/config.yml \
-          unclecode/crawl4ai:latest # Or your specific tag
-        ```
-
-    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
-        ```yaml
-        services:
-          crawl4ai-hub-amd64: # Or your chosen service
-            image: unclecode/crawl4ai:latest
-            profiles: ["hub-amd64"]
-            <<: *base-config
-            volumes:
-              # Mount local custom config over the default one in the container
-              - ./my-custom-config.yml:/app/config.yml
-              # Keep the shared memory volume from base-config
-              - /dev/shm:/dev/shm
-        ```
-        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
-
-> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
-
-### Configuration Recommendations
-
-1. **Security First** 🔒
-   - Always enable security in production
-   - Use specific trusted_hosts instead of wildcards
-   - Set up proper rate limiting to protect your server
-   - Consider your environment before enabling HTTPS redirect
-
-2. **Resource Management** 💻
-   - Adjust memory_threshold_percent based on available RAM
-   - Set timeouts according to your content size and network conditions
-   - Use Redis for rate limiting in multi-container setups
-
-3. **Monitoring** 📊
-   - Enable Prometheus if you need metrics
-   - Set DEBUG logging in development, INFO in production
-   - Regular health check monitoring is crucial
-
-4. **Performance Tuning** ⚡
-   - Start with conservative rate limiter delays
-   - Increase batch_process timeout for large content
-   - Adjust stream_init timeout based on initial response times
-
-## Getting Help
-
-We're here to help you succeed with Crawl4AI! Here's how to get support:
-
-- 📖 Check our [full documentation](https://docs.crawl4ai.com)
-- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
-- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
-- ⭐ Star us on GitHub to show support!
-
-## Summary
-
-In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
-- Building and running the Docker container
-- Configuring the environment
-- Making API requests with proper typing
-- Using the Python SDK
-- Monitoring your deployment
-
-Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
-
-Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
-
-Happy crawling! 🕷️
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
index b4b6e414..1deebd50 100644
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -3,395 +3,504 @@
 ## Table of Contents
 - [Prerequisites](#prerequisites)
 - [Installation](#installation)
-  - [Local Build](#local-build)
-  - [Docker Hub](#docker-hub)
+  - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
+  - [Option 2: Using Docker Compose](#option-2-using-docker-compose)
+  - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
 - [Dockerfile Parameters](#dockerfile-parameters)
 - [Using the API](#using-the-api)
+  - [Playground Interface](#playground-interface)
+  - [Python SDK](#python-sdk)
   - [Understanding Request Schema](#understanding-request-schema)
   - [REST API Examples](#rest-api-examples)
-  - [Python SDK](#python-sdk)
+- [Additional API Endpoints](#additional-api-endpoints)
+  - [HTML Extraction Endpoint](#html-extraction-endpoint)
+  - [Screenshot Endpoint](#screenshot-endpoint)
+  - [PDF Export Endpoint](#pdf-export-endpoint)
+  - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+  - [Library Context Endpoint](#library-context-endpoint)
+- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
+  - [What is MCP?](#what-is-mcp)
+  - [Connecting via MCP](#connecting-via-mcp)
+  - [Using with Claude Code](#using-with-claude-code)
+  - [Available MCP Tools](#available-mcp-tools)
+  - [Testing MCP Connections](#testing-mcp-connections)
+  - [MCP Schemas](#mcp-schemas)
 - [Metrics & Monitoring](#metrics--monitoring)
 - [Deployment Scenarios](#deployment-scenarios)
 - [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+  - [Understanding config.yml](#understanding-configyml)
+  - [JWT Authentication](#jwt-authentication)
+  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+  - [Customizing Your Configuration](#customizing-your-configuration)
+  - [Configuration Recommendations](#configuration-recommendations)
 - [Getting Help](#getting-help)
+- [Summary](#summary)
 
 ## Prerequisites
 
 Before we dive in, make sure you have:
-- Docker installed and running (version 20.10.0 or higher)
-- At least 4GB of RAM available for the container
-- Python 3.10+ (if using the Python SDK)
-- Node.js 16+ (if using the Node.js examples)
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
 
 > 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
 
 ## Installation
 
-### Local Build
+We offer several ways to get the Crawl4AI server running. The quickest way is to use our pre-built Docker Hub images.
 
-Let's get your local environment set up step by step!
+### Option 1: Using Pre-built Docker Hub Images (Recommended)
 
-#### 1. Building the Image
+Pull and run images directly from Docker Hub without building locally.
 
-First, clone the repository and build the Docker image:
+#### 1. Pull the Image
+
+Our latest release candidate is `0.6.0rc1-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
 
 ```bash
-# Clone the repository
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai/deploy
+# Pull the release candidate (recommended for latest features)
+docker pull unclecode/crawl4ai:0.6.0rc1-r1
 
-# Build the Docker image
-docker build --platform=linux/amd64 --no-cache -t crawl4ai .
-
-# Or build for arm64
-docker build --platform=linux/arm64 --no-cache -t crawl4ai .
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
 ```
 
-#### 2. Environment Setup
+#### 2. Setup Environment (API Keys)
 
-If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
+If you plan to use LLMs, create a `.llm.env` file in your working directory:
 
-```env
+```bash
+# Create a .llm.env file with your API keys
+cat > .llm.env << EOL
 # OpenAI
 OPENAI_API_KEY=sk-your-key
 
 # Anthropic
 ANTHROPIC_API_KEY=your-anthropic-key
 
-# DeepSeek
-DEEPSEEK_API_KEY=your-deepseek-key
-
-# Check out https://docs.litellm.ai/docs/providers for more providers!
+# Other providers as needed
+# DEEPSEEK_API_KEY=your-deepseek-key
+# GROQ_API_KEY=your-groq-key
+# TOGETHER_API_KEY=your-together-key
+# MISTRAL_API_KEY=your-mistral-key
+# GEMINI_API_TOKEN=your-gemini-token
+EOL
 ```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
 
-> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
+#### 3. Run the Container
 
-#### 3. Running the Container
+*   **Basic run:**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.6.0rc1-r1
+    ```
 
-You have several options for running the container:
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.6.0rc1-r1
+    ```
 
-Basic run (no LLM support):
-```bash
-docker run -d -p 8000:8000 --name crawl4ai crawl4ai
-```
+> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
 
-With LLM support:
-```bash
-docker run -d -p 8000:8000 \
-  --env-file .llm.env \
-  --name crawl4ai \
-  crawl4ai
-```
-
-Using host environment variables (Not a good practice, but works for local testing):
-```bash
-docker run -d -p 8000:8000 \
-  --env-file .llm.env \
-  --env "$(env)" \
-  --name crawl4ai \
-  crawl4ai
-```
-
-#### Multi-Platform Build
-For distributing your image across different architectures, use `buildx`:
+#### 4. Stopping the Container
 
 ```bash
-# Set up buildx builder
-docker buildx create --use
+docker stop crawl4ai && docker rm crawl4ai
+```
 
-# Build for multiple platforms
+#### Docker Hub Versioning Explained
+
+*   **Image Name:** `unclecode/crawl4ai`
+*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0rc1-r1`)
+    *   `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
+    *   `SUFFIX`: Optional tag for release candidates (`rc1`) and revisions (`r1`)
+*   **`latest` Tag:** Points to the most recent stable version
+*   **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
+
+### Option 2: Using Docker Compose
+
+Docker Compose simplifies building and running the service, especially for local development and testing.
+
+#### 1. Clone Repository
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
+```
+
+#### 3. Build and Run with Compose
+
+The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
+
+*   **Run Pre-built Image from Docker Hub:**
+    ```bash
+    # Pulls and runs the release candidate from Docker Hub
+    # Automatically selects the correct architecture
+    IMAGE=unclecode/crawl4ai:0.6.0rc1-r1 docker compose up -d
+    ```
+
+*   **Build and Run Locally:**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    # Automatically uses the correct architecture for your machine
+    docker compose up --build -d
+    ```
+
+*   **Customize the Build:**
+    ```bash
+    # Build with all features (includes torch and transformers)
+    INSTALL_TYPE=all docker compose up --build -d
+    
+    # Build with GPU support (for AMD64 platforms)
+    ENABLE_GPU=true docker compose up --build -d
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Service
+
+```bash
+# Stop the service
+docker compose down
+```
+
+### Option 3: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for direct control over the build and run process.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. Crawl4AI now uses buildx to handle multi-architecture builds automatically.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+# Build for the current architecture and load it into Docker
+docker buildx build -t crawl4ai-local:latest --load .
+
+# Or build for multiple architectures (useful for publishing)
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+
+# Build with additional options
+docker buildx build \
+  --build-arg INSTALL_TYPE=all \
+  --build-arg ENABLE_GPU=false \
+  -t crawl4ai-local:latest --load .
+```
+
+#### 3. Run the Container
+
+*   **Basic run (no LLM support):**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory (project root)
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --env-file .llm.env \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
+```
+
+---
+
+## MCP (Model Context Protocol) Support
+
+Crawl4AI server includes support for the Model Context Protocol (MCP), allowing you to connect the server's capabilities directly to MCP-compatible clients like Claude Code.
+
+### What is MCP?
+
+MCP is an open protocol that standardizes how applications provide context to LLMs. It allows AI models to access external tools, data sources, and services through a standardized interface.
+
+### Connecting via MCP
+
+The Crawl4AI server exposes two MCP endpoints:
+
+- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
+- **WebSocket**: `ws://localhost:11235/mcp/ws`
+
+### Using with Claude Code
+
+You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:
+
+```bash
+# Add the Crawl4AI server as an MCP provider
+claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+
+# List all MCP providers to verify it was added
+claude mcp list
+```
+
+Once connected, Claude Code can directly use Crawl4AI's capabilities like screenshot capture, PDF generation, and HTML processing without having to make separate API calls.
+
+### Available MCP Tools
+
+When connected via MCP, the following tools are available:
+
+- `md` - Generate markdown from web content
+- `html` - Extract preprocessed HTML
+- `screenshot` - Capture webpage screenshots
+- `pdf` - Generate PDF documents
+- `execute_js` - Run JavaScript on web pages
+- `crawl` - Perform multi-URL crawling
+- `ask` - Query the Crawl4AI library context
+
+### Testing MCP Connections
+
+You can test the MCP WebSocket connection using the test file included in the repository:
+
+```bash
+# From the repository root
+python tests/mcp/test_mcp_socket.py
+```
+
+### MCP Schemas
+
+Access the MCP tool schemas at `http://localhost:11235/mcp/schema` for detailed information on each tool's parameters and capabilities.
+
+---
+
+## Additional API Endpoints
+
+In addition to the core `/crawl` and `/crawl/stream` endpoints, the server provides several specialized endpoints:
+
+### HTML Extraction Endpoint
+
+```
+POST /html
+```
+
+Crawls the URL and returns preprocessed HTML optimized for schema extraction.
+
+```json
+{
+  "url": "https://example.com"
+}
+```
+
+### Screenshot Endpoint
+
+```
+POST /screenshot
+```
+
+Captures a full-page PNG screenshot of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "screenshot_wait_for": 2,
+  "output_path": "/path/to/save/screenshot.png"
+}
+```
+
+- `screenshot_wait_for`: Optional delay in seconds before capture (default: 2)
+- `output_path`: Optional path to save the screenshot (recommended)
+
+### PDF Export Endpoint
+
+```
+POST /pdf
+```
+
+Generates a PDF document of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "output_path": "/path/to/save/document.pdf"
+}
+```
+
+- `output_path`: Optional path to save the PDF (recommended)
+
+### JavaScript Execution Endpoint
+
+```
+POST /execute_js
+```
+
+Executes JavaScript snippets on the specified URL and returns the full crawl result.
+
+```json
+{
+  "url": "https://example.com",
+  "scripts": [
+    "return document.title",
+    "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+  ]
+}
+```
+
+- `scripts`: List of JavaScript snippets to execute sequentially
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
 docker buildx build \
   --platform linux/amd64,linux/arm64 \
-  -t crawl4ai \
-  --push \
-  .
-```
-
-> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
-
-#### Development Build
-For development, you might want to enable all features:
-
-```bash
-docker build -t crawl4ai
   --build-arg INSTALL_TYPE=all \
-  --build-arg PYTHON_VERSION=3.10 \
-  --build-arg ENABLE_GPU=true \
-  .
-```
-
-#### GPU-Enabled Build
-If you plan to use GPU acceleration:
-
-```bash
-docker build -t crawl4ai
-  --build-arg ENABLE_GPU=true \
-  deploy/docker/
+  -t yourname/crawl4ai-all:latest \
+  --load \
+  . # Build from root context
 ```
 
 ### Build Arguments Explained
 
-| Argument | Description | Default | Options |
-|----------|-------------|---------|----------|
-| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
-| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
-| ENABLE_GPU | GPU support | false | true, false |
-| APP_HOME | Install path | /app | any valid path |
+| Argument     | Description                              | Default   | Options                            |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
+| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
+| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
+| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
 
 ### Build Best Practices
 
-1. **Choose the Right Install Type**
-   - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
-   - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+1.  **Choose the Right Install Type**
+    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2.  **Platform Considerations**
+    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
+    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3.  **Performance Optimization**
+    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
 
-2. **Platform Considerations**
-   - Let Docker auto-detect platform unless you need cross-compilation
-   - Use --platform for specific architecture requirements
-   - Consider buildx for multi-architecture distribution
-
-3. **Performance Optimization**
-   - The image automatically includes platform-specific optimizations
-   - AMD64 gets OpenMP optimizations
-   - ARM64 gets OpenBLAS optimizations
-
-### Docker Hub
-
-> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
+---
 
 ## Using the API
 
-In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Playground Interface
+
+A built-in web playground is available at `http://localhost:11235/playground` for testing and generating API requests. The playground allows you to:
+
+1. Configure `CrawlerRunConfig` and `BrowserConfig` using the main library's Python syntax
+2. Test crawling operations directly from the interface
+3. Generate corresponding JSON for REST API requests based on your configuration
+
+This is the easiest way to translate Python configuration to JSON requests when building integrations.
 
 ### Python SDK
 
-The SDK makes things easier! Here's how to use it:
+Install the SDK: `pip install crawl4ai`
 
 ```python
+import asyncio
 from crawl4ai.docker_client import Crawl4aiDockerClient
-from crawl4ai import BrowserConfig, CrawlerRunConfig
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
 
 async def main():
-    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
-      # If JWT is enabled, you can authenticate like this: (more on this later)
-        # await client.authenticate("test@example.com")
-        
-        # Non-streaming crawl
+    # Point to the correct server port
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # If JWT is enabled on the server, authenticate first:
+        # await client.authenticate("user@example.com") # See Server Configuration section
+
+        # Example Non-streaming crawl
+        print("--- Running Non-Streaming Crawl ---")
         results = await client.crawl(
-            ["https://example.com", "https://python.org"],
-            browser_config=BrowserConfig(headless=True),
-            crawler_config=CrawlerRunConfig()
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         )
-        print(f"Non-streaming results: {results}")
-        
-        # Streaming crawl
-        crawler_config = CrawlerRunConfig(stream=True)
-        async for result in await client.crawl(
-            ["https://example.com", "https://python.org"],
-            browser_config=BrowserConfig(headless=True),
-            crawler_config=crawler_config
-        ):
-            print(f"Streamed result: {result}")
-        
-        # Get schema
+        if results: # client.crawl returns None on failure
+          print(f"Non-streaming results success: {results.success}")
+          if results.success:
+              for result in results: # Iterate through the CrawlResultContainer
+                  print(f"URL: {result.url}, Success: {result.success}")
+        else:
+            print("Non-streaming crawl failed.")
+
+
+        # Example Streaming crawl
+        print("\n--- Running Streaming Crawl ---")
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        try:
+            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=stream_config
+            ):
+                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+        except Exception as e:
+            print(f"Streaming crawl failed: {e}")
+
+
+        # Example Get schema
+        print("\n--- Getting Schema ---")
         schema = await client.get_schema()
-        print(f"Schema: {schema}")
+        print(f"Schema received: {bool(schema)}") # Print whether schema was received
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
 
-- `base_url` (str): Base URL of the Crawl4AI Docker server
-- `timeout` (float): Default timeout for requests in seconds
-- `verify_ssl` (bool): Whether to verify SSL certificates
-- `verbose` (bool): Whether to show logging output
-- `log_file` (str, optional): Path to log file if file logging is desired
+### Second Approach: Direct API Calls
 
-This client SDK generates a properly structured JSON request for the server's HTTP API.
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
 
-## Second Approach: Direct API Calls
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
 
-This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
-
-### Understanding Configuration Structure
-
-Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
-
-#### The Basic Pattern
-
-Try this in Python to understand the structure:
-```python
-from crawl4ai import BrowserConfig
-
-# Create a config and see its structure
-config = BrowserConfig(headless=True)
-print(config.dump())
-```
-
-This outputs:
-```json
-{
-    "type": "BrowserConfig",
-    "params": {
-        "headless": true
-    }
-}
-```
-
-#### Simple vs Complex Values
-
-The structure follows these rules:
-- Simple values (strings, numbers, booleans, lists) are passed directly
-- Complex values (classes, dictionaries) use the type-params pattern
-
-For example, with dictionaries:
-```json
-{
-    "browser_config": {
-        "type": "BrowserConfig",
-        "params": {
-            "headless": true,           // Simple boolean - direct value
-            "viewport": {               // Complex dictionary - needs type-params
-                "type": "dict",
-                "value": {
-                    "width": 1200,
-                    "height": 800
-                }
-            }
-        }
-    }
-}
-```
-
-#### Strategy Pattern and Nesting
-
-Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
-
-```json
-{
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "chunking_strategy": {
-                "type": "RegexChunking",      // Strategy implementation
-                "params": {
-                    "patterns": ["\n\n", "\\.\\s+"]
-                }
-            }
-        }
-    }
-}
-```
-
-Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
-
-#### Complex Nested Example
-
-Let's look at a more complex example with content filtering:
-
-```json
-{
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "markdown_generator": {
-                "type": "DefaultMarkdownGenerator",
-                "params": {
-                    "content_filter": {
-                        "type": "PruningContentFilter",
-                        "params": {
-                            "threshold": 0.48,
-                            "threshold_type": "fixed"
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-```
-
-This shows how deeply configurations can nest while maintaining a consistent structure.
-
-#### Quick Grammar Overview
-```
-config := {
-    "type": string,
-    "params": {
-        key: simple_value | complex_value
-    }
-}
-
-simple_value := string | number | boolean | [simple_value]
-complex_value := config | dict_value
-
-dict_value := {
-    "type": "dict",
-    "value": object
-}
-```
-
-#### Important Rules 🚨
-
-- Always use the type-params pattern for class instances
-- Use direct values for primitives (numbers, strings, booleans)
-- Wrap dictionaries with {"type": "dict", "value": {...}}
-- Arrays/lists are passed directly without type-params
-- All parameters are optional unless specifically required
-
-#### Pro Tip 💡
-
-The easiest way to get the correct structure is to:
-1. Create configuration objects in Python
-2. Use the `dump()` method to see their JSON representation
-3. Use that JSON in your API calls
-
-Example:
-```python
-from crawl4ai import CrawlerRunConfig, PruningContentFilter
-
-config = CrawlerRunConfig(
-    markdown_generator=DefaultMarkdownGenerator(
-        content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
-    ),
-    cache_mode= CacheMode.BYPASS
-)
-print(config.dump())  # Use this JSON in your API calls
-```
-
-
-#### More Examples
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
 
 **Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
 
-```json
-{
-    "urls": ["https://example.com"],
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "cache_mode": "bypass",
-            "markdown_generator": {
-                "type": "DefaultMarkdownGenerator",
-                "params": {
-                    "content_filter": {
-                        "type": "PruningContentFilter",
-                        "params": {
-                            "threshold": 0.48,
-                            "threshold_type": "fixed",
-                            "min_word_threshold": 0
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-```
-
-**Extraction Strategy**:
-
+**Extraction Strategy**
 ```json
 {
     "crawler_config": {
@@ -401,11 +510,14 @@ print(config.dump())  # Use this JSON in your API calls
                 "type": "JsonCssExtractionStrategy",
                 "params": {
                     "schema": {
-                        "baseSelector": "article.post",
-                        "fields": [
-                            {"name": "title", "selector": "h1", "type": "text"},
-                            {"name": "content", "selector": ".content", "type": "html"}
-                        ]
+                        "type": "dict",
+                        "value": {
+                           "baseSelector": "article.post",
+                           "fields": [
+                               {"name": "title", "selector": "h1", "type": "text"},
+                               {"name": "content", "selector": ".content", "type": "html"}
+                           ]
+                         }
                     }
                 }
             }
@@ -414,166 +526,105 @@ print(config.dump())  # Use this JSON in your API calls
 }
 ```
 
-**LLM Extraction Strategy**
-
-```json
-{
-  "crawler_config": {
-    "type": "CrawlerRunConfig",
-    "params": {
-      "extraction_strategy": {
-        "type": "LLMExtractionStrategy",
-        "params": {
-          "instruction": "Extract article title, author, publication date and main content",
-          "provider": "openai/gpt-4",
-          "api_token": "your-api-token",
-          "schema": {
-            "type": "dict",
-            "value": {
-              "title": "Article Schema",
-              "type": "object",
-              "properties": {
-                "title": {
-                  "type": "string",
-                  "description": "The article's headline"
-                },
-                "author": {
-                  "type": "string",
-                  "description": "The author's name"
-                },
-                "published_date": {
-                  "type": "string",
-                  "format": "date-time",
-                  "description": "Publication date and time"
-                },
-                "content": {
-                  "type": "string",
-                  "description": "The main article content"
-                }
-              },
-              "required": ["title", "content"]
-            }
-          }
-        }
-      }
-    }
-  }
-}
-```
-
-**Deep Crawler Example**
-
-```json
-{
-  "crawler_config": {
-    "type": "CrawlerRunConfig",
-    "params": {
-      "deep_crawl_strategy": {
-        "type": "BFSDeepCrawlStrategy",
-        "params": {
-          "max_depth": 3,
-          "filter_chain": {
-            "type": "FilterChain",
-            "params": {
-              "filters": [
-                {
-                  "type": "ContentTypeFilter",
-                  "params": {
-                    "allowed_types": ["text/html", "application/xhtml+xml"]
-                  }
-                },
-                {
-                  "type": "DomainFilter",
-                  "params": {
-                    "allowed_domains": ["blog.*", "docs.*"],
-                  }
-                }
-              ]
-            }
-          },
-          "url_scorer": {
-            "type": "CompositeScorer",
-            "params": {
-              "scorers": [
-                {
-                  "type": "KeywordRelevanceScorer",
-                  "params": {
-                    "keywords": ["tutorial", "guide", "documentation"],
-                  }
-                },
-                {
-                  "type": "PathDepthScorer",
-                  "params": {
-                    "weight": 0.5,
-                    "optimal_depth": 3  
-                  }
-                }
-              ]
-            }
-          }
-        }
-      }
-    }
-  }
-}
-```
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
 
 ### REST API Examples
 
-Let's look at some practical examples:
+Update URLs to use port `11235`.
 
 #### Simple Crawl
 
 ```python
 import requests
 
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+    "type": "BrowserConfig",
+    "params": {"headless": True}
+}
+crawler_config_payload = {
+    "type": "CrawlerRunConfig",
+    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
 crawl_payload = {
-    "urls": ["https://example.com"],
-    "browser_config": {"headless": True},
-    "crawler_config": {"stream": False}
+    "urls": ["https://httpbin.org/html"],
+    "browser_config": browser_config_payload,
+    "crawler_config": crawler_config_payload
 }
 response = requests.post(
-    "http://localhost:8000/crawl",
-    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled, more on this later
+    "http://localhost:11235/crawl", # Updated port
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
     json=crawl_payload
 )
-print(response.json())  # Print the response for debugging
+print(f"Status Code: {response.status_code}")
+if response.ok:
+    print(response.json())
+else:
+    print(f"Error: {response.text}")
+
 ```
 
 #### Streaming Results
 
 ```python
-async def test_stream_crawl(session, token: str):
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
     """Test the /crawl/stream endpoint with multiple URLs."""
-    url = "http://localhost:8000/crawl/stream"
+    url = "http://localhost:11235/crawl/stream" # Updated port
     payload = {
         "urls": [
-            "https://example.com",
-            "https://example.com/page1",  
-            "https://example.com/page2",  
-            "https://example.com/page3",  
+            "https://httpbin.org/html",
+            "https://httpbin.org/links/5/0",
         ],
-        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"stream": True, "cache_mode": "bypass"}
+        }
     }
 
-    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
-    
+    headers = {}
+    # if token:
+    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
     try:
-        async with session.post(url, json=payload, headers=headers) as response:
-            status = response.status
-            print(f"Status: {status} (Expected: 200)")
-            assert status == 200, f"Expected 200, got {status}"
-            
-            # Read streaming response line-by-line (NDJSON)
-            async for line in response.content:
-                if line:
-                    data = json.loads(line.decode('utf-8').strip())
-                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+                print(f"Status: {response.status_code} (Expected: 200)")
+                response.raise_for_status() # Raise exception for bad status codes
+
+                # Read streaming response line-by-line (NDJSON)
+                async for line in response.aiter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            # Check for completion marker
+                            if data.get("status") == "completed":
+                                print("Stream completed.")
+                                break
+                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
+                        except json.JSONDecodeError:
+                            print(f"Warning: Could not decode JSON line: {line}")
+
+    except httpx.HTTPStatusError as e:
+         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
     except Exception as e:
         print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
 ```
 
+---
+
 ## Metrics & Monitoring
 
 Keep an eye on your crawler with these endpoints:
@@ -584,57 +635,63 @@ Keep an eye on your crawler with these endpoints:
 
 Example health check:
 ```bash
-curl http://localhost:8000/health
+curl http://localhost:11235/health
 ```
 
-## Deployment Scenarios
+---
 
-> 🚧 Coming soon! We'll cover:
-> - Kubernetes deployment
-> - Cloud provider setups (AWS, GCP, Azure)
-> - High-availability configurations
-> - Load balancing strategies
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
 
-## Complete Examples
-
-Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
-[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
-[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
+---
 
 ## Server Configuration
 
-The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+The server's behavior can be customized through the `config.yml` file.
 
 ### Understanding config.yml
 
-The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
 
-Here's a detailed breakdown of the configuration options:
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
 
 ```yaml
 # Application Configuration
 app:
-  title: "Crawl4AI API"           # Server title in OpenAPI docs
-  version: "1.0.0"               # API version
-  host: "0.0.0.0"               # Listen on all interfaces
-  port: 8000                    # Server port
-  reload: True                  # Enable hot reloading (development only)
-  timeout_keep_alive: 300       # Keep-alive timeout in seconds
+  title: "Crawl4AI API"
+  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+  host: "0.0.0.0"
+  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  reload: False # Default set to False - suitable for production
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  # ... other redis options ...
 
 # Rate Limiting Configuration
 rate_limiting:
-  enabled: True                 # Enable/disable rate limiting
-  default_limit: "100/minute"   # Rate limit format: "number/timeunit"
-  trusted_proxies: []          # List of trusted proxy IPs
-  storage_uri: "memory://"     # Use "redis://localhost:6379" for production
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
 
 # Security Configuration
 security:
-  enabled: false               # Master toggle for security features
-  jwt_enabled: true            # Enable JWT authentication
-  https_redirect: True         # Force HTTPS
-  trusted_hosts: ["*"]         # Allowed hosts (use specific domains in production)
-  headers:                     # Security headers
+  enabled: false # Master toggle for security features
+  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+  https_redirect: false # Force HTTPS (requires security.enabled=true)
+  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+  headers: # Security headers (applied if security.enabled=true)
     x_content_type_options: "nosniff"
     x_frame_options: "DENY"
     content_security_policy: "default-src 'self'"
@@ -642,148 +699,72 @@ security:
 
 # Crawler Configuration
 crawler:
-  memory_threshold_percent: 95.0  # Memory usage threshold
+  memory_threshold_percent: 95.0
   rate_limiter:
-    base_delay: [1.0, 2.0]      # Min and max delay between requests
+    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
   timeouts:
-    stream_init: 30.0           # Stream initialization timeout
-    batch_process: 300.0        # Batch processing timeout
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0 # Timeout for non-streaming /crawl processing
 
 # Logging Configuration
 logging:
-  level: "INFO"                 # Log level (DEBUG, INFO, WARNING, ERROR)
+  level: "INFO"
   format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 
 # Observability Configuration
 observability:
   prometheus:
-    enabled: True              # Enable Prometheus metrics
-    endpoint: "/metrics"       # Metrics endpoint
+    enabled: True
+    endpoint: "/metrics"
   health_check:
-    endpoint: "/health"        # Health check endpoint
+    endpoint: "/health"
 ```
 
-### JWT Authentication
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
 
-When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
-
-#### Getting a Token
-```python
-POST /token
-Content-Type: application/json
-
-{
-    "email": "user@example.com"
-}
-```
-
-The endpoint returns:
-```json
-{
-    "email": "user@example.com",
-    "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
-    "token_type": "bearer"
-}
-```
-
-#### Using the Token
-Add the token to your requests:
-```bash
-curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
-```
-
-Using the Python SDK:
-```python
-from crawl4ai.docker_client import Crawl4aiDockerClient
-
-async with Crawl4aiDockerClient() as client:
-    # Authenticate first
-    await client.authenticate("user@example.com")
-    
-    # Now all requests will include the token automatically
-    result = await client.crawl(urls=["https://example.com"])
-```
-
-#### Production Considerations 💡
-The default implementation uses a simple email verification. For production use, consider:
-- Email verification via OTP/magic links
-- OAuth2 integration
-- Rate limiting token generation
-- Token expiration and refresh mechanisms
-- IP-based restrictions
-
-### Configuration Tips and Best Practices
-
-1. **Production Settings** 🏭
-
-   ```yaml
-   app:
-     reload: False              # Disable reload in production
-     timeout_keep_alive: 120    # Lower timeout for better resource management
-   
-   rate_limiting:
-     storage_uri: "redis://redis:6379"  # Use Redis for distributed rate limiting
-     default_limit: "50/minute"         # More conservative rate limit
-   
-   security:
-     enabled: true                      # Enable all security features
-     trusted_hosts: ["your-domain.com"] # Restrict to your domain
-   ```
-
-2. **Development Settings** 🛠️
-
-   ```yaml
-   app:
-     reload: True               # Enable hot reloading
-     timeout_keep_alive: 300    # Longer timeout for debugging
-   
-   logging:
-     level: "DEBUG"            # More verbose logging
-   ```
-
-3. **High-Traffic Settings** 🚦
-
-   ```yaml
-   crawler:
-     memory_threshold_percent: 85.0  # More conservative memory limit
-     rate_limiter:
-       base_delay: [2.0, 4.0]       # More aggressive rate limiting
-   ```
+*(Configuration Tips and Best Practices remain the same)*
 
 ### Customizing Your Configuration
 
-#### Method 1: Pre-build Configuration
+You can override the default `config.yml`.
 
-```bash
-# Copy and modify config before building
-cd crawl4ai/deploy
-vim custom-config.yml # Or use any editor
+#### Method 1: Modify Before Build
 
-# Build with custom config
-docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
-```
+1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
+2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
 
-#### Method 2: Build-time Configuration
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
 
-Use a custom config during build:
+1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2.  Mount it when running the container:
 
-```bash
-# Build with custom config
-docker build --platform=linux/amd64 --no-cache \
-  --build-arg CONFIG_PATH=/path/to/custom-config.yml \ 
-  -t crawl4ai:latest .
-```
+    *   **Using `docker run`:**
+        ```bash
+        # Assumes my-custom-config.yml is in the current directory
+        docker run -d -p 11235:11235 \
+          --name crawl4ai-custom-config \
+          --env-file .llm.env \
+          --shm-size=1g \
+          -v $(pwd)/my-custom-config.yml:/app/config.yml \
+          unclecode/crawl4ai:latest # Or your specific tag
+        ```
 
-#### Method 3: Runtime Configuration
-```bash
-# Mount custom config at runtime
-docker run -d -p 8000:8000 \
-  -v $(pwd)/custom-config.yml:/app/config.yml \
-  crawl4ai-server:prod
-```
+    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+        ```yaml
+        services:
+          crawl4ai-hub-amd64: # Or your chosen service
+            image: unclecode/crawl4ai:latest
+            profiles: ["hub-amd64"]
+            <<: *base-config
+            volumes:
+              # Mount local custom config over the default one in the container
+              - ./my-custom-config.yml:/app/config.yml
+              # Keep the shared memory volume from base-config
+              - /dev/shm:/dev/shm
+        ```
+        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
 
-> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
-> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
 
 ### Configuration Recommendations
 
@@ -821,13 +802,20 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
 
 In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
 - Building and running the Docker container
-- Configuring the environment
+- Configuring the environment  
+- Using the interactive playground for testing
 - Making API requests with proper typing
 - Using the Python SDK
+- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
+- Connecting via the Model Context Protocol (MCP)
 - Monitoring your deployment
 
+The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+
+For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
 Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
 
 Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
 
-Happy crawling! 🕷️
\ No newline at end of file
+Happy crawling! 🕷️
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index e93343c1..680765a3 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -3,9 +3,9 @@ app:
   title: "Crawl4AI API"
   version: "1.0.0"
   host: "0.0.0.0"
-  port: 8020
+  port: 11235
   reload: False
-  workers: 4
+  workers: 1
   timeout_keep_alive: 300
 
 # Default LLM Configuration
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index 0dbb684c..dd489e28 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,5 +1,5 @@
-fastapi==0.115.12
-uvicorn==0.34.2
+fastapi>=0.115.12
+uvicorn>=0.34.2
 gunicorn>=23.0.0
 slowapi==0.1.9
 prometheus-fastapi-instrumentator>=7.1.0
@@ -8,8 +8,9 @@ jwt>=1.3.1
 dnspython>=2.7.0
 email-validator==2.2.0
 sse-starlette==2.2.1
-pydantic==2.11
+pydantic>=2.11
 rank-bm25==0.2.2
 anyio==4.9.0
 PyJWT==2.10.1
-
+mcp>=1.6.0
+websockets>=15.0.1
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index 3cad8d05..bda9d891 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -629,6 +629,7 @@ async def get_context(
     
 
 # attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
+print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
 attach_mcp(
     app,
     base_url=f"http://{config['app']['host']}:{config['app']['port']}"
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
index 8c2b3fb9..8f0e2bdd 100644
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -536,10 +536,14 @@
 
             const endpointMap = {
                 crawl: '/crawl',
+            };
+
+            /*const endpointMap = {
+                crawl: '/crawl',
                 crawl_stream: '/crawl/stream',
                 md: '/md',
                 llm: '/llm'
-            };
+            };*/
 
             const api = endpointMap[endpoint];
             const payload = {
diff --git a/deploy/docker/supervisord.conf b/deploy/docker/supervisord.conf
index d51cc953..a1b994aa 100644
--- a/deploy/docker/supervisord.conf
+++ b/deploy/docker/supervisord.conf
@@ -14,7 +14,7 @@ stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
 stderr_logfile_maxbytes=0
 
 [program:gunicorn]
-command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
 directory=/app                  ; Working directory for the app
 user=appuser                    ; Run gunicorn as our non-root user
 autorestart=true
diff --git a/docker-compose.yml b/docker-compose.yml
index 4331d219..10ff3269 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,19 +1,11 @@
-# docker-compose.yml
+version: '3.8'
 
-# Base configuration anchor for reusability
+# Shared configuration for all environments
 x-base-config: &base-config
   ports:
-    # Map host port 11235 to container port 11235 (where Gunicorn will listen)
-    - "11235:11235"
-    # - "8080:8080" # Uncomment if needed
-
-  # Load API keys primarily from .llm.env file
-  # Create .llm.env in the root directory .llm.env.example
+    - "11235:11235"  # Gunicorn port
   env_file:
-    - .llm.env
-
-  # Define environment variables, allowing overrides from host environment
-  # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
+    - .llm.env       # API keys (create from .llm.env.example)
   environment:
     - OPENAI_API_KEY=${OPENAI_API_KEY:-}
     - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
@@ -22,10 +14,8 @@ x-base-config: &base-config
     - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
     - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
     - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
-
   volumes:
-    # Mount /dev/shm for Chromium/Playwright performance
-    - /dev/shm:/dev/shm
+    - /dev/shm:/dev/shm  # Chromium performance
   deploy:
     resources:
       limits:
@@ -34,47 +24,26 @@ x-base-config: &base-config
         memory: 1G
   restart: unless-stopped
   healthcheck:
-    # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
     test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
     interval: 30s
     timeout: 10s
     retries: 3
-    start_period: 40s # Give the server time to start
-  # Run the container as the non-root user defined in the Dockerfile
+    start_period: 40s
   user: "appuser"
 
 services:
-  # --- Local Build Services ---
-  crawl4ai-local-amd64:
+  crawl4ai:
+    # 1. Default: Pull multi-platform test image from Docker Hub
+    # 2. Override with local image via: IMAGE=local-test docker compose up
+    image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
+    
+    # Local build config (used with --build)
     build:
-      context: . # Build context is the root directory
-      dockerfile: Dockerfile # Dockerfile is in the root directory
+      context: .
+      dockerfile: Dockerfile
       args:
         INSTALL_TYPE: ${INSTALL_TYPE:-default}
         ENABLE_GPU: ${ENABLE_GPU:-false}
-        # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
-    platform: linux/amd64
-    profiles: ["local-amd64"]
-    <<: *base-config # Inherit base configuration
-
-  crawl4ai-local-arm64:
-    build:
-      context: . # Build context is the root directory
-      dockerfile: Dockerfile # Dockerfile is in the root directory
-      args:
-        INSTALL_TYPE: ${INSTALL_TYPE:-default}
-        ENABLE_GPU: ${ENABLE_GPU:-false}
-    platform: linux/arm64
-    profiles: ["local-arm64"]
-    <<: *base-config
-
-  # --- Docker Hub Image Services ---
-  crawl4ai-hub-amd64:
-    image: unclecode/crawl4ai:${VERSION:-latest}-amd64
-    profiles: ["hub-amd64"]
-    <<: *base-config
-
-  crawl4ai-hub-arm64:
-    image: unclecode/crawl4ai:${VERSION:-latest}-arm64
-    profiles: ["hub-arm64"]
+    
+    # Inherit shared config
     <<: *base-config
\ No newline at end of file
diff --git a/docs/md_v2/blog/releases/0.6.0.md b/docs/md_v2/blog/releases/0.6.0.md
new file mode 100644
index 00000000..2e5bb63c
--- /dev/null
+++ b/docs/md_v2/blog/releases/0.6.0.md
@@ -0,0 +1,51 @@
+# Crawl4AI 0.6.0
+
+*Release date: 2025‑04‑22*
+
+0.6.0 is the **biggest jump** since the 0.5 series, packing a smarter browser core, pool‑based crawlers, and a ton of DX candy. Expect faster runs, lower RAM burn, and richer diagnostics.
+
+---
+
+## 🚀 Key upgrades
+
+| Area | What changed |
+|------|--------------|
+| **Browser** | New **Browser** management with pooling, page pre‑warm, geolocation + locale + timezone switches |
+| **Crawler** | Console and network log capture, MHTML snapshots, safer `get_page` API |
+| **Server & API** | **Crawler Pool Manager** endpoint, MCP socket + SSE support |
+| **Docs** | v2 layout, floating Ask‑AI helper, GitHub stats badge, copy‑code buttons, Docker API demo |
+| **Tests** | Memory + load benchmarks, 90+ new cases covering MCP and Docker |
+
+---
+
+## ⚠️ Breaking changes
+
+1. **`get_page` signature** – returns `(html, metadata)` instead of plain html.
+2. **Docker** – new Chromium base layer, rebuild images.
+
+---
+
+## How to upgrade
+
+```bash
+pip install -U crawl4ai==0.6.0
+```
+
+---
+
+## Full changelog
+
+The diff between `main` and `next` spans **36 k insertions, 4.9 k deletions** over 121 files. Read the [compare view](https://github.com/unclecode/crawl4ai/compare/0.5.0.post8...0.6.0) or see `CHANGELOG.md` for the granular list.
+
+---
+
+## Upgrade tips
+
+* Using the Docker API? Pull `unclecode/crawl4ai:0.6.0`, new args are documented in `/deploy/docker/README.md`.
+* Stress‑test your stack with `tests/memory/run_benchmark.py` before production rollout.
+* Markdown generators renamed but aliased, update when convenient, warnings will remind you.
+
+---
+
+Happy crawling, ping `@unclecode` on X for questions or memes.
+
diff --git a/pyproject.toml b/pyproject.toml
index 032e5cd6..cffef4de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ dynamic = ["version"]
 description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
 readme = "README.md"
 requires-python = ">=3.9"
-license = {text = "MIT"}
+license = {text = "Apache-2.0"}
 authors = [
     {name = "Unclecode", email = "unclecode@kidocode.com"}
 ]
diff --git a/tests/mcp/test_mcp_socket.py b/tests/mcp/test_mcp_socket.py
index ecb3070f..32456b31 100644
--- a/tests/mcp/test_mcp_socket.py
+++ b/tests/mcp/test_mcp_socket.py
@@ -101,19 +101,19 @@ async def test_context(s: ClientSession):
 
 
 async def main() -> None:
-    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+    async with websocket_client("ws://localhost:11235/mcp/ws") as (r, w):
         async with ClientSession(r, w) as s:
             await s.initialize()                       # handshake
             tools = (await s.list_tools()).tools
             print("tools:", [t.name for t in tools])
 
             # await test_list()
-            # await test_crawl(s)
-            # await test_md(s)
-            # await test_screenshot(s)
-            # await test_pdf(s)
-            # await test_execute_js(s)
-            # await test_html(s)
+            await test_crawl(s)
+            await test_md(s)
+            await test_screenshot(s)
+            await test_pdf(s)
+            await test_execute_js(s)
+            await test_html(s)
             await test_context(s)
 
 anyio.run(main)

From c98ffe2130abc5abebad17265b7d8dc429b1aacd Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 22 Apr 2025 22:36:41 +0800
Subject: [PATCH 71/78] Update CHANGELOG

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc3da893..e9948529 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.6.0rc1‑r1] ‑ 2025‑04‑22
+## [0.6.0rc1] ‑ 2025‑04‑22
 
 ### Added
 - Browser pooling with page pre‑warming and fine‑grained **geolocation, locale, and timezone** controls  

From b0aa8bc9f73e4d5d1f7849d1dced0b152e38b99b Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 22 Apr 2025 23:21:42 +0800
Subject: [PATCH 72/78] Update README

---
 README.md                            |  104 ++-
 docs/md_v2/core/docker-deployment.md | 1178 +++++++++++++-------------
 2 files changed, 652 insertions(+), 630 deletions(-)

diff --git a/README.md b/README.md
index e98af5e7..92628a5e 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,9 @@
 
 Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.  
 
-[✨ Check out latest update v0.5.0](#-recent-updates)
+[✨ Check out latest update v0.6.0rc1](#-recent-updates)
 
-🎉 **Version 0.5.0 is out!** This major release introduces Deep Crawling with BFS/DFS/BestFirst strategies, Memory-Adaptive Dispatcher, Multiple Crawling Strategies (Playwright and HTTP), Docker Deployment with FastAPI, Command-Line Interface (CLI), and more! [Read the release notes →](https://docs.crawl4ai.com/blog)
+🎉 **Version 0.6.0rc1 is now available!** This release candidate introduces World-aware Crawling with geolocation and locale settings, Table-to-DataFrame extraction, Browser pooling with pre-warming, Network and console traffic capture, MCP integration for AI tools, and a completely revamped Docker deployment! [Read the release notes →](https://docs.crawl4ai.com/blog)
 
 <details>
 <summary>🤓 <strong>My Personal Story</strong></summary>
@@ -253,24 +253,29 @@ pip install -e ".[all]"             # Install all optional features
 <details>
 <summary>🐳 <strong>Docker Deployment</strong></summary>
 
-> 🚀 **Major Changes Coming!** We're developing a completely new Docker implementation that will make deployment even more efficient and seamless. The current Docker setup is being deprecated in favor of this new solution.
+> 🚀 **Now Available!** Our completely redesigned Docker implementation is here! This new solution makes deployment more efficient and seamless than ever.
 
-### Current Docker Support
+### New Docker Features
 
-The existing Docker implementation is being deprecated and will be replaced soon. If you still need to use Docker with the current version:
+The new Docker implementation includes:
+- **Browser pooling** with page pre-warming for faster response times
+- **Interactive playground** to test and generate request code
+- **MCP integration** for direct connection to AI tools like Claude Code
+- **Comprehensive API endpoints** including HTML extraction, screenshots, PDF generation, and JavaScript execution
+- **Multi-architecture support** with automatic detection (AMD64/ARM64)
+- **Optimized resources** with improved memory management
 
-- 📚 [Deprecated Docker Setup](./docs/deprecated/docker-deployment.md) - Instructions for the current Docker implementation
-- ⚠️ Note: This setup will be replaced in the next major release
+### Getting Started
 
-### What's Coming Next?
+```bash
+# Pull and run the latest release candidate
+docker pull unclecode/crawl4ai:0.6.0rc1-r1
+docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.6.0rc1-r1
 
-Our new Docker implementation will bring:
-- Improved performance and resource efficiency
-- Streamlined deployment process
-- Better integration with Crawl4AI features
-- Enhanced scalability options
+# Visit the playground at http://localhost:11235/playground
+```
 
-Stay connected with our [GitHub repository](https://github.com/unclecode/crawl4ai) for updates!
+For complete documentation, see our [Docker Deployment Guide](https://docs.crawl4ai.com/core/docker-deployment/).
 
 </details>
 
@@ -500,31 +505,60 @@ async def test_news_crawl():
 
 ## ✨ Recent Updates
 
-### Version 0.5.0 Major Release Highlights
+### Version 0.6.0rc1 Release Highlights
 
--   **🚀 Deep Crawling System**: Explore websites beyond initial URLs with three strategies:
-    -   **BFS Strategy**: Breadth-first search explores websites level by level
-    -   **DFS Strategy**: Depth-first search explores each branch deeply before backtracking
-    -   **BestFirst Strategy**: Uses scoring functions to prioritize which URLs to crawl next
-    -   **Page Limiting**: Control the maximum number of pages to crawl with `max_pages` parameter
-    -   **Score Thresholds**: Filter URLs based on relevance scores
--   **⚡ Memory-Adaptive Dispatcher**: Dynamically adjusts concurrency based on system memory with built-in rate limiting
--   **🔄 Multiple Crawling Strategies**:
-    -   **AsyncPlaywrightCrawlerStrategy**: Browser-based crawling with JavaScript support (Default)
-    -   **AsyncHTTPCrawlerStrategy**: Fast, lightweight HTTP-only crawler for simple tasks
--   **🐳 Docker Deployment**: Easy deployment with FastAPI server and streaming/non-streaming endpoints
--   **💻 Command-Line Interface**: New `crwl` CLI provides convenient terminal access to all features with intuitive commands and configuration options
--   **👤 Browser Profiler**: Create and manage persistent browser profiles to save authentication states, cookies, and settings for seamless crawling of protected content
--   **🧠 Crawl4AI Coding Assistant**: AI-powered coding assistant to answer your question for Crawl4ai, and generate proper code for crawling.
--   **🏎️ LXML Scraping Mode**: Fast HTML parsing using the `lxml` library for improved performance
--   **🌐 Proxy Rotation**: Built-in support for proxy switching with `RoundRobinProxyStrategy`
+- **🌎 World-aware Crawling**: Set geolocation, language, and timezone for authentic locale-specific content:
+  ```python
+  crawler_config = CrawlerRunConfig(
+      geo_locale={"city": "Tokyo", "lang": "ja", "timezone": "Asia/Tokyo"}
+  )
+  ```
+
+- **📊 Table-to-DataFrame Extraction**: Extract HTML tables directly to CSV or pandas DataFrames:
+  ```python
+  crawler_config = CrawlerRunConfig(extract_tables=True)
+  # Access tables via result.tables or result.tables_as_dataframe
+  ```
+
+- **🚀 Browser Pooling**: Pages launch hot with pre-warmed browser instances for lower latency and memory usage
+
+- **🕸️ Network and Console Capture**: Full traffic logs and MHTML snapshots for debugging:
+  ```python
+  crawler_config = CrawlerRunConfig(
+      capture_network=True,
+      capture_console=True,
+      mhtml=True
+  )
+  ```
+
+- **🔌 MCP Integration**: Connect to AI tools like Claude Code through the Model Context Protocol
+  ```bash
+  # Add Crawl4AI to Claude Code
+  claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+  ```
+
+- **🖥️ Interactive Playground**: Test configurations and generate API requests with the built-in web interface at `/playground`
+
+- **🐳 Revamped Docker Deployment**: Streamlined multi-architecture Docker image with improved resource efficiency
+
+- **📱 Multi-stage Build System**: Optimized Dockerfile with platform-specific performance enhancements
+
+Read the full details in our [0.6.0rc1 Release Notes](https://docs.crawl4ai.com/blog/releases/0.6.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
+
+### Previous Version: 0.5.0 Major Release Highlights
+
+-   **🚀 Deep Crawling System**: Explore websites beyond initial URLs with BFS, DFS, and BestFirst strategies
+-   **⚡ Memory-Adaptive Dispatcher**: Dynamically adjusts concurrency based on system memory
+-   **🔄 Multiple Crawling Strategies**: Browser-based and lightweight HTTP-only crawlers
+-   **💻 Command-Line Interface**: New `crwl` CLI provides convenient terminal access
+-   **👤 Browser Profiler**: Create and manage persistent browser profiles
+-   **🧠 Crawl4AI Coding Assistant**: AI-powered coding assistant
+-   **🏎️ LXML Scraping Mode**: Fast HTML parsing using the `lxml` library
+-   **🌐 Proxy Rotation**: Built-in support for proxy switching
 -   **🤖 LLM Content Filter**: Intelligent markdown generation using LLMs
 -   **📄 PDF Processing**: Extract text, images, and metadata from PDF files
--   **🔗 URL Redirection Tracking**: Automatically follow and record HTTP redirects
--   **🤖 LLM Schema Generation**: Easily create extraction schemas with LLM assistance
--   **🔍 robots.txt Compliance**: Respect website crawling rules
 
-Read the full details in our [0.5.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.5.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
+Read the full details in our [0.5.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.5.0.html).
 
 ## Version Numbering in Crawl4AI
 
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
index b4b6e414..ddebeaeb 100644
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -3,395 +3,504 @@
 ## Table of Contents
 - [Prerequisites](#prerequisites)
 - [Installation](#installation)
-  - [Local Build](#local-build)
-  - [Docker Hub](#docker-hub)
+  - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
+  - [Option 2: Using Docker Compose](#option-2-using-docker-compose)
+  - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
 - [Dockerfile Parameters](#dockerfile-parameters)
 - [Using the API](#using-the-api)
+  - [Playground Interface](#playground-interface)
+  - [Python SDK](#python-sdk)
   - [Understanding Request Schema](#understanding-request-schema)
   - [REST API Examples](#rest-api-examples)
-  - [Python SDK](#python-sdk)
+- [Additional API Endpoints](#additional-api-endpoints)
+  - [HTML Extraction Endpoint](#html-extraction-endpoint)
+  - [Screenshot Endpoint](#screenshot-endpoint)
+  - [PDF Export Endpoint](#pdf-export-endpoint)
+  - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+  - [Library Context Endpoint](#library-context-endpoint)
+- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
+  - [What is MCP?](#what-is-mcp)
+  - [Connecting via MCP](#connecting-via-mcp)
+  - [Using with Claude Code](#using-with-claude-code)
+  - [Available MCP Tools](#available-mcp-tools)
+  - [Testing MCP Connections](#testing-mcp-connections)
+  - [MCP Schemas](#mcp-schemas)
 - [Metrics & Monitoring](#metrics--monitoring)
 - [Deployment Scenarios](#deployment-scenarios)
 - [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+  - [Understanding config.yml](#understanding-configyml)
+  - [JWT Authentication](#jwt-authentication)
+  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+  - [Customizing Your Configuration](#customizing-your-configuration)
+  - [Configuration Recommendations](#configuration-recommendations)
 - [Getting Help](#getting-help)
+- [Summary](#summary)
 
 ## Prerequisites
 
 Before we dive in, make sure you have:
-- Docker installed and running (version 20.10.0 or higher)
-- At least 4GB of RAM available for the container
-- Python 3.10+ (if using the Python SDK)
-- Node.js 16+ (if using the Node.js examples)
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
 
 > 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
 
 ## Installation
 
-### Local Build
+We offer several ways to get the Crawl4AI server running. The quickest way is to use our pre-built Docker Hub images.
 
-Let's get your local environment set up step by step!
+### Option 1: Using Pre-built Docker Hub Images (Recommended)
 
-#### 1. Building the Image
+Pull and run images directly from Docker Hub without building locally.
 
-First, clone the repository and build the Docker image:
+#### 1. Pull the Image
+
+Our latest release candidate is `0.6.0rc1-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
 
 ```bash
-# Clone the repository
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai/deploy
+# Pull the release candidate (recommended for latest features)
+docker pull unclecode/crawl4ai:0.6.0rc1-r2
 
-# Build the Docker image
-docker build --platform=linux/amd64 --no-cache -t crawl4ai .
-
-# Or build for arm64
-docker build --platform=linux/arm64 --no-cache -t crawl4ai .
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
 ```
 
-#### 2. Environment Setup
+#### 2. Setup Environment (API Keys)
 
-If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
+If you plan to use LLMs, create a `.llm.env` file in your working directory:
 
-```env
+```bash
+# Create a .llm.env file with your API keys
+cat > .llm.env << EOL
 # OpenAI
 OPENAI_API_KEY=sk-your-key
 
 # Anthropic
 ANTHROPIC_API_KEY=your-anthropic-key
 
-# DeepSeek
-DEEPSEEK_API_KEY=your-deepseek-key
-
-# Check out https://docs.litellm.ai/docs/providers for more providers!
+# Other providers as needed
+# DEEPSEEK_API_KEY=your-deepseek-key
+# GROQ_API_KEY=your-groq-key
+# TOGETHER_API_KEY=your-together-key
+# MISTRAL_API_KEY=your-mistral-key
+# GEMINI_API_TOKEN=your-gemini-token
+EOL
 ```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
 
-> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
+#### 3. Run the Container
 
-#### 3. Running the Container
+*   **Basic run:**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.6.0rc1-r2
+    ```
 
-You have several options for running the container:
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.6.0rc1-r2
+    ```
 
-Basic run (no LLM support):
-```bash
-docker run -d -p 8000:8000 --name crawl4ai crawl4ai
-```
+> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
 
-With LLM support:
-```bash
-docker run -d -p 8000:8000 \
-  --env-file .llm.env \
-  --name crawl4ai \
-  crawl4ai
-```
-
-Using host environment variables (Not a good practice, but works for local testing):
-```bash
-docker run -d -p 8000:8000 \
-  --env-file .llm.env \
-  --env "$(env)" \
-  --name crawl4ai \
-  crawl4ai
-```
-
-#### Multi-Platform Build
-For distributing your image across different architectures, use `buildx`:
+#### 4. Stopping the Container
 
 ```bash
-# Set up buildx builder
-docker buildx create --use
+docker stop crawl4ai && docker rm crawl4ai
+```
 
-# Build for multiple platforms
+#### Docker Hub Versioning Explained
+
+*   **Image Name:** `unclecode/crawl4ai`
+*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0rc1-r2`)
+    *   `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
+    *   `SUFFIX`: Optional tag for release candidates (`rc1`) and revisions (`r1`)
+*   **`latest` Tag:** Points to the most recent stable version
+*   **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
+
+### Option 2: Using Docker Compose
+
+Docker Compose simplifies building and running the service, especially for local development and testing.
+
+#### 1. Clone Repository
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
+```
+
+#### 3. Build and Run with Compose
+
+The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
+
+*   **Run Pre-built Image from Docker Hub:**
+    ```bash
+    # Pulls and runs the release candidate from Docker Hub
+    # Automatically selects the correct architecture
+    IMAGE=unclecode/crawl4ai:0.6.0rc1-r2 docker compose up -d
+    ```
+
+*   **Build and Run Locally:**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    # Automatically uses the correct architecture for your machine
+    docker compose up --build -d
+    ```
+
+*   **Customize the Build:**
+    ```bash
+    # Build with all features (includes torch and transformers)
+    INSTALL_TYPE=all docker compose up --build -d
+    
+    # Build with GPU support (for AMD64 platforms)
+    ENABLE_GPU=true docker compose up --build -d
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Service
+
+```bash
+# Stop the service
+docker compose down
+```
+
+### Option 3: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for direct control over the build and run process.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. Crawl4AI now uses buildx to handle multi-architecture builds automatically.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+# Build for the current architecture and load it into Docker
+docker buildx build -t crawl4ai-local:latest --load .
+
+# Or build for multiple architectures (useful for publishing)
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+
+# Build with additional options
+docker buildx build \
+  --build-arg INSTALL_TYPE=all \
+  --build-arg ENABLE_GPU=false \
+  -t crawl4ai-local:latest --load .
+```
+
+#### 3. Run the Container
+
+*   **Basic run (no LLM support):**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory (project root)
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --env-file .llm.env \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
+```
+
+---
+
+## MCP (Model Context Protocol) Support
+
+Crawl4AI server includes support for the Model Context Protocol (MCP), allowing you to connect the server's capabilities directly to MCP-compatible clients like Claude Code.
+
+### What is MCP?
+
+MCP is an open protocol that standardizes how applications provide context to LLMs. It allows AI models to access external tools, data sources, and services through a standardized interface.
+
+### Connecting via MCP
+
+The Crawl4AI server exposes two MCP endpoints:
+
+- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
+- **WebSocket**: `ws://localhost:11235/mcp/ws`
+
+### Using with Claude Code
+
+You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:
+
+```bash
+# Add the Crawl4AI server as an MCP provider
+claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+
+# List all MCP providers to verify it was added
+claude mcp list
+```
+
+Once connected, Claude Code can directly use Crawl4AI's capabilities like screenshot capture, PDF generation, and HTML processing without having to make separate API calls.
+
+### Available MCP Tools
+
+When connected via MCP, the following tools are available:
+
+- `md` - Generate markdown from web content
+- `html` - Extract preprocessed HTML
+- `screenshot` - Capture webpage screenshots
+- `pdf` - Generate PDF documents
+- `execute_js` - Run JavaScript on web pages
+- `crawl` - Perform multi-URL crawling
+- `ask` - Query the Crawl4AI library context
+
+### Testing MCP Connections
+
+You can test the MCP WebSocket connection using the test file included in the repository:
+
+```bash
+# From the repository root
+python tests/mcp/test_mcp_socket.py
+```
+
+### MCP Schemas
+
+Access the MCP tool schemas at `http://localhost:11235/mcp/schema` for detailed information on each tool's parameters and capabilities.
+
+---
+
+## Additional API Endpoints
+
+In addition to the core `/crawl` and `/crawl/stream` endpoints, the server provides several specialized endpoints:
+
+### HTML Extraction Endpoint
+
+```
+POST /html
+```
+
+Crawls the URL and returns preprocessed HTML optimized for schema extraction.
+
+```json
+{
+  "url": "https://example.com"
+}
+```
+
+### Screenshot Endpoint
+
+```
+POST /screenshot
+```
+
+Captures a full-page PNG screenshot of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "screenshot_wait_for": 2,
+  "output_path": "/path/to/save/screenshot.png"
+}
+```
+
+- `screenshot_wait_for`: Optional delay in seconds before capture (default: 2)
+- `output_path`: Optional path to save the screenshot (recommended)
+
+### PDF Export Endpoint
+
+```
+POST /pdf
+```
+
+Generates a PDF document of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "output_path": "/path/to/save/document.pdf"
+}
+```
+
+- `output_path`: Optional path to save the PDF (recommended)
+
+### JavaScript Execution Endpoint
+
+```
+POST /execute_js
+```
+
+Executes JavaScript snippets on the specified URL and returns the full crawl result.
+
+```json
+{
+  "url": "https://example.com",
+  "scripts": [
+    "return document.title",
+    "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+  ]
+}
+```
+
+- `scripts`: List of JavaScript snippets to execute sequentially
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
 docker buildx build \
   --platform linux/amd64,linux/arm64 \
-  -t crawl4ai \
-  --push \
-  .
-```
-
-> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
-
-#### Development Build
-For development, you might want to enable all features:
-
-```bash
-docker build -t crawl4ai
   --build-arg INSTALL_TYPE=all \
-  --build-arg PYTHON_VERSION=3.10 \
-  --build-arg ENABLE_GPU=true \
-  .
-```
-
-#### GPU-Enabled Build
-If you plan to use GPU acceleration:
-
-```bash
-docker build -t crawl4ai
-  --build-arg ENABLE_GPU=true \
-  deploy/docker/
+  -t yourname/crawl4ai-all:latest \
+  --load \
+  . # Build from root context
 ```
 
 ### Build Arguments Explained
 
-| Argument | Description | Default | Options |
-|----------|-------------|---------|----------|
-| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
-| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
-| ENABLE_GPU | GPU support | false | true, false |
-| APP_HOME | Install path | /app | any valid path |
+| Argument     | Description                              | Default   | Options                            |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
+| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
+| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
+| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
 
 ### Build Best Practices
 
-1. **Choose the Right Install Type**
-   - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
-   - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+1.  **Choose the Right Install Type**
+    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2.  **Platform Considerations**
+    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
+    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3.  **Performance Optimization**
+    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
 
-2. **Platform Considerations**
-   - Let Docker auto-detect platform unless you need cross-compilation
-   - Use --platform for specific architecture requirements
-   - Consider buildx for multi-architecture distribution
-
-3. **Performance Optimization**
-   - The image automatically includes platform-specific optimizations
-   - AMD64 gets OpenMP optimizations
-   - ARM64 gets OpenBLAS optimizations
-
-### Docker Hub
-
-> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
+---
 
 ## Using the API
 
-In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Playground Interface
+
+A built-in web playground is available at `http://localhost:11235/playground` for testing and generating API requests. The playground allows you to:
+
+1. Configure `CrawlerRunConfig` and `BrowserConfig` using the main library's Python syntax
+2. Test crawling operations directly from the interface
+3. Generate corresponding JSON for REST API requests based on your configuration
+
+This is the easiest way to translate Python configuration to JSON requests when building integrations.
 
 ### Python SDK
 
-The SDK makes things easier! Here's how to use it:
+Install the SDK: `pip install crawl4ai`
 
 ```python
+import asyncio
 from crawl4ai.docker_client import Crawl4aiDockerClient
-from crawl4ai import BrowserConfig, CrawlerRunConfig
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
 
 async def main():
-    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
-      # If JWT is enabled, you can authenticate like this: (more on this later)
-        # await client.authenticate("test@example.com")
-        
-        # Non-streaming crawl
+    # Point to the correct server port
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # If JWT is enabled on the server, authenticate first:
+        # await client.authenticate("user@example.com") # See Server Configuration section
+
+        # Example Non-streaming crawl
+        print("--- Running Non-Streaming Crawl ---")
         results = await client.crawl(
-            ["https://example.com", "https://python.org"],
-            browser_config=BrowserConfig(headless=True),
-            crawler_config=CrawlerRunConfig()
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         )
-        print(f"Non-streaming results: {results}")
-        
-        # Streaming crawl
-        crawler_config = CrawlerRunConfig(stream=True)
-        async for result in await client.crawl(
-            ["https://example.com", "https://python.org"],
-            browser_config=BrowserConfig(headless=True),
-            crawler_config=crawler_config
-        ):
-            print(f"Streamed result: {result}")
-        
-        # Get schema
+        if results: # client.crawl returns None on failure
+          print(f"Non-streaming results success: {results.success}")
+          if results.success:
+              for result in results: # Iterate through the CrawlResultContainer
+                  print(f"URL: {result.url}, Success: {result.success}")
+        else:
+            print("Non-streaming crawl failed.")
+
+
+        # Example Streaming crawl
+        print("\n--- Running Streaming Crawl ---")
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        try:
+            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=stream_config
+            ):
+                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+        except Exception as e:
+            print(f"Streaming crawl failed: {e}")
+
+
+        # Example Get schema
+        print("\n--- Getting Schema ---")
         schema = await client.get_schema()
-        print(f"Schema: {schema}")
+        print(f"Schema received: {bool(schema)}") # Print whether schema was received
 
 if __name__ == "__main__":
     asyncio.run(main())
 ```
 
-`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
 
-- `base_url` (str): Base URL of the Crawl4AI Docker server
-- `timeout` (float): Default timeout for requests in seconds
-- `verify_ssl` (bool): Whether to verify SSL certificates
-- `verbose` (bool): Whether to show logging output
-- `log_file` (str, optional): Path to log file if file logging is desired
+### Second Approach: Direct API Calls
 
-This client SDK generates a properly structured JSON request for the server's HTTP API.
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
 
-## Second Approach: Direct API Calls
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
 
-This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
-
-### Understanding Configuration Structure
-
-Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
-
-#### The Basic Pattern
-
-Try this in Python to understand the structure:
-```python
-from crawl4ai import BrowserConfig
-
-# Create a config and see its structure
-config = BrowserConfig(headless=True)
-print(config.dump())
-```
-
-This outputs:
-```json
-{
-    "type": "BrowserConfig",
-    "params": {
-        "headless": true
-    }
-}
-```
-
-#### Simple vs Complex Values
-
-The structure follows these rules:
-- Simple values (strings, numbers, booleans, lists) are passed directly
-- Complex values (classes, dictionaries) use the type-params pattern
-
-For example, with dictionaries:
-```json
-{
-    "browser_config": {
-        "type": "BrowserConfig",
-        "params": {
-            "headless": true,           // Simple boolean - direct value
-            "viewport": {               // Complex dictionary - needs type-params
-                "type": "dict",
-                "value": {
-                    "width": 1200,
-                    "height": 800
-                }
-            }
-        }
-    }
-}
-```
-
-#### Strategy Pattern and Nesting
-
-Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
-
-```json
-{
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "chunking_strategy": {
-                "type": "RegexChunking",      // Strategy implementation
-                "params": {
-                    "patterns": ["\n\n", "\\.\\s+"]
-                }
-            }
-        }
-    }
-}
-```
-
-Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
-
-#### Complex Nested Example
-
-Let's look at a more complex example with content filtering:
-
-```json
-{
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "markdown_generator": {
-                "type": "DefaultMarkdownGenerator",
-                "params": {
-                    "content_filter": {
-                        "type": "PruningContentFilter",
-                        "params": {
-                            "threshold": 0.48,
-                            "threshold_type": "fixed"
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-```
-
-This shows how deeply configurations can nest while maintaining a consistent structure.
-
-#### Quick Grammar Overview
-```
-config := {
-    "type": string,
-    "params": {
-        key: simple_value | complex_value
-    }
-}
-
-simple_value := string | number | boolean | [simple_value]
-complex_value := config | dict_value
-
-dict_value := {
-    "type": "dict",
-    "value": object
-}
-```
-
-#### Important Rules 🚨
-
-- Always use the type-params pattern for class instances
-- Use direct values for primitives (numbers, strings, booleans)
-- Wrap dictionaries with {"type": "dict", "value": {...}}
-- Arrays/lists are passed directly without type-params
-- All parameters are optional unless specifically required
-
-#### Pro Tip 💡
-
-The easiest way to get the correct structure is to:
-1. Create configuration objects in Python
-2. Use the `dump()` method to see their JSON representation
-3. Use that JSON in your API calls
-
-Example:
-```python
-from crawl4ai import CrawlerRunConfig, PruningContentFilter
-
-config = CrawlerRunConfig(
-    markdown_generator=DefaultMarkdownGenerator(
-        content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
-    ),
-    cache_mode= CacheMode.BYPASS
-)
-print(config.dump())  # Use this JSON in your API calls
-```
-
-
-#### More Examples
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
 
 **Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
 
-```json
-{
-    "urls": ["https://example.com"],
-    "crawler_config": {
-        "type": "CrawlerRunConfig",
-        "params": {
-            "cache_mode": "bypass",
-            "markdown_generator": {
-                "type": "DefaultMarkdownGenerator",
-                "params": {
-                    "content_filter": {
-                        "type": "PruningContentFilter",
-                        "params": {
-                            "threshold": 0.48,
-                            "threshold_type": "fixed",
-                            "min_word_threshold": 0
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-```
-
-**Extraction Strategy**:
-
+**Extraction Strategy**
 ```json
 {
     "crawler_config": {
@@ -401,11 +510,14 @@ print(config.dump())  # Use this JSON in your API calls
                 "type": "JsonCssExtractionStrategy",
                 "params": {
                     "schema": {
-                        "baseSelector": "article.post",
-                        "fields": [
-                            {"name": "title", "selector": "h1", "type": "text"},
-                            {"name": "content", "selector": ".content", "type": "html"}
-                        ]
+                        "type": "dict",
+                        "value": {
+                           "baseSelector": "article.post",
+                           "fields": [
+                               {"name": "title", "selector": "h1", "type": "text"},
+                               {"name": "content", "selector": ".content", "type": "html"}
+                           ]
+                         }
                     }
                 }
             }
@@ -414,166 +526,105 @@ print(config.dump())  # Use this JSON in your API calls
 }
 ```
 
-**LLM Extraction Strategy**
-
-```json
-{
-  "crawler_config": {
-    "type": "CrawlerRunConfig",
-    "params": {
-      "extraction_strategy": {
-        "type": "LLMExtractionStrategy",
-        "params": {
-          "instruction": "Extract article title, author, publication date and main content",
-          "provider": "openai/gpt-4",
-          "api_token": "your-api-token",
-          "schema": {
-            "type": "dict",
-            "value": {
-              "title": "Article Schema",
-              "type": "object",
-              "properties": {
-                "title": {
-                  "type": "string",
-                  "description": "The article's headline"
-                },
-                "author": {
-                  "type": "string",
-                  "description": "The author's name"
-                },
-                "published_date": {
-                  "type": "string",
-                  "format": "date-time",
-                  "description": "Publication date and time"
-                },
-                "content": {
-                  "type": "string",
-                  "description": "The main article content"
-                }
-              },
-              "required": ["title", "content"]
-            }
-          }
-        }
-      }
-    }
-  }
-}
-```
-
-**Deep Crawler Example**
-
-```json
-{
-  "crawler_config": {
-    "type": "CrawlerRunConfig",
-    "params": {
-      "deep_crawl_strategy": {
-        "type": "BFSDeepCrawlStrategy",
-        "params": {
-          "max_depth": 3,
-          "filter_chain": {
-            "type": "FilterChain",
-            "params": {
-              "filters": [
-                {
-                  "type": "ContentTypeFilter",
-                  "params": {
-                    "allowed_types": ["text/html", "application/xhtml+xml"]
-                  }
-                },
-                {
-                  "type": "DomainFilter",
-                  "params": {
-                    "allowed_domains": ["blog.*", "docs.*"],
-                  }
-                }
-              ]
-            }
-          },
-          "url_scorer": {
-            "type": "CompositeScorer",
-            "params": {
-              "scorers": [
-                {
-                  "type": "KeywordRelevanceScorer",
-                  "params": {
-                    "keywords": ["tutorial", "guide", "documentation"],
-                  }
-                },
-                {
-                  "type": "PathDepthScorer",
-                  "params": {
-                    "weight": 0.5,
-                    "optimal_depth": 3  
-                  }
-                }
-              ]
-            }
-          }
-        }
-      }
-    }
-  }
-}
-```
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
 
 ### REST API Examples
 
-Let's look at some practical examples:
+Update URLs to use port `11235`.
 
 #### Simple Crawl
 
 ```python
 import requests
 
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+    "type": "BrowserConfig",
+    "params": {"headless": True}
+}
+crawler_config_payload = {
+    "type": "CrawlerRunConfig",
+    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
 crawl_payload = {
-    "urls": ["https://example.com"],
-    "browser_config": {"headless": True},
-    "crawler_config": {"stream": False}
+    "urls": ["https://httpbin.org/html"],
+    "browser_config": browser_config_payload,
+    "crawler_config": crawler_config_payload
 }
 response = requests.post(
-    "http://localhost:8000/crawl",
-    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled, more on this later
+    "http://localhost:11235/crawl", # Updated port
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
     json=crawl_payload
 )
-print(response.json())  # Print the response for debugging
+print(f"Status Code: {response.status_code}")
+if response.ok:
+    print(response.json())
+else:
+    print(f"Error: {response.text}")
+
 ```
 
 #### Streaming Results
 
 ```python
-async def test_stream_crawl(session, token: str):
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
     """Test the /crawl/stream endpoint with multiple URLs."""
-    url = "http://localhost:8000/crawl/stream"
+    url = "http://localhost:11235/crawl/stream" # Updated port
     payload = {
         "urls": [
-            "https://example.com",
-            "https://example.com/page1",  
-            "https://example.com/page2",  
-            "https://example.com/page3",  
+            "https://httpbin.org/html",
+            "https://httpbin.org/links/5/0",
         ],
-        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"stream": True, "cache_mode": "bypass"}
+        }
     }
 
-    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
-    
+    headers = {}
+    # if token:
+    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
     try:
-        async with session.post(url, json=payload, headers=headers) as response:
-            status = response.status
-            print(f"Status: {status} (Expected: 200)")
-            assert status == 200, f"Expected 200, got {status}"
-            
-            # Read streaming response line-by-line (NDJSON)
-            async for line in response.content:
-                if line:
-                    data = json.loads(line.decode('utf-8').strip())
-                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+                print(f"Status: {response.status_code} (Expected: 200)")
+                response.raise_for_status() # Raise exception for bad status codes
+
+                # Read streaming response line-by-line (NDJSON)
+                async for line in response.aiter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            # Check for completion marker
+                            if data.get("status") == "completed":
+                                print("Stream completed.")
+                                break
+                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
+                        except json.JSONDecodeError:
+                            print(f"Warning: Could not decode JSON line: {line}")
+
+    except httpx.HTTPStatusError as e:
+         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
     except Exception as e:
         print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
 ```
 
+---
+
 ## Metrics & Monitoring
 
 Keep an eye on your crawler with these endpoints:
@@ -584,57 +635,63 @@ Keep an eye on your crawler with these endpoints:
 
 Example health check:
 ```bash
-curl http://localhost:8000/health
+curl http://localhost:11235/health
 ```
 
-## Deployment Scenarios
+---
 
-> 🚧 Coming soon! We'll cover:
-> - Kubernetes deployment
-> - Cloud provider setups (AWS, GCP, Azure)
-> - High-availability configurations
-> - Load balancing strategies
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
 
-## Complete Examples
-
-Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
-[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
-[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
+---
 
 ## Server Configuration
 
-The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+The server's behavior can be customized through the `config.yml` file.
 
 ### Understanding config.yml
 
-The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
 
-Here's a detailed breakdown of the configuration options:
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
 
 ```yaml
 # Application Configuration
 app:
-  title: "Crawl4AI API"           # Server title in OpenAPI docs
-  version: "1.0.0"               # API version
-  host: "0.0.0.0"               # Listen on all interfaces
-  port: 8000                    # Server port
-  reload: True                  # Enable hot reloading (development only)
-  timeout_keep_alive: 300       # Keep-alive timeout in seconds
+  title: "Crawl4AI API"
+  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+  host: "0.0.0.0"
+  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  reload: False # Default set to False - suitable for production
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  # ... other redis options ...
 
 # Rate Limiting Configuration
 rate_limiting:
-  enabled: True                 # Enable/disable rate limiting
-  default_limit: "100/minute"   # Rate limit format: "number/timeunit"
-  trusted_proxies: []          # List of trusted proxy IPs
-  storage_uri: "memory://"     # Use "redis://localhost:6379" for production
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
 
 # Security Configuration
 security:
-  enabled: false               # Master toggle for security features
-  jwt_enabled: true            # Enable JWT authentication
-  https_redirect: True         # Force HTTPS
-  trusted_hosts: ["*"]         # Allowed hosts (use specific domains in production)
-  headers:                     # Security headers
+  enabled: false # Master toggle for security features
+  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+  https_redirect: false # Force HTTPS (requires security.enabled=true)
+  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+  headers: # Security headers (applied if security.enabled=true)
     x_content_type_options: "nosniff"
     x_frame_options: "DENY"
     content_security_policy: "default-src 'self'"
@@ -642,148 +699,72 @@ security:
 
 # Crawler Configuration
 crawler:
-  memory_threshold_percent: 95.0  # Memory usage threshold
+  memory_threshold_percent: 95.0
   rate_limiter:
-    base_delay: [1.0, 2.0]      # Min and max delay between requests
+    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
   timeouts:
-    stream_init: 30.0           # Stream initialization timeout
-    batch_process: 300.0        # Batch processing timeout
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0 # Timeout for non-streaming /crawl processing
 
 # Logging Configuration
 logging:
-  level: "INFO"                 # Log level (DEBUG, INFO, WARNING, ERROR)
+  level: "INFO"
   format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 
 # Observability Configuration
 observability:
   prometheus:
-    enabled: True              # Enable Prometheus metrics
-    endpoint: "/metrics"       # Metrics endpoint
+    enabled: True
+    endpoint: "/metrics"
   health_check:
-    endpoint: "/health"        # Health check endpoint
+    endpoint: "/health"
 ```
 
-### JWT Authentication
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
 
-When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
-
-#### Getting a Token
-```python
-POST /token
-Content-Type: application/json
-
-{
-    "email": "user@example.com"
-}
-```
-
-The endpoint returns:
-```json
-{
-    "email": "user@example.com",
-    "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
-    "token_type": "bearer"
-}
-```
-
-#### Using the Token
-Add the token to your requests:
-```bash
-curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
-```
-
-Using the Python SDK:
-```python
-from crawl4ai.docker_client import Crawl4aiDockerClient
-
-async with Crawl4aiDockerClient() as client:
-    # Authenticate first
-    await client.authenticate("user@example.com")
-    
-    # Now all requests will include the token automatically
-    result = await client.crawl(urls=["https://example.com"])
-```
-
-#### Production Considerations 💡
-The default implementation uses a simple email verification. For production use, consider:
-- Email verification via OTP/magic links
-- OAuth2 integration
-- Rate limiting token generation
-- Token expiration and refresh mechanisms
-- IP-based restrictions
-
-### Configuration Tips and Best Practices
-
-1. **Production Settings** 🏭
-
-   ```yaml
-   app:
-     reload: False              # Disable reload in production
-     timeout_keep_alive: 120    # Lower timeout for better resource management
-   
-   rate_limiting:
-     storage_uri: "redis://redis:6379"  # Use Redis for distributed rate limiting
-     default_limit: "50/minute"         # More conservative rate limit
-   
-   security:
-     enabled: true                      # Enable all security features
-     trusted_hosts: ["your-domain.com"] # Restrict to your domain
-   ```
-
-2. **Development Settings** 🛠️
-
-   ```yaml
-   app:
-     reload: True               # Enable hot reloading
-     timeout_keep_alive: 300    # Longer timeout for debugging
-   
-   logging:
-     level: "DEBUG"            # More verbose logging
-   ```
-
-3. **High-Traffic Settings** 🚦
-
-   ```yaml
-   crawler:
-     memory_threshold_percent: 85.0  # More conservative memory limit
-     rate_limiter:
-       base_delay: [2.0, 4.0]       # More aggressive rate limiting
-   ```
+*(Configuration Tips and Best Practices remain the same)*
 
 ### Customizing Your Configuration
 
-#### Method 1: Pre-build Configuration
+You can override the default `config.yml`.
 
-```bash
-# Copy and modify config before building
-cd crawl4ai/deploy
-vim custom-config.yml # Or use any editor
+#### Method 1: Modify Before Build
 
-# Build with custom config
-docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
-```
+1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
+2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
 
-#### Method 2: Build-time Configuration
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
 
-Use a custom config during build:
+1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2.  Mount it when running the container:
 
-```bash
-# Build with custom config
-docker build --platform=linux/amd64 --no-cache \
-  --build-arg CONFIG_PATH=/path/to/custom-config.yml \ 
-  -t crawl4ai:latest .
-```
+    *   **Using `docker run`:**
+        ```bash
+        # Assumes my-custom-config.yml is in the current directory
+        docker run -d -p 11235:11235 \
+          --name crawl4ai-custom-config \
+          --env-file .llm.env \
+          --shm-size=1g \
+          -v $(pwd)/my-custom-config.yml:/app/config.yml \
+          unclecode/crawl4ai:latest # Or your specific tag
+        ```
 
-#### Method 3: Runtime Configuration
-```bash
-# Mount custom config at runtime
-docker run -d -p 8000:8000 \
-  -v $(pwd)/custom-config.yml:/app/config.yml \
-  crawl4ai-server:prod
-```
+    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+        ```yaml
+        services:
+          crawl4ai-hub-amd64: # Or your chosen service
+            image: unclecode/crawl4ai:latest
+            profiles: ["hub-amd64"]
+            <<: *base-config
+            volumes:
+              # Mount local custom config over the default one in the container
+              - ./my-custom-config.yml:/app/config.yml
+              # Keep the shared memory volume from base-config
+              - /dev/shm:/dev/shm
+        ```
+        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
 
-> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
-> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
 
 ### Configuration Recommendations
 
@@ -821,13 +802,20 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
 
 In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
 - Building and running the Docker container
-- Configuring the environment
+- Configuring the environment  
+- Using the interactive playground for testing
 - Making API requests with proper typing
 - Using the Python SDK
+- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
+- Connecting via the Model Context Protocol (MCP)
 - Monitoring your deployment
 
+The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+
+For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
 Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
 
 Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
 
-Happy crawling! 🕷️
\ No newline at end of file
+Happy crawling! 🕷️

From c4f5651199b03b4fcb181e8edafe72542a0bd67a Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 23 Apr 2025 16:35:15 +0800
Subject: [PATCH 73/78] chore(deps): upgrade to Python 3.12 and prepare for
 0.6.0 release

- Update Docker base image to Python 3.12-slim-bookworm
- Bump version from 0.6.0rc1 to 0.6.0
- Update documentation to reflect release version changes
- Fix license specification in pyproject.toml and setup.py
- Clean up code formatting in demo_docker_api.py

BREAKING CHANGE: Base Python version upgraded from 3.10 to 3.12
---
 Dockerfile                              | 7 +++++--
 README.md                               | 4 ++--
 crawl4ai/__version__.py                 | 2 +-
 deploy/docker/README.md                 | 8 ++++----
 docs/examples/docker/demo_docker_api.py | 3 ---
 pyproject.toml                          | 3 +--
 setup.py                                | 3 +--
 7 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7ea648f9..1a89800c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.12-slim-bookworm AS build
 
 # C4ai version
 ARG C4AI_VER=0.6.0
@@ -22,7 +22,7 @@ ENV PYTHONFAULTHANDLER=1 \
     REDIS_HOST=localhost \
     REDIS_PORT=6379
 
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.12
 ARG INSTALL_TYPE=default
 ARG ENABLE_GPU=false
 ARG TARGETARCH
@@ -71,6 +71,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*
 
+RUN apt-get update && apt-get dist-upgrade -y \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
     apt-get update && apt-get install -y --no-install-recommends \
     nvidia-cuda-toolkit \
diff --git a/README.md b/README.md
index 92628a5e..69a66d7a 100644
--- a/README.md
+++ b/README.md
@@ -269,8 +269,8 @@ The new Docker implementation includes:
 
 ```bash
 # Pull and run the latest release candidate
-docker pull unclecode/crawl4ai:0.6.0rc1-r1
-docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.6.0rc1-r1
+docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
+docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
 
 # Visit the playground at http://localhost:11235/playground
 ```
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 06e10ed9..ee78de23 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,3 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.6.0rc1"
+__version__ = "0.6.0"
 
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
index 1deebd50..9e81a818 100644
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -62,7 +62,7 @@ Our latest release candidate is `0.6.0rc1-r1`. Images are built with multi-arch
 
 ```bash
 # Pull the release candidate (recommended for latest features)
-docker pull unclecode/crawl4ai:0.6.0rc1-r1
+docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
 
 # Or pull the latest stable version
 docker pull unclecode/crawl4ai:latest
@@ -99,7 +99,7 @@ EOL
       -p 11235:11235 \
       --name crawl4ai \
       --shm-size=1g \
-      unclecode/crawl4ai:0.6.0rc1-r1
+      unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
     ```
 
 *   **With LLM support:**
@@ -110,7 +110,7 @@ EOL
       --name crawl4ai \
       --env-file .llm.env \
       --shm-size=1g \
-      unclecode/crawl4ai:0.6.0rc1-r1
+      unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
     ```
 
 > The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
@@ -160,7 +160,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
     ```bash
     # Pulls and runs the release candidate from Docker Hub
     # Automatically selects the correct architecture
-    IMAGE=unclecode/crawl4ai:0.6.0rc1-r1 docker compose up -d
+    IMAGE=unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number docker compose up -d
     ```
 
 *   **Build and Run Locally:**
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 77f3bf42..09625248 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -398,7 +398,6 @@ async def demo_param_js_execution(client: httpx.AsyncClient):
     elif results:
          console.print("[yellow]JS Execution Result not found in response.[/]")
 
-
 async def demo_param_screenshot(client: httpx.AsyncClient):
     payload = {
         "urls": [SIMPLE_URL],
@@ -430,8 +429,6 @@ async def demo_param_ssl_fetch(client: httpx.AsyncClient):
     elif results:
          console.print("[yellow]SSL Certificate data not found in response.[/]")
 
-
-
 async def demo_param_proxy(client: httpx.AsyncClient):
     proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
     if not proxy_params_list:
diff --git a/pyproject.toml b/pyproject.toml
index cffef4de..be44397e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ dynamic = ["version"]
 description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
 readme = "README.md"
 requires-python = ">=3.9"
-license = {text = "Apache-2.0"}
+license = "Apache-2.0"
 authors = [
     {name = "Unclecode", email = "unclecode@kidocode.com"}
 ]
@@ -48,7 +48,6 @@ dependencies = [
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
-    "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
diff --git a/setup.py b/setup.py
index 16b1b53c..a0b91041 100644
--- a/setup.py
+++ b/setup.py
@@ -49,13 +49,12 @@ setup(
     url="https://github.com/unclecode/crawl4ai",
     author="Unclecode",
     author_email="unclecode@kidocode.com",
-    license="MIT",
+    license="Apache-2.0",
     packages=find_packages(),
     package_data={"crawl4ai": ["js_snippet/*.js"]},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
-        "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",

From 949a93982eff2e5beb94f33ff8abe47131dbcf1d Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 23 Apr 2025 19:02:39 +0800
Subject: [PATCH 74/78] feat(docs): update documentation and disable Ask AI
 feature

Major documentation updates including:
- Add comprehensive code examples page
- Add video tutorial to homepage
- Update Docker deployment instructions for v0.6.0
- Temporarily disable Ask AI feature
- Add table border styling
- Update site version to v0.6.x

BREAKING CHANGE: Ask AI feature temporarily disabled pending launch
---
 docs/md_v2/ask_ai/ask-ai.js          |   8 +-
 docs/md_v2/ask_ai/index.html         |   2 +-
 docs/md_v2/assets/styles.css         |   3 +
 docs/md_v2/core/docker-deployment.md |   8 +-
 docs/md_v2/core/examples.md          | 115 +++++++++++++++++++++++++++
 docs/md_v2/index.md                  |   8 ++
 mkdocs.yml                           |   3 +-
 7 files changed, 139 insertions(+), 8 deletions(-)
 create mode 100644 docs/md_v2/core/examples.md

diff --git a/docs/md_v2/ask_ai/ask-ai.js b/docs/md_v2/ask_ai/ask-ai.js
index 2710923e..bb1b370c 100644
--- a/docs/md_v2/ask_ai/ask-ai.js
+++ b/docs/md_v2/ask_ai/ask-ai.js
@@ -361,8 +361,10 @@ A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
         chatMessages.innerHTML = ""; // Start with clean slate for query
         if (!isFromQuery) {
             // Show welcome only if manually started
+            // chatMessages.innerHTML =
+            //     '<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
             chatMessages.innerHTML =
-                '<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
+                '<div class="message ai-message welcome-message">We will launch this feature very soon.</div>';
         }
         addCitations([]); // Clear citations
         updateCitationsDisplay(); // Clear UI
@@ -504,8 +506,10 @@ A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
             addMessageToChat(message, false);
         });
         if (messages.length === 0) {
+            // chatMessages.innerHTML =
+            //     '<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
             chatMessages.innerHTML =
-                '<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
+                '<div class="message ai-message welcome-message">We will launch this feature very soon.</div>';
         }
         // Scroll to bottom after loading messages
         scrollToBottom();
diff --git a/docs/md_v2/ask_ai/index.html b/docs/md_v2/ask_ai/index.html
index 5fe79b12..ccb7faa4 100644
--- a/docs/md_v2/ask_ai/index.html
+++ b/docs/md_v2/ask_ai/index.html
@@ -36,7 +36,7 @@
             <div id="chat-input-area">
                 <!-- Loading indicator for general waiting (optional) -->
                 <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
-                <textarea id="chat-input" placeholder="Ask about Crawl4AI..." rows="2"></textarea> 
+                <textarea id="chat-input" placeholder="We will roll out this feature very soon." rows="2" disabled></textarea> 
                 <button id="send-button">Send</button>
             </div>
         </main>
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 92e01f85..46b90ab0 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -268,3 +268,6 @@ div.badges a > img {
 }
 
 
+table td, table th {
+    border: 1px solid var(--code-bg-color) !important;
+}
\ No newline at end of file
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
index ddebeaeb..2a2f75eb 100644
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -62,7 +62,7 @@ Our latest release candidate is `0.6.0rc1-r2`. Images are built with multi-arch
 
 ```bash
 # Pull the release candidate (recommended for latest features)
-docker pull unclecode/crawl4ai:0.6.0rc1-r2
+docker pull unclecode/crawl4ai:0.6.0-r1
 
 # Or pull the latest stable version
 docker pull unclecode/crawl4ai:latest
@@ -99,7 +99,7 @@ EOL
       -p 11235:11235 \
       --name crawl4ai \
       --shm-size=1g \
-      unclecode/crawl4ai:0.6.0rc1-r2
+      unclecode/crawl4ai:latest
     ```
 
 *   **With LLM support:**
@@ -110,7 +110,7 @@ EOL
       --name crawl4ai \
       --env-file .llm.env \
       --shm-size=1g \
-      unclecode/crawl4ai:0.6.0rc1-r2
+      unclecode/crawl4ai:latest
     ```
 
 > The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
@@ -160,7 +160,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
     ```bash
     # Pulls and runs the release candidate from Docker Hub
     # Automatically selects the correct architecture
-    IMAGE=unclecode/crawl4ai:0.6.0rc1-r2 docker compose up -d
+    IMAGE=unclecode/crawl4ai:latest docker compose up -d
     ```
 
 *   **Build and Run Locally:**
diff --git a/docs/md_v2/core/examples.md b/docs/md_v2/core/examples.md
new file mode 100644
index 00000000..93989552
--- /dev/null
+++ b/docs/md_v2/core/examples.md
@@ -0,0 +1,115 @@
+# Code Examples
+
+This page provides a comprehensive list of example scripts that demonstrate various features and capabilities of Crawl4AI. Each example is designed to showcase specific functionality, making it easier for you to understand how to implement these features in your own projects.
+
+## Getting Started Examples
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Hello World | A simple introductory example demonstrating basic usage of AsyncWebCrawler with JavaScript execution and content filtering. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world.py) |
+| Quickstart | A comprehensive collection of examples showcasing various features including basic crawling, content cleaning, link analysis, JavaScript execution, CSS selectors, media handling, custom hooks, proxy configuration, screenshots, and multiple extraction strategies. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) |
+| Quickstart Set 1 | Basic examples for getting started with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_1.py) |
+| Quickstart Set 2 | More advanced examples for working with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_2.py) |
+
+## Browser & Crawling Features
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Built-in Browser | Demonstrates how to use the built-in browser capabilities. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/builtin_browser_example.py) |
+| Browser Optimization | Focuses on browser performance optimization techniques. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/browser_optimization_example.py) |
+| arun vs arun_many | Compares the `arun` and `arun_many` methods for single vs. multiple URL crawling. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/arun_vs_arun_many.py) |
+| Multiple URLs | Shows how to crawl multiple URLs asynchronously. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/async_webcrawler_multiple_urls_example.py) |
+| Page Interaction | Guide on interacting with dynamic elements through clicks. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/tutorial_dynamic_clicks.md) |
+| Crawler Monitor | Shows how to monitor the crawler's activities and status. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crawler_monitor_example.py) |
+| Full Page Screenshot & PDF | Guide on capturing full-page screenshots and PDFs from massive webpages. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/full_page_screenshot_and_pdf_export.md) |
+
+## Advanced Crawling & Deep Crawling
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
+| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
+| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
+| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
+
+## Extraction Strategies
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Extraction Strategies | Demonstrates different extraction strategies with various input formats (markdown, HTML, fit_markdown) and JSON-based extractors (CSS and XPath). | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/extraction_strategies_examples.py) |
+| Scraping Strategies | Compares the performance of different scraping strategies. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/scraping_strategies_performance.py) |
+| LLM Extraction | Demonstrates LLM-based extraction specifically for OpenAI pricing data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/llm_extraction_openai_pricing.py) |
+| LLM Markdown | Shows how to use LLMs to generate markdown from crawled content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/llm_markdown_generator.py) |
+| Summarize Page | Shows how to summarize web page content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/summarize_page.py) |
+
+## E-commerce & Specialized Crawling
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Amazon Product Extraction | Demonstrates how to extract structured product data from Amazon search results using CSS selectors. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_direct_url.py) |
+| Amazon with Hooks | Shows how to use hooks with Amazon product extraction. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_using_hooks.py) |
+| Amazon with JavaScript | Demonstrates using custom JavaScript for Amazon product extraction. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_using_use_javascript.py) |
+| Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) |
+| SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) |
+
+## Customization & Security
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Hooks | Illustrates how to use hooks at different stages of the crawling process for advanced customization. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hooks_example.py) |
+| Identity-Based Browsing | Illustrates identity-based browsing configurations for authentic browsing experiences. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/identity_based_browsing.py) |
+| Proxy Rotation | Shows how to use proxy rotation for web scraping and avoiding IP blocks. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/proxy_rotation_demo.py) |
+| SSL Certificate | Illustrates SSL certificate handling and verification. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/ssl_example.py) |
+| Language Support | Shows how to handle different languages during crawling. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/language_support_example.py) |
+| Geolocation | Demonstrates how to use geolocation features. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/use_geo_location.py) |
+
+## Docker & Deployment
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Docker Config | Demonstrates how to create and use Docker configuration objects. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_config_obj.py) |
+| Docker Basic | A test suite for Docker deployment, showcasing various functionalities through the Docker API. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py) |
+| Docker REST API | Shows how to interact with Crawl4AI Docker using REST API calls. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py) |
+| Docker SDK | Demonstrates using the Python SDK for Crawl4AI Docker. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py) |
+
+## Application Examples
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Research Assistant | Demonstrates how to build a research assistant using Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/research_assistant.py) |
+| REST Call | Shows how to make REST API calls with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/rest_call.py) |
+| Chainlit Integration | Shows how to integrate Crawl4AI with Chainlit. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/chainlit.md) |
+| Crawl4AI vs FireCrawl | Compares Crawl4AI with the FireCrawl library. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crawlai_vs_firecrawl.py) |
+
+## Content Generation & Markdown
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Content Source | Demonstrates how to work with different content sources in markdown generation. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/markdown/content_source_example.py) |
+| Content Source (Short) | A simplified version of content source usage. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/markdown/content_source_short_example.py) |
+| Built-in Browser Guide | Guide for using the built-in browser capabilities. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/README_BUILTIN_BROWSER.md) |
+
+## Running the Examples
+
+To run any of these examples, you'll need to have Crawl4AI installed:
+
+```bash
+pip install crawl4ai
+```
+
+Then, you can run an example script like this:
+
+```bash
+python -m docs.examples.hello_world
+```
+
+For examples that require additional dependencies or environment variables, refer to the comments at the top of each file.
+
+Some examples may require:
+- API keys (for LLM-based examples)
+- Docker setup (for Docker-related examples)
+- Additional dependencies (specified in the example files)
+
+## Contributing New Examples
+
+If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
\ No newline at end of file
diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md
index 7a230d5d..4e54da7d 100644
--- a/docs/md_v2/index.md
+++ b/docs/md_v2/index.md
@@ -72,6 +72,14 @@ asyncio.run(main())
 
 ---
 
+## Video Tutorial
+
+<div align="center">
+  <iframe width="560" height="315" src="https://www.youtube.com/embed/xo3qK6Hg9AA?start=15" title="Crawl4AI Tutorial" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</div>
+
+---
+
 ## What Does Crawl4AI Do?
 
 Crawl4AI is a feature-rich crawler and scraper that aims to:
diff --git a/mkdocs.yml b/mkdocs.yml
index 39e03a88..b7d44220 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: Crawl4AI Documentation (v0.5.x)
+site_name: Crawl4AI Documentation (v0.6.x)
 site_description:  🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
 site_url: https://docs.crawl4ai.com
 repo_url: https://github.com/unclecode/crawl4ai
@@ -9,6 +9,7 @@ nav:
   - Home: 'index.md'
   - "Ask AI": "core/ask-ai.md"
   - "Quick Start": "core/quickstart.md"
+  - "Code Examples": "core/examples.md"
   - Setup & Installation:
     - "Installation": "core/installation.md"
     - "Docker Deployment": "core/docker-deployment.md"

From 37fd80e4b9245a265f13beeb711a39be9a75a52b Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 23 Apr 2025 19:44:25 +0800
Subject: [PATCH 75/78] feat(docs): add mobile-friendly navigation menu

Implements a responsive hamburger menu for mobile devices with the following changes:
- Add new mobile_menu.js for handling mobile navigation
- Update layout.css with mobile-specific styles and animations
- Enhance README with updated geolocation example
- Register mobile_menu.js in mkdocs.yml

The mobile menu includes:
- Hamburger button animation
- Slide-out sidebar
- Backdrop overlay
- Touch-friendly navigation
- Proper event handling
---
 README.md                        |  13 +++-
 docs/md_v2/assets/layout.css     | 121 +++++++++++++++++++++++++++++--
 docs/md_v2/assets/mobile_menu.js | 106 +++++++++++++++++++++++++++
 mkdocs.yml                       |   3 +-
 4 files changed, 234 insertions(+), 9 deletions(-)
 create mode 100644 docs/md_v2/assets/mobile_menu.js

diff --git a/README.md b/README.md
index 69a66d7a..7892a30e 100644
--- a/README.md
+++ b/README.md
@@ -509,9 +509,16 @@ async def test_news_crawl():
 
 - **🌎 World-aware Crawling**: Set geolocation, language, and timezone for authentic locale-specific content:
   ```python
-  crawler_config = CrawlerRunConfig(
-      geo_locale={"city": "Tokyo", "lang": "ja", "timezone": "Asia/Tokyo"}
-  )
+    crun_cfg = CrawlerRunConfig(
+        url="https://browserleaks.com/geo",          # test page that shows your location
+        locale="en-US",                              # Accept-Language & UI locale
+        timezone_id="America/Los_Angeles",           # JS Date()/Intl timezone
+        geolocation=GeolocationConfig(                 # override GPS coords
+            latitude=34.0522,
+            longitude=-118.2437,
+            accuracy=10.0,
+        )
+    )
   ```
 
 - **📊 Table-to-DataFrame Extraction**: Extract HTML tables directly to CSV or pandas DataFrames:
diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
index f8dbedde..0da340fa 100644
--- a/docs/md_v2/assets/layout.css
+++ b/docs/md_v2/assets/layout.css
@@ -64,7 +64,7 @@ body {
     /* Apply side padding within the centered block */
     padding-left: calc(var(--global-space) * 2);
     padding-right: calc(var(--global-space) * 2);
-    /* Add margin-left to clear the fixed sidebar */
+    /* Add margin-left to clear the fixed sidebar - ONLY ON DESKTOP */
     margin-left: var(--sidebar-width);
 }
 
@@ -81,7 +81,7 @@ body {
     z-index: 900;
     padding: 1em calc(var(--global-space) * 2);
     padding-bottom: 2em;
-    /* transition: left var(--layout-transition-speed) ease-in-out; */
+    transition: left var(--layout-transition-speed) ease-in-out;
 }
 
 /* --- 2. Main Content Area (Within Centered Grid) --- */
@@ -188,21 +188,133 @@ footer {
     }
 }
 
+/* --- Mobile Menu Styles --- */
+.mobile-menu-toggle {
+    display: none; /* Hidden by default, shown in mobile */
+    background: none;
+    border: none;
+    padding: 10px;
+    cursor: pointer;
+    z-index: 1200;
+    margin-right: 10px;
+    position: absolute;
+    left: 10px;
+    top: 50%;
+    transform: translateY(-50%);
+    /* Make sure it doesn't get moved */
+    min-width: 30px;
+    min-height: 30px;
+}
+
+.hamburger-line {
+    display: block;
+    width: 22px;
+    height: 2px;
+    margin: 5px 0;
+    background-color: var(--font-color);
+    transition: transform 0.3s, opacity 0.3s;
+}
+
+/* Hamburger animation */
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(1) {
+    transform: translateY(7px) rotate(45deg);
+}
+
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(2) {
+    opacity: 0;
+}
+
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(3) {
+    transform: translateY(-7px) rotate(-45deg);
+}
+
+.mobile-menu-close {
+    display: none; /* Hidden by default, shown in mobile */
+    position: absolute;
+    top: 10px;
+    right: 10px;
+    background: none;
+    border: none;
+    color: var(--font-color);
+    font-size: 24px;
+    cursor: pointer;
+    z-index: 1200;
+    padding: 5px 10px;
+}
+
+.mobile-menu-backdrop {
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background-color: rgba(0, 0, 0, 0.7);
+    z-index: 1050;
+}
+
 /* --- Small screens: Hide left sidebar, full width content & footer --- */
 @media screen and (max-width: 768px) {
+    /* Hide the terminal-menu from theme */
+    .terminal-menu {
+        display: none !important;
+    }
+    
+    /* Add padding to site name to prevent hamburger overlap */
+    .terminal-mkdocs-site-name,
+    .terminal-logo a,
+    .terminal-nav .logo {
+        padding-left: 40px !important;
+        white-space: nowrap;
+        overflow: hidden;
+        text-overflow: ellipsis;
+    }
+    
+    /* Show mobile menu toggle button */
+    .mobile-menu-toggle {
+        display: block;
+    }
+    
+    /* Show mobile menu close button */
+    .mobile-menu-close {
+        display: block;
+    }
 
     #terminal-mkdocs-side-panel {
-        left: calc(-1 * var(--sidebar-width));
+        left: -100%; /* Hide completely off-screen */
         z-index: 1100;
         box-shadow: 2px 0 10px rgba(0,0,0,0.3);
+        top: 0; /* Start from top edge */
+        height: 100%; /* Full height */
+        transition: left 0.3s ease-in-out;
+        padding-top: 50px; /* Space for close button */
+        overflow-y: auto;
+        width: 85%; /* Wider on mobile */
+        max-width: 320px; /* Maximum width */
+        background-color: var(--background-color); /* Ensure solid background */
     }
+    
     #terminal-mkdocs-side-panel.sidebar-visible {
         left: 0;
     }
+    
+    /* Make navigation links more touch-friendly */
+    #terminal-mkdocs-side-panel a {
+        padding: 6px 15px;
+        display: block;
+        /* No border as requested */
+    }
+    
+    #terminal-mkdocs-side-panel ul {
+        padding-left: 0;
+    }
+    
+    #terminal-mkdocs-side-panel ul ul a {
+        padding-left: 10px;
+    }
 
     .terminal-mkdocs-main-grid {
         /* Grid now takes full width (minus body padding) */
-        margin-left: 0; /* Override sidebar margin */
+        margin-left: 0 !important; /* Override sidebar margin with !important */
         margin-right: 0; /* Override auto margin */
         max-width: 100%; /* Allow full width */
         padding-left: var(--global-space); /* Reduce padding */
@@ -224,7 +336,6 @@ footer {
          text-align: center;
          gap: 0.5em;
     }
-    /* Remember JS for toggle button & overlay */
 }
 
 
diff --git a/docs/md_v2/assets/mobile_menu.js b/docs/md_v2/assets/mobile_menu.js
new file mode 100644
index 00000000..e529839e
--- /dev/null
+++ b/docs/md_v2/assets/mobile_menu.js
@@ -0,0 +1,106 @@
+// mobile_menu.js - Hamburger menu for mobile view
+document.addEventListener('DOMContentLoaded', () => {
+    // Get references to key elements
+    const sidePanel = document.getElementById('terminal-mkdocs-side-panel');
+    const mainHeader = document.querySelector('.terminal .container:first-child');
+    
+    if (!sidePanel || !mainHeader) {
+        console.warn('Mobile menu: Required elements not found');
+        return;
+    }
+    
+    // Force hide sidebar on mobile
+    const checkMobile = () => {
+        if (window.innerWidth <= 768) {
+            // Force with !important-like priority
+            sidePanel.style.setProperty('left', '-100%', 'important');
+            // Also hide terminal-menu from the theme
+            const terminalMenu = document.querySelector('.terminal-menu');
+            if (terminalMenu) {
+                terminalMenu.style.setProperty('display', 'none', 'important');
+            }
+        } else {
+            sidePanel.style.removeProperty('left');
+            // Restore terminal-menu if it exists
+            const terminalMenu = document.querySelector('.terminal-menu');
+            if (terminalMenu) {
+                terminalMenu.style.removeProperty('display');
+            }
+        }
+    };
+    
+    // Run on initial load
+    checkMobile();
+    
+    // Also run on resize
+    window.addEventListener('resize', checkMobile);
+    
+    // Create hamburger button
+    const hamburgerBtn = document.createElement('button');
+    hamburgerBtn.className = 'mobile-menu-toggle';
+    hamburgerBtn.setAttribute('aria-label', 'Toggle navigation menu');
+    hamburgerBtn.innerHTML = `
+        <span class="hamburger-line"></span>
+        <span class="hamburger-line"></span>
+        <span class="hamburger-line"></span>
+    `;
+    
+    // Create backdrop overlay
+    const menuBackdrop = document.createElement('div');
+    menuBackdrop.className = 'mobile-menu-backdrop';
+    menuBackdrop.style.display = 'none';
+    document.body.appendChild(menuBackdrop);
+    
+    // Make sure it's properly hidden on page load
+    if (window.innerWidth <= 768) {
+        menuBackdrop.style.display = 'none';
+    }
+    
+    // Insert hamburger button into header
+    mainHeader.insertBefore(hamburgerBtn, mainHeader.firstChild);
+    
+    // Add menu close button to side panel
+    const closeBtn = document.createElement('button');
+    closeBtn.className = 'mobile-menu-close';
+    closeBtn.setAttribute('aria-label', 'Close navigation menu');
+    closeBtn.innerHTML = `&times;`;
+    sidePanel.insertBefore(closeBtn, sidePanel.firstChild);
+    
+    // Toggle function
+    function toggleMobileMenu() {
+        const isOpen = sidePanel.classList.toggle('sidebar-visible');
+        
+        // Toggle backdrop
+        menuBackdrop.style.display = isOpen ? 'block' : 'none';
+        
+        // Toggle aria-expanded
+        hamburgerBtn.setAttribute('aria-expanded', isOpen ? 'true' : 'false');
+        
+        // Toggle hamburger animation class
+        hamburgerBtn.classList.toggle('is-active');
+        
+        // Force sidebar visibility setting
+        if (isOpen) {
+            sidePanel.style.setProperty('left', '0', 'important');
+        } else {
+            sidePanel.style.setProperty('left', '-100%', 'important');
+        }
+        
+        // Prevent body scrolling when menu is open
+        document.body.style.overflow = isOpen ? 'hidden' : '';
+    }
+    
+    // Event listeners
+    hamburgerBtn.addEventListener('click', toggleMobileMenu);
+    closeBtn.addEventListener('click', toggleMobileMenu);
+    menuBackdrop.addEventListener('click', toggleMobileMenu);
+    
+    // Close menu on window resize to desktop
+    window.addEventListener('resize', () => {
+        if (window.innerWidth > 768 && sidePanel.classList.contains('sidebar-visible')) {
+            toggleMobileMenu();
+        }
+    });
+    
+    console.log('Mobile menu initialized');
+});
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index b7d44220..23f4ceda 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -91,4 +91,5 @@ extra_javascript:
   - assets/github_stats.js 
   - assets/selection_ask_ai.js
   - assets/copy_code.js
-  - assets/floating_ask_ai_button.js
\ No newline at end of file
+  - assets/floating_ask_ai_button.js
+  - assets/mobile_menu.js
\ No newline at end of file

From 146f9d415fe4451265eeefc9249e384f14da6a31 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 23 Apr 2025 19:50:33 +0800
Subject: [PATCH 76/78] Update README

---
 README.md                                | 31 ++++++++++++++++++---
 docs/examples/crypto_analysis_example.py | 34 ++++++++++++------------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 7892a30e..2147ee2e 100644
--- a/README.md
+++ b/README.md
@@ -523,8 +523,33 @@ async def test_news_crawl():
 
 - **📊 Table-to-DataFrame Extraction**: Extract HTML tables directly to CSV or pandas DataFrames:
   ```python
-  crawler_config = CrawlerRunConfig(extract_tables=True)
-  # Access tables via result.tables or result.tables_as_dataframe
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        # Set up scraping parameters
+        crawl_config = CrawlerRunConfig(
+            table_score_threshold=8,  # Strict table detection
+        )
+
+        # Execute market data extraction
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://coinmarketcap.com/?page=1", config=crawl_config
+        )
+
+        # Process results
+        raw_df = pd.DataFrame()
+        for result in results:
+            if result.success and result.media["tables"]:
+                raw_df = pd.DataFrame(
+                    result.media["tables"][0]["rows"],
+                    columns=result.media["tables"][0]["headers"],
+                )
+                break
+        print(raw_df.head())
+
+    finally:
+        await crawler.stop()
   ```
 
 - **🚀 Browser Pooling**: Pages launch hot with pre-warmed browser instances for lower latency and memory usage
@@ -544,7 +569,7 @@ async def test_news_crawl():
   claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
   ```
 
-- **🖥️ Interactive Playground**: Test configurations and generate API requests with the built-in web interface at `/playground`
+- **🖥️ Interactive Playground**: Test configurations and generate API requests with the built-in web interface at `http://localhost:11235//playground`
 
 - **🐳 Revamped Docker Deployment**: Streamlined multi-architecture Docker image with improved resource efficiency
 
diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py
index 3cdba2c4..10b9e7ab 100644
--- a/docs/examples/crypto_analysis_example.py
+++ b/docs/examples/crypto_analysis_example.py
@@ -383,29 +383,29 @@ async def main():
             scroll_delay=0.2,
         )
 
-        # # Execute market data extraction
-        # results: List[CrawlResult] = await crawler.arun(
-        #     url="https://coinmarketcap.com/?page=1", config=crawl_config
-        # )
+        # Execute market data extraction
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://coinmarketcap.com/?page=1", config=crawl_config
+        )
 
-        # # Process results
-        # raw_df = pd.DataFrame()
-        # for result in results:
-        #     if result.success and result.media["tables"]:
-        #         # Extract primary market table
-        #         # DataFrame
-        #         raw_df = pd.DataFrame(
-        #             result.media["tables"][0]["rows"],
-        #             columns=result.media["tables"][0]["headers"],
-        #         )
-        #         break
+        # Process results
+        raw_df = pd.DataFrame()
+        for result in results:
+            if result.success and result.media["tables"]:
+                # Extract primary market table
+                # DataFrame
+                raw_df = pd.DataFrame(
+                    result.media["tables"][0]["rows"],
+                    columns=result.media["tables"][0]["headers"],
+                )
+                break
 
 
         # This is for debugging only
         # ////// Remove this in production from here..
         # Save raw data for debugging
-        # raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
-        # print("🔍 Raw data saved to 'raw_crypto_data.csv'")
+        raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
+        print("🔍 Raw data saved to 'raw_crypto_data.csv'")
 
         # Read from file for debugging
         raw_df = pd.read_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv")

From 7784b2468ec1887537cbe2ffb506c8bb6e9633ef Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 23 Apr 2025 20:07:03 +0800
Subject: [PATCH 77/78] feat(docs): enhance Ask AI button UX and add v0.6.0
 release notes

Improve Ask AI button with better mobile support, animations, and positioning:
- Add button animations and hover effects
- Improve mobile responsiveness
- Add icon to button
- Fix positioning logic for different viewport sizes
- Add keyboard (Escape) support

Add comprehensive v0.6.0 release documentation:
- Create detailed release notes
- Update blog index with latest release
- Document all major features and breaking changes

BREAKING CHANGE: Documentation structure updated with new v0.6.0 section
---
 docs/md_v2/assets/layout.css          |  30 ++++-
 docs/md_v2/assets/selection_ask_ai.js |  95 ++++++++++++++--
 docs/md_v2/blog/index.md              |  26 +++++
 docs/md_v2/blog/releases/0.6.0.md     | 156 ++++++++++++++++++++------
 4 files changed, 263 insertions(+), 44 deletions(-)

diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
index 0da340fa..044c272b 100644
--- a/docs/md_v2/assets/layout.css
+++ b/docs/md_v2/assets/layout.css
@@ -412,17 +412,41 @@ footer {
     background-color: var(--primary-dimmed-color, #09b5a5);
     color: var(--background-color, #070708);
     border: none;
-    padding: 4px 8px;
+    padding: 6px 10px;
     font-size: 0.8em;
     border-radius: 4px;
     cursor: pointer;
-    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
-    transition: background-color 0.2s ease;
+    box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
+    transition: background-color 0.2s ease, transform 0.15s ease;
     white-space: nowrap;
+    display: flex;
+    align-items: center;
+    font-weight: 500;
+    animation: askAiButtonAppear 0.2s ease-out;
+}
+
+@keyframes askAiButtonAppear {
+    from {
+        opacity: 0;
+        transform: scale(0.9);
+    }
+    to {
+        opacity: 1;
+        transform: scale(1);
+    }
 }
 
 .ask-ai-selection-button:hover {
     background-color: var(--primary-color, #50ffff);
+    transform: scale(1.05);
+}
+
+/* Mobile styles for Ask AI button */
+@media screen and (max-width: 768px) {
+    .ask-ai-selection-button {
+        padding: 8px 12px; /* Larger touch target on mobile */
+        font-size: 0.9em; /* Slightly larger text */
+    }
 }
 
 /* ==== File: docs/assets/layout.css (Additions) ==== */
diff --git a/docs/md_v2/assets/selection_ask_ai.js b/docs/md_v2/assets/selection_ask_ai.js
index b5cb471d..e88ad34e 100644
--- a/docs/md_v2/assets/selection_ask_ai.js
+++ b/docs/md_v2/assets/selection_ask_ai.js
@@ -8,12 +8,32 @@ document.addEventListener('DOMContentLoaded', () => {
         const button = document.createElement('button');
         button.id = 'ask-ai-selection-btn';
         button.className = 'ask-ai-selection-button';
-        button.textContent = 'Ask AI'; // Or use an icon
+        
+        // Add icon and text for better visibility
+        button.innerHTML = `
+            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="12" height="12" fill="currentColor" style="margin-right: 4px; vertical-align: middle;">
+                <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2z"/>
+            </svg>
+            <span>Ask AI</span>
+        `;
+        
+        // Common styles
         button.style.display = 'none'; // Initially hidden
         button.style.position = 'absolute';
         button.style.zIndex = '1500'; // Ensure it's on top
+        button.style.boxShadow = '0 3px 8px rgba(0, 0, 0, 0.4)'; // More pronounced shadow
+        button.style.transition = 'transform 0.15s ease, background-color 0.2s ease'; // Smooth hover effect
+        
+        // Add transform on hover
+        button.addEventListener('mouseover', () => {
+            button.style.transform = 'scale(1.05)';
+        });
+        
+        button.addEventListener('mouseout', () => {
+            button.style.transform = 'scale(1)';
+        });
+        
         document.body.appendChild(button);
-
         button.addEventListener('click', handleAskAiClick);
         return button;
     }
@@ -43,11 +63,38 @@ document.addEventListener('DOMContentLoaded', () => {
         const range = selection.getRangeAt(0);
         const rect = range.getBoundingClientRect();
 
-        // Calculate position: top-right of the selection
+        // Get viewport dimensions
+        const viewportWidth = window.innerWidth;
+        const viewportHeight = window.innerHeight;
+        
+        // Calculate position based on selection
         const scrollX = window.scrollX;
         const scrollY = window.scrollY;
-        const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
-        const buttonLeft = rect.right + scrollX + 5; // 5px to the right
+        
+        // Default position (top-right of selection)
+        let buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
+        let buttonLeft = rect.right + scrollX + 5; // 5px to the right
+        
+        // Check if we're on mobile (which we define as less than 768px)
+        const isMobile = viewportWidth <= 768;
+        
+        if (isMobile) {
+            // On mobile, position centered above selection to avoid edge issues
+            buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 10; // 10px above on mobile
+            buttonLeft = rect.left + scrollX + (rect.width / 2) - (askAiButton.offsetWidth / 2); // Centered
+        } else {
+            // For desktop, ensure the button doesn't go off screen
+            // Check right edge
+            if (buttonLeft + askAiButton.offsetWidth > scrollX + viewportWidth) {
+                buttonLeft = scrollX + viewportWidth - askAiButton.offsetWidth - 10; // 10px from right edge
+            }
+        }
+        
+        // Check top edge (for all devices)
+        if (buttonTop < scrollY) {
+            // If would go above viewport, position below selection instead
+            buttonTop = rect.bottom + scrollY + 5; // 5px below
+        }
 
         askAiButton.style.top = `${buttonTop}px`;
         askAiButton.style.left = `${buttonLeft}px`;
@@ -77,8 +124,8 @@ document.addEventListener('DOMContentLoaded', () => {
 
     // --- Event Listeners ---
 
-    // Show button on mouse up after selection
-    document.addEventListener('mouseup', (event) => {
+    // Function to handle selection events (both mouse and touch)
+    function handleSelectionEvent(event) {
         // Slight delay to ensure selection is registered
         setTimeout(() => {
             const selectedText = getSafeSelectedText();
@@ -86,7 +133,7 @@ document.addEventListener('DOMContentLoaded', () => {
                 if (!askAiButton) {
                     askAiButton = createAskAiButton();
                 }
-                // Don't position if the click was ON the button itself
+                // Don't position if the event was ON the button itself
                 if (event.target !== askAiButton) {
                      positionButton(event);
                 }
@@ -94,16 +141,46 @@ document.addEventListener('DOMContentLoaded', () => {
                 hideButton();
             }
         }, 10); // Small delay
+    }
+
+    // Mouse selection events (desktop)
+    document.addEventListener('mouseup', handleSelectionEvent);
+
+    // Touch selection events (mobile)
+    document.addEventListener('touchend', handleSelectionEvent);
+    document.addEventListener('selectionchange', () => {
+        // This helps with mobile selection which can happen without mouseup/touchend
+        setTimeout(() => {
+            const selectedText = getSafeSelectedText();
+            if (selectedText && askAiButton) {
+                positionButton();
+            }
+        }, 300); // Longer delay for selection change
     });
 
-    // Hide button on scroll or click elsewhere
+    // Hide button on various events
     document.addEventListener('mousedown', (event) => {
         // Hide if clicking anywhere EXCEPT the button itself
         if (askAiButton && event.target !== askAiButton) {
             hideButton();
         }
     });
+    
+    document.addEventListener('touchstart', (event) => {
+        // Same for touch events, but only hide if not on the button
+        if (askAiButton && event.target !== askAiButton) {
+            hideButton();
+        }
+    });
+    
     document.addEventListener('scroll', hideButton, true); // Capture scroll events
+    
+    // Also hide when pressing Escape key
+    document.addEventListener('keydown', (event) => {
+        if (event.key === 'Escape') {
+            hideButton();
+        }
+    });
 
     console.log("Selection Ask AI script loaded.");
 });
\ No newline at end of file
diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md
index 1eed43d9..55532fce 100644
--- a/docs/md_v2/blog/index.md
+++ b/docs/md_v2/blog/index.md
@@ -4,6 +4,32 @@ Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical
 
 ## Latest Release
 
+Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
+
+---
+
+### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
+*April 23, 2025*
+
+Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.  
+
+The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.  
+
+Other key changes:  
+
+*   Native support for `result.media["tables"]` to export DataFrames  
+* Full network + console logs and MHTML snapshot per crawl  
+* Browser pooling and pre-warming for faster cold starts  
+* New streaming endpoints via MCP API and Playground  
+* Robots.txt support, proxy rotation, and improved session handling  
+* Deprecated old markdown names, legacy modules cleaned up  
+* Massive repo cleanup: ~36K insertions, ~5K deletions across 121 files
+
+[Read full release notes →](releases/0.6.0.md)
+
+---
+
+Let me know if you want me to auto-update the actual file or just paste this into the markdown.
 
 ### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
 
diff --git a/docs/md_v2/blog/releases/0.6.0.md b/docs/md_v2/blog/releases/0.6.0.md
index 2e5bb63c..a3a7c216 100644
--- a/docs/md_v2/blog/releases/0.6.0.md
+++ b/docs/md_v2/blog/releases/0.6.0.md
@@ -1,51 +1,143 @@
-# Crawl4AI 0.6.0
+# Crawl4AI v0.6.0 Release Notes
 
-*Release date: 2025‑04‑22*
-
-0.6.0 is the **biggest jump** since the 0.5 series, packing a smarter browser core, pool‑based crawlers, and a ton of DX candy. Expect faster runs, lower RAM burn, and richer diagnostics.
+We're excited to announce the release of **Crawl4AI v0.6.0**, our biggest and most feature-rich update yet. This version introduces major architectural upgrades, brand-new capabilities for geo-aware crawling, high-efficiency scraping, and real-time streaming support for scalable deployments.
 
 ---
 
-## 🚀 Key upgrades
+## Highlights
 
-| Area | What changed |
-|------|--------------|
-| **Browser** | New **Browser** management with pooling, page pre‑warm, geolocation + locale + timezone switches |
-| **Crawler** | Console and network log capture, MHTML snapshots, safer `get_page` API |
-| **Server & API** | **Crawler Pool Manager** endpoint, MCP socket + SSE support |
-| **Docs** | v2 layout, floating Ask‑AI helper, GitHub stats badge, copy‑code buttons, Docker API demo |
-| **Tests** | Memory + load benchmarks, 90+ new cases covering MCP and Docker |
+### 1. **World-Aware Crawlers**
+Crawl as if you’re anywhere in the world. With v0.6.0, each crawl can simulate:
+- Specific GPS coordinates
+- Browser locale
+- Timezone
+
+Example:
+```python
+CrawlerRunConfig(
+    url="https://browserleaks.com/geo",
+    locale="en-US",
+    timezone_id="America/Los_Angeles",
+    geolocation=GeolocationConfig(
+        latitude=34.0522,
+        longitude=-118.2437,
+        accuracy=10.0
+    )
+)
+```
+Great for accessing region-specific content or testing global behavior.
 
 ---
 
-## ⚠️ Breaking changes
+### 2. **Native Table Extraction**
+Extract HTML tables directly into usable formats like Pandas DataFrames or CSV with zero parsing hassle. All table data is available under `result.media["tables"]`.
 
-1. **`get_page` signature** – returns `(html, metadata)` instead of plain html.
-2. **Docker** – new Chromium base layer, rebuild images.
+Example:
+```python
+raw_df = pd.DataFrame(
+    result.media["tables"][0]["rows"],
+    columns=result.media["tables"][0]["headers"]
+)
+```
+This makes it ideal for scraping financial data, pricing pages, or anything tabular.
 
 ---
 
-## How to upgrade
+### 3. **Browser Pooling & Pre-Warming**
+We've overhauled browser management. Now, multiple browser instances can be pooled and pages pre-warmed for ultra-fast launches:
+- Reduces cold-start latency
+- Lowers memory spikes
+- Enhances parallel crawling stability
 
+This powers the new **Docker Playground** experience and streamlines heavy-load crawling.
+
+---
+
+### 4. **Traffic & Snapshot Capture**
+Need full visibility? You can now capture:
+- Full network traffic logs
+- Console output
+- MHTML page snapshots for post-crawl audits and debugging
+
+No more guesswork on what happened during your crawl.
+
+---
+
+### 5. **MCP API and Streaming Support**
+We’re exposing **MCP socket and SSE endpoints**, allowing:
+- Live streaming of crawl results
+- Real-time integration with agents or frontends
+- A new Playground UI for interactive crawling
+
+This is a major step towards making Crawl4AI real-time ready.
+
+---
+
+### 6. **Stress-Test Framework**
+Want to test performance under heavy load? v0.6.0 includes a new memory stress-test suite that supports 1,000+ URL workloads. Ideal for:
+- Load testing
+- Performance benchmarking
+- Validating memory efficiency
+
+---
+
+## Core Improvements
+- Robots.txt compliance
+- Proxy rotation support
+- Improved URL normalization and session reuse
+- Shared data across crawler hooks
+- New page routing logic
+
+---
+
+## Breaking Changes & Deprecations
+- Legacy `crawl4ai/browser/*` modules are removed. Update imports accordingly.
+- `AsyncPlaywrightCrawlerStrategy.get_page` now uses a new function signature.
+- Deprecated markdown generator aliases now point to `DefaultMarkdownGenerator` with warning.
+
+---
+
+## Miscellaneous Updates
+- FastAPI validators replaced custom validation logic
+- Docker build now based on a Chromium layer
+- Repo-wide cleanup: ~36,000 insertions, ~5,000 deletions
+
+---
+
+## New Examples Included
+- Geo-location crawling
+- Network + console log capture
+- Docker MCP API usage
+- Markdown selector usage
+- Crypto project data extraction
+
+---
+
+## Watch the Release Video
+Want a visual walkthrough of all these updates? Watch the video:
+🔗 https://youtu.be/9x7nVcjOZks
+
+If you're new to Crawl4AI, start here:
+🔗 https://www.youtube.com/watch?v=xo3qK6Hg9AA&t=15s
+
+---
+
+## Join the Community
+We’ve just opened up our **Discord** for the public. Join us to:
+- Ask questions
+- Share your projects
+- Get help or contribute
+
+💬 https://discord.gg/wpYFACrHR4
+
+---
+
+## Install or Upgrade
 ```bash
-pip install -U crawl4ai==0.6.0
+pip install -U crawl4ai
 ```
 
 ---
 
-## Full changelog
-
-The diff between `main` and `next` spans **36 k insertions, 4.9 k deletions** over 121 files. Read the [compare view](https://github.com/unclecode/crawl4ai/compare/0.5.0.post8...0.6.0) or see `CHANGELOG.md` for the granular list.
-
----
-
-## Upgrade tips
-
-* Using the Docker API? Pull `unclecode/crawl4ai:0.6.0`, new args are documented in `/deploy/docker/README.md`.
-* Stress‑test your stack with `tests/memory/run_benchmark.py` before production rollout.
-* Markdown generators renamed but aliased, update when convenient, warnings will remind you.
-
----
-
-Happy crawling, ping `@unclecode` on X for questions or memes.
+Live long and import crawl4ai. 🖖
 

From ad4dfb21e181813da1dd32b9ec780c0645cde534 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 23 Apr 2025 21:00:00 +0800
Subject: [PATCH 78/78] Remoce "rc1"

---
 CHANGELOG.md                         |  2 +-
 README.md                            | 10 +++++-----
 deploy/docker/README.md              |  6 +++---
 docs/md_v2/core/docker-deployment.md |  6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9948529..9205c0b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.6.0rc1] ‑ 2025‑04‑22
+## [0.6.0] ‑ 2025‑04‑22
 
 ### Added
 - Browser pooling with page pre‑warming and fine‑grained **geolocation, locale, and timezone** controls  
diff --git a/README.md b/README.md
index 2147ee2e..97787b2f 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,9 @@
 
 Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.  
 
-[✨ Check out latest update v0.6.0rc1](#-recent-updates)
+[✨ Check out latest update v0.6.0](#-recent-updates)
 
-🎉 **Version 0.6.0rc1 is now available!** This release candidate introduces World-aware Crawling with geolocation and locale settings, Table-to-DataFrame extraction, Browser pooling with pre-warming, Network and console traffic capture, MCP integration for AI tools, and a completely revamped Docker deployment! [Read the release notes →](https://docs.crawl4ai.com/blog)
+🎉 **Version 0.6.0 is now available!** This release candidate introduces World-aware Crawling with geolocation and locale settings, Table-to-DataFrame extraction, Browser pooling with pre-warming, Network and console traffic capture, MCP integration for AI tools, and a completely revamped Docker deployment! [Read the release notes →](https://docs.crawl4ai.com/blog)
 
 <details>
 <summary>🤓 <strong>My Personal Story</strong></summary>
@@ -505,7 +505,7 @@ async def test_news_crawl():
 
 ## ✨ Recent Updates
 
-### Version 0.6.0rc1 Release Highlights
+### Version 0.6.0 Release Highlights
 
 - **🌎 World-aware Crawling**: Set geolocation, language, and timezone for authentic locale-specific content:
   ```python
@@ -575,7 +575,7 @@ async def test_news_crawl():
 
 - **📱 Multi-stage Build System**: Optimized Dockerfile with platform-specific performance enhancements
 
-Read the full details in our [0.6.0rc1 Release Notes](https://docs.crawl4ai.com/blog/releases/0.6.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
+Read the full details in our [0.6.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.6.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
 
 ### Previous Version: 0.5.0 Major Release Highlights
 
@@ -606,7 +606,7 @@ We use different suffixes to indicate development stages:
 - `dev` (0.4.3dev1): Development versions, unstable
 - `a` (0.4.3a1): Alpha releases, experimental features
 - `b` (0.4.3b1): Beta releases, feature complete but needs testing
-- `rc` (0.4.3rc1): Release candidates, potential final version
+- `rc` (0.4.3): Release candidates, potential final version
 
 #### Installation
 - Regular installation (stable version):
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
index 9e81a818..a0273f97 100644
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -58,7 +58,7 @@ Pull and run images directly from Docker Hub without building locally.
 
 #### 1. Pull the Image
 
-Our latest release candidate is `0.6.0rc1-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
+Our latest release candidate is `0.6.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
 
 ```bash
 # Pull the release candidate (recommended for latest features)
@@ -124,9 +124,9 @@ docker stop crawl4ai && docker rm crawl4ai
 #### Docker Hub Versioning Explained
 
 *   **Image Name:** `unclecode/crawl4ai`
-*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0rc1-r1`)
+*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r1`)
     *   `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
-    *   `SUFFIX`: Optional tag for release candidates (`rc1`) and revisions (`r1`)
+    *   `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
 *   **`latest` Tag:** Points to the most recent stable version
 *   **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
 
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
index 2a2f75eb..7e239d43 100644
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -58,7 +58,7 @@ Pull and run images directly from Docker Hub without building locally.
 
 #### 1. Pull the Image
 
-Our latest release candidate is `0.6.0rc1-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
+Our latest release candidate is `0.6.0-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
 
 ```bash
 # Pull the release candidate (recommended for latest features)
@@ -124,9 +124,9 @@ docker stop crawl4ai && docker rm crawl4ai
 #### Docker Hub Versioning Explained
 
 *   **Image Name:** `unclecode/crawl4ai`
-*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0rc1-r2`)
+*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r2`)
     *   `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
-    *   `SUFFIX`: Optional tag for release candidates (`rc1`) and revisions (`r1`)
+    *   `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
 *   **`latest` Tag:** Points to the most recent stable version
 *   **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
 

{col}
{row["test_id"]}	{row["date"]}	{row["urls"]}	{row["workers"]}	{row["success_rate"]:.1f}%	{row["time_seconds"]:.2f}	{row["urls_per_second"]:.1f}	{row["memory_growth"]:.1f}	N/A
{row["test_id"]}	{row["date"]}	{row["urls"]}	{row["workers"]}	{row["success_rate"]:.1f}%	{row["time_seconds"]:.2f}	{row["urls_per_second"]:.1f}	{row["memory_growth"]:.1f}	N/A