From 7b9aabc64a8ee8e992cfe2eeea9f00785ca0e069 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Mon, 14 Apr 2025 12:11:22 +0200
Subject: [PATCH 01/37] fix(crawler): ensure max_pages limit is respected
 during batch processing in crawling strategies

---
 crawl4ai/deep_crawling/bff_strategy.py | 12 ++++++++++++
 crawl4ai/deep_crawling/bfs_strategy.py |  9 +++++++++
 crawl4ai/deep_crawling/dfs_strategy.py |  8 ++++++++
 3 files changed, 29 insertions(+)

diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py
index 4811ba14..fd1b30bf 100644
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -148,6 +148,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                 self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                 break
                 
+            # Calculate how many more URLs we can process in this batch
+            remaining = self.max_pages - self._pages_crawled
+            batch_size = min(BATCH_SIZE, remaining)
+            if batch_size <= 0:
+                # No more pages to crawl
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
             batch: List[Tuple[float, int, str, Optional[str]]] = []
             # Retrieve up to BATCH_SIZE items from the priority queue.
             for _ in range(BATCH_SIZE):
@@ -182,6 +190,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                 # Count only successful crawls toward max_pages limit
                 if result.success:
                     self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                 
                 yield result
                 
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
index 54b72ea3..84e00642 100644
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -156,6 +156,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
         results: List[CrawlResult] = []
 
         while current_level and not self._cancel_event.is_set():
+            # Check if we've already reached max_pages before starting a new level
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+            
             next_level: List[Tuple[str, Optional[str]]] = []
             urls = [url for url, _ in current_level]
             visited.update(urls)
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                 # Count only successful crawls
                 if result.success:
                     self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                 
                 results_count += 1
                 yield result
diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py
index f79f9628..0eca58e3 100644
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                 # Count only successful crawls toward max_pages limit
                 if result.success:
                     self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                     
                     # Only discover links from successful crawls
                     new_links: List[Tuple[str, Optional[str]]] = []
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                 # and only discover links from successful crawls
                 if result.success:
                     self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                     
                     new_links: List[Tuple[str, Optional[str]]] = []
                     await self.link_discovery(result, url, depth, visited, new_links, depths)

From 1f3b1251d0aa8639c2615f13add944766ecaafa8 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Mon, 14 Apr 2025 12:16:31 +0200
Subject: [PATCH 02/37] docs(cli): add Crawl4AI CLI installation instructions
 to the CLI guide

---
 docs/md_v2/core/cli.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md
index ff4bf658..ded35f2f 100644
--- a/docs/md_v2/core/cli.md
+++ b/docs/md_v2/core/cli.md
@@ -17,6 +17,9 @@
 - [Configuration Reference](#configuration-reference)
 - [Best Practices & Tips](#best-practices--tips)
 
+## Installation
+The Crawl4AI CLI will be installed automatically when you install the library.
+
 ## Basic Usage
 
 The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:

From 05085b6e3d48f9b583aada02ccdc2f80db8b6cf8 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Tue, 15 Apr 2025 13:05:19 +0200
Subject: [PATCH 03/37] fix(requirements): add fake-useragent to requirements

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c1f36c56..8ad6bc41 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ psutil>=6.1.1
 nltk>=3.9.1
 rich>=13.9.4
 cssselect>=1.2.0
-faust-cchardet>=2.1.19
\ No newline at end of file
+faust-cchardet>=2.1.19
+fake-useragent>=2.2.0
\ No newline at end of file

From 0ec3c4a7886a26e38a7467905f55072dc72737da Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 17 Apr 2025 12:11:12 +0200
Subject: [PATCH 04/37] fix(crawler): handle navigation aborts during file
 downloads in AsyncPlaywrightCrawlerStrategy

---
 crawl4ai/async_crawler_strategy.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 89b4df84..28325c84 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -582,7 +582,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     )
                     redirected_url = page.url
                 except Error as e:
-                    raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+                    # Allow navigation to be aborted when downloading files
+                    # This is expected behavior for downloads in some browser engines
+                    if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
+                        self.logger.info(
+                            message=f"Navigation aborted, likely due to file download: {url}",
+                            tag="GOTO",
+                            params={"url": url},
+                        )
+                        response = None
+                    else:
+                        raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
 
                 await self.execute_hook(
                     "after_goto", page, context=context, url=url, response=response, config=config

From 0886153d6a4267bf6b1846b8601edc87055fa13e Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 17 Apr 2025 12:48:11 +0200
Subject: [PATCH 05/37] fix(async_playwright_crawler): improve segment handling
 and viewport adjustments during screenshot capture (Fixed bug: Capturing
 Screenshot Twice and Increasing Image Size)

---
 crawl4ai/async_crawler_strategy.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 28325c84..bda4897c 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1162,12 +1162,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             num_segments = (page_height // viewport_height) + 1
             for i in range(num_segments):
                 y_offset = i * viewport_height
+                # Special handling for the last segment
+                if i == num_segments - 1:
+                    last_part_height = page_height % viewport_height
+                    
+                    # If page_height is an exact multiple of viewport_height,
+                    # we don't need an extra segment
+                    if last_part_height == 0:
+                        # Skip last segment if page height is exact multiple of viewport
+                        break
+                    
+                    # Adjust viewport to exactly match the remaining content height
+                    await page.set_viewport_size({"width": page_width, "height": last_part_height})
+                
                 await page.evaluate(f"window.scrollTo(0, {y_offset})")
                 await asyncio.sleep(0.01)  # wait for render
-                seg_shot = await page.screenshot(full_page=False)
+                
+                # Capture the current segment
+                # Note: Using compression options (format, quality) would go here
+                seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
+                # seg_shot = await page.screenshot(full_page=False)
                 img = Image.open(BytesIO(seg_shot)).convert("RGB")
                 segments.append(img)
 
+            # Reset viewport to original size after capturing segments
+            await page.set_viewport_size({"width": page_width, "height": viewport_height})
+
             total_height = sum(img.height for img in segments)
             stitched = Image.new("RGB", (segments[0].width, total_height))
             offset = 0

From 14a31456ef249a32be1d971cad9ab056da1a24e7 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Mon, 21 Apr 2025 13:59:49 +0200
Subject: [PATCH 06/37] fix(docs): update browser-crawler-config example to
 include LLMContentFilter and DefaultMarkdownGenerator, fix syntax errors

---
 docs/md_v2/core/browser-crawler-config.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 0d97e0fc..5f66b3ea 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -265,7 +265,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 
 async def main():
@@ -290,7 +290,7 @@ async def main():
     # 3) Example LLM content filtering
 
     gemini_config = LLMConfig(
-        provider="gemini/gemini-1.5-pro" 
+        provider="gemini/gemini-1.5-pro", 
         api_token = "env:GEMINI_API_TOKEN"
     )
 
@@ -314,8 +314,9 @@ async def main():
     )
 
     md_generator = DefaultMarkdownGenerator(
-    content_filter=filter,
-    options={"ignore_links": True}
+        content_filter=filter,
+        options={"ignore_links": True}
+    )
 
     # 4) Crawler run config: skip cache, use extraction
     run_conf = CrawlerRunConfig(

From 53245e4e0e54dc4604f8b427105d820dba6c38a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Sacrist=C3=A1n?= <marc@sacristan.com.es>
Date: Tue, 29 Apr 2025 16:26:35 +0200
Subject: [PATCH 07/37] Fix: README.md urls list

---
 README.md               | 18 +++++++++++++-----
 tests/docker_example.py | 18 +++++++++---------
 tests/test_docker.py    | 16 ++++++++--------
 tests/test_main.py      | 16 ++++++++--------
 4 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 97787b2f..879baa51 100644
--- a/README.md
+++ b/README.md
@@ -291,12 +291,20 @@ import requests
 # Submit a crawl job
 response = requests.post(
     "http://localhost:11235/crawl",
-    json={"urls": "https://example.com", "priority": 10}
+    json={"urls": ["https://example.com"], "priority": 10}
 )
-task_id = response.json()["task_id"]
-
-# Continue polling until the task is complete (status="completed")
-result = requests.get(f"http://localhost:11235/task/{task_id}")
+if response.status_code == 200:
+    print("Crawl job submitted successfully.")
+    
+if "results" in response.json():
+    results = response.json()["results"]
+    print("Crawl job completed. Results:")
+    for result in results:
+        print(result)
+else:
+    task_id = response.json()["task_id"]
+    print(f"Crawl job submitted. Task ID:: {task_id}")
+    result = requests.get(f"http://localhost:11235/task/{task_id}")
 ```
 
 For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
diff --git a/tests/docker_example.py b/tests/docker_example.py
index 336ca52f..03348d50 100644
--- a/tests/docker_example.py
+++ b/tests/docker_example.py
@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
 def test_basic_crawl(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 10,
         "session_id": "test",
     }
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
 def test_basic_crawl_sync(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl (Sync) ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 10,
         "session_id": "test",
     }
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
 def test_js_execution(tester: Crawl4AiTester):
     print("\n=== Testing JS Execution ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 8,
         "js_code": [
             "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
 def test_css_selector(tester: Crawl4AiTester):
     print("\n=== Testing CSS Selector ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 7,
         "css_selector": ".wide-tease-item__description",
         "crawler_params": {"headless": True},
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
     }
 
     request = {
-        "urls": "https://www.coinbase.com/explore",
+        "urls": ["https://www.coinbase.com/explore"],
         "priority": 9,
         "extraction_config": {"type": "json_css", "params": {"schema": schema}},
     }
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
     }
 
     request = {
-        "urls": "https://openai.com/api/pricing",
+        "urls": ["https://openai.com/api/pricing"],
         "priority": 8,
         "extraction_config": {
             "type": "llm",
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
     }
 
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 8,
         "extraction_config": {
             "type": "llm",
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
 def test_cosine_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Cosine Extraction ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 8,
         "extraction_config": {
             "type": "cosine",
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
 def test_screenshot(tester: Crawl4AiTester):
     print("\n=== Testing Screenshot ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 5,
         "screenshot": True,
         "crawler_params": {"headless": True},
diff --git a/tests/test_docker.py b/tests/test_docker.py
index 3570d608..c507ae56 100644
--- a/tests/test_docker.py
+++ b/tests/test_docker.py
@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
 
 def test_basic_crawl(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl ===")
-    request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
+    request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
 
     result = tester.submit_and_wait(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
 def test_js_execution(tester: Crawl4AiTester):
     print("\n=== Testing JS Execution ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 8,
         "js_code": [
             "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
 def test_css_selector(tester: Crawl4AiTester):
     print("\n=== Testing CSS Selector ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 7,
         "css_selector": ".wide-tease-item__description",
         "crawler_params": {"headless": True},
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
     }
 
     request = {
-        "urls": "https://www.coinbase.com/explore",
+        "urls": ["https://www.coinbase.com/explore"],
         "priority": 9,
         "extraction_config": {"type": "json_css", "params": {"schema": schema}},
     }
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
     }
 
     request = {
-        "urls": "https://openai.com/api/pricing",
+        "urls": ["https://openai.com/api/pricing"],
         "priority": 8,
         "extraction_config": {
             "type": "llm",
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
     }
 
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 8,
         "extraction_config": {
             "type": "llm",
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
 def test_cosine_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Cosine Extraction ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 8,
         "extraction_config": {
             "type": "cosine",
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
 def test_screenshot(tester: Crawl4AiTester):
     print("\n=== Testing Screenshot ===")
     request = {
-        "urls": "https://www.nbcnews.com/business",
+        "urls": ["https://www.nbcnews.com/business"],
         "priority": 5,
         "screenshot": True,
         "crawler_params": {"headless": True},
diff --git a/tests/test_main.py b/tests/test_main.py
index 0e938f59..b32b68f0 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -54,7 +54,7 @@ class NBCNewsAPITest:
 async def test_basic_crawl():
     print("\n=== Testing Basic Crawl ===")
     async with NBCNewsAPITest() as api:
-        request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
+        request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
         print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -67,7 +67,7 @@ async def test_js_execution():
     print("\n=== Testing JS Execution ===")
     async with NBCNewsAPITest() as api:
         request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
             "priority": 8,
             "js_code": [
                 "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -86,7 +86,7 @@ async def test_css_selector():
     print("\n=== Testing CSS Selector ===")
     async with NBCNewsAPITest() as api:
         request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
             "priority": 7,
             "css_selector": ".wide-tease-item__description",
         }
@@ -120,7 +120,7 @@ async def test_structured_extraction():
         }
 
         request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
             "priority": 9,
             "extraction_config": {"type": "json_css", "params": {"schema": schema}},
         }
@@ -177,7 +177,7 @@ async def test_llm_extraction():
         }
 
         request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
             "priority": 8,
             "extraction_config": {
                 "type": "llm",
@@ -209,7 +209,7 @@ async def test_screenshot():
     print("\n=== Testing Screenshot ===")
     async with NBCNewsAPITest() as api:
         request = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
             "priority": 5,
             "screenshot": True,
             "crawler_params": {"headless": True},
@@ -227,7 +227,7 @@ async def test_priority_handling():
     async with NBCNewsAPITest() as api:
         # Submit low priority task first
         low_priority = {
-            "urls": "https://www.nbcnews.com/business",
+            "urls": ["https://www.nbcnews.com/business"],
             "priority": 1,
             "crawler_params": {"headless": True},
         }
@@ -235,7 +235,7 @@ async def test_priority_handling():
 
         # Submit high priority task
         high_priority = {
-            "urls": "https://www.nbcnews.com/business/consumer",
+            "urls": ["https://www.nbcnews.com/business/consumer"],
             "priority": 10,
             "crawler_params": {"headless": True},
         }

From 039be1b1ce7e32d1186ce9d1b123605248f3fb26 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 30 Apr 2025 11:41:35 +0200
Subject: [PATCH 08/37] feat: add pdf2image dependency to requirements

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4aa2dbff..b695f92c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,4 +23,5 @@ rich>=13.9.4
 cssselect>=1.2.0
 chardet>=5.2.0
 brotli>=1.1.0
-fake-useragent>=2.2.0
\ No newline at end of file
+fake-useragent>=2.2.0
+pdf2image>=1.17.0
\ No newline at end of file

From 1d6a2b9979d530703ec76708a385a2d87a1b5f7d Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 30 Apr 2025 12:29:17 +0200
Subject: [PATCH 09/37] fix(crawler): surface real redirect status codes and
 keep redirect chain. the 30x response instead of always returning 200. Refs
 #660

---
 crawl4ai/async_crawler_strategy.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 3162bd54..da5490b6 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -744,12 +744,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     "after_goto", page, context=context, url=url, response=response, config=config
                 )
 
+                # ──────────────────────────────────────────────────────────────
+                # Walk the redirect chain.  Playwright returns only the last
+                # hop, so we trace the `request.redirected_from` links until the
+                # first response that differs from the final one and surface its
+                # status-code.
+                # ──────────────────────────────────────────────────────────────
                 if response is None:
                     status_code = 200
                     response_headers = {}
                 else:
-                    status_code = response.status
-                    response_headers = response.headers
+                    first_resp = response
+                    req = response.request
+                    while req and req.redirected_from:
+                        prev_req = req.redirected_from
+                        prev_resp = await prev_req.response()
+                        if prev_resp:                       # keep earliest
+                            first_resp = prev_resp
+                        req = prev_req
+                
+                    status_code = first_resp.status
+                    response_headers = first_resp.headers
+                # if response is None:
+                #     status_code = 200
+                #     response_headers = {}
+                # else:
+                #     status_code = response.status
+                #     response_headers = response.headers
 
             else:
                 status_code = 200

From e0cd3e10de0b04079c2144c6febb54cd74139f50 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Fri, 2 May 2025 10:35:35 +0200
Subject: [PATCH 10/37] fix(crawler): initialize captured_console variable for
 local file processing

---
 crawl4ai/async_crawler_strategy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index da5490b6..6c0b4115 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -445,6 +445,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             return await self._crawl_web(url, config)
 
         elif url.startswith("file://"):
+            captured_console = None
             # Process local file
             local_file_path = url[7:]  # Remove 'file://' prefix
             if not os.path.exists(local_file_path):

From 12783fabdab1cdea99e930392c572e83831897df Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 7 May 2025 11:18:13 +0200
Subject: [PATCH 11/37] fix(dependencies): update pillow version constraint to
 allow newer releases. ref #709

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be44397e..8b5f0910 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = [
     "lxml~=5.3",
     "litellm>=1.53.1",
     "numpy>=1.26.0,<3",
-    "pillow~=10.4",
+    "pillow>=10.4",
     "playwright>=1.49.0",
     "python-dotenv~=1.0",
     "requests~=2.26",
diff --git a/requirements.txt b/requirements.txt
index b695f92c..10d7fd81 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ aiosqlite~=0.20
 lxml~=5.3
 litellm>=1.53.1
 numpy>=1.26.0,<3
-pillow~=10.4
+pillow>=10.4
 playwright>=1.49.0
 python-dotenv~=1.0
 requests~=2.26

From eebb8c84f0a434f6cec4173a82c8b4dceb510037 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 7 May 2025 11:18:44 +0200
Subject: [PATCH 12/37] fix(requirements): add PyPDF2 dependency for PDF
 processing

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 10d7fd81..b62575d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,4 +24,5 @@ cssselect>=1.2.0
 chardet>=5.2.0
 brotli>=1.1.0
 fake-useragent>=2.2.0
-pdf2image>=1.17.0
\ No newline at end of file
+pdf2image>=1.17.0
+PyPDF2>=3.0.1
\ No newline at end of file

From 2b17f234f8354dca893063b68aa3ec41431c5d3c Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 7 May 2025 15:20:36 +0530
Subject: [PATCH 13/37] docs: update direct passing of content_filter to
 CrawlerRunConfig and instead pass it via MarkdownGenerator. Ref: #603

---
 deploy/docker/c4ai-doc-context.md      | 11 ++++++++---
 docs/md_v2/core/markdown-generation.md |  9 ++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 1642f85e..081f29b7 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -403,7 +403,7 @@ async def main():
 
     md_generator = DefaultMarkdownGenerator(
     content_filter=filter,
-    options={"ignore_links": True}
+    options={"ignore_links": True})
 
     # 4) Crawler run config: skip cache, use extraction
     run_conf = CrawlerRunConfig(
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 
 async def main():
@@ -4175,8 +4175,13 @@ async def main():
         verbose=True
     )
 
+    md_generator = DefaultMarkdownGenerator(
+        content_filter=filter,
+        options={"ignore_links": True}
+    )
+
     config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator
     )
 
     async with AsyncWebCrawler() as crawler:
diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index e6f5e12a..4a6e9218 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 
 async def main():
@@ -255,9 +255,12 @@ async def main():
         chunk_token_threshold=4096,  # Adjust based on your needs
         verbose=True
     )
-
+    md_generator = DefaultMarkdownGenerator(
+        content_filter=filter,
+        options={"ignore_links": True}
+    )
     config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator,
     )
 
     async with AsyncWebCrawler() as crawler:

From ee93acbd06c49ce70e3905f267fd15711b39446b Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 7 May 2025 12:32:38 +0200
Subject: [PATCH 14/37] fix(async_playwright_crawler): use config directly
 instead of self.config for verbosity check

---
 crawl4ai/async_crawler_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index a6aae4e7..85c3a15c 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -816,7 +816,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             except Error:
                 visibility_info = await self.check_visibility(page)
 
-                if self.config.verbose:
+                if self.verbose:
                     self.logger.debug(
                         message="Body visibility info: {info}",
                         tag="DEBUG",

From f6e25e2a6bae8a1b774b6e71fc98edc460d04b53 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 7 May 2025 17:53:30 +0530
Subject: [PATCH 15/37] fix: check_robots_txt to support wildcard rules ref:
 #699

---
 crawl4ai/utils.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index bfa8ce9d..4018d78c 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -42,6 +42,29 @@ from itertools import chain
 from collections import deque
 from typing import  Generator, Iterable
 
+# Monkey patch to fix wildcard handling in urllib.robotparser
+from urllib.robotparser import RuleLine
+import re
+
+original_applies_to = RuleLine.applies_to
+
+def patched_applies_to(self, filename):
+   # Handle wildcards in paths
+   if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
+       pattern = self.path.replace('%2A', '*')
+       pattern = re.escape(pattern).replace('\\*', '.*')
+       pattern = '^' + pattern
+       if pattern.endswith('\\$'):
+           pattern = pattern[:-2] + '$'
+       try:
+           return bool(re.match(pattern, filename))
+       except re.error:
+           return original_applies_to(self, filename)
+   return original_applies_to(self, filename)
+
+RuleLine.applies_to = patched_applies_to
+# Monkey patch ends
+
 def chunk_documents(
     documents: Iterable[str],
     chunk_token_threshold: int,
@@ -303,7 +326,7 @@ class RobotsParser:
                 robots_url = f"{scheme}://{domain}/robots.txt"
                 
                 async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2) as response:
+                    async with session.get(robots_url, timeout=2, ssl=False) as response:
                         if response.status == 200:
                             rules = await response.text()
                             self._cache_rules(domain, rules)

From c1041b9bbee1338ec89997bdf62e76c6a5f3ada6 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 7 May 2025 18:43:29 +0530
Subject: [PATCH 16/37] fix: exclude_external_images flag simply discards
 elements ref:https://github.com/unclecode/crawl4ai/issues/345

---
 crawl4ai/content_scraping_strategy.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 1dfbce84..d11e02d0 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
                     # Check flag if we should remove external images
                     if kwargs.get("exclude_external_images", False):
-                        element.decompose()
-                        return False
-                        # src_url_base = src.split('/')[2]
-                        # url_base = url.split('/')[2]
-                        # if url_base not in src_url_base:
-                        #     element.decompose()
-                        #     return False
+                        # Handle relative URLs (which are always from the same domain)
+                        if not src.startswith('http') and not src.startswith('//'):
+                            return True  # Keep relative URLs
+                        
+                        # For absolute URLs, compare the base domains using the existing function
+                        src_base_domain = get_base_domain(src)
+                        url_base_domain = get_base_domain(url)
+                        
+                        # If the domains don't match and both are valid, the image is external
+                        if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
+                            element.decompose()
+                            return False
 
                     # if kwargs.get('exclude_social_media_links', False):
                     #     if image_src_base_domain in exclude_social_media_domains:

From 25d97d56e4e3bbc74fa1de9423cc5ae3457b0baf Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Tue, 13 May 2025 13:56:12 +0200
Subject: [PATCH 17/37] fix(dependencies): remove duplicated aiofiles from
 project dependencies. REF #1045

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8b5f0910..a208d5d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,6 @@ dependencies = [
     "psutil>=6.1.1",
     "nltk>=3.9.1",
     "playwright",
-    "aiofiles",
     "rich>=13.9.4",
     "cssselect>=1.2.0",
     "httpx>=0.27.2",

From 260e2dc347e2d0b4463eec31f3eaa81e87ca109b Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Tue, 13 May 2025 14:03:20 +0200
Subject: [PATCH 18/37] fix(browser): create browser config before launching
 managed browser instance. REF:
 https://discord.com/channels/1278297938551902308/1278298697540567132/1371683009459392716

---
 crawl4ai/browser_profiler.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index 961ba740..41f917f5 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -615,9 +615,18 @@ class BrowserProfiler:
         self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
         self.logger.info(f"Headless mode: {headless}", tag="CDP")
         
+        # create browser config
+        browser_config = BrowserConfig(
+            browser_type=browser_type,
+            headless=headless,
+            user_data_dir=profile_path,
+            debugging_port=debugging_port,
+            verbose=True
+        )
+        
         # Create managed browser instance
         managed_browser = ManagedBrowser(
-            browser_type=browser_type,
+            browser_config=browser_config,
             user_data_dir=profile_path,
             headless=headless,
             logger=self.logger,

From 137556b3dce373bfd8af09e8bd5f9da0051ba463 Mon Sep 17 00:00:00 2001
From: medo94my <medoroyalrma@gmail.com>
Date: Wed, 14 May 2025 16:01:10 +0800
Subject: [PATCH 19/37] fix the EXTRACT to match the styling of the other
 methods

---
 crawl4ai/async_webcrawler.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 19b98522..9e42b824 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -360,7 +360,7 @@ class AsyncWebCrawler:
                         pdf_data=pdf_data,
                         verbose=config.verbose,
                         is_raw_html=True if url.startswith("raw:") else False,
-                        redirected_url=async_response.redirected_url, 
+                        redirected_url=async_response.redirected_url,
                         **kwargs,
                     )
 
@@ -503,7 +503,7 @@ class AsyncWebCrawler:
             tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
-            
+
         fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
 
         ################################
@@ -585,11 +585,13 @@ class AsyncWebCrawler:
             # Choose content based on input_format
             content_format = config.extraction_strategy.input_format
             if content_format == "fit_markdown" and not markdown_result.fit_markdown:
-                self.logger.warning(
-                    message="Fit markdown requested but not available. Falling back to raw markdown.",
-                    tag="EXTRACT",
-                    params={"url": _url},
-                )
+
+                self.logger.url_status(
+                        url=_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - t1,
+                        tag="EXTRACT",
+                    )
                 content_format = "markdown"
 
             content = {

From a3b0cab52a813f505db0f58e40079b4e8d817a6a Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Thu, 15 May 2025 11:25:06 +0800
Subject: [PATCH 20/37] #1088 is sloved flag -bc now if for --byPass-cache

---
 crawl4ai/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index 51477d6b..a02eff59 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
 @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
 @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
 @click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
-@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling")
 @click.option("--question", "-q", help="Ask a question about the crawled content")
 @click.option("--verbose", "-v", is_flag=True)
 @click.option("--profile", "-p", help="Use a specific browser profile (by name)")

From 32966bea11dc595d752f89502bbe7e0a2240ba28 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 15 May 2025 10:09:19 +0200
Subject: [PATCH 21/37] fix(extraction): resolve `'str' object has no attribute
 'choices'` error in LLMExtractionStrategy. Refs: #979

This patch ensures consistent handling of `response.choices[0].message.content` by avoiding redefinition
of the `response` variable, which caused downstream exceptions during error handling.
---
 crawl4ai/extraction_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 245abc54..6be084b3 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -680,7 +680,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
                     block["error"] = False
             except Exception:
                 parsed, unparsed = split_and_parse_json_objects(
-                    response.choices[0].message.content
+                    response
                 )
                 blocks = parsed
                 if unparsed:

From e0fbd2b0a0488569ac1d5e89a6363d11a00c2b25 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 15 May 2025 10:45:23 +0200
Subject: [PATCH 22/37] fix(schema): update `f` parameter description to use
 lowercase enum values. REF: #1070

Revised the description for the `f` parameter in the `/mcp/md` tool schema to use lowercase enum values
(`raw`, `fit`, `bm25`, `llm`) for consistency with the actual `enum` definition. This change prevents
LLM-based clients (e.g., Gemini via LibreChat) from generating uppercase values like `"FIT"`, which
caused 422 validation errors due to strict case-sensitive matching.
---
 deploy/docker/schemas.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py
index ea32b6c6..611e17e9 100644
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
 class MarkdownRequest(BaseModel):
     """Request body for the /md endpoint."""
     url: str                    = Field(...,  description="Absolute http/https URL to fetch")
-    f:   FilterType             = Field(FilterType.FIT,
-                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
+    f:   FilterType             = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
     q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
     c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
 

From 22725ca87b76107f5251e1ab97906ecfd61fac07 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 15 May 2025 11:29:36 +0200
Subject: [PATCH 23/37] fix(crawler): initialize `captured_console` to prevent
 unbound local error for local HTML files. REF: #1072

Resolved a bug where running the crawler on local HTML files with `capture_console_messages=False`
(default) raised `UnboundLocalError` due to `captured_console` being accessed before assignment.
---
 crawl4ai/async_crawler_strategy.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 10d395ee..9a8d621c 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             return await self._crawl_web(url, config)
 
         elif url.startswith("file://"):
+            # initialize empty lists for console messages
+            captured_console = []
+            
             # Process local file
             local_file_path = url[7:]  # Remove 'file://' prefix
             if not os.path.exists(local_file_path):

From faa98eefbc4f3f87f8751bbb5c534cba4f8507c1 Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Mon, 19 May 2025 11:35:13 +0800
Subject: [PATCH 24/37] #1105 got fixed (metadata now matches with  meta
 property article:*

---
 crawl4ai/utils.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index bfa8ce9d..ebf15f24 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1456,6 +1456,12 @@ def extract_metadata_using_lxml(html, doc=None):
         content = tag.get("content", "").strip()
         if property_name and content:
             metadata[property_name] = content
+        # getting the article Values
+    metadata.update({
+        tag['property'].strip():tag["content"].strip()
+        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
+          if tag.has_attr('property') and tag.has_attr('content')
+    })
 
     return metadata
 
@@ -1531,7 +1537,12 @@ def extract_metadata(html, soup=None):
         content = tag.get("content", "").strip()
         if property_name and content:
             metadata[property_name] = content
-
+        # getting the article Values
+    metadata.update({
+        tag['property'].strip():tag["content"].strip()
+        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
+          if tag.has_attr('property') and tag.has_attr('content')
+    })
     return metadata
 
 

From 137ac014fb986f7df4e3cd8d8598b6120e05a20c Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Mon, 19 May 2025 13:48:02 +0800
Subject: [PATCH 25/37] #1105 :fix(metadata): optimize article metadata
 extraction using XPath for improved performance

---
 crawl4ai/utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index ebf15f24..64d4b210 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1456,12 +1456,13 @@ def extract_metadata_using_lxml(html, doc=None):
         content = tag.get("content", "").strip()
         if property_name and content:
             metadata[property_name] = content
-        # getting the article Values
-    metadata.update({
-        tag['property'].strip():tag["content"].strip()
-        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
-          if tag.has_attr('property') and tag.has_attr('content')
-    })
+   # Article metadata - using starts-with() for performance
+    article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
+    for tag in article_tags:
+        property_name = tag.get("property", "").strip()
+        content = tag.get("content", "").strip()
+        if property_name and content:
+            metadata[property_name] = content
 
     return metadata
 

From b4fc60a5552c4c89b6d6893ecd45910eda9219ae Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Mon, 19 May 2025 13:51:16 +0800
Subject: [PATCH 26/37] #1103 fix(url): enhance URL normalization to handle
 invalid schemes and trailing slashes

---
 crawl4ai/utils.py           | 14 +++---
 tests/test_normalize_url.py | 91 +++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_normalize_url.py

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 64d4b210..46207ca7 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2012,14 +2012,16 @@ def normalize_url(href, base_url):
     parsed_base = urlparse(base_url)
     if not parsed_base.scheme or not parsed_base.netloc:
         raise ValueError(f"Invalid base URL format: {base_url}")
-
-    # Ensure base_url ends with a trailing slash if it's a directory path
-    if not base_url.endswith('/'):
-        base_url = base_url + '/'
+    
+    if  parsed_base.scheme.lower() not in ["http", "https"]:
+        # Handle special protocols
+        raise ValueError(f"Invalid base URL format: {base_url}")
+    cleaned_href = href.strip()
 
     # Use urljoin to handle all cases
-    normalized = urljoin(base_url, href.strip())
-    return normalized
+    return urljoin(base_url, cleaned_href)
+
+
 
 
 def normalize_url_for_deep_crawl(href, base_url):
diff --git a/tests/test_normalize_url.py b/tests/test_normalize_url.py
new file mode 100644
index 00000000..b1f1cc7d
--- /dev/null
+++ b/tests/test_normalize_url.py
@@ -0,0 +1,91 @@
+import unittest
+from crawl4ai.utils import normalize_url
+
+class TestNormalizeUrl(unittest.TestCase):
+
+    def test_basic_relative_path(self):
+        self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html")
+
+    def test_base_url_with_trailing_slash(self):
+        self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html")
+
+    def test_base_url_without_trailing_slash(self):
+        # If normalize_url correctly uses urljoin, "base" is treated as a file.
+        self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html")
+
+    def test_absolute_url_as_href(self):
+        self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html")
+
+    def test_href_with_leading_trailing_spaces(self):
+        self.assertEqual(normalize_url("  page.html  ", "http://example.com/"), "http://example.com/page.html")
+
+    def test_empty_href(self):
+        # urljoin with an empty href and base ending in '/' returns the base.
+        self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/")
+        # urljoin with an empty href and base not ending in '/' also returns base.
+        self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base")
+
+    def test_href_with_query_parameters(self):
+        self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test")
+
+    def test_href_with_fragment(self):
+        self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section")
+
+    def test_different_scheme_in_href(self):
+        self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html")
+
+    def test_parent_directory_in_href(self):
+        self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html")
+
+    def test_root_relative_href(self):
+        self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html")
+
+    def test_base_url_with_path_and_no_trailing_slash(self):
+        # If normalize_url correctly uses urljoin, "path" is treated as a file.
+        self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html")
+
+    def test_base_url_is_just_domain(self):
+        self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html")
+
+    def test_href_is_only_query(self):
+        self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true")
+
+    def test_href_is_only_fragment(self):
+        self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment")
+
+    def test_relative_link_from_base_file_url(self):
+        """
+        Tests the specific bug report: relative links from a base URL that is a file.
+        Example:
+        Page URL: http://example.com/path/to/document.html
+        Link on page: <a href="./file.xlsx">
+        Expected: http://example.com/path/to/file.xlsx
+        """
+        base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml"
+        href_relative_current_dir = "./P020241203375994691134.xlsx"
+        expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx"
+        self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1)
+
+        # Test with a relative link that doesn't start with "./"
+        href_relative_no_dot_slash = "another.doc"
+        expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc"
+        self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2)
+
+    def test_invalid_base_url_scheme(self):
+        with self.assertRaises(ValueError) as context:
+            normalize_url("page.html", "ftp://example.com/")
+        self.assertIn("Invalid base URL format", str(context.exception))
+
+    def test_invalid_base_url_netloc(self):
+        with self.assertRaises(ValueError) as context:
+            normalize_url("page.html", "http:///path/")
+        self.assertIn("Invalid base URL format", str(context.exception))
+        
+    def test_base_url_with_port(self):
+        self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html")
+
+    def test_href_with_special_characters(self):
+        self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html")
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From a97654270b9b5ba89ed7d3a1bb616bf2f8417203 Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Mon, 19 May 2025 14:11:46 +0800
Subject: [PATCH 27/37] #1086 fix(markdown): update BM25 filter to use language
 parameter for stemming

---
 docs/md_v2/core/markdown-generation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index e6f5e12a..e897b2bb 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -187,7 +187,7 @@ from crawl4ai import CrawlerRunConfig
 bm25_filter = BM25ContentFilter(
     user_query="machine learning",
     bm25_threshold=1.2,
-    use_stemming=True
+    language="english"
 )
 
 md_generator = DefaultMarkdownGenerator(
@@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 
 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
-- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+- **`language (str)`**: Language for stemming (default: 'english').
 
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
 

From a55c2b3f88371570a5683be59e40f8ea609b0a19 Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Mon, 19 May 2025 16:32:22 +0800
Subject: [PATCH 28/37] refactor(logging): update extraction logging to use
 url_status method

---
 crawl4ai/async_webcrawler.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 9e42b824..cb221b72 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -615,11 +615,12 @@ class AsyncWebCrawler:
             )
 
             # Log extraction completion
-            self.logger.info(
-                message="Completed for {url:.50}... | Time: {timing}s",
-                tag="EXTRACT",
-                params={"url": _url, "timing": time.perf_counter() - t1},
-            )
+            self.logger.url_status(
+                        url=_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - t1,
+                        tag="EXTRACT",
+                    )
 
         # Apply HTML formatting if requested
         if config.prettiify:

From cb8d581e477daf1a310f504847cbbcafb7e8e07e Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Mon, 19 May 2025 18:03:05 +0200
Subject: [PATCH 29/37] fix(docs): update CrawlerRunConfig to use CacheMode for
 bypassing cache. REF: #1125

---
 deploy/docker/c4ai-doc-context.md | 20 ++++++++++----------
 docs/md_v2/core/local-files.md    | 16 ++++++++--------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 1642f85e..5b5a81bb 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/apple", 
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def crawl_local_file():
     local_file_path = "/path/to/apple.html"  # Replace with your file path
     file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
     
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(url=file_url, config=config)
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def crawl_raw_html():
     raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
     raw_html_url = f"raw:{raw_html}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
     
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(url=raw_html_url, config=config)
@@ -3845,7 +3845,7 @@ import os
 import sys
 import asyncio
 from pathlib import Path
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def main():
@@ -3856,7 +3856,7 @@ async def main():
     async with AsyncWebCrawler() as crawler:
         # Step 1: Crawl the Web URL
         print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
+        web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         result = await crawler.arun(url=wikipedia_url, config=web_config)
 
         if not result.success:
@@ -3871,7 +3871,7 @@ async def main():
         # Step 2: Crawl from the Local HTML File
         print("=== Step 2: Crawling from the Local HTML File ===")
         file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
+        file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         local_result = await crawler.arun(url=file_url, config=file_config)
 
         if not local_result.success:
@@ -3887,7 +3887,7 @@ async def main():
         with open(html_file_path, 'r', encoding='utf-8') as f:
             raw_html_content = f.read()
         raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
 
         if not raw_result.success:
diff --git a/docs/md_v2/core/local-files.md b/docs/md_v2/core/local-files.md
index ddf27f8c..31fe7792 100644
--- a/docs/md_v2/core/local-files.md
+++ b/docs/md_v2/core/local-files.md
@@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/apple", 
@@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def crawl_local_file():
     local_file_path = "/path/to/apple.html"  # Replace with your file path
     file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
     
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(url=file_url, config=config)
@@ -93,7 +93,7 @@ import os
 import sys
 import asyncio
 from pathlib import Path
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def main():
@@ -104,7 +104,7 @@ async def main():
     async with AsyncWebCrawler() as crawler:
         # Step 1: Crawl the Web URL
         print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
+        web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         result = await crawler.arun(url=wikipedia_url, config=web_config)
 
         if not result.success:
@@ -119,7 +119,7 @@ async def main():
         # Step 2: Crawl from the Local HTML File
         print("=== Step 2: Crawling from the Local HTML File ===")
         file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
+        file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         local_result = await crawler.arun(url=file_url, config=file_config)
 
         if not local_result.success:
@@ -135,7 +135,7 @@ async def main():
         with open(html_file_path, 'r', encoding='utf-8') as f:
             raw_html_content = f.read()
         raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
         raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
 
         if not raw_result.success:

From 984524ca1c4cb394a8a18d353ac1b45cdc1cca7d Mon Sep 17 00:00:00 2001
From: Ahmed-Tawfik94 <tawfik@kidocode.com>
Date: Wed, 21 May 2025 13:26:11 +0800
Subject: [PATCH 30/37] fix(auth): add token authorization header in request
 preparation to ensure authenticated requests are made

---
 crawl4ai/docker_client.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py
index f4816eb5..4e33431f 100644
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
     def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, 
                        crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
         """Prepare request data from configs."""
+        if self._token:
+            self._http_client.headers["Authorization"] = f"Bearer {self._token}"
         return {
             "urls": urls,
             "browser_config": browser_config.dump() if browser_config else {},
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
         crawler_config: Optional[CrawlerRunConfig] = None
     ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
         """Execute a crawl operation."""
-        if not self._token:
-            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
         await self._check_server()
         
         data = self._prepare_request(urls, browser_config, crawler_config)
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:
 
     async def get_schema(self) -> Dict[str, Any]:
         """Retrieve configuration schemas."""
-        if not self._token:
-            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
         response = await self._request("GET", "/schema")
         return response.json()
 
@@ -167,4 +165,4 @@ async def main():
         print(schema)
 
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())

From 33a0c7a17adfcc5c023aba4183d0de63a4f2dffd Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 22 May 2025 11:17:28 +0200
Subject: [PATCH 31/37] fix(logger): add RED color to LogColor enum for
 enhanced logging options

---
 crawl4ai/async_logger.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 49c7ee6f..9fb1e8e7 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -39,6 +39,7 @@ class LogColor(str, Enum):
     YELLOW = "yellow"
     MAGENTA = "magenta"
     DIM_MAGENTA = "dim magenta"
+    RED = "red"
 
     def __str__(self):
         """Automatically convert rich color to string."""

From da8f0dbb931e7701bde807186bd9e9ae32cde114 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Thu, 22 May 2025 11:25:51 +0200
Subject: [PATCH 32/37] fix(browser_profiler): change logger print to info for
 consistent logging in interactive manager

---
 crawl4ai/browser_profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index 961ba740..41efd4b0 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -458,7 +458,7 @@ class BrowserProfiler:
                 self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
                 exit_option = "4"
             
-            self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
+            self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
             choice = input()
             
             if choice == "1":

From 3d46d89759da93702f2dbd2c7f931389298afbb1 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Thu, 22 May 2025 17:21:42 +0530
Subject: [PATCH 33/37] docs: fix
 https://github.com/unclecode/crawl4ai/issues/1109

---
 docs/md_v2/advanced/proxy-security.md | 80 ++++++++++++++++++---------
 1 file changed, 53 insertions(+), 27 deletions(-)

diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md
index 0e56572c..13191cd7 100644
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
 ```python
 from crawl4ai.async_configs import BrowserConfig
 
-proxy_config = {
-    "server": "http://proxy.example.com:8080",
-    "username": "user",
-    "password": "pass"
-}
-
-browser_config = BrowserConfig(proxy_config=proxy_config)
+browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
 async with AsyncWebCrawler(config=browser_config) as crawler:
     result = await crawler.arun(url="https://example.com")
 ```
 
-Here's the corrected documentation:
 
 ## Rotating Proxies 
 
 Example using a proxy rotation service dynamically:
 
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def get_next_proxy():
-    # Your proxy rotation logic here
-    return {"server": "http://next.proxy.com:8080"}
-
+import re
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    RoundRobinProxyStrategy,
+)
+import asyncio
+from crawl4ai import ProxyConfig
 async def main():
-    browser_config = BrowserConfig()
-    run_config = CrawlerRunConfig()
-    
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # For each URL, create a new run config with different proxy
-        for url in urls:
-            proxy = await get_next_proxy()
-            # Clone the config and update proxy - this creates a new browser context
-            current_config = run_config.clone(proxy_config=proxy)
-            result = await crawler.arun(url=url, config=current_config)
+    # Load proxies and create rotation strategy
+    proxies = ProxyConfig.from_env()
+    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config
+            )
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy.ip
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+
+asyncio.run(main())
 
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main())
 ```
 

From b55e27d2ef2bedecae53359fc71f4d0a6771e455 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Mon, 26 May 2025 11:08:23 +0200
Subject: [PATCH 34/37] fix: chanegd error variable name handle_crawl_request,
 docker api

---
 deploy/docker/api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 732371f7..b728acd1 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -459,7 +459,7 @@ async def handle_crawl_request(
             #      await crawler.close()
             #  except Exception as close_e:
             #       logger.error(f"Error closing crawler during exception handling: {close_e}")
-            logger.error(f"Error closing crawler during exception handling: {close_e}")
+            logger.error(f"Error closing crawler during exception handling: {str(e)}")
 
         # Measure memory even on error if possible
         end_mem_mb_error = _get_memory_mb()
@@ -518,7 +518,7 @@ async def handle_stream_crawl_request(
             #       await crawler.close()
             #  except Exception as close_e:
             #       logger.error(f"Error closing crawler during stream setup exception: {close_e}")
-            logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+            logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
         logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
         # Raising HTTPException here will prevent streaming response
         raise HTTPException(

From 871d4f1158c9b45e3bd869c4f192ec4420fcd932 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Fri, 13 Jun 2025 11:26:05 +0200
Subject: [PATCH 35/37] fix(extraction_strategy): rename response variable to
 content for clarity in LLMExtractionStrategy. ref #1146

---
 crawl4ai/extraction_strategy.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 6be084b3..25ebbd5f 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
             self.total_usage.total_tokens += usage.total_tokens
 
             try:
-                response = response.choices[0].message.content
+                content = response.choices[0].message.content
                 blocks = None
 
                 if self.force_json_response:
-                    blocks = json.loads(response)
+                    blocks = json.loads(content)
                     if isinstance(blocks, dict):
                         # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
                         if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
@@ -673,14 +673,14 @@ class LLMExtractionStrategy(ExtractionStrategy):
                         blocks = blocks
                 else: 
                     # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
-                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = extract_xml_data(["blocks"], content)["blocks"]
                     blocks = json.loads(blocks)
 
                 for block in blocks:
                     block["error"] = False
             except Exception:
                 parsed, unparsed = split_and_parse_json_objects(
-                    response
+                    response.choices[0].message.content
                 )
                 blocks = parsed
                 if unparsed:

From b7a6e02236f9da30c1bb21b8a5bb3dab86d97233 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 18 Jun 2025 19:04:32 +0200
Subject: [PATCH 36/37] fix: Update pdf and screenshot usage documentation. ref
 #1230

---
 deploy/docker/c4ai-doc-context.md        | 29 ++++++++++++++++--------
 docs/md_v2/advanced/advanced-features.md | 29 ++++++++++++++++--------
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 6591c265..f8b83088 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -5433,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
 ```python
 import os, asyncio
 from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 
 async def main():
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        screenshot=True,
+        pdf=True
+    )
+
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=run_config
         )
-        
         if result.success:
-            # Save screenshot
+            print(f"Screenshot data present: {result.screenshot is not None}")
+            print(f"PDF data present: {result.pdf is not None}")
+
             if result.screenshot:
+                print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
                 with open("wikipedia_screenshot.png", "wb") as f:
                     f.write(b64decode(result.screenshot))
-            
-            # Save PDF
+            else:
+                print("[WARN] Screenshot data is None.")
+
             if result.pdf:
+                print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
                 with open("wikipedia_page.pdf", "wb") as f:
                     f.write(result.pdf)
-            
-            print("[OK] PDF & screenshot captured.")
+            else:
+                print("[WARN] PDF data is None.")
+
         else:
             print("[ERROR]", result.error_message)
 
diff --git a/docs/md_v2/advanced/advanced-features.md b/docs/md_v2/advanced/advanced-features.md
index b56f216e..3563fd40 100644
--- a/docs/md_v2/advanced/advanced-features.md
+++ b/docs/md_v2/advanced/advanced-features.md
@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
 ```python
 import os, asyncio
 from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 
 async def main():
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        screenshot=True,
+        pdf=True
+    )
+
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=run_config
         )
-        
         if result.success:
-            # Save screenshot
+            print(f"Screenshot data present: {result.screenshot is not None}")
+            print(f"PDF data present: {result.pdf is not None}")
+
             if result.screenshot:
+                print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
                 with open("wikipedia_screenshot.png", "wb") as f:
                     f.write(b64decode(result.screenshot))
-            
-            # Save PDF
+            else:
+                print("[WARN] Screenshot data is None.")
+
             if result.pdf:
+                print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
                 with open("wikipedia_page.pdf", "wb") as f:
                     f.write(result.pdf)
-            
-            print("[OK] PDF & screenshot captured.")
+            else:
+                print("[WARN] PDF data is None.")
+
         else:
             print("[ERROR]", result.error_message)
 

From 414f16e975cc2ca29abe3531d5ab91a4b17a4163 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Wed, 18 Jun 2025 19:05:44 +0200
Subject: [PATCH 37/37] fix: Update pdf and screenshot usage documentation. ref
 #1230

---
 .../crawl4ai_all_reasoning_content.llm.txt    | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
index 850c1237..c3350fb5 100644
--- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
+++ b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
@@ -5359,29 +5359,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
 ```python
 import os, asyncio
 from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 
 async def main():
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        screenshot=True,
+        pdf=True
+    )
+
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=run_config
         )
-        
         if result.success:
-            # Save screenshot
+            print(f"Screenshot data present: {result.screenshot is not None}")
+            print(f"PDF data present: {result.pdf is not None}")
+
             if result.screenshot:
+                print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
                 with open("wikipedia_screenshot.png", "wb") as f:
                     f.write(b64decode(result.screenshot))
-            
-            # Save PDF
+            else:
+                print("[WARN] Screenshot data is None.")
+
             if result.pdf:
+                print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
                 with open("wikipedia_page.pdf", "wb") as f:
                     f.write(result.pdf)
-            
-            print("[OK] PDF & screenshot captured.")
+            else:
+                print("[WARN] PDF data is None.")
+
         else:
             print("[ERROR]", result.error_message)