From 8b215e17afcd15015ee5a9b47d5e60e7c0c62c35 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 8 Jun 2025 06:57:37 +0200
Subject: [PATCH 1/3] Add use_stemming option to BM25ContentFilter (#1192)

---
 crawl4ai/cli.py                        |  3 ++-
 crawl4ai/content_filter_strategy.py    | 25 +++++++++++++++++--------
 docs/md_v2/core/markdown-generation.md |  2 +-
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index 51477d6b..33b313bc 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
                 crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
                     content_filter = BM25ContentFilter(
                         user_query=filter_conf.get("query"),
-                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                        bm25_threshold=filter_conf.get("threshold", 1.0),
+                        use_stemming=filter_conf.get("use_stemming", True),
                     )
                 )
             elif filter_conf["type"] == "pruning":
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 4102cbad..1e764f74 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter):
         user_query: str = None,
         bm25_threshold: float = 1.0,
         language: str = "english",
+        use_stemming: bool = True,
     ):
         """
         Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
@@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter):
             user_query (str): User query for filtering (optional).
             bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
             language (str): Language for stemming (default: 'english').
+            use_stemming (bool): Whether to apply stemming (default: True).
         """
         super().__init__(user_query=user_query)
         self.bm25_threshold = bm25_threshold
+        self.use_stemming = use_stemming
         self.priority_tags = {
             "h1": 5.0,
             "h2": 4.0,
@@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter):
             "pre": 1.5,
             "th": 1.5,  # Table headers
         }
-        self.stemmer = stemmer(language)
+        self.stemmer = stemmer(language) if use_stemming else None
 
     def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
         """
@@ -479,13 +482,19 @@ class BM25ContentFilter(RelevantContentFilter):
         #                 for _, chunk, _, _ in candidates]
         # tokenized_query = [ps.stem(word) for word in query.lower().split()]
 
-        tokenized_corpus = [
-            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
-            for _, chunk, _, _ in candidates
-        ]
-        tokenized_query = [
-            self.stemmer.stemWord(word) for word in query.lower().split()
-        ]
+        if self.use_stemming:
+            tokenized_corpus = [
+                [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+                for _, chunk, _, _ in candidates
+            ]
+            tokenized_query = [
+                self.stemmer.stemWord(word) for word in query.lower().split()
+            ]
+        else:
+            tokenized_corpus = [
+                chunk.lower().split() for _, chunk, _, _ in candidates
+            ]
+            tokenized_query = query.lower().split()
 
         # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
         #            for _, chunk, _, _ in candidates]
diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index d4cad79b..eccb115a 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 
 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
-- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
 
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
 

From 451b0d6c9a85b3013d1fd8fa857cfd4d9a5cba94 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 8 Jun 2025 07:53:09 +0200
Subject: [PATCH 2/3] Set memory_wait_timeout default to 10 minutes (#1193)

---
 crawl4ai/async_dispatcher.py                  | 53 +++++++++++++++----
 deploy/docker/c4ai-doc-context.md             |  2 +-
 docs/md_v2/advanced/multi-url-crawling.md     |  2 +-
 .../crawl4ai_all_reasoning_content.llm.txt    |  2 +-
 4 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
index b97d59a7..1558efc0 100644
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -126,6 +126,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
         check_interval: float = 1.0,
         max_session_permit: int = 20,
         fairness_timeout: float = 600.0,  # 10 minutes before prioritizing long-waiting URLs
+        memory_wait_timeout: Optional[float] = 600.0,
         rate_limiter: Optional[RateLimiter] = None,
         monitor: Optional[CrawlerMonitor] = None,
     ):
@@ -136,27 +137,46 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
         self.check_interval = check_interval
         self.max_session_permit = max_session_permit
         self.fairness_timeout = fairness_timeout
+        self.memory_wait_timeout = memory_wait_timeout
         self.result_queue = asyncio.Queue()
         self.task_queue = asyncio.PriorityQueue()  # Priority queue for better management
         self.memory_pressure_mode = False  # Flag to indicate when we're in memory pressure mode
         self.current_memory_percent = 0.0  # Track current memory usage
+        self._high_memory_start_time: Optional[float] = None
         
     async def _memory_monitor_task(self):
         """Background task to continuously monitor memory usage and update state"""
         while True:
             self.current_memory_percent = psutil.virtual_memory().percent
-            
+
             # Enter memory pressure mode if we cross the threshold
-            if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
-                self.memory_pressure_mode = True
-                if self.monitor:
-                    self.monitor.update_memory_status("PRESSURE")
-            
+            if self.current_memory_percent >= self.memory_threshold_percent:
+                if not self.memory_pressure_mode:
+                    self.memory_pressure_mode = True
+                    self._high_memory_start_time = time.time()
+                    if self.monitor:
+                        self.monitor.update_memory_status("PRESSURE")
+                else:
+                    if self._high_memory_start_time is None:
+                        self._high_memory_start_time = time.time()
+                    if (
+                        self.memory_wait_timeout is not None
+                        and self._high_memory_start_time is not None
+                        and time.time() - self._high_memory_start_time >= self.memory_wait_timeout
+                    ):
+                        raise MemoryError(
+                            "Memory usage exceeded threshold for"
+                            f" {self.memory_wait_timeout} seconds"
+                        )
+
             # Exit memory pressure mode if we go below recovery threshold
             elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
                 self.memory_pressure_mode = False
+                self._high_memory_start_time = None
                 if self.monitor:
                     self.monitor.update_memory_status("NORMAL")
+            elif self.current_memory_percent < self.memory_threshold_percent:
+                self._high_memory_start_time = None
             
             # In critical mode, we might need to take more drastic action
             if self.current_memory_percent >= self.critical_threshold_percent:
@@ -307,7 +327,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
             self.monitor.start()
             
         results = []
-        
+
         try:
             # Initialize task queue
             for url in urls:
@@ -316,11 +336,18 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
                     self.monitor.add_task(task_id, url)
                 # Add to queue with initial priority 0, retry count 0, and current time
                 await self.task_queue.put((0, (url, task_id, 0, time.time())))
-                
+
             active_tasks = []
-            
+
             # Process until both queues are empty
             while not self.task_queue.empty() or active_tasks:
+                if memory_monitor.done():
+                    exc = memory_monitor.exception()
+                    if exc:
+                        for t in active_tasks:
+                            t.cancel()
+                        raise exc
+
                 # If memory pressure is low, start new tasks
                 if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
                     try:
@@ -465,8 +492,14 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
             active_tasks = []
             completed_count = 0
             total_urls = len(urls)
-            
+
             while completed_count < total_urls:
+                if memory_monitor.done():
+                    exc = memory_monitor.exception()
+                    if exc:
+                        for t in active_tasks:
+                            t.cancel()
+                        raise exc
                 # If memory pressure is low, start new tasks
                 if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
                     try:
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 1642f85e..77785cec 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -6705,7 +6705,7 @@ dispatcher = MemoryAdaptiveDispatcher(
 3. **`max_session_permit`** (`int`, default: `10`)  
   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
 
-4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+4. **`memory_wait_timeout`** (`float`, default: `600.0`)
   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
 
 5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md
index f6d944d6..40493c21 100644
--- a/docs/md_v2/advanced/multi-url-crawling.md
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -172,7 +172,7 @@ dispatcher = MemoryAdaptiveDispatcher(
 3. **`max_session_permit`** (`int`, default: `10`)  
   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
 
-4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+4. **`memory_wait_timeout`** (`float`, default: `600.0`)
   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
 
 5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
diff --git a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
index 850c1237..846b6914 100644
--- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
+++ b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
@@ -6732,7 +6732,7 @@ dispatcher = MemoryAdaptiveDispatcher(
 3. **`max_session_permit`** (`int`, default: `10`)  
   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
 
-4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+4. **`memory_wait_timeout`** (`float`, default: `600.0`)
   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
 
 5. **`rate_limiter`** (`RateLimiter`, default: `None`)  

From b870bfdb6cc9c7a84ae333a6210e1af901c63093 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 8 Jun 2025 10:06:38 +0200
Subject: [PATCH 3/3] chore(deps): add httpx extras (#1195)

---
 deploy/docker/requirements.txt | 1 +
 pyproject.toml                 | 1 +
 requirements.txt               | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index dd489e28..d463c641 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -14,3 +14,4 @@ anyio==4.9.0
 PyJWT==2.10.1
 mcp>=1.6.0
 websockets>=15.0.1
+httpx[http2]>=0.27.2
diff --git a/pyproject.toml b/pyproject.toml
index 5abfb460..fc961d2b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "rich>=13.9.4",
     "cssselect>=1.2.0",
     "httpx>=0.27.2",
+    "httpx[http2]>=0.27.2",
     "fake-useragent>=2.0.3",
     "click>=8.1.7",
     "pyperclip>=1.8.2",
diff --git a/requirements.txt b/requirements.txt
index 0bb596d1..3a93e6a6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,4 +22,5 @@ nltk>=3.9.1
 rich>=13.9.4
 cssselect>=1.2.0
 chardet>=5.2.0
-brotli>=1.1.0
\ No newline at end of file
+brotli>=1.1.0
+httpx[http2]>=0.27.2