chore(deps): add httpx extras

Set memory_wait_timeout default to 10 minutes (#1193 )
Add use_stemming option to BM25ContentFilter (#1192 )
2025-06-08 10:06:13 +02:00 · 2025-06-08 13:53:09 +08:00 · 2025-06-08 12:57:37 +08:00
10 changed files with 70 additions and 24 deletions
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -126,6 +126,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        check_interval: float = 1.0,
        max_session_permit: int = 20,
        fairness_timeout: float = 600.0,  # 10 minutes before prioritizing long-waiting URLs
        memory_wait_timeout: Optional[float] = 600.0,
        rate_limiter: Optional[RateLimiter] = None,
        monitor: Optional[CrawlerMonitor] = None,
    ):
@@ -136,27 +137,46 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
        self.check_interval = check_interval
        self.max_session_permit = max_session_permit
        self.fairness_timeout = fairness_timeout
        self.memory_wait_timeout = memory_wait_timeout
        self.result_queue = asyncio.Queue()
        self.task_queue = asyncio.PriorityQueue()  # Priority queue for better management
        self.memory_pressure_mode = False  # Flag to indicate when we're in memory pressure mode
        self.current_memory_percent = 0.0  # Track current memory usage
        self._high_memory_start_time: Optional[float] = None
    async def _memory_monitor_task(self):
        """Background task to continuously monitor memory usage and update state"""
        while True:
            self.current_memory_percent = psutil.virtual_memory().percent
-            
+
            # Enter memory pressure mode if we cross the threshold
-            if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
+            if self.current_memory_percent >= self.memory_threshold_percent:
-                self.memory_pressure_mode = True
+                if not self.memory_pressure_mode:
-                if self.monitor:
+                    self.memory_pressure_mode = True
-                    self.monitor.update_memory_status("PRESSURE")
+                    self._high_memory_start_time = time.time()
-            
+                    if self.monitor:
                        self.monitor.update_memory_status("PRESSURE")
                else:
                    if self._high_memory_start_time is None:
                        self._high_memory_start_time = time.time()
                    if (
                        self.memory_wait_timeout is not None
                        and self._high_memory_start_time is not None
                        and time.time() - self._high_memory_start_time >= self.memory_wait_timeout
                    ):
                        raise MemoryError(
                            "Memory usage exceeded threshold for"
                            f" {self.memory_wait_timeout} seconds"
                        )
            # Exit memory pressure mode if we go below recovery threshold
            elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
                self.memory_pressure_mode = False
                self._high_memory_start_time = None
                if self.monitor:
                    self.monitor.update_memory_status("NORMAL")
            elif self.current_memory_percent < self.memory_threshold_percent:
                self._high_memory_start_time = None
            # In critical mode, we might need to take more drastic action
            if self.current_memory_percent >= self.critical_threshold_percent:
@@ -307,7 +327,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
            self.monitor.start()
        results = []
-        
+
        try:
            # Initialize task queue
            for url in urls:
@@ -316,11 +336,18 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
                    self.monitor.add_task(task_id, url)
                # Add to queue with initial priority 0, retry count 0, and current time
                await self.task_queue.put((0, (url, task_id, 0, time.time())))
-                
+
            active_tasks = []
-            
+
            # Process until both queues are empty
            while not self.task_queue.empty() or active_tasks:
                if memory_monitor.done():
                    exc = memory_monitor.exception()
                    if exc:
                        for t in active_tasks:
                            t.cancel()
                        raise exc
                # If memory pressure is low, start new tasks
                if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
                    try:
@@ -465,8 +492,14 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
            active_tasks = []
            completed_count = 0
            total_urls = len(urls)
-            
+
            while completed_count < total_urls:
                if memory_monitor.done():
                    exc = memory_monitor.exception()
                    if exc:
                        for t in active_tasks:
                            t.cancel()
                        raise exc
                # If memory pressure is low, start new tasks
                if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
                    try:
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
                    content_filter = BM25ContentFilter(
                        user_query=filter_conf.get("query"),
-                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                        bm25_threshold=filter_conf.get("threshold", 1.0),
                        use_stemming=filter_conf.get("use_stemming", True),
                    )
                )
            elif filter_conf["type"] == "pruning":
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter):
        user_query: str = None,
        bm25_threshold: float = 1.0,
        language: str = "english",
        use_stemming: bool = True,
    ):
        """
        Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
@@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter):
            user_query (str): User query for filtering (optional).
            bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
            language (str): Language for stemming (default: 'english').
            use_stemming (bool): Whether to apply stemming (default: True).
        """
        super().__init__(user_query=user_query)
        self.bm25_threshold = bm25_threshold
        self.use_stemming = use_stemming
        self.priority_tags = {
            "h1": 5.0,
            "h2": 4.0,
@@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter):
            "pre": 1.5,
            "th": 1.5,  # Table headers
        }
-        self.stemmer = stemmer(language)
+        self.stemmer = stemmer(language) if use_stemming else None
    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
        """
@@ -479,13 +482,19 @@ class BM25ContentFilter(RelevantContentFilter):
        #                 for _, chunk, _, _ in candidates]
        # tokenized_query = [ps.stem(word) for word in query.lower().split()]
-        tokenized_corpus = [
+        if self.use_stemming:
-            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+            tokenized_corpus = [
-            for _, chunk, _, _ in candidates
+                [self.stemmer.stemWord(word) for word in chunk.lower().split()]
-        ]
+                for _, chunk, _, _ in candidates
-        tokenized_query = [
+            ]
-            self.stemmer.stemWord(word) for word in query.lower().split()
+            tokenized_query = [
-        ]
+                self.stemmer.stemWord(word) for word in query.lower().split()
            ]
        else:
            tokenized_corpus = [
                chunk.lower().split() for _, chunk, _, _ in candidates
            ]
            tokenized_query = query.lower().split()
        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
        #            for _, chunk, _, _ in candidates]
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -6705,7 +6705,7 @@ dispatcher = MemoryAdaptiveDispatcher(
 3. **`max_session_permit`** (`int`, default: `10`)  
   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
-4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+4. **`memory_wait_timeout`** (`float`, default: `600.0`)
   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
 5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -14,3 +14,4 @@ anyio==4.9.0
 PyJWT==2.10.1
 mcp>=1.6.0
 websockets>=15.0.1
 httpx[http2]>=0.27.2
--- a/docs/md_v2/advanced/multi-url-crawling.md
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -172,7 +172,7 @@ dispatcher = MemoryAdaptiveDispatcher(
 3. **`max_session_permit`** (`int`, default: `10`)  
   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
-4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+4. **`memory_wait_timeout`** (`float`, default: `600.0`)
   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
 5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
--- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
+++ b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
@@ -6732,7 +6732,7 @@ dispatcher = MemoryAdaptiveDispatcher(
 3. **`max_session_permit`** (`int`, default: `10`)  
   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
-4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+4. **`memory_wait_timeout`** (`float`, default: `600.0`)
   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
 5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
 - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
    "rich>=13.9.4",
    "cssselect>=1.2.0",
    "httpx>=0.27.2",
    "httpx[http2]>=0.27.2",
    "fake-useragent>=2.0.3",
    "click>=8.1.7",
    "pyperclip>=1.8.2",
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,4 +22,5 @@ nltk>=3.9.1
 rich>=13.9.4
 cssselect>=1.2.0
 chardet>=5.2.0
-brotli>=1.1.0
+brotli>=1.1.0
 httpx[http2]>=0.27.2
Author	SHA1	Message	Date
UncleCode	f54db649c5	chore(deps): add httpx extras	2025-06-08 10:06:13 +02:00
UncleCode	451b0d6c9a	Set memory_wait_timeout default to 10 minutes (#1193 )	2025-06-08 13:53:09 +08:00
UncleCode	8b215e17af	Add use_stemming option to BM25ContentFilter (#1192 )	2025-06-08 12:57:37 +08:00