From 8b215e17afcd15015ee5a9b47d5e60e7c0c62c35 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 8 Jun 2025 06:57:37 +0200 Subject: [PATCH 1/3] Add use_stemming option to BM25ContentFilter (#1192) --- crawl4ai/cli.py | 3 ++- crawl4ai/content_filter_strategy.py | 25 +++++++++++++++++-------- docs/md_v2/core/markdown-generation.md | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 51477d6b..33b313bc 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1073,7 +1073,8 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: crawler_cfg.markdown_generator = DefaultMarkdownGenerator( content_filter = BM25ContentFilter( user_query=filter_conf.get("query"), - bm25_threshold=filter_conf.get("threshold", 1.0) + bm25_threshold=filter_conf.get("threshold", 1.0), + use_stemming=filter_conf.get("use_stemming", True), ) ) elif filter_conf["type"] == "pruning": diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 4102cbad..1e764f74 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -405,6 +405,7 @@ class BM25ContentFilter(RelevantContentFilter): user_query: str = None, bm25_threshold: float = 1.0, language: str = "english", + use_stemming: bool = True, ): """ Initializes the BM25ContentFilter class, if not provided, falls back to page metadata. @@ -416,9 +417,11 @@ class BM25ContentFilter(RelevantContentFilter): user_query (str): User query for filtering (optional). bm25_threshold (float): BM25 threshold for filtering (default: 1.0). language (str): Language for stemming (default: 'english'). + use_stemming (bool): Whether to apply stemming (default: True). """ super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold + self.use_stemming = use_stemming self.priority_tags = { "h1": 5.0, "h2": 4.0, @@ -432,7 +435,7 @@ class BM25ContentFilter(RelevantContentFilter): "pre": 1.5, "th": 1.5, # Table headers } - self.stemmer = stemmer(language) + self.stemmer = stemmer(language) if use_stemming else None def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: """ @@ -479,13 +482,19 @@ class BM25ContentFilter(RelevantContentFilter): # for _, chunk, _, _ in candidates] # tokenized_query = [ps.stem(word) for word in query.lower().split()] - tokenized_corpus = [ - [self.stemmer.stemWord(word) for word in chunk.lower().split()] - for _, chunk, _, _ in candidates - ] - tokenized_query = [ - self.stemmer.stemWord(word) for word in query.lower().split() - ] + if self.use_stemming: + tokenized_corpus = [ + [self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates + ] + tokenized_query = [ + self.stemmer.stemWord(word) for word in query.lower().split() + ] + else: + tokenized_corpus = [ + chunk.lower().split() for _, chunk, _, _ in candidates + ] + tokenized_query = query.lower().split() # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] # for _, chunk, _, _ in candidates] diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index d4cad79b..eccb115a 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator) - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. -- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”). +- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”). **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results. From 451b0d6c9a85b3013d1fd8fa857cfd4d9a5cba94 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 8 Jun 2025 07:53:09 +0200 Subject: [PATCH 2/3] Set memory_wait_timeout default to 10 minutes (#1193) --- crawl4ai/async_dispatcher.py | 53 +++++++++++++++---- deploy/docker/c4ai-doc-context.md | 2 +- docs/md_v2/advanced/multi-url-crawling.md | 2 +- .../crawl4ai_all_reasoning_content.llm.txt | 2 +- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index b97d59a7..1558efc0 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -126,6 +126,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): check_interval: float = 1.0, max_session_permit: int = 20, fairness_timeout: float = 600.0, # 10 minutes before prioritizing long-waiting URLs + memory_wait_timeout: Optional[float] = 600.0, rate_limiter: Optional[RateLimiter] = None, monitor: Optional[CrawlerMonitor] = None, ): @@ -136,27 +137,46 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): self.check_interval = check_interval self.max_session_permit = max_session_permit self.fairness_timeout = fairness_timeout + self.memory_wait_timeout = memory_wait_timeout self.result_queue = asyncio.Queue() self.task_queue = asyncio.PriorityQueue() # Priority queue for better management self.memory_pressure_mode = False # Flag to indicate when we're in memory pressure mode self.current_memory_percent = 0.0 # Track current memory usage + self._high_memory_start_time: Optional[float] = None async def _memory_monitor_task(self): """Background task to continuously monitor memory usage and update state""" while True: self.current_memory_percent = psutil.virtual_memory().percent - + # Enter memory pressure mode if we cross the threshold - if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent: - self.memory_pressure_mode = True - if self.monitor: - self.monitor.update_memory_status("PRESSURE") - + if self.current_memory_percent >= self.memory_threshold_percent: + if not self.memory_pressure_mode: + self.memory_pressure_mode = True + self._high_memory_start_time = time.time() + if self.monitor: + self.monitor.update_memory_status("PRESSURE") + else: + if self._high_memory_start_time is None: + self._high_memory_start_time = time.time() + if ( + self.memory_wait_timeout is not None + and self._high_memory_start_time is not None + and time.time() - self._high_memory_start_time >= self.memory_wait_timeout + ): + raise MemoryError( + "Memory usage exceeded threshold for" + f" {self.memory_wait_timeout} seconds" + ) + # Exit memory pressure mode if we go below recovery threshold elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent: self.memory_pressure_mode = False + self._high_memory_start_time = None if self.monitor: self.monitor.update_memory_status("NORMAL") + elif self.current_memory_percent < self.memory_threshold_percent: + self._high_memory_start_time = None # In critical mode, we might need to take more drastic action if self.current_memory_percent >= self.critical_threshold_percent: @@ -307,7 +327,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): self.monitor.start() results = [] - + try: # Initialize task queue for url in urls: @@ -316,11 +336,18 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): self.monitor.add_task(task_id, url) # Add to queue with initial priority 0, retry count 0, and current time await self.task_queue.put((0, (url, task_id, 0, time.time()))) - + active_tasks = [] - + # Process until both queues are empty while not self.task_queue.empty() or active_tasks: + if memory_monitor.done(): + exc = memory_monitor.exception() + if exc: + for t in active_tasks: + t.cancel() + raise exc + # If memory pressure is low, start new tasks if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit: try: @@ -465,8 +492,14 @@ class MemoryAdaptiveDispatcher(BaseDispatcher): active_tasks = [] completed_count = 0 total_urls = len(urls) - + while completed_count < total_urls: + if memory_monitor.done(): + exc = memory_monitor.exception() + if exc: + for t in active_tasks: + t.cancel() + raise exc # If memory pressure is low, start new tasks if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit: try: diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 1642f85e..77785cec 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -6705,7 +6705,7 @@ dispatcher = MemoryAdaptiveDispatcher( 3. **`max_session_permit`** (`int`, default: `10`)   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency. -4. **`memory_wait_timeout`** (`float`, default: `300.0`) +4. **`memory_wait_timeout`** (`float`, default: `600.0`)   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised. 5. **`rate_limiter`** (`RateLimiter`, default: `None`) diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index f6d944d6..40493c21 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -172,7 +172,7 @@ dispatcher = MemoryAdaptiveDispatcher( 3. **`max_session_permit`** (`int`, default: `10`)   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency. -4. **`memory_wait_timeout`** (`float`, default: `300.0`) +4. **`memory_wait_timeout`** (`float`, default: `600.0`)   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised. 5. **`rate_limiter`** (`RateLimiter`, default: `None`) diff --git a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt index 850c1237..846b6914 100644 --- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt +++ b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt @@ -6732,7 +6732,7 @@ dispatcher = MemoryAdaptiveDispatcher( 3. **`max_session_permit`** (`int`, default: `10`)   The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency. -4. **`memory_wait_timeout`** (`float`, default: `300.0`) +4. **`memory_wait_timeout`** (`float`, default: `600.0`)   Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised. 5. **`rate_limiter`** (`RateLimiter`, default: `None`) From b870bfdb6cc9c7a84ae333a6210e1af901c63093 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 8 Jun 2025 10:06:38 +0200 Subject: [PATCH 3/3] chore(deps): add httpx extras (#1195) --- deploy/docker/requirements.txt | 1 + pyproject.toml | 1 + requirements.txt | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt index dd489e28..d463c641 100644 --- a/deploy/docker/requirements.txt +++ b/deploy/docker/requirements.txt @@ -14,3 +14,4 @@ anyio==4.9.0 PyJWT==2.10.1 mcp>=1.6.0 websockets>=15.0.1 +httpx[http2]>=0.27.2 diff --git a/pyproject.toml b/pyproject.toml index 5abfb460..fc961d2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "rich>=13.9.4", "cssselect>=1.2.0", "httpx>=0.27.2", + "httpx[http2]>=0.27.2", "fake-useragent>=2.0.3", "click>=8.1.7", "pyperclip>=1.8.2", diff --git a/requirements.txt b/requirements.txt index 0bb596d1..3a93e6a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,5 @@ nltk>=3.9.1 rich>=13.9.4 cssselect>=1.2.0 chardet>=5.2.0 -brotli>=1.1.0 \ No newline at end of file +brotli>=1.1.0 +httpx[http2]>=0.27.2