From 7b9aabc64a8ee8e992cfe2eeea9f00785ca0e069 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 14 Apr 2025 12:11:22 +0200 Subject: [PATCH 01/19] fix(crawler): ensure max_pages limit is respected during batch processing in crawling strategies --- crawl4ai/deep_crawling/bff_strategy.py | 12 ++++++++++++ crawl4ai/deep_crawling/bfs_strategy.py | 9 +++++++++ crawl4ai/deep_crawling/dfs_strategy.py | 8 ++++++++ 3 files changed, 29 insertions(+) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 4811ba14..fd1b30bf 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -148,6 +148,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") break + # Calculate how many more URLs we can process in this batch + remaining = self.max_pages - self._pages_crawled + batch_size = min(BATCH_SIZE, remaining) + if batch_size <= 0: + # No more pages to crawl + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + batch: List[Tuple[float, int, str, Optional[str]]] = [] # Retrieve up to BATCH_SIZE items from the priority queue. for _ in range(BATCH_SIZE): @@ -182,6 +190,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): # Count only successful crawls toward max_pages limit if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator yield result diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 54b72ea3..84e00642 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -156,6 +156,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): results: List[CrawlResult] = [] while current_level and not self._cancel_event.is_set(): + # Check if we've already reached max_pages before starting a new level + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + next_level: List[Tuple[str, Optional[str]]] = [] urls = [url for url, _ in current_level] visited.update(urls) @@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # Count only successful crawls if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator results_count += 1 yield result diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index f79f9628..0eca58e3 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # Count only successful crawls toward max_pages limit if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator # Only discover links from successful crawls new_links: List[Tuple[str, Optional[str]]] = [] @@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # and only discover links from successful crawls if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator new_links: List[Tuple[str, Optional[str]]] = [] await self.link_discovery(result, url, depth, visited, new_links, depths) From 1f3b1251d0aa8639c2615f13add944766ecaafa8 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 14 Apr 2025 12:16:31 +0200 Subject: [PATCH 02/19] docs(cli): add Crawl4AI CLI installation instructions to the CLI guide --- docs/md_v2/core/cli.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md index ff4bf658..ded35f2f 100644 --- a/docs/md_v2/core/cli.md +++ b/docs/md_v2/core/cli.md @@ -17,6 +17,9 @@ - [Configuration Reference](#configuration-reference) - [Best Practices & Tips](#best-practices--tips) +## Installation +The Crawl4AI CLI will be installed automatically when you install the library. + ## Basic Usage The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library: From 05085b6e3d48f9b583aada02ccdc2f80db8b6cf8 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Apr 2025 13:05:19 +0200 Subject: [PATCH 03/19] fix(requirements): add fake-useragent to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c1f36c56..8ad6bc41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ psutil>=6.1.1 nltk>=3.9.1 rich>=13.9.4 cssselect>=1.2.0 -faust-cchardet>=2.1.19 \ No newline at end of file +faust-cchardet>=2.1.19 +fake-useragent>=2.2.0 \ No newline at end of file From 0ec3c4a7886a26e38a7467905f55072dc72737da Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Apr 2025 12:11:12 +0200 Subject: [PATCH 04/19] fix(crawler): handle navigation aborts during file downloads in AsyncPlaywrightCrawlerStrategy --- crawl4ai/async_crawler_strategy.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 89b4df84..28325c84 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -582,7 +582,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) redirected_url = page.url except Error as e: - raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + # Allow navigation to be aborted when downloading files + # This is expected behavior for downloads in some browser engines + if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads: + self.logger.info( + message=f"Navigation aborted, likely due to file download: {url}", + tag="GOTO", + params={"url": url}, + ) + response = None + else: + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") await self.execute_hook( "after_goto", page, context=context, url=url, response=response, config=config From 0886153d6a4267bf6b1846b8601edc87055fa13e Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Apr 2025 12:48:11 +0200 Subject: [PATCH 05/19] fix(async_playwright_crawler): improve segment handling and viewport adjustments during screenshot capture (Fixed bug: Capturing Screenshot Twice and Increasing Image Size) --- crawl4ai/async_crawler_strategy.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 28325c84..bda4897c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1162,12 +1162,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): num_segments = (page_height // viewport_height) + 1 for i in range(num_segments): y_offset = i * viewport_height + # Special handling for the last segment + if i == num_segments - 1: + last_part_height = page_height % viewport_height + + # If page_height is an exact multiple of viewport_height, + # we don't need an extra segment + if last_part_height == 0: + # Skip last segment if page height is exact multiple of viewport + break + + # Adjust viewport to exactly match the remaining content height + await page.set_viewport_size({"width": page_width, "height": last_part_height}) + await page.evaluate(f"window.scrollTo(0, {y_offset})") await asyncio.sleep(0.01) # wait for render - seg_shot = await page.screenshot(full_page=False) + + # Capture the current segment + # Note: Using compression options (format, quality) would go here + seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85) + # seg_shot = await page.screenshot(full_page=False) img = Image.open(BytesIO(seg_shot)).convert("RGB") segments.append(img) + # Reset viewport to original size after capturing segments + await page.set_viewport_size({"width": page_width, "height": viewport_height}) + total_height = sum(img.height for img in segments) stitched = Image.new("RGB", (segments[0].width, total_height)) offset = 0 From 14a31456ef249a32be1d971cad9ab056da1a24e7 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 21 Apr 2025 13:59:49 +0200 Subject: [PATCH 06/19] fix(docs): update browser-crawler-config example to include LLMContentFilter and DefaultMarkdownGenerator, fix syntax errors --- docs/md_v2/core/browser-crawler-config.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 0d97e0fc..5f66b3ea 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -265,7 +265,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi ```python import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator from crawl4ai.extraction_strategy import JsonCssExtractionStrategy async def main(): @@ -290,7 +290,7 @@ async def main(): # 3) Example LLM content filtering gemini_config = LLMConfig( - provider="gemini/gemini-1.5-pro" + provider="gemini/gemini-1.5-pro", api_token = "env:GEMINI_API_TOKEN" ) @@ -314,8 +314,9 @@ async def main(): ) md_generator = DefaultMarkdownGenerator( - content_filter=filter, - options={"ignore_links": True} + content_filter=filter, + options={"ignore_links": True} + ) # 4) Crawler run config: skip cache, use extraction run_conf = CrawlerRunConfig( From 039be1b1ce7e32d1186ce9d1b123605248f3fb26 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 30 Apr 2025 11:41:35 +0200 Subject: [PATCH 07/19] feat: add pdf2image dependency to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4aa2dbff..b695f92c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ rich>=13.9.4 cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 -fake-useragent>=2.2.0 \ No newline at end of file +fake-useragent>=2.2.0 +pdf2image>=1.17.0 \ No newline at end of file From 1d6a2b9979d530703ec76708a385a2d87a1b5f7d Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 30 Apr 2025 12:29:17 +0200 Subject: [PATCH 08/19] fix(crawler): surface real redirect status codes and keep redirect chain. the 30x response instead of always returning 200. Refs #660 --- crawl4ai/async_crawler_strategy.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3162bd54..da5490b6 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -744,12 +744,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "after_goto", page, context=context, url=url, response=response, config=config ) + # ────────────────────────────────────────────────────────────── + # Walk the redirect chain. Playwright returns only the last + # hop, so we trace the `request.redirected_from` links until the + # first response that differs from the final one and surface its + # status-code. + # ────────────────────────────────────────────────────────────── if response is None: status_code = 200 response_headers = {} else: - status_code = response.status - response_headers = response.headers + first_resp = response + req = response.request + while req and req.redirected_from: + prev_req = req.redirected_from + prev_resp = await prev_req.response() + if prev_resp: # keep earliest + first_resp = prev_resp + req = prev_req + + status_code = first_resp.status + response_headers = first_resp.headers + # if response is None: + # status_code = 200 + # response_headers = {} + # else: + # status_code = response.status + # response_headers = response.headers else: status_code = 200 From e0cd3e10de0b04079c2144c6febb54cd74139f50 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Fri, 2 May 2025 10:35:35 +0200 Subject: [PATCH 09/19] fix(crawler): initialize captured_console variable for local file processing --- crawl4ai/async_crawler_strategy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index da5490b6..6c0b4115 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -445,6 +445,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return await self._crawl_web(url, config) elif url.startswith("file://"): + captured_console = None # Process local file local_file_path = url[7:] # Remove 'file://' prefix if not os.path.exists(local_file_path): From 12783fabdab1cdea99e930392c572e83831897df Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 7 May 2025 11:18:13 +0200 Subject: [PATCH 10/19] fix(dependencies): update pillow version constraint to allow newer releases. ref #709 --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be44397e..8b5f0910 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "lxml~=5.3", "litellm>=1.53.1", "numpy>=1.26.0,<3", - "pillow~=10.4", + "pillow>=10.4", "playwright>=1.49.0", "python-dotenv~=1.0", "requests~=2.26", diff --git a/requirements.txt b/requirements.txt index b695f92c..10d7fd81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ aiosqlite~=0.20 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 -pillow~=10.4 +pillow>=10.4 playwright>=1.49.0 python-dotenv~=1.0 requests~=2.26 From eebb8c84f0a434f6cec4173a82c8b4dceb510037 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 7 May 2025 11:18:44 +0200 Subject: [PATCH 11/19] fix(requirements): add PyPDF2 dependency for PDF processing --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 10d7fd81..b62575d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,5 @@ cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 fake-useragent>=2.2.0 -pdf2image>=1.17.0 \ No newline at end of file +pdf2image>=1.17.0 +PyPDF2>=3.0.1 \ No newline at end of file From 2b17f234f8354dca893063b68aa3ec41431c5d3c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 15:20:36 +0530 Subject: [PATCH 12/19] docs: update direct passing of content_filter to CrawlerRunConfig and instead pass it via MarkdownGenerator. Ref: #603 --- deploy/docker/c4ai-doc-context.md | 11 ++++++++--- docs/md_v2/core/markdown-generation.md | 9 ++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 1642f85e..081f29b7 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -403,7 +403,7 @@ async def main(): md_generator = DefaultMarkdownGenerator( content_filter=filter, - options={"ignore_links": True} + options={"ignore_links": True}) # 4) Crawler run config: skip cache, use extraction run_conf = CrawlerRunConfig( @@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): @@ -4175,8 +4175,13 @@ async def main(): verbose=True ) + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) + config = CrawlerRunConfig( - content_filter=filter + markdown_generator=md_generator ) async with AsyncWebCrawler() as crawler: diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index e6f5e12a..4a6e9218 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -233,7 +233,7 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): @@ -255,9 +255,12 @@ async def main(): chunk_token_threshold=4096, # Adjust based on your needs verbose=True ) - + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) config = CrawlerRunConfig( - content_filter=filter + markdown_generator=md_generator, ) async with AsyncWebCrawler() as crawler: From ee93acbd06c49ce70e3905f267fd15711b39446b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 7 May 2025 12:32:38 +0200 Subject: [PATCH 13/19] fix(async_playwright_crawler): use config directly instead of self.config for verbosity check --- crawl4ai/async_crawler_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a6aae4e7..85c3a15c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -816,7 +816,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error: visibility_info = await self.check_visibility(page) - if self.config.verbose: + if self.verbose: self.logger.debug( message="Body visibility info: {info}", tag="DEBUG", From f6e25e2a6bae8a1b774b6e71fc98edc460d04b53 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 17:53:30 +0530 Subject: [PATCH 14/19] fix: check_robots_txt to support wildcard rules ref: #699 --- crawl4ai/utils.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index bfa8ce9d..4018d78c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -42,6 +42,29 @@ from itertools import chain from collections import deque from typing import Generator, Iterable +# Monkey patch to fix wildcard handling in urllib.robotparser +from urllib.robotparser import RuleLine +import re + +original_applies_to = RuleLine.applies_to + +def patched_applies_to(self, filename): + # Handle wildcards in paths + if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"): + pattern = self.path.replace('%2A', '*') + pattern = re.escape(pattern).replace('\\*', '.*') + pattern = '^' + pattern + if pattern.endswith('\\$'): + pattern = pattern[:-2] + '$' + try: + return bool(re.match(pattern, filename)) + except re.error: + return original_applies_to(self, filename) + return original_applies_to(self, filename) + +RuleLine.applies_to = patched_applies_to +# Monkey patch ends + def chunk_documents( documents: Iterable[str], chunk_token_threshold: int, @@ -303,7 +326,7 @@ class RobotsParser: robots_url = f"{scheme}://{domain}/robots.txt" async with aiohttp.ClientSession() as session: - async with session.get(robots_url, timeout=2) as response: + async with session.get(robots_url, timeout=2, ssl=False) as response: if response.status == 200: rules = await response.text() self._cache_rules(domain, rules) From c1041b9bbee1338ec89997bdf62e76c6a5f3ada6 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 18:43:29 +0530 Subject: [PATCH 15/19] fix: exclude_external_images flag simply discards elements ref:https://github.com/unclecode/crawl4ai/issues/345 --- crawl4ai/content_scraping_strategy.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 1dfbce84..d11e02d0 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy): # Check flag if we should remove external images if kwargs.get("exclude_external_images", False): - element.decompose() - return False - # src_url_base = src.split('/')[2] - # url_base = url.split('/')[2] - # if url_base not in src_url_base: - # element.decompose() - # return False + # Handle relative URLs (which are always from the same domain) + if not src.startswith('http') and not src.startswith('//'): + return True # Keep relative URLs + + # For absolute URLs, compare the base domains using the existing function + src_base_domain = get_base_domain(src) + url_base_domain = get_base_domain(url) + + # If the domains don't match and both are valid, the image is external + if src_base_domain and url_base_domain and src_base_domain != url_base_domain: + element.decompose() + return False # if kwargs.get('exclude_social_media_links', False): # if image_src_base_domain in exclude_social_media_domains: From 25d97d56e4e3bbc74fa1de9423cc5ae3457b0baf Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 13 May 2025 13:56:12 +0200 Subject: [PATCH 16/19] fix(dependencies): remove duplicated aiofiles from project dependencies. REF #1045 --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8b5f0910..a208d5d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ dependencies = [ "psutil>=6.1.1", "nltk>=3.9.1", "playwright", - "aiofiles", "rich>=13.9.4", "cssselect>=1.2.0", "httpx>=0.27.2", From 260e2dc347e2d0b4463eec31f3eaa81e87ca109b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 13 May 2025 14:03:20 +0200 Subject: [PATCH 17/19] fix(browser): create browser config before launching managed browser instance. REF: https://discord.com/channels/1278297938551902308/1278298697540567132/1371683009459392716 --- crawl4ai/browser_profiler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 961ba740..41f917f5 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -615,9 +615,18 @@ class BrowserProfiler: self.logger.info(f"Debugging port: {debugging_port}", tag="CDP") self.logger.info(f"Headless mode: {headless}", tag="CDP") + # create browser config + browser_config = BrowserConfig( + browser_type=browser_type, + headless=headless, + user_data_dir=profile_path, + debugging_port=debugging_port, + verbose=True + ) + # Create managed browser instance managed_browser = ManagedBrowser( - browser_type=browser_type, + browser_config=browser_config, user_data_dir=profile_path, headless=headless, logger=self.logger, From 137556b3dce373bfd8af09e8bd5f9da0051ba463 Mon Sep 17 00:00:00 2001 From: medo94my Date: Wed, 14 May 2025 16:01:10 +0800 Subject: [PATCH 18/19] fix the EXTRACT to match the styling of the other methods --- crawl4ai/async_webcrawler.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 19b98522..9e42b824 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -360,7 +360,7 @@ class AsyncWebCrawler: pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, - redirected_url=async_response.redirected_url, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -503,7 +503,7 @@ class AsyncWebCrawler: tables = media.pop("tables", []) links = result.links.model_dump() metadata = result.metadata - + fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) ################################ @@ -585,11 +585,13 @@ class AsyncWebCrawler: # Choose content based on input_format content_format = config.extraction_strategy.input_format if content_format == "fit_markdown" and not markdown_result.fit_markdown: - self.logger.warning( - message="Fit markdown requested but not available. Falling back to raw markdown.", - tag="EXTRACT", - params={"url": _url}, - ) + + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) content_format = "markdown" content = { From a55c2b3f88371570a5683be59e40f8ea609b0a19 Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Mon, 19 May 2025 16:32:22 +0800 Subject: [PATCH 19/19] refactor(logging): update extraction logging to use url_status method --- crawl4ai/async_webcrawler.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9e42b824..cb221b72 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -615,11 +615,12 @@ class AsyncWebCrawler: ) # Log extraction completion - self.logger.info( - message="Completed for {url:.50}... | Time: {timing}s", - tag="EXTRACT", - params={"url": _url, "timing": time.perf_counter() - t1}, - ) + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) # Apply HTML formatting if requested if config.prettiify: