From 7b9aabc64a8ee8e992cfe2eeea9f00785ca0e069 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 14 Apr 2025 12:11:22 +0200 Subject: [PATCH 01/37] fix(crawler): ensure max_pages limit is respected during batch processing in crawling strategies --- crawl4ai/deep_crawling/bff_strategy.py | 12 ++++++++++++ crawl4ai/deep_crawling/bfs_strategy.py | 9 +++++++++ crawl4ai/deep_crawling/dfs_strategy.py | 8 ++++++++ 3 files changed, 29 insertions(+) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 4811ba14..fd1b30bf 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -148,6 +148,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") break + # Calculate how many more URLs we can process in this batch + remaining = self.max_pages - self._pages_crawled + batch_size = min(BATCH_SIZE, remaining) + if batch_size <= 0: + # No more pages to crawl + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + batch: List[Tuple[float, int, str, Optional[str]]] = [] # Retrieve up to BATCH_SIZE items from the priority queue. for _ in range(BATCH_SIZE): @@ -182,6 +190,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): # Count only successful crawls toward max_pages limit if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator yield result diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 54b72ea3..84e00642 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -156,6 +156,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): results: List[CrawlResult] = [] while current_level and not self._cancel_event.is_set(): + # Check if we've already reached max_pages before starting a new level + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + next_level: List[Tuple[str, Optional[str]]] = [] urls = [url for url, _ in current_level] visited.update(urls) @@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # Count only successful crawls if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator results_count += 1 yield result diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index f79f9628..0eca58e3 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # Count only successful crawls toward max_pages limit if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator # Only discover links from successful crawls new_links: List[Tuple[str, Optional[str]]] = [] @@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # and only discover links from successful crawls if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator new_links: List[Tuple[str, Optional[str]]] = [] await self.link_discovery(result, url, depth, visited, new_links, depths) From 1f3b1251d0aa8639c2615f13add944766ecaafa8 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 14 Apr 2025 12:16:31 +0200 Subject: [PATCH 02/37] docs(cli): add Crawl4AI CLI installation instructions to the CLI guide --- docs/md_v2/core/cli.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md index ff4bf658..ded35f2f 100644 --- a/docs/md_v2/core/cli.md +++ b/docs/md_v2/core/cli.md @@ -17,6 +17,9 @@ - [Configuration Reference](#configuration-reference) - [Best Practices & Tips](#best-practices--tips) +## Installation +The Crawl4AI CLI will be installed automatically when you install the library. + ## Basic Usage The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library: From 05085b6e3d48f9b583aada02ccdc2f80db8b6cf8 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Apr 2025 13:05:19 +0200 Subject: [PATCH 03/37] fix(requirements): add fake-useragent to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c1f36c56..8ad6bc41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ psutil>=6.1.1 nltk>=3.9.1 rich>=13.9.4 cssselect>=1.2.0 -faust-cchardet>=2.1.19 \ No newline at end of file +faust-cchardet>=2.1.19 +fake-useragent>=2.2.0 \ No newline at end of file From 0ec3c4a7886a26e38a7467905f55072dc72737da Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Apr 2025 12:11:12 +0200 Subject: [PATCH 04/37] fix(crawler): handle navigation aborts during file downloads in AsyncPlaywrightCrawlerStrategy --- crawl4ai/async_crawler_strategy.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 89b4df84..28325c84 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -582,7 +582,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) redirected_url = page.url except Error as e: - raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + # Allow navigation to be aborted when downloading files + # This is expected behavior for downloads in some browser engines + if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads: + self.logger.info( + message=f"Navigation aborted, likely due to file download: {url}", + tag="GOTO", + params={"url": url}, + ) + response = None + else: + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") await self.execute_hook( "after_goto", page, context=context, url=url, response=response, config=config From 0886153d6a4267bf6b1846b8601edc87055fa13e Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Apr 2025 12:48:11 +0200 Subject: [PATCH 05/37] fix(async_playwright_crawler): improve segment handling and viewport adjustments during screenshot capture (Fixed bug: Capturing Screenshot Twice and Increasing Image Size) --- crawl4ai/async_crawler_strategy.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 28325c84..bda4897c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1162,12 +1162,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): num_segments = (page_height // viewport_height) + 1 for i in range(num_segments): y_offset = i * viewport_height + # Special handling for the last segment + if i == num_segments - 1: + last_part_height = page_height % viewport_height + + # If page_height is an exact multiple of viewport_height, + # we don't need an extra segment + if last_part_height == 0: + # Skip last segment if page height is exact multiple of viewport + break + + # Adjust viewport to exactly match the remaining content height + await page.set_viewport_size({"width": page_width, "height": last_part_height}) + await page.evaluate(f"window.scrollTo(0, {y_offset})") await asyncio.sleep(0.01) # wait for render - seg_shot = await page.screenshot(full_page=False) + + # Capture the current segment + # Note: Using compression options (format, quality) would go here + seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85) + # seg_shot = await page.screenshot(full_page=False) img = Image.open(BytesIO(seg_shot)).convert("RGB") segments.append(img) + # Reset viewport to original size after capturing segments + await page.set_viewport_size({"width": page_width, "height": viewport_height}) + total_height = sum(img.height for img in segments) stitched = Image.new("RGB", (segments[0].width, total_height)) offset = 0 From 14a31456ef249a32be1d971cad9ab056da1a24e7 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 21 Apr 2025 13:59:49 +0200 Subject: [PATCH 06/37] fix(docs): update browser-crawler-config example to include LLMContentFilter and DefaultMarkdownGenerator, fix syntax errors --- docs/md_v2/core/browser-crawler-config.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 0d97e0fc..5f66b3ea 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -265,7 +265,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi ```python import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator from crawl4ai.extraction_strategy import JsonCssExtractionStrategy async def main(): @@ -290,7 +290,7 @@ async def main(): # 3) Example LLM content filtering gemini_config = LLMConfig( - provider="gemini/gemini-1.5-pro" + provider="gemini/gemini-1.5-pro", api_token = "env:GEMINI_API_TOKEN" ) @@ -314,8 +314,9 @@ async def main(): ) md_generator = DefaultMarkdownGenerator( - content_filter=filter, - options={"ignore_links": True} + content_filter=filter, + options={"ignore_links": True} + ) # 4) Crawler run config: skip cache, use extraction run_conf = CrawlerRunConfig( From 53245e4e0e54dc4604f8b427105d820dba6c38a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sacrist=C3=A1n?= Date: Tue, 29 Apr 2025 16:26:35 +0200 Subject: [PATCH 07/37] Fix: README.md urls list --- README.md | 18 +++++++++++++----- tests/docker_example.py | 18 +++++++++--------- tests/test_docker.py | 16 ++++++++-------- tests/test_main.py | 16 ++++++++-------- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 97787b2f..879baa51 100644 --- a/README.md +++ b/README.md @@ -291,12 +291,20 @@ import requests # Submit a crawl job response = requests.post( "http://localhost:11235/crawl", - json={"urls": "https://example.com", "priority": 10} + json={"urls": ["https://example.com"], "priority": 10} ) -task_id = response.json()["task_id"] - -# Continue polling until the task is complete (status="completed") -result = requests.get(f"http://localhost:11235/task/{task_id}") +if response.status_code == 200: + print("Crawl job submitted successfully.") + +if "results" in response.json(): + results = response.json()["results"] + print("Crawl job completed. Results:") + for result in results: + print(result) +else: + task_id = response.json()["task_id"] + print(f"Crawl job submitted. Task ID:: {task_id}") + result = requests.get(f"http://localhost:11235/task/{task_id}") ``` For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/). diff --git a/tests/docker_example.py b/tests/docker_example.py index 336ca52f..03348d50 100644 --- a/tests/docker_example.py +++ b/tests/docker_example.py @@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"): def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 10, "session_id": "test", } @@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester): def test_basic_crawl_sync(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl (Sync) ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 10, "session_id": "test", } @@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" @@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester): def test_css_selector(tester: Crawl4AiTester): print("\n=== Testing CSS Selector ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 7, "css_selector": ".wide-tease-item__description", "crawler_params": {"headless": True}, @@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://www.coinbase.com/explore", + "urls": ["https://www.coinbase.com/explore"], "priority": 9, "extraction_config": {"type": "json_css", "params": {"schema": schema}}, } @@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://openai.com/api/pricing", + "urls": ["https://openai.com/api/pricing"], "priority": 8, "extraction_config": { "type": "llm", @@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "llm", @@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): def test_cosine_extraction(tester: Crawl4AiTester): print("\n=== Testing Cosine Extraction ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "cosine", @@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester): def test_screenshot(tester: Crawl4AiTester): print("\n=== Testing Screenshot ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 5, "screenshot": True, "crawler_params": {"headless": True}, diff --git a/tests/test_docker.py b/tests/test_docker.py index 3570d608..c507ae56 100644 --- a/tests/test_docker.py +++ b/tests/test_docker.py @@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"): def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") - request = {"urls": "https://www.nbcnews.com/business", "priority": 10} + request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10} result = tester.submit_and_wait(request) print(f"Basic crawl result length: {len(result['result']['markdown'])}") @@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" @@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester): def test_css_selector(tester: Crawl4AiTester): print("\n=== Testing CSS Selector ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 7, "css_selector": ".wide-tease-item__description", "crawler_params": {"headless": True}, @@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://www.coinbase.com/explore", + "urls": ["https://www.coinbase.com/explore"], "priority": 9, "extraction_config": {"type": "json_css", "params": {"schema": schema}}, } @@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://openai.com/api/pricing", + "urls": ["https://openai.com/api/pricing"], "priority": 8, "extraction_config": { "type": "llm", @@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "llm", @@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): def test_cosine_extraction(tester: Crawl4AiTester): print("\n=== Testing Cosine Extraction ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "cosine", @@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester): def test_screenshot(tester: Crawl4AiTester): print("\n=== Testing Screenshot ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 5, "screenshot": True, "crawler_params": {"headless": True}, diff --git a/tests/test_main.py b/tests/test_main.py index 0e938f59..b32b68f0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -54,7 +54,7 @@ class NBCNewsAPITest: async def test_basic_crawl(): print("\n=== Testing Basic Crawl ===") async with NBCNewsAPITest() as api: - request = {"urls": "https://www.nbcnews.com/business", "priority": 10} + request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10} task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print(f"Basic crawl result length: {len(result['result']['markdown'])}") @@ -67,7 +67,7 @@ async def test_js_execution(): print("\n=== Testing JS Execution ===") async with NBCNewsAPITest() as api: request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" @@ -86,7 +86,7 @@ async def test_css_selector(): print("\n=== Testing CSS Selector ===") async with NBCNewsAPITest() as api: request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 7, "css_selector": ".wide-tease-item__description", } @@ -120,7 +120,7 @@ async def test_structured_extraction(): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 9, "extraction_config": {"type": "json_css", "params": {"schema": schema}}, } @@ -177,7 +177,7 @@ async def test_llm_extraction(): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "llm", @@ -209,7 +209,7 @@ async def test_screenshot(): print("\n=== Testing Screenshot ===") async with NBCNewsAPITest() as api: request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 5, "screenshot": True, "crawler_params": {"headless": True}, @@ -227,7 +227,7 @@ async def test_priority_handling(): async with NBCNewsAPITest() as api: # Submit low priority task first low_priority = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 1, "crawler_params": {"headless": True}, } @@ -235,7 +235,7 @@ async def test_priority_handling(): # Submit high priority task high_priority = { - "urls": "https://www.nbcnews.com/business/consumer", + "urls": ["https://www.nbcnews.com/business/consumer"], "priority": 10, "crawler_params": {"headless": True}, } From 039be1b1ce7e32d1186ce9d1b123605248f3fb26 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 30 Apr 2025 11:41:35 +0200 Subject: [PATCH 08/37] feat: add pdf2image dependency to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4aa2dbff..b695f92c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ rich>=13.9.4 cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 -fake-useragent>=2.2.0 \ No newline at end of file +fake-useragent>=2.2.0 +pdf2image>=1.17.0 \ No newline at end of file From 1d6a2b9979d530703ec76708a385a2d87a1b5f7d Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 30 Apr 2025 12:29:17 +0200 Subject: [PATCH 09/37] fix(crawler): surface real redirect status codes and keep redirect chain. the 30x response instead of always returning 200. Refs #660 --- crawl4ai/async_crawler_strategy.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3162bd54..da5490b6 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -744,12 +744,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "after_goto", page, context=context, url=url, response=response, config=config ) + # ────────────────────────────────────────────────────────────── + # Walk the redirect chain. Playwright returns only the last + # hop, so we trace the `request.redirected_from` links until the + # first response that differs from the final one and surface its + # status-code. + # ────────────────────────────────────────────────────────────── if response is None: status_code = 200 response_headers = {} else: - status_code = response.status - response_headers = response.headers + first_resp = response + req = response.request + while req and req.redirected_from: + prev_req = req.redirected_from + prev_resp = await prev_req.response() + if prev_resp: # keep earliest + first_resp = prev_resp + req = prev_req + + status_code = first_resp.status + response_headers = first_resp.headers + # if response is None: + # status_code = 200 + # response_headers = {} + # else: + # status_code = response.status + # response_headers = response.headers else: status_code = 200 From e0cd3e10de0b04079c2144c6febb54cd74139f50 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Fri, 2 May 2025 10:35:35 +0200 Subject: [PATCH 10/37] fix(crawler): initialize captured_console variable for local file processing --- crawl4ai/async_crawler_strategy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index da5490b6..6c0b4115 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -445,6 +445,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return await self._crawl_web(url, config) elif url.startswith("file://"): + captured_console = None # Process local file local_file_path = url[7:] # Remove 'file://' prefix if not os.path.exists(local_file_path): From 12783fabdab1cdea99e930392c572e83831897df Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 7 May 2025 11:18:13 +0200 Subject: [PATCH 11/37] fix(dependencies): update pillow version constraint to allow newer releases. ref #709 --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be44397e..8b5f0910 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "lxml~=5.3", "litellm>=1.53.1", "numpy>=1.26.0,<3", - "pillow~=10.4", + "pillow>=10.4", "playwright>=1.49.0", "python-dotenv~=1.0", "requests~=2.26", diff --git a/requirements.txt b/requirements.txt index b695f92c..10d7fd81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ aiosqlite~=0.20 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 -pillow~=10.4 +pillow>=10.4 playwright>=1.49.0 python-dotenv~=1.0 requests~=2.26 From eebb8c84f0a434f6cec4173a82c8b4dceb510037 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 7 May 2025 11:18:44 +0200 Subject: [PATCH 12/37] fix(requirements): add PyPDF2 dependency for PDF processing --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 10d7fd81..b62575d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,5 @@ cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 fake-useragent>=2.2.0 -pdf2image>=1.17.0 \ No newline at end of file +pdf2image>=1.17.0 +PyPDF2>=3.0.1 \ No newline at end of file From 2b17f234f8354dca893063b68aa3ec41431c5d3c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 15:20:36 +0530 Subject: [PATCH 13/37] docs: update direct passing of content_filter to CrawlerRunConfig and instead pass it via MarkdownGenerator. Ref: #603 --- deploy/docker/c4ai-doc-context.md | 11 ++++++++--- docs/md_v2/core/markdown-generation.md | 9 ++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 1642f85e..081f29b7 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -403,7 +403,7 @@ async def main(): md_generator = DefaultMarkdownGenerator( content_filter=filter, - options={"ignore_links": True} + options={"ignore_links": True}) # 4) Crawler run config: skip cache, use extraction run_conf = CrawlerRunConfig( @@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): @@ -4175,8 +4175,13 @@ async def main(): verbose=True ) + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) + config = CrawlerRunConfig( - content_filter=filter + markdown_generator=md_generator ) async with AsyncWebCrawler() as crawler: diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index e6f5e12a..4a6e9218 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -233,7 +233,7 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): @@ -255,9 +255,12 @@ async def main(): chunk_token_threshold=4096, # Adjust based on your needs verbose=True ) - + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) config = CrawlerRunConfig( - content_filter=filter + markdown_generator=md_generator, ) async with AsyncWebCrawler() as crawler: From ee93acbd06c49ce70e3905f267fd15711b39446b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 7 May 2025 12:32:38 +0200 Subject: [PATCH 14/37] fix(async_playwright_crawler): use config directly instead of self.config for verbosity check --- crawl4ai/async_crawler_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a6aae4e7..85c3a15c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -816,7 +816,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error: visibility_info = await self.check_visibility(page) - if self.config.verbose: + if self.verbose: self.logger.debug( message="Body visibility info: {info}", tag="DEBUG", From f6e25e2a6bae8a1b774b6e71fc98edc460d04b53 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 17:53:30 +0530 Subject: [PATCH 15/37] fix: check_robots_txt to support wildcard rules ref: #699 --- crawl4ai/utils.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index bfa8ce9d..4018d78c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -42,6 +42,29 @@ from itertools import chain from collections import deque from typing import Generator, Iterable +# Monkey patch to fix wildcard handling in urllib.robotparser +from urllib.robotparser import RuleLine +import re + +original_applies_to = RuleLine.applies_to + +def patched_applies_to(self, filename): + # Handle wildcards in paths + if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"): + pattern = self.path.replace('%2A', '*') + pattern = re.escape(pattern).replace('\\*', '.*') + pattern = '^' + pattern + if pattern.endswith('\\$'): + pattern = pattern[:-2] + '$' + try: + return bool(re.match(pattern, filename)) + except re.error: + return original_applies_to(self, filename) + return original_applies_to(self, filename) + +RuleLine.applies_to = patched_applies_to +# Monkey patch ends + def chunk_documents( documents: Iterable[str], chunk_token_threshold: int, @@ -303,7 +326,7 @@ class RobotsParser: robots_url = f"{scheme}://{domain}/robots.txt" async with aiohttp.ClientSession() as session: - async with session.get(robots_url, timeout=2) as response: + async with session.get(robots_url, timeout=2, ssl=False) as response: if response.status == 200: rules = await response.text() self._cache_rules(domain, rules) From c1041b9bbee1338ec89997bdf62e76c6a5f3ada6 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 7 May 2025 18:43:29 +0530 Subject: [PATCH 16/37] fix: exclude_external_images flag simply discards elements ref:https://github.com/unclecode/crawl4ai/issues/345 --- crawl4ai/content_scraping_strategy.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 1dfbce84..d11e02d0 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy): # Check flag if we should remove external images if kwargs.get("exclude_external_images", False): - element.decompose() - return False - # src_url_base = src.split('/')[2] - # url_base = url.split('/')[2] - # if url_base not in src_url_base: - # element.decompose() - # return False + # Handle relative URLs (which are always from the same domain) + if not src.startswith('http') and not src.startswith('//'): + return True # Keep relative URLs + + # For absolute URLs, compare the base domains using the existing function + src_base_domain = get_base_domain(src) + url_base_domain = get_base_domain(url) + + # If the domains don't match and both are valid, the image is external + if src_base_domain and url_base_domain and src_base_domain != url_base_domain: + element.decompose() + return False # if kwargs.get('exclude_social_media_links', False): # if image_src_base_domain in exclude_social_media_domains: From 25d97d56e4e3bbc74fa1de9423cc5ae3457b0baf Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 13 May 2025 13:56:12 +0200 Subject: [PATCH 17/37] fix(dependencies): remove duplicated aiofiles from project dependencies. REF #1045 --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8b5f0910..a208d5d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ dependencies = [ "psutil>=6.1.1", "nltk>=3.9.1", "playwright", - "aiofiles", "rich>=13.9.4", "cssselect>=1.2.0", "httpx>=0.27.2", From 260e2dc347e2d0b4463eec31f3eaa81e87ca109b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 13 May 2025 14:03:20 +0200 Subject: [PATCH 18/37] fix(browser): create browser config before launching managed browser instance. REF: https://discord.com/channels/1278297938551902308/1278298697540567132/1371683009459392716 --- crawl4ai/browser_profiler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 961ba740..41f917f5 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -615,9 +615,18 @@ class BrowserProfiler: self.logger.info(f"Debugging port: {debugging_port}", tag="CDP") self.logger.info(f"Headless mode: {headless}", tag="CDP") + # create browser config + browser_config = BrowserConfig( + browser_type=browser_type, + headless=headless, + user_data_dir=profile_path, + debugging_port=debugging_port, + verbose=True + ) + # Create managed browser instance managed_browser = ManagedBrowser( - browser_type=browser_type, + browser_config=browser_config, user_data_dir=profile_path, headless=headless, logger=self.logger, From 137556b3dce373bfd8af09e8bd5f9da0051ba463 Mon Sep 17 00:00:00 2001 From: medo94my Date: Wed, 14 May 2025 16:01:10 +0800 Subject: [PATCH 19/37] fix the EXTRACT to match the styling of the other methods --- crawl4ai/async_webcrawler.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 19b98522..9e42b824 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -360,7 +360,7 @@ class AsyncWebCrawler: pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, - redirected_url=async_response.redirected_url, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -503,7 +503,7 @@ class AsyncWebCrawler: tables = media.pop("tables", []) links = result.links.model_dump() metadata = result.metadata - + fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) ################################ @@ -585,11 +585,13 @@ class AsyncWebCrawler: # Choose content based on input_format content_format = config.extraction_strategy.input_format if content_format == "fit_markdown" and not markdown_result.fit_markdown: - self.logger.warning( - message="Fit markdown requested but not available. Falling back to raw markdown.", - tag="EXTRACT", - params={"url": _url}, - ) + + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) content_format = "markdown" content = { From a3b0cab52a813f505db0f58e40079b4e8d817a6a Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Thu, 15 May 2025 11:25:06 +0800 Subject: [PATCH 20/37] #1088 is sloved flag -bc now if for --byPass-cache --- crawl4ai/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 51477d6b..a02eff59 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") @click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)") -@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling") +@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling") @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") From 32966bea11dc595d752f89502bbe7e0a2240ba28 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 15 May 2025 10:09:19 +0200 Subject: [PATCH 21/37] fix(extraction): resolve `'str' object has no attribute 'choices'` error in LLMExtractionStrategy. Refs: #979 This patch ensures consistent handling of `response.choices[0].message.content` by avoiding redefinition of the `response` variable, which caused downstream exceptions during error handling. --- crawl4ai/extraction_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 245abc54..6be084b3 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -680,7 +680,7 @@ class LLMExtractionStrategy(ExtractionStrategy): block["error"] = False except Exception: parsed, unparsed = split_and_parse_json_objects( - response.choices[0].message.content + response ) blocks = parsed if unparsed: From e0fbd2b0a0488569ac1d5e89a6363d11a00c2b25 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 15 May 2025 10:45:23 +0200 Subject: [PATCH 22/37] fix(schema): update `f` parameter description to use lowercase enum values. REF: #1070 Revised the description for the `f` parameter in the `/mcp/md` tool schema to use lowercase enum values (`raw`, `fit`, `bm25`, `llm`) for consistency with the actual `enum` definition. This change prevents LLM-based clients (e.g., Gemini via LibreChat) from generating uppercase values like `"FIT"`, which caused 422 validation errors due to strict case-sensitive matching. --- deploy/docker/schemas.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py index ea32b6c6..611e17e9 100644 --- a/deploy/docker/schemas.py +++ b/deploy/docker/schemas.py @@ -12,8 +12,7 @@ class CrawlRequest(BaseModel): class MarkdownRequest(BaseModel): """Request body for the /md endpoint.""" url: str = Field(..., description="Absolute http/https URL to fetch") - f: FilterType = Field(FilterType.FIT, - description="Content‑filter strategy: FIT, RAW, BM25, or LLM") + f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm") q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters") c: Optional[str] = Field("0", description="Cache‑bust / revision counter") From 22725ca87b76107f5251e1ab97906ecfd61fac07 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 15 May 2025 11:29:36 +0200 Subject: [PATCH 23/37] fix(crawler): initialize `captured_console` to prevent unbound local error for local HTML files. REF: #1072 Resolved a bug where running the crawler on local HTML files with `capture_console_messages=False` (default) raised `UnboundLocalError` due to `captured_console` being accessed before assignment. --- crawl4ai/async_crawler_strategy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 10d395ee..9a8d621c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return await self._crawl_web(url, config) elif url.startswith("file://"): + # initialize empty lists for console messages + captured_console = [] + # Process local file local_file_path = url[7:] # Remove 'file://' prefix if not os.path.exists(local_file_path): From faa98eefbc4f3f87f8751bbb5c534cba4f8507c1 Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Mon, 19 May 2025 11:35:13 +0800 Subject: [PATCH 24/37] #1105 got fixed (metadata now matches with meta property article:* --- crawl4ai/utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index bfa8ce9d..ebf15f24 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1456,6 +1456,12 @@ def extract_metadata_using_lxml(html, doc=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content + # getting the article Values + metadata.update({ + tag['property'].strip():tag["content"].strip() + for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")}) + if tag.has_attr('property') and tag.has_attr('content') + }) return metadata @@ -1531,7 +1537,12 @@ def extract_metadata(html, soup=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content - + # getting the article Values + metadata.update({ + tag['property'].strip():tag["content"].strip() + for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")}) + if tag.has_attr('property') and tag.has_attr('content') + }) return metadata From 137ac014fb986f7df4e3cd8d8598b6120e05a20c Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Mon, 19 May 2025 13:48:02 +0800 Subject: [PATCH 25/37] #1105 :fix(metadata): optimize article metadata extraction using XPath for improved performance --- crawl4ai/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index ebf15f24..64d4b210 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1456,12 +1456,13 @@ def extract_metadata_using_lxml(html, doc=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content - # getting the article Values - metadata.update({ - tag['property'].strip():tag["content"].strip() - for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")}) - if tag.has_attr('property') and tag.has_attr('content') - }) + # Article metadata - using starts-with() for performance + article_tags = head.xpath('.//meta[starts-with(@property, "article:")]') + for tag in article_tags: + property_name = tag.get("property", "").strip() + content = tag.get("content", "").strip() + if property_name and content: + metadata[property_name] = content return metadata From b4fc60a5552c4c89b6d6893ecd45910eda9219ae Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Mon, 19 May 2025 13:51:16 +0800 Subject: [PATCH 26/37] #1103 fix(url): enhance URL normalization to handle invalid schemes and trailing slashes --- crawl4ai/utils.py | 14 +++--- tests/test_normalize_url.py | 91 +++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 tests/test_normalize_url.py diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 64d4b210..46207ca7 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2012,14 +2012,16 @@ def normalize_url(href, base_url): parsed_base = urlparse(base_url) if not parsed_base.scheme or not parsed_base.netloc: raise ValueError(f"Invalid base URL format: {base_url}") - - # Ensure base_url ends with a trailing slash if it's a directory path - if not base_url.endswith('/'): - base_url = base_url + '/' + + if parsed_base.scheme.lower() not in ["http", "https"]: + # Handle special protocols + raise ValueError(f"Invalid base URL format: {base_url}") + cleaned_href = href.strip() # Use urljoin to handle all cases - normalized = urljoin(base_url, href.strip()) - return normalized + return urljoin(base_url, cleaned_href) + + def normalize_url_for_deep_crawl(href, base_url): diff --git a/tests/test_normalize_url.py b/tests/test_normalize_url.py new file mode 100644 index 00000000..b1f1cc7d --- /dev/null +++ b/tests/test_normalize_url.py @@ -0,0 +1,91 @@ +import unittest +from crawl4ai.utils import normalize_url + +class TestNormalizeUrl(unittest.TestCase): + + def test_basic_relative_path(self): + self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html") + + def test_base_url_with_trailing_slash(self): + self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html") + + def test_base_url_without_trailing_slash(self): + # If normalize_url correctly uses urljoin, "base" is treated as a file. + self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html") + + def test_absolute_url_as_href(self): + self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html") + + def test_href_with_leading_trailing_spaces(self): + self.assertEqual(normalize_url(" page.html ", "http://example.com/"), "http://example.com/page.html") + + def test_empty_href(self): + # urljoin with an empty href and base ending in '/' returns the base. + self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/") + # urljoin with an empty href and base not ending in '/' also returns base. + self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base") + + def test_href_with_query_parameters(self): + self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test") + + def test_href_with_fragment(self): + self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section") + + def test_different_scheme_in_href(self): + self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html") + + def test_parent_directory_in_href(self): + self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html") + + def test_root_relative_href(self): + self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html") + + def test_base_url_with_path_and_no_trailing_slash(self): + # If normalize_url correctly uses urljoin, "path" is treated as a file. + self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html") + + def test_base_url_is_just_domain(self): + self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html") + + def test_href_is_only_query(self): + self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true") + + def test_href_is_only_fragment(self): + self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment") + + def test_relative_link_from_base_file_url(self): + """ + Tests the specific bug report: relative links from a base URL that is a file. + Example: + Page URL: http://example.com/path/to/document.html + Link on page: + Expected: http://example.com/path/to/file.xlsx + """ + base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml" + href_relative_current_dir = "./P020241203375994691134.xlsx" + expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx" + self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1) + + # Test with a relative link that doesn't start with "./" + href_relative_no_dot_slash = "another.doc" + expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc" + self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2) + + def test_invalid_base_url_scheme(self): + with self.assertRaises(ValueError) as context: + normalize_url("page.html", "ftp://example.com/") + self.assertIn("Invalid base URL format", str(context.exception)) + + def test_invalid_base_url_netloc(self): + with self.assertRaises(ValueError) as context: + normalize_url("page.html", "http:///path/") + self.assertIn("Invalid base URL format", str(context.exception)) + + def test_base_url_with_port(self): + self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html") + + def test_href_with_special_characters(self): + self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From a97654270b9b5ba89ed7d3a1bb616bf2f8417203 Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Mon, 19 May 2025 14:11:46 +0800 Subject: [PATCH 27/37] #1086 fix(markdown): update BM25 filter to use language parameter for stemming --- docs/md_v2/core/markdown-generation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index e6f5e12a..e897b2bb 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -187,7 +187,7 @@ from crawl4ai import CrawlerRunConfig bm25_filter = BM25ContentFilter( user_query="machine learning", bm25_threshold=1.2, - use_stemming=True + language="english" ) md_generator = DefaultMarkdownGenerator( @@ -200,7 +200,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator) - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. -- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”). +- **`language (str)`**: Language for stemming (default: 'english'). **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results. From a55c2b3f88371570a5683be59e40f8ea609b0a19 Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Mon, 19 May 2025 16:32:22 +0800 Subject: [PATCH 28/37] refactor(logging): update extraction logging to use url_status method --- crawl4ai/async_webcrawler.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9e42b824..cb221b72 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -615,11 +615,12 @@ class AsyncWebCrawler: ) # Log extraction completion - self.logger.info( - message="Completed for {url:.50}... | Time: {timing}s", - tag="EXTRACT", - params={"url": _url, "timing": time.perf_counter() - t1}, - ) + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) # Apply HTML formatting if requested if config.prettiify: From cb8d581e477daf1a310f504847cbbcafb7e8e07e Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 19 May 2025 18:03:05 +0200 Subject: [PATCH 29/37] fix(docs): update CrawlerRunConfig to use CacheMode for bypassing cache. REF: #1125 --- deploy/docker/c4ai-doc-context.md | 20 ++++++++++---------- docs/md_v2/core/local-files.md | 16 ++++++++-------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 1642f85e..5b5a81bb 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`, ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def crawl_web(): - config = CrawlerRunConfig(bypass_cache=True) + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://en.wikipedia.org/wiki/apple", @@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`. ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def crawl_local_file(): local_file_path = "/path/to/apple.html" # Replace with your file path file_url = f"file://{local_file_path}" - config = CrawlerRunConfig(bypass_cache=True) + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=file_url, config=config) @@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`. ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def crawl_raw_html(): raw_html = "

Hello, World!

" raw_html_url = f"raw:{raw_html}" - config = CrawlerRunConfig(bypass_cache=True) + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=raw_html_url, config=config) @@ -3845,7 +3845,7 @@ import os import sys import asyncio from pathlib import Path -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def main(): @@ -3856,7 +3856,7 @@ async def main(): async with AsyncWebCrawler() as crawler: # Step 1: Crawl the Web URL print("\n=== Step 1: Crawling the Wikipedia URL ===") - web_config = CrawlerRunConfig(bypass_cache=True) + web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) result = await crawler.arun(url=wikipedia_url, config=web_config) if not result.success: @@ -3871,7 +3871,7 @@ async def main(): # Step 2: Crawl from the Local HTML File print("=== Step 2: Crawling from the Local HTML File ===") file_url = f"file://{html_file_path.resolve()}" - file_config = CrawlerRunConfig(bypass_cache=True) + file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) local_result = await crawler.arun(url=file_url, config=file_config) if not local_result.success: @@ -3887,7 +3887,7 @@ async def main(): with open(html_file_path, 'r', encoding='utf-8') as f: raw_html_content = f.read() raw_html_url = f"raw:{raw_html_content}" - raw_config = CrawlerRunConfig(bypass_cache=True) + raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) raw_result = await crawler.arun(url=raw_html_url, config=raw_config) if not raw_result.success: diff --git a/docs/md_v2/core/local-files.md b/docs/md_v2/core/local-files.md index ddf27f8c..31fe7792 100644 --- a/docs/md_v2/core/local-files.md +++ b/docs/md_v2/core/local-files.md @@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`, ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def crawl_web(): - config = CrawlerRunConfig(bypass_cache=True) + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://en.wikipedia.org/wiki/apple", @@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`. ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def crawl_local_file(): local_file_path = "/path/to/apple.html" # Replace with your file path file_url = f"file://{local_file_path}" - config = CrawlerRunConfig(bypass_cache=True) + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=file_url, config=config) @@ -93,7 +93,7 @@ import os import sys import asyncio from pathlib import Path -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def main(): @@ -104,7 +104,7 @@ async def main(): async with AsyncWebCrawler() as crawler: # Step 1: Crawl the Web URL print("\n=== Step 1: Crawling the Wikipedia URL ===") - web_config = CrawlerRunConfig(bypass_cache=True) + web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) result = await crawler.arun(url=wikipedia_url, config=web_config) if not result.success: @@ -119,7 +119,7 @@ async def main(): # Step 2: Crawl from the Local HTML File print("=== Step 2: Crawling from the Local HTML File ===") file_url = f"file://{html_file_path.resolve()}" - file_config = CrawlerRunConfig(bypass_cache=True) + file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) local_result = await crawler.arun(url=file_url, config=file_config) if not local_result.success: @@ -135,7 +135,7 @@ async def main(): with open(html_file_path, 'r', encoding='utf-8') as f: raw_html_content = f.read() raw_html_url = f"raw:{raw_html_content}" - raw_config = CrawlerRunConfig(bypass_cache=True) + raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) raw_result = await crawler.arun(url=raw_html_url, config=raw_config) if not raw_result.success: From 984524ca1c4cb394a8a18d353ac1b45cdc1cca7d Mon Sep 17 00:00:00 2001 From: Ahmed-Tawfik94 Date: Wed, 21 May 2025 13:26:11 +0800 Subject: [PATCH 30/37] fix(auth): add token authorization header in request preparation to ensure authenticated requests are made --- crawl4ai/docker_client.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py index f4816eb5..4e33431f 100644 --- a/crawl4ai/docker_client.py +++ b/crawl4ai/docker_client.py @@ -73,6 +73,8 @@ class Crawl4aiDockerClient: def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]: """Prepare request data from configs.""" + if self._token: + self._http_client.headers["Authorization"] = f"Bearer {self._token}" return { "urls": urls, "browser_config": browser_config.dump() if browser_config else {}, @@ -103,8 +105,6 @@ class Crawl4aiDockerClient: crawler_config: Optional[CrawlerRunConfig] = None ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]: """Execute a crawl operation.""" - if not self._token: - raise Crawl4aiClientError("Authentication required. Call authenticate() first.") await self._check_server() data = self._prepare_request(urls, browser_config, crawler_config) @@ -140,8 +140,6 @@ class Crawl4aiDockerClient: async def get_schema(self) -> Dict[str, Any]: """Retrieve configuration schemas.""" - if not self._token: - raise Crawl4aiClientError("Authentication required. Call authenticate() first.") response = await self._request("GET", "/schema") return response.json() @@ -167,4 +165,4 @@ async def main(): print(schema) if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From 33a0c7a17adfcc5c023aba4183d0de63a4f2dffd Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 22 May 2025 11:17:28 +0200 Subject: [PATCH 31/37] fix(logger): add RED color to LogColor enum for enhanced logging options --- crawl4ai/async_logger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index 49c7ee6f..9fb1e8e7 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -39,6 +39,7 @@ class LogColor(str, Enum): YELLOW = "yellow" MAGENTA = "magenta" DIM_MAGENTA = "dim magenta" + RED = "red" def __str__(self): """Automatically convert rich color to string.""" From da8f0dbb931e7701bde807186bd9e9ae32cde114 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 22 May 2025 11:25:51 +0200 Subject: [PATCH 32/37] fix(browser_profiler): change logger print to info for consistent logging in interactive manager --- crawl4ai/browser_profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 961ba740..41efd4b0 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -458,7 +458,7 @@ class BrowserProfiler: self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA) exit_option = "4" - self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="") + self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="") choice = input() if choice == "1": From 3d46d89759da93702f2dbd2c7f931389298afbb1 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 22 May 2025 17:21:42 +0530 Subject: [PATCH 33/37] docs: fix https://github.com/unclecode/crawl4ai/issues/1109 --- docs/md_v2/advanced/proxy-security.md | 80 ++++++++++++++++++--------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index 0e56572c..13191cd7 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`: ```python from crawl4ai.async_configs import BrowserConfig -proxy_config = { - "server": "http://proxy.example.com:8080", - "username": "user", - "password": "pass" -} - -browser_config = BrowserConfig(proxy_config=proxy_config) +browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]") async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` -Here's the corrected documentation: ## Rotating Proxies Example using a proxy rotation service dynamically: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig - -async def get_next_proxy(): - # Your proxy rotation logic here - return {"server": "http://next.proxy.com:8080"} - +import re +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + RoundRobinProxyStrategy, +) +import asyncio +from crawl4ai import ProxyConfig async def main(): - browser_config = BrowserConfig() - run_config = CrawlerRunConfig() - - async with AsyncWebCrawler(config=browser_config) as crawler: - # For each URL, create a new run config with different proxy - for url in urls: - proxy = await get_next_proxy() - # Clone the config and update proxy - this creates a new browser context - current_config = run_config.clone(proxy_config=proxy) - result = await crawler.arun(url=url, config=current_config) + # Load proxies and create rotation strategy + proxies = ProxyConfig.from_env() + #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2" + if not proxies: + print("No proxies found in environment. Set PROXIES env variable!") + return + + proxy_strategy = RoundRobinProxyStrategy(proxies) + + # Create configs + browser_config = BrowserConfig(headless=True, verbose=False) + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_rotation_strategy=proxy_strategy + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice + + print("\n📈 Initializing crawler with proxy rotation...") + async with AsyncWebCrawler(config=browser_config) as crawler: + print("\n🚀 Starting batch crawl with proxy rotation...") + results = await crawler.arun_many( + urls=urls, + config=run_config + ) + for result in results: + if result.success: + ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) + current_proxy = run_config.proxy_config if run_config.proxy_config else None + + if current_proxy and ip_match: + print(f"URL {result.url}") + print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}") + verified = ip_match.group(0) == current_proxy.ip + if verified: + print(f"✅ Proxy working! IP matches: {current_proxy.ip}") + else: + print("❌ Proxy failed or IP mismatch!") + print("---") + +asyncio.run(main()) -if __name__ == "__main__": - import asyncio - asyncio.run(main()) ``` From b55e27d2ef2bedecae53359fc71f4d0a6771e455 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 26 May 2025 11:08:23 +0200 Subject: [PATCH 34/37] fix: chanegd error variable name handle_crawl_request, docker api --- deploy/docker/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 732371f7..b728acd1 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -459,7 +459,7 @@ async def handle_crawl_request( # await crawler.close() # except Exception as close_e: # logger.error(f"Error closing crawler during exception handling: {close_e}") - logger.error(f"Error closing crawler during exception handling: {close_e}") + logger.error(f"Error closing crawler during exception handling: {str(e)}") # Measure memory even on error if possible end_mem_mb_error = _get_memory_mb() @@ -518,7 +518,7 @@ async def handle_stream_crawl_request( # await crawler.close() # except Exception as close_e: # logger.error(f"Error closing crawler during stream setup exception: {close_e}") - logger.error(f"Error closing crawler during stream setup exception: {close_e}") + logger.error(f"Error closing crawler during stream setup exception: {str(e)}") logger.error(f"Stream crawl error: {str(e)}", exc_info=True) # Raising HTTPException here will prevent streaming response raise HTTPException( From 871d4f1158c9b45e3bd869c4f192ec4420fcd932 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Fri, 13 Jun 2025 11:26:05 +0200 Subject: [PATCH 35/37] fix(extraction_strategy): rename response variable to content for clarity in LLMExtractionStrategy. ref #1146 --- crawl4ai/extraction_strategy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 6be084b3..25ebbd5f 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy): self.total_usage.total_tokens += usage.total_tokens try: - response = response.choices[0].message.content + content = response.choices[0].message.content blocks = None if self.force_json_response: - blocks = json.loads(response) + blocks = json.loads(content) if isinstance(blocks, dict): # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]} if len(blocks) == 1 and isinstance(list(blocks.values())[0], list): @@ -673,14 +673,14 @@ class LLMExtractionStrategy(ExtractionStrategy): blocks = blocks else: # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"] - blocks = extract_xml_data(["blocks"], response)["blocks"] + blocks = extract_xml_data(["blocks"], content)["blocks"] blocks = json.loads(blocks) for block in blocks: block["error"] = False except Exception: parsed, unparsed = split_and_parse_json_objects( - response + response.choices[0].message.content ) blocks = parsed if unparsed: From b7a6e02236f9da30c1bb21b8a5bb3dab86d97233 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 18 Jun 2025 19:04:32 +0200 Subject: [PATCH 36/37] fix: Update pdf and screenshot usage documentation. ref #1230 --- deploy/docker/c4ai-doc-context.md | 29 ++++++++++++++++-------- docs/md_v2/advanced/advanced-features.md | 29 ++++++++++++++++-------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 6591c265..f8b83088 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -5433,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c ```python import os, asyncio from base64 import b64decode -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig async def main(): + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + pdf=True + ) + async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", - cache_mode=CacheMode.BYPASS, - pdf=True, - screenshot=True + config=run_config ) - if result.success: - # Save screenshot + print(f"Screenshot data present: {result.screenshot is not None}") + print(f"PDF data present: {result.pdf is not None}") + if result.screenshot: + print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes") with open("wikipedia_screenshot.png", "wb") as f: f.write(b64decode(result.screenshot)) - - # Save PDF + else: + print("[WARN] Screenshot data is None.") + if result.pdf: + print(f"[OK] PDF captured, size: {len(result.pdf)} bytes") with open("wikipedia_page.pdf", "wb") as f: f.write(result.pdf) - - print("[OK] PDF & screenshot captured.") + else: + print("[WARN] PDF data is None.") + else: print("[ERROR]", result.error_message) diff --git a/docs/md_v2/advanced/advanced-features.md b/docs/md_v2/advanced/advanced-features.md index b56f216e..3563fd40 100644 --- a/docs/md_v2/advanced/advanced-features.md +++ b/docs/md_v2/advanced/advanced-features.md @@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c ```python import os, asyncio from base64 import b64decode -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig async def main(): + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + pdf=True + ) + async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", - cache_mode=CacheMode.BYPASS, - pdf=True, - screenshot=True + config=run_config ) - if result.success: - # Save screenshot + print(f"Screenshot data present: {result.screenshot is not None}") + print(f"PDF data present: {result.pdf is not None}") + if result.screenshot: + print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes") with open("wikipedia_screenshot.png", "wb") as f: f.write(b64decode(result.screenshot)) - - # Save PDF + else: + print("[WARN] Screenshot data is None.") + if result.pdf: + print(f"[OK] PDF captured, size: {len(result.pdf)} bytes") with open("wikipedia_page.pdf", "wb") as f: f.write(result.pdf) - - print("[OK] PDF & screenshot captured.") + else: + print("[WARN] PDF data is None.") + else: print("[ERROR]", result.error_message) From 414f16e975cc2ca29abe3531d5ab91a4b17a4163 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 18 Jun 2025 19:05:44 +0200 Subject: [PATCH 37/37] fix: Update pdf and screenshot usage documentation. ref #1230 --- .../crawl4ai_all_reasoning_content.llm.txt | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt index 850c1237..c3350fb5 100644 --- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt +++ b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt @@ -5359,29 +5359,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c ```python import os, asyncio from base64 import b64decode -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig async def main(): + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + pdf=True + ) + async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", - cache_mode=CacheMode.BYPASS, - pdf=True, - screenshot=True + config=run_config ) - if result.success: - # Save screenshot + print(f"Screenshot data present: {result.screenshot is not None}") + print(f"PDF data present: {result.pdf is not None}") + if result.screenshot: + print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes") with open("wikipedia_screenshot.png", "wb") as f: f.write(b64decode(result.screenshot)) - - # Save PDF + else: + print("[WARN] Screenshot data is None.") + if result.pdf: + print(f"[OK] PDF captured, size: {len(result.pdf)} bytes") with open("wikipedia_page.pdf", "wb") as f: f.write(result.pdf) - - print("[OK] PDF & screenshot captured.") + else: + print("[WARN] PDF data is None.") + else: print("[ERROR]", result.error_message)