Basic HTML document for testing purposes.
+diff --git a/README.md b/README.md index 02f01d03..8e6980d8 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,17 @@ [](https://pypi.org/project/crawl4ai/) [](https://pepy.tech/project/crawl4ai) - -[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) -[](https://github.com/psf/black) -[](https://github.com/PyCQA/bandit) -[](code_of_conduct.md) - +
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. @@ -291,12 +296,20 @@ import requests # Submit a crawl job response = requests.post( "http://localhost:11235/crawl", - json={"urls": "https://example.com", "priority": 10} + json={"urls": ["https://example.com"], "priority": 10} ) -task_id = response.json()["task_id"] - -# Continue polling until the task is complete (status="completed") -result = requests.get(f"http://localhost:11235/task/{task_id}") +if response.status_code == 200: + print("Crawl job submitted successfully.") + +if "results" in response.json(): + results = response.json()["results"] + print("Crawl job completed. Results:") + for result in results: + print(result) +else: + task_id = response.json()["task_id"] + print(f"Crawl job submitted. Task ID:: {task_id}") + result = requests.get(f"http://localhost:11235/task/{task_id}") ``` For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/). diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 4f9da890..57b3fc4b 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -926,6 +926,8 @@ class CrawlerRunConfig(): Default: False. scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True. Default: 0.2. + max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan. + If None, scrolls until the entire page is loaded. Default: None. process_iframes (bool): If True, attempts to process and inline iframe content. Default: False. remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. @@ -1066,6 +1068,7 @@ class CrawlerRunConfig(): ignore_body_visibility: bool = True, scan_full_page: bool = False, scroll_delay: float = 0.2, + max_scroll_steps: Optional[int] = None, process_iframes: bool = False, remove_overlay_elements: bool = False, simulate_user: bool = False, @@ -1170,6 +1173,7 @@ class CrawlerRunConfig(): self.ignore_body_visibility = ignore_body_visibility self.scan_full_page = scan_full_page self.scroll_delay = scroll_delay + self.max_scroll_steps = max_scroll_steps self.process_iframes = process_iframes self.remove_overlay_elements = remove_overlay_elements self.simulate_user = simulate_user @@ -1387,6 +1391,7 @@ class CrawlerRunConfig(): ignore_body_visibility=kwargs.get("ignore_body_visibility", True), scan_full_page=kwargs.get("scan_full_page", False), scroll_delay=kwargs.get("scroll_delay", 0.2), + max_scroll_steps=kwargs.get("max_scroll_steps"), process_iframes=kwargs.get("process_iframes", False), remove_overlay_elements=kwargs.get("remove_overlay_elements", False), simulate_user=kwargs.get("simulate_user", False), @@ -1499,6 +1504,7 @@ class CrawlerRunConfig(): "ignore_body_visibility": self.ignore_body_visibility, "scan_full_page": self.scan_full_page, "scroll_delay": self.scroll_delay, + "max_scroll_steps": self.max_scroll_steps, "process_iframes": self.process_iframes, "remove_overlay_elements": self.remove_overlay_elements, "simulate_user": self.simulate_user, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 817b980c..9fdb0fe2 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return await self._crawl_web(url, config) elif url.startswith("file://"): + # initialize empty lists for console messages + captured_console = [] + # Process local file local_file_path = url[7:] # Remove 'file://' prefix if not os.path.exists(local_file_path): @@ -466,9 +469,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): console_messages=captured_console, ) - elif url.startswith("raw:") or url.startswith("raw://"): + ##### + # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect. + # Fix: Check for "raw://" first, then "raw:" + # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:] + ##### + elif url.startswith("raw://") or url.startswith("raw:"): # Process raw HTML content - raw_html = url[4:] if url[:4] == "raw:" else url[7:] + # raw_html = url[4:] if url[:4] == "raw:" else url[7:] + raw_html = url[6:] if url.startswith("raw://") else url[4:] html = raw_html if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) @@ -741,18 +750,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) redirected_url = page.url except Error as e: - raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + # Allow navigation to be aborted when downloading files + # This is expected behavior for downloads in some browser engines + if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads: + self.logger.info( + message=f"Navigation aborted, likely due to file download: {url}", + tag="GOTO", + params={"url": url}, + ) + response = None + else: + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") await self.execute_hook( "after_goto", page, context=context, url=url, response=response, config=config ) + # ────────────────────────────────────────────────────────────── + # Walk the redirect chain. Playwright returns only the last + # hop, so we trace the `request.redirected_from` links until the + # first response that differs from the final one and surface its + # status-code. + # ────────────────────────────────────────────────────────────── if response is None: status_code = 200 response_headers = {} else: - status_code = response.status - response_headers = response.headers + first_resp = response + req = response.request + while req and req.redirected_from: + prev_req = req.redirected_from + prev_resp = await prev_req.response() + if prev_resp: # keep earliest + first_resp = prev_resp + req = prev_req + + status_code = first_resp.status + response_headers = first_resp.headers + # if response is None: + # status_code = 200 + # response_headers = {} + # else: + # status_code = response.status + # response_headers = response.headers else: status_code = 200 @@ -896,7 +936,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Handle full page scanning if config.scan_full_page: - await self._handle_full_page_scan(page, config.scroll_delay) + # await self._handle_full_page_scan(page, config.scroll_delay) + await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps) # Handle virtual scroll if configured if config.virtual_scroll_config: @@ -1088,7 +1129,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Close the page await page.close() - async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): + # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None): """ Helper method to handle full page scanning. @@ -1103,6 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Args: page (Page): The Playwright page object scroll_delay (float): The delay between page scrolls + max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end. """ try: @@ -1127,9 +1170,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): dimensions = await self.get_page_dimensions(page) total_height = dimensions["height"] + scroll_step_count = 0 while current_position < total_height: + #### + # NEW FEATURE: Check if we've reached the maximum allowed scroll steps + # This prevents infinite scrolling on very long pages or infinite scroll scenarios + # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior) + #### + if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps: + break current_position = min(current_position + viewport_height, total_height) await self.safe_scroll(page, 0, current_position, delay=scroll_delay) + + # Increment the step counter for max_scroll_steps tracking + scroll_step_count += 1 + # await page.evaluate(f"window.scrollTo(0, {current_position})") # await asyncio.sleep(scroll_delay) @@ -1616,12 +1671,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): num_segments = (page_height // viewport_height) + 1 for i in range(num_segments): y_offset = i * viewport_height + # Special handling for the last segment + if i == num_segments - 1: + last_part_height = page_height % viewport_height + + # If page_height is an exact multiple of viewport_height, + # we don't need an extra segment + if last_part_height == 0: + # Skip last segment if page height is exact multiple of viewport + break + + # Adjust viewport to exactly match the remaining content height + await page.set_viewport_size({"width": page_width, "height": last_part_height}) + await page.evaluate(f"window.scrollTo(0, {y_offset})") await asyncio.sleep(0.01) # wait for render - seg_shot = await page.screenshot(full_page=False) + + # Capture the current segment + # Note: Using compression options (format, quality) would go here + seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85) + # seg_shot = await page.screenshot(full_page=False) img = Image.open(BytesIO(seg_shot)).convert("RGB") segments.append(img) + # Reset viewport to original size after capturing segments + await page.set_viewport_size({"width": page_width, "height": viewport_height}) + total_height = sum(img.height for img in segments) stitched = Image.new("RGB", (segments[0].width, total_height)) offset = 0 @@ -1750,12 +1825,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # then wait for the new page to load before continuing result = None try: + # OLD VERSION: + # result = await page.evaluate( + # f""" + # (async () => {{ + # try {{ + # const script_result = {script}; + # return {{ success: true, result: script_result }}; + # }} catch (err) {{ + # return {{ success: false, error: err.toString(), stack: err.stack }}; + # }} + # }})(); + # """ + # ) + + # """ NEW VERSION: + # When {script} contains statements (e.g., const link = …; link.click();), + # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'. + # """ result = await page.evaluate( f""" (async () => {{ try {{ - const script_result = {script}; - return {{ success: true, result: script_result }}; + return await (async () => {{ + {script} + }})(); }} catch (err) {{ return {{ success: false, error: err.toString(), stack: err.stack }}; }} diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index e203b6c9..b0857d66 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -39,6 +39,7 @@ class LogColor(str, Enum): YELLOW = "yellow" MAGENTA = "magenta" DIM_MAGENTA = "dim magenta" + RED = "red" def __str__(self): """Automatically convert rich color to string.""" diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ce9a0633..a2c6cf9f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -363,7 +363,7 @@ class AsyncWebCrawler: pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, - redirected_url=async_response.redirected_url, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -506,7 +506,7 @@ class AsyncWebCrawler: tables = media.pop("tables", []) links = result.links.model_dump() metadata = result.metadata - + fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) ################################ @@ -588,11 +588,13 @@ class AsyncWebCrawler: # Choose content based on input_format content_format = config.extraction_strategy.input_format if content_format == "fit_markdown" and not markdown_result.fit_markdown: - self.logger.warning( - message="Fit markdown requested but not available. Falling back to raw markdown.", - tag="EXTRACT", - params={"url": _url}, - ) + + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) content_format = "markdown" content = { @@ -616,11 +618,12 @@ class AsyncWebCrawler: ) # Log extraction completion - self.logger.info( - message="Completed for {url:.50}... | Time: {timing}s", - tag="EXTRACT", - params={"url": _url, "timing": time.perf_counter() - t1}, - ) + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) # Apply HTML formatting if requested if config.prettiify: diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 4a5be13c..bc902f61 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -480,7 +480,7 @@ class BrowserProfiler: self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA) exit_option = "4" - self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="") + self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="") choice = input() if choice == "1": @@ -637,9 +637,18 @@ class BrowserProfiler: self.logger.info(f"Debugging port: {debugging_port}", tag="CDP") self.logger.info(f"Headless mode: {headless}", tag="CDP") + # create browser config + browser_config = BrowserConfig( + browser_type=browser_type, + headless=headless, + user_data_dir=profile_path, + debugging_port=debugging_port, + verbose=True + ) + # Create managed browser instance managed_browser = ManagedBrowser( - browser_type=browser_type, + browser_config=browser_config, user_data_dir=profile_path, headless=headless, logger=self.logger, diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 33b313bc..a5fb7dbb 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") @click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)") -@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling") +@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling") @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 268b599a..f1ea5fa5 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -720,13 +720,18 @@ class WebScrapingStrategy(ContentScrapingStrategy): # Check flag if we should remove external images if kwargs.get("exclude_external_images", False): - element.decompose() - return False - # src_url_base = src.split('/')[2] - # url_base = url.split('/')[2] - # if url_base not in src_url_base: - # element.decompose() - # return False + # Handle relative URLs (which are always from the same domain) + if not src.startswith('http') and not src.startswith('//'): + return True # Keep relative URLs + + # For absolute URLs, compare the base domains using the existing function + src_base_domain = get_base_domain(src) + url_base_domain = get_base_domain(url) + + # If the domains don't match and both are valid, the image is external + if src_base_domain and url_base_domain and src_base_domain != url_base_domain: + element.decompose() + return False # if kwargs.get('exclude_social_media_links', False): # if image_src_base_domain in exclude_social_media_domains: diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 65d4e819..7779c9f4 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") break + # Calculate how many more URLs we can process in this batch + remaining = self.max_pages - self._pages_crawled + batch_size = min(BATCH_SIZE, remaining) + if batch_size <= 0: + # No more pages to crawl + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + batch: List[Tuple[float, int, str, Optional[str]]] = [] # Retrieve up to BATCH_SIZE items from the priority queue. for _ in range(BATCH_SIZE): @@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): # Count only successful crawls toward max_pages limit if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator yield result diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 48c116dd..950c3980 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): results: List[CrawlResult] = [] while current_level and not self._cancel_event.is_set(): + # Check if we've already reached max_pages before starting a new level + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + next_level: List[Tuple[str, Optional[str]]] = [] urls = [url for url, _ in current_level] @@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # Count only successful crawls if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator results_count += 1 yield result diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index f79f9628..0eca58e3 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # Count only successful crawls toward max_pages limit if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator # Only discover links from successful crawls new_links: List[Tuple[str, Optional[str]]] = [] @@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # and only discover links from successful crawls if result.success: self._pages_crawled += 1 + # Check if we've reached the limit during batch processing + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl") + break # Exit the generator new_links: List[Tuple[str, Optional[str]]] = [] await self.link_discovery(result, url, depth, visited, new_links, depths) diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 122be482..b65112e2 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter): # Prefix check (/foo/*) if self._simple_prefixes: path = url.split("?")[0] - if any(path.startswith(p) for p in self._simple_prefixes): - result = True - self._update_stats(result) - return not result if self._reverse else result + # if any(path.startswith(p) for p in self._simple_prefixes): + # result = True + # self._update_stats(result) + # return not result if self._reverse else result + #### + # Modified the prefix matching logic to ensure path boundary checking: + # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path + # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/` + #### + for prefix in self._simple_prefixes: + if path.startswith(prefix): + if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']: + result = True + self._update_stats(result) + return not result if self._reverse else result # Complex patterns if self._path_patterns: @@ -337,6 +348,15 @@ class ContentTypeFilter(URLFilter): "sqlite": "application/vnd.sqlite3", # Placeholder "unknown": "application/octet-stream", # Fallback for unknown file types + # php + "php": "application/x-httpd-php", + "php3": "application/x-httpd-php", + "php4": "application/x-httpd-php", + "php5": "application/x-httpd-php", + "php7": "application/x-httpd-php", + "phtml": "application/x-httpd-php", + "phps": "application/x-httpd-php-source", + } @staticmethod diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py index f4816eb5..4e33431f 100644 --- a/crawl4ai/docker_client.py +++ b/crawl4ai/docker_client.py @@ -73,6 +73,8 @@ class Crawl4aiDockerClient: def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]: """Prepare request data from configs.""" + if self._token: + self._http_client.headers["Authorization"] = f"Bearer {self._token}" return { "urls": urls, "browser_config": browser_config.dump() if browser_config else {}, @@ -103,8 +105,6 @@ class Crawl4aiDockerClient: crawler_config: Optional[CrawlerRunConfig] = None ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]: """Execute a crawl operation.""" - if not self._token: - raise Crawl4aiClientError("Authentication required. Call authenticate() first.") await self._check_server() data = self._prepare_request(urls, browser_config, crawler_config) @@ -140,8 +140,6 @@ class Crawl4aiDockerClient: async def get_schema(self) -> Dict[str, Any]: """Retrieve configuration schemas.""" - if not self._token: - raise Crawl4aiClientError("Authentication required. Call authenticate() first.") response = await self._request("GET", "/schema") return response.json() @@ -167,4 +165,4 @@ async def main(): print(schema) if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 966f333e..380f83b4 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy): self.total_usage.total_tokens += usage.total_tokens try: - response = response.choices[0].message.content + content = response.choices[0].message.content blocks = None if self.force_json_response: - blocks = json.loads(response) + blocks = json.loads(content) if isinstance(blocks, dict): # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]} if len(blocks) == 1 and isinstance(list(blocks.values())[0], list): @@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy): blocks = blocks else: # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"] - blocks = extract_xml_data(["blocks"], response)["blocks"] + blocks = extract_xml_data(["blocks"], content)["blocks"] blocks = json.loads(blocks) for block in blocks: diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 14073ef1..8735dee0 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -50,6 +50,29 @@ from urllib.parse import ( ) +# Monkey patch to fix wildcard handling in urllib.robotparser +from urllib.robotparser import RuleLine +import re + +original_applies_to = RuleLine.applies_to + +def patched_applies_to(self, filename): + # Handle wildcards in paths + if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"): + pattern = self.path.replace('%2A', '*') + pattern = re.escape(pattern).replace('\\*', '.*') + pattern = '^' + pattern + if pattern.endswith('\\$'): + pattern = pattern[:-2] + '$' + try: + return bool(re.match(pattern, filename)) + except re.error: + return original_applies_to(self, filename) + return original_applies_to(self, filename) + +RuleLine.applies_to = patched_applies_to +# Monkey patch ends + def chunk_documents( documents: Iterable[str], chunk_token_threshold: int, @@ -318,7 +341,7 @@ class RobotsParser: robots_url = f"{scheme}://{domain}/robots.txt" async with aiohttp.ClientSession() as session: - async with session.get(robots_url, timeout=2) as response: + async with session.get(robots_url, timeout=2, ssl=False) as response: if response.status == 200: rules = await response.text() self._cache_rules(domain, rules) @@ -1524,6 +1547,14 @@ def extract_metadata_using_lxml(html, doc=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content + + # Article metadata + article_tags = head.xpath('.//meta[starts-with(@property, "article:")]') + for tag in article_tags: + property_name = tag.get("property", "").strip() + content = tag.get("content", "").strip() + if property_name and content: + metadata[property_name] = content return metadata @@ -1599,7 +1630,15 @@ def extract_metadata(html, soup=None): content = tag.get("content", "").strip() if property_name and content: metadata[property_name] = content - + + # Article metadata + article_tags = head.find_all("meta", attrs={"property": re.compile(r"^article:")}) + for tag in article_tags: + property_name = tag.get("property", "").strip() + content = tag.get("content", "").strip() + if property_name and content: + metadata[property_name] = content + return metadata @@ -2068,14 +2107,16 @@ def normalize_url(href, base_url): parsed_base = urlparse(base_url) if not parsed_base.scheme or not parsed_base.netloc: raise ValueError(f"Invalid base URL format: {base_url}") - - # Ensure base_url ends with a trailing slash if it's a directory path - if not base_url.endswith('/'): - base_url = base_url + '/' + + if parsed_base.scheme.lower() not in ["http", "https"]: + # Handle special protocols + raise ValueError(f"Invalid base URL format: {base_url}") + cleaned_href = href.strip() # Use urljoin to handle all cases - normalized = urljoin(base_url, href.strip()) - return normalized + return urljoin(base_url, cleaned_href) + + def normalize_url( diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 732371f7..b728acd1 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -459,7 +459,7 @@ async def handle_crawl_request( # await crawler.close() # except Exception as close_e: # logger.error(f"Error closing crawler during exception handling: {close_e}") - logger.error(f"Error closing crawler during exception handling: {close_e}") + logger.error(f"Error closing crawler during exception handling: {str(e)}") # Measure memory even on error if possible end_mem_mb_error = _get_memory_mb() @@ -518,7 +518,7 @@ async def handle_stream_crawl_request( # await crawler.close() # except Exception as close_e: # logger.error(f"Error closing crawler during stream setup exception: {close_e}") - logger.error(f"Error closing crawler during stream setup exception: {close_e}") + logger.error(f"Error closing crawler during stream setup exception: {str(e)}") logger.error(f"Stream crawl error: {str(e)}", exc_info=True) # Raising HTTPException here will prevent streaming response raise HTTPException( diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 5137159e..74ad794f 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -332,7 +332,7 @@ The `clone()` method: ### Key fields to note 1. **`provider`**: -- Which LLM provoder to use. +- Which LLM provider to use. - Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`Basic HTML document for testing purposes.
+