From 24b3da717ae2e81345127b1431902b49b95d475e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 2 Jan 2025 17:53:30 +0800 Subject: [PATCH 1/6] refactor(): - Update hello world example --- docs/examples/hello_world.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index bcec9a9a..18534d0e 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -2,7 +2,8 @@ import asyncio from crawl4ai import * async def main(): - async with AsyncWebCrawler() as crawler: + browser_config = BrowserConfig(headless=True, verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( @@ -10,7 +11,7 @@ async def main(): ) ) result = await crawler.arun( - url="https://crawl4ai.com", + url="https://www.helloworld.org", config=crawler_config ) print(result.markdown_v2.raw_markdown[:500]) From 196dc79ec7005a1cabf22af621f7b6b029288e47 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 3 Jan 2025 21:17:23 +0800 Subject: [PATCH 2/6] fix: prevent memory leaks by ensuring proper closure of Playwright pages - Fixes critical memory leak issue where browser pages remained open - Ensures proper cleanup of Playwright resources after page operations - Improves resource management in browser farm implementation This is an urgent fix to address resource leakage that could impact system stability. --- .gitignore | 1 + crawl4ai/async_crawler_strategy.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 6a3b65f0..7ce3ee0c 100644 --- a/.gitignore +++ b/.gitignore @@ -225,3 +225,4 @@ tree.md .scripts .local .do +/plans \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 32bd14b8..82e445e1 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1475,8 +1475,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise e + + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() - async def _handle_full_page_scan(self, page: Page, scroll_delay: float): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): """ Helper method to handle full page scanning. @@ -1500,7 +1505,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): current_position = viewport_height # await page.evaluate(f"window.scrollTo(0, {current_position})") - await self.safe_scroll(page, 0, current_position) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await self.csp_scroll_to(page, 0, current_position) # await asyncio.sleep(scroll_delay) @@ -1510,7 +1515,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): while current_position < total_height: current_position = min(current_position + viewport_height, total_height) - await self.safe_scroll(page, 0, current_position) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await page.evaluate(f"window.scrollTo(0, {current_position})") # await asyncio.sleep(scroll_delay) @@ -2066,7 +2071,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - async def safe_scroll(self, page: Page, x: int, y: int): + async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1): """ Safely scroll the page with rendering time. @@ -2077,7 +2082,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ result = await self.csp_scroll_to(page, x, y) if result['success']: - await page.wait_for_timeout(100) # Allow for rendering + await page.wait_for_timeout(delay * 1000) return result async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: From 72fbdac467b8e0a3aba511e93353cb42d45b1842 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 5 Jan 2025 19:26:46 +0800 Subject: [PATCH 3/6] fix(extraction): JsonCss selector and crawler improvements - Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one - Add robust error handling to page_need_scroll with default fallback - Improve JSON extraction strategies documentation - Refactor content scraping strategy - Update version to 0.4.247 --- crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 17 +++- crawl4ai/content_scraping_strategy.py | 93 ------------------- crawl4ai/extraction_strategy.py | 4 +- crawl4ai/utils.py | 21 +++++ .../tutorials/async-webcrawler-basics.md | 21 ++++- 6 files changed, 56 insertions(+), 102 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3f798c0c..8ec3d053 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.246" +__version__ = "0.4.247" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 0cdaffd5..b879413c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2163,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - async def page_need_scroll(self, page: Page): + async def page_need_scroll(self, page: Page) -> bool: """ Determine whether the page need to scroll @@ -2171,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page: Playwright page object Returns: - page should scroll or not + bool: True if page needs scrolling """ - return await page.evaluate(""" + try: + need_scroll = await page.evaluate(""" () => { const scrollHeight = document.documentElement.scrollHeight; const viewportHeight = window.innerHeight; return scrollHeight > viewportHeight; } - """) \ No newline at end of file + """) + return need_scroll + except Exception as e: + self.logger.warning( + message="Failed to check scroll need: {error}. Defaulting to True for safety.", + tag="SCROLL", + params={"error": str(e)} + ) + return True # Default to scrolling if check fails \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 985ff592..f3a96cf3 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -122,92 +122,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) - def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]: - """ - Generate markdown content from cleaned HTML. - - Args: - cleaned_html (str): The cleaned HTML content. - html (str): The original HTML content. - url (str): The URL of the page. - success (bool): Whether the content was successfully cleaned. - **kwargs: Additional keyword arguments. - - Returns: - Dict[str, Any]: A dictionary containing the generated markdown content. - """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) - - if markdown_generator: - try: - if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - - markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( - cleaned_html=cleaned_html, - base_url=url, - html2text_options=kwargs.get('html2text', {}) - ) - - return { - 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown, - 'fit_html': markdown_result.fit_html, - 'markdown_v2': markdown_result - } - except Exception as e: - self._log('error', - message="Error using new markdown generation strategy: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - markdown_generator = None - return { - 'markdown': f"Error using new markdown generation strategy: {str(e)}", - 'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': None - } - - # Legacy method - """ - # h = CustomHTML2Text() - # h.update_params(**kwargs.get('html2text', {})) - # markdown = h.handle(cleaned_html) - # markdown = markdown.replace(' ```', '```') - - # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - - # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - # content_filter = kwargs.get('content_filter', None) - # if not content_filter: - # content_filter = BM25ContentFilter( - # user_query=kwargs.get('fit_markdown_user_query', None), - # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - # ) - # fit_html = content_filter.filter_content(html) - # fit_html = '\n'.join('
{}
'.format(s) for s in fit_html) - # fit_markdown = h.handle(fit_html) - - # markdown_v2 = MarkdownGenerationResult( - # raw_markdown=markdown, - # markdown_with_citations=markdown, - # references_markdown=markdown, - # fit_markdown=fit_markdown - # ) - - # return { - # 'markdown': markdown, - # 'fit_markdown': fit_markdown, - # 'fit_html': fit_html, - # 'markdown_v2' : markdown_v2 - # } - """ - def flatten_nested_elements(self, node): """ Flatten nested elements in a HTML tree. @@ -798,13 +712,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - # markdown_content = self._generate_markdown_content( - # cleaned_html=cleaned_html, - # html=html, - # url=url, - # success=success, - # **kwargs - # ) return { # **markdown_content, diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 7441e32d..1e9d9c79 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -974,8 +974,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): return parsed_html.select(selector) def _get_elements(self, element, selector: str): - selected = element.select_one(selector) - return [selected] if selected else [] + return element.select(selector) def _get_element_text(self, element) -> str: return element.get_text(strip=True) @@ -1050,3 +1049,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): def _get_element_attribute(self, element, attribute: str): return element.get(attribute) + diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 214ebbc6..6fd7429f 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -21,6 +21,8 @@ import textwrap import cProfile import pstats from functools import wraps +import asyncio + class InvalidCSSSelectorError(Exception): pass @@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: return content_paths +def configure_windows_event_loop(): + """ + Configure the Windows event loop to use ProactorEventLoop. + This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses. + + This function should only be called on Windows systems and before any async operations. + On non-Windows systems, this function does nothing. + + Example: + ```python + from crawl4ai.async_configs import configure_windows_event_loop + + # Call this before any async operations if you're on Windows + configure_windows_event_loop() + ``` + """ + if platform.system() == 'Windows': + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + def get_error_context(exc_info, context_lines: int = 5): """ Extract error context with more reliable line number tracking. diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md index 46256eaa..6236d899 100644 --- a/docs/md_v3/tutorials/async-webcrawler-basics.md +++ b/docs/md_v3/tutorials/async-webcrawler-basics.md @@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea --- -## 5. Putting It All Together +## 5. Windows-Specific Configuration + +When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations. + +To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations: + +```python +from crawl4ai.utils import configure_windows_event_loop + +# Call this before any async operations if you're on Windows +configure_windows_event_loop() + +# Your AsyncWebCrawler code here +``` + +--- + +## 6. Putting It All Together Here’s a slightly more in-depth example that shows off a few key config parameters at once: @@ -193,7 +210,7 @@ if __name__ == "__main__": --- -## 6. Next Steps +## 7. Next Steps - **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md). - **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md). From ae376f15fb8b92701ea1a0b167f9a0e9c2d6804c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 5 Jan 2025 19:39:15 +0800 Subject: [PATCH 4/6] docs(extraction): add clarifying comments for CSS selector behavior Add explanatory comments to JsonCssExtractionStrategy._get_elements() method to clarify that it returns all matching elements using select() instead of select_one(). This helps developers understand the method's behavior and its difference from single element selection. Removed trailing whitespace at end of file. --- crawl4ai/extraction_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 1e9d9c79..3e688f13 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -974,6 +974,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): return parsed_html.select(selector) def _get_elements(self, element, selector: str): + # Return all matching elements using select() instead of select_one() + # This ensures that we get all elements that match the selector, not just the first one return element.select(selector) def _get_element_text(self, element) -> str: @@ -1048,5 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): def _get_element_attribute(self, element, attribute: str): return element.get(attribute) - - From 3427ead8b8854f70aef2b8fd485648ba22623e21 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 6 Jan 2025 15:13:43 +0800 Subject: [PATCH 5/6] Update CHANGELOG --- CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b654953f..afa841c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,43 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +--- + +## [0.4.267] - 2025 - 01 - 06 + +### Added +- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) +- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +### Changed +- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py)) +- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) +- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py)) +- **Documentation Update**: + - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) + +### Removed +- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py)) + +### Fixed +- **Page Closing to Prevent Memory Leaks**: + - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided. + - **Impact**: Prevents memory leaks caused by lingering pages after a crawl. + - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py) + - **Code**: + ```python + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() + ``` +- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py)) +- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +### Other +- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore)) + + ## [0.4.24] - 2024-12-31 ### Added From 12880f1ffad9702aad6adbca3e0f16e391c081ba Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 6 Jan 2025 15:19:01 +0800 Subject: [PATCH 6/6] Update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7ce3ee0c..943c059c 100644 --- a/.gitignore +++ b/.gitignore @@ -225,4 +225,5 @@ tree.md .scripts .local .do -/plans \ No newline at end of file +/plans +plans/ \ No newline at end of file