diff --git a/.gitignore b/.gitignore index f022c9ef..943c059c 100644 --- a/.gitignore +++ b/.gitignore @@ -225,4 +225,5 @@ tree.md .scripts .local .do +/plans plans/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index b654953f..afa841c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,43 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +--- + +## [0.4.267] - 2025 - 01 - 06 + +### Added +- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) +- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +### Changed +- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py)) +- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) +- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py)) +- **Documentation Update**: + - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) + +### Removed +- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py)) + +### Fixed +- **Page Closing to Prevent Memory Leaks**: + - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided. + - **Impact**: Prevents memory leaks caused by lingering pages after a crawl. + - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py) + - **Code**: + ```python + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() + ``` +- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py)) +- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +### Other +- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore)) + + ## [0.4.24] - 2024-12-31 ### Added diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3f798c0c..8ec3d053 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.246" +__version__ = "0.4.247" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 4723a836..b879413c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1475,8 +1475,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise e + + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() - async def _handle_full_page_scan(self, page: Page, scroll_delay: float): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): """ Helper method to handle full page scanning. @@ -1500,7 +1505,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): current_position = viewport_height # await page.evaluate(f"window.scrollTo(0, {current_position})") - await self.safe_scroll(page, 0, current_position) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await self.csp_scroll_to(page, 0, current_position) # await asyncio.sleep(scroll_delay) @@ -1510,7 +1515,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): while current_position < total_height: current_position = min(current_position + viewport_height, total_height) - await self.safe_scroll(page, 0, current_position) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await page.evaluate(f"window.scrollTo(0, {current_position})") # await asyncio.sleep(scroll_delay) @@ -2064,7 +2069,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - async def safe_scroll(self, page: Page, x: int, y: int): + async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1): """ Safely scroll the page with rendering time. @@ -2075,7 +2080,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ result = await self.csp_scroll_to(page, x, y) if result['success']: - await page.wait_for_timeout(100) # Allow for rendering + await page.wait_for_timeout(delay * 1000) return result async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: @@ -2158,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - async def page_need_scroll(self, page: Page): + async def page_need_scroll(self, page: Page) -> bool: """ Determine whether the page need to scroll @@ -2166,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page: Playwright page object Returns: - page should scroll or not + bool: True if page needs scrolling """ - return await page.evaluate(""" + try: + need_scroll = await page.evaluate(""" () => { const scrollHeight = document.documentElement.scrollHeight; const viewportHeight = window.innerHeight; return scrollHeight > viewportHeight; } - """) \ No newline at end of file + """) + return need_scroll + except Exception as e: + self.logger.warning( + message="Failed to check scroll need: {error}. Defaulting to True for safety.", + tag="SCROLL", + params={"error": str(e)} + ) + return True # Default to scrolling if check fails \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 985ff592..f3a96cf3 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -122,92 +122,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) - def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]: - """ - Generate markdown content from cleaned HTML. - - Args: - cleaned_html (str): The cleaned HTML content. - html (str): The original HTML content. - url (str): The URL of the page. - success (bool): Whether the content was successfully cleaned. - **kwargs: Additional keyword arguments. - - Returns: - Dict[str, Any]: A dictionary containing the generated markdown content. - """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) - - if markdown_generator: - try: - if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - - markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( - cleaned_html=cleaned_html, - base_url=url, - html2text_options=kwargs.get('html2text', {}) - ) - - return { - 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown, - 'fit_html': markdown_result.fit_html, - 'markdown_v2': markdown_result - } - except Exception as e: - self._log('error', - message="Error using new markdown generation strategy: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - markdown_generator = None - return { - 'markdown': f"Error using new markdown generation strategy: {str(e)}", - 'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': None - } - - # Legacy method - """ - # h = CustomHTML2Text() - # h.update_params(**kwargs.get('html2text', {})) - # markdown = h.handle(cleaned_html) - # markdown = markdown.replace(' ```', '```') - - # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - - # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - # content_filter = kwargs.get('content_filter', None) - # if not content_filter: - # content_filter = BM25ContentFilter( - # user_query=kwargs.get('fit_markdown_user_query', None), - # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - # ) - # fit_html = content_filter.filter_content(html) - # fit_html = '\n'.join('