diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3f798c0c..8ec3d053 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.246" +__version__ = "0.4.247" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 0cdaffd5..b879413c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2163,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - async def page_need_scroll(self, page: Page): + async def page_need_scroll(self, page: Page) -> bool: """ Determine whether the page need to scroll @@ -2171,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page: Playwright page object Returns: - page should scroll or not + bool: True if page needs scrolling """ - return await page.evaluate(""" + try: + need_scroll = await page.evaluate(""" () => { const scrollHeight = document.documentElement.scrollHeight; const viewportHeight = window.innerHeight; return scrollHeight > viewportHeight; } - """) \ No newline at end of file + """) + return need_scroll + except Exception as e: + self.logger.warning( + message="Failed to check scroll need: {error}. Defaulting to True for safety.", + tag="SCROLL", + params={"error": str(e)} + ) + return True # Default to scrolling if check fails \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 985ff592..f3a96cf3 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -122,92 +122,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) - def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]: - """ - Generate markdown content from cleaned HTML. - - Args: - cleaned_html (str): The cleaned HTML content. - html (str): The original HTML content. - url (str): The URL of the page. - success (bool): Whether the content was successfully cleaned. - **kwargs: Additional keyword arguments. - - Returns: - Dict[str, Any]: A dictionary containing the generated markdown content. - """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) - - if markdown_generator: - try: - if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - - markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( - cleaned_html=cleaned_html, - base_url=url, - html2text_options=kwargs.get('html2text', {}) - ) - - return { - 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown, - 'fit_html': markdown_result.fit_html, - 'markdown_v2': markdown_result - } - except Exception as e: - self._log('error', - message="Error using new markdown generation strategy: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - markdown_generator = None - return { - 'markdown': f"Error using new markdown generation strategy: {str(e)}", - 'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': None - } - - # Legacy method - """ - # h = CustomHTML2Text() - # h.update_params(**kwargs.get('html2text', {})) - # markdown = h.handle(cleaned_html) - # markdown = markdown.replace(' ```', '```') - - # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - - # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - # content_filter = kwargs.get('content_filter', None) - # if not content_filter: - # content_filter = BM25ContentFilter( - # user_query=kwargs.get('fit_markdown_user_query', None), - # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - # ) - # fit_html = content_filter.filter_content(html) - # fit_html = '\n'.join('