From b8147b64e00629d2effaf025020c2fe5c9842a43 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 18 Oct 2024 13:31:12 +0800 Subject: [PATCH] chore: Bump version to 0.3.71 and improve error handling - Update version number to 0.3.71 - Add sleep_on_close option to AsyncPlaywrightCrawlerStrategy - Enhance context creation with additional options - Improve error message formatting and visibility - Update quickstart documentation --- CHANGELOG.md | 27 +++++++++++++++++++++++++++ crawl4ai/__init__.py | 2 +- crawl4ai/async_crawler_strategy.py | 20 ++++++++++++-------- crawl4ai/async_webcrawler.py | 4 ++-- docs/examples/quickstart.ipynb | 5 ++--- 5 files changed, 44 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b0513d4..07ca786c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## [v0.3.71] - 2024-10-18 + +### Changes +1. **Version Update**: + - Updated version number from 0.3.7 to 0.3.71. + +2. **Crawler Enhancements**: + - Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure. + - Improved context creation with additional options: + - Enabled `accept_downloads` and `java_script_enabled`. + - Added a cookie to enable cookies by default. + +3. **Error Handling Improvements**: + - Enhanced error messages in AsyncWebCrawler's `arun` method. + - Updated error reporting format for better visibility and consistency. + +4. **Performance Optimization**: + - Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios. + +### Documentation +- Updated quickstart notebook: + - Changed installation command to use the released package instead of GitHub repository. + - Updated kernel display name. + +### Developer Notes +- Minor code refactoring and cleanup. + ## [v0.3.7] - 2024-10-17 ### New Features diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index f349fb94..1ecff4b0 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -__version__ = "0.3.8" +__version__ = "0.3.71" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 307dee20..f87b6243 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -80,6 +80,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.verbose = kwargs.get("verbose", False) self.playwright = None self.browser = None + self.sleep_on_close = kwargs.get("sleep_on_close", False) self.hooks = { 'on_browser_created': None, 'on_user_agent_updated': None, @@ -132,6 +133,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook('on_browser_created', self.browser) async def close(self): + if self.sleep_on_close: + await asyncio.sleep(500) if self.browser: await self.browser.close() self.browser = None @@ -296,8 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=True, + java_script_enabled=True ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) page = await context.new_page() self.sessions[session_id] = (context, page, time.time()) @@ -419,8 +425,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") - - # Update image dimensions update_image_dimensions_js = """ () => { @@ -531,11 +535,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) return response except Error as e: - raise Error(f"Failed to crawl {url}: {str(e)}") - finally: - if not session_id: - await page.close() - await context.close() + raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + # finally: + # if not session_id: + # await page.close() + # await context.close() async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 76846fe9..9a57048d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -133,8 +133,8 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}") - return CrawlResult(url=url, html="", success=False, error_message=e.msg) + print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}") + return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg) async def arun_many( self, diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb index 09ad623b..71f23acb 100644 --- a/docs/examples/quickstart.ipynb +++ b/docs/examples/quickstart.ipynb @@ -47,8 +47,7 @@ }, "outputs": [], "source": [ - "# !pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git\"\n", - "!pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git@staging\"\n", + "!pip install crawl4ai\n", "!pip install nest-asyncio\n", "!playwright install" ] @@ -714,7 +713,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", "name": "python3" },