From b8147b64e00629d2effaf025020c2fe5c9842a43 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 18 Oct 2024 13:31:12 +0800
Subject: [PATCH] chore: Bump version to 0.3.71 and improve error handling

- Update version number to 0.3.71
- Add sleep_on_close option to AsyncPlaywrightCrawlerStrategy
- Enhance context creation with additional options
- Improve error message formatting and visibility
- Update quickstart documentation
---
 CHANGELOG.md                       | 27 +++++++++++++++++++++++++++
 crawl4ai/__init__.py               |  2 +-
 crawl4ai/async_crawler_strategy.py | 20 ++++++++++++--------
 crawl4ai/async_webcrawler.py       |  4 ++--
 docs/examples/quickstart.ipynb     |  5 ++---
 5 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b0513d4..07ca786c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,32 @@
 # Changelog
 
+## [v0.3.71] - 2024-10-18
+
+### Changes
+1. **Version Update**:
+   - Updated version number from 0.3.7 to 0.3.71.
+
+2. **Crawler Enhancements**:
+   - Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure.
+   - Improved context creation with additional options:
+     - Enabled `accept_downloads` and `java_script_enabled`.
+     - Added a cookie to enable cookies by default.
+
+3. **Error Handling Improvements**:
+   - Enhanced error messages in AsyncWebCrawler's `arun` method.
+   - Updated error reporting format for better visibility and consistency.
+
+4. **Performance Optimization**:
+   - Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios.
+
+### Documentation
+- Updated quickstart notebook:
+  - Changed installation command to use the released package instead of GitHub repository.
+  - Updated kernel display name.
+
+### Developer Notes
+- Minor code refactoring and cleanup.
+
 ## [v0.3.7] - 2024-10-17
 
 ### New Features
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index f349fb94..1ecff4b0 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult
 
-__version__ = "0.3.8"
+__version__ = "0.3.71"
 
 __all__ = [
     "AsyncWebCrawler",
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 307dee20..f87b6243 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -80,6 +80,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         self.verbose = kwargs.get("verbose", False)
         self.playwright = None
         self.browser = None
+        self.sleep_on_close = kwargs.get("sleep_on_close", False)
         self.hooks = {
             'on_browser_created': None,
             'on_user_agent_updated': None,
@@ -132,6 +133,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             await self.execute_hook('on_browser_created', self.browser)
 
     async def close(self):
+        if self.sleep_on_close:
+            await asyncio.sleep(500)
         if self.browser:
             await self.browser.close()
             self.browser = None
@@ -296,8 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 context = await self.browser.new_context(
                     user_agent=self.user_agent,
                     viewport={"width": 1920, "height": 1080},
-                    proxy={"server": self.proxy} if self.proxy else None
+                    proxy={"server": self.proxy} if self.proxy else None,
+                    accept_downloads=True,
+                    java_script_enabled=True
                 )
+                await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
                 await context.set_extra_http_headers(self.headers)
                 page = await context.new_page()
                 self.sessions[session_id] = (context, page, time.time())
@@ -419,8 +425,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 except Exception as e:
                     raise RuntimeError(f"Wait condition failed: {str(e)}")
 
-
-            
             # Update image dimensions
             update_image_dimensions_js = """
             () => {
@@ -531,11 +535,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             )
             return response
         except Error as e:
-            raise Error(f"Failed to crawl {url}: {str(e)}")
-        finally:
-            if not session_id:
-                await page.close()
-                await context.close()
+            raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}")
+        # finally:
+        #     if not session_id:
+        #         await page.close()
+        #         await context.close()
 
     async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
         semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 76846fe9..9a57048d 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -133,8 +133,8 @@ class AsyncWebCrawler:
         except Exception as e:
             if not hasattr(e, "msg"):
                 e.msg = str(e)
-            print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
-            return CrawlResult(url=url, html="", success=False, error_message=e.msg)
+            print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}")
+            return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg)
 
     async def arun_many(
         self,
diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb
index 09ad623b..71f23acb 100644
--- a/docs/examples/quickstart.ipynb
+++ b/docs/examples/quickstart.ipynb
@@ -47,8 +47,7 @@
       },
       "outputs": [],
       "source": [
-        "# !pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git\"\n",
-        "!pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git@staging\"\n",
+        "!pip install crawl4ai\n",
         "!pip install nest-asyncio\n",
         "!playwright install"
       ]
@@ -714,7 +713,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": "venv",
       "language": "python",
       "name": "python3"
     },