From 2d31915f0a0b8f1e5cecfaff0514423c20b6daeb Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 9 Dec 2024 20:04:59 +0800
Subject: [PATCH] Commit Message:   Enhance Async Crawler with storage state
 handling   - Updated Async Crawler to support storage state management.   -
 Added error handling for URL validation in Async Web Crawler.   - Modified
 README logo and improved .gitignore entries.   - Fixed issues in multiple
 files for better code robustness.

---
 .gitignore                         |  5 ++-
 README.md                          |  2 +-
 crawl4ai/async_crawler_strategy.py | 54 ++++++++++++++++++++++++------
 crawl4ai/async_webcrawler.py       |  7 +++-
 crawl4ai/extraction_strategy.py    |  2 +-
 crawl4ai/utils.py                  |  1 +
 main.py                            |  2 +-
 7 files changed, 58 insertions(+), 15 deletions(-)
diff --git a/.gitignore b/.gitignore
index 52e25a2a..02c75b3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -214,4 +214,7 @@ git_issues.md
 todo_executor.md
 protect-all-except-feature.sh
 manage-collab.sh
-publish.sh
\ No newline at end of file
+publish.sh
+
+combine.sh
+combined_output.txt
\ No newline at end of file
diff --git a/README.md b/README.md
index dede4a03..095c595c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
+# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
 
 <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 5c706239..fca0c0ec 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -238,8 +238,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
              
         self.user_agent = kwargs.get(
             "user_agent",
-            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
-            "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
+            # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36"
         )
         user_agenr_generator = UserAgentGenerator()
         if kwargs.get("user_agent_mode") == "random":
@@ -254,6 +254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
         self.headers.setdefault("sec-ch-ua", self.browser_hint)
         self.cookies = kwargs.get("cookies", [])
+        self.storage_state = kwargs.get("storage_state", None)
         self.sessions = {}
         self.session_ttl = 1800 
         self.js_code = js_code
@@ -315,7 +316,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     # If no default context exists, create one
                     self.default_context = await self.browser.new_context(
                         # viewport={"width": 1920, "height": 1080}
-                        viewport={"width": self.viewport_width, "height": self.viewport_height}
+                        viewport={"width": self.viewport_width, "height": self.viewport_height},
+                        storage_state=self.storage_state,
                     )
                 
                 # Set up the default context
@@ -323,6 +325,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     await self.default_context.set_extra_http_headers(self.headers)
                     if self.cookies:
                         await self.default_context.add_cookies(self.cookies)                    
+                    if self.storage_state:
+                        # If storage_state is a dictionary or file path, Playwright will handle it.
+                        await self.default_context.storage_state(path=None)  # Just ensuring default_context is ready
                     if self.accept_downloads:
                         await self.default_context.set_default_timeout(60000)
                         await self.default_context.set_default_navigation_timeout(60000)
@@ -426,6 +431,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                             self.default_context = self.browser
                         else:
                             self.browser = await self.playwright.chromium.launch(**browser_args)
+                            self.default_context = self.browser
                                 
                 except Exception as e:
                     # Fallback to chromium if Chrome channel fails
@@ -643,6 +649,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     viewport={"width": self.viewport_width, "height": self.viewport_height},
                     proxy={"server": self.proxy} if self.proxy else None,
                     accept_downloads=self.accept_downloads,
+                    storage_state=self.storage_state,
                     ignore_https_errors=True
                 )
                 
@@ -771,6 +778,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                             proxy={"server": self.proxy} if self.proxy else None,
                             java_script_enabled=True,
                             accept_downloads=self.accept_downloads,
+                            storage_state=self.storage_state,
                             # downloads_path=self.downloads_path if self.accept_downloads else None
                         )
                         await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
@@ -792,6 +800,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         viewport={"width": self.viewport_width, "height": self.viewport_height},
                         proxy={"server": self.proxy} if self.proxy else None,
                         accept_downloads=self.accept_downloads,
+                        storage_state=self.storage_state,
                         ignore_https_errors=True  # Add this line
                     )
                     if self.cookies:
@@ -862,7 +871,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     return response
 
             if not kwargs.get("js_only", False):
-                await self.execute_hook('before_goto', page, context = context)
+                await self.execute_hook('before_goto', page, context = context, **kwargs)
 
                 try:
                     response = await page.goto(
@@ -874,7 +883,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 except Error as e:
                     raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
                 
-                await self.execute_hook('after_goto', page, context = context)
+                await self.execute_hook('after_goto', page, context = context, **kwargs)
                 
                 # Get status code and headers
                 status_code = response.status
@@ -929,9 +938,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             # CONTENT LOADING ASSURANCE
             if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)):
                 # Wait for network idle after initial load and images to load
-                await page.wait_for_load_state("networkidle")
+                # await page.wait_for_load_state("networkidle")
+                await page.wait_for_load_state("domcontentloaded")
                 await asyncio.sleep(0.1)
-                await page.wait_for_function("Array.from(document.images).every(img => img.complete)")
+                from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+                try:
+                    await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000)
+                # Check for TimeoutError and ignore it
+                except PlaywrightTimeoutError:
+                    pass
             
             # After initial load, adjust viewport to content size
             if not self.text_only and kwargs.get("adjust_viewport_to_content", False):
@@ -1015,7 +1030,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 # await page.wait_for_timeout(100)
                 
                 # Check for on execution event
-                await self.execute_hook('on_execution_started', page, context = context)
+                await self.execute_hook('on_execution_started', page, context = context, **kwargs)
                 
             if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
                 # Simulate user interactions
@@ -1119,7 +1134,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             if kwargs.get("process_iframes", False):
                 page = await self.process_iframes(page)
             
-            await self.execute_hook('before_retrieve_html', page, context = context)
+            await self.execute_hook('before_retrieve_html', page, context = context, **kwargs)
             # Check if delay_before_return_html is set then wait for that time
             delay_before_return_html = kwargs.get("delay_before_return_html", 0.1)
             if delay_before_return_html:
@@ -1130,7 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 await self.remove_overlay_elements(page)
             
             html = await page.content()
-            await self.execute_hook('before_return_html', page, html, context = context)
+            await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
             
             # Check if kwargs has screenshot=True then take screenshot
             screenshot_data = None
@@ -1394,6 +1409,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             return base64.b64encode(buffered.getvalue()).decode('utf-8')
         finally:
             await page.close()
+     
+    async def export_storage_state(self, path: str = None) -> dict:
+        """
+        Exports the current storage state (cookies, localStorage, sessionStorage)
+        to a JSON file at the specified path.
+        """
+        if self.default_context:
+            state = await self.default_context.storage_state(path=path)
+            self.logger.info(
+                message="Exported storage state to {path}",
+                tag="INFO",
+                params={"path": path}
+            )
+            return state
+        else:
+            self.logger.warning(
+                message="No default_context available to export storage state.",
+                tag="WARNING"
+            )
             
     async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
         """
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 2c17602d..b872c20c 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -182,6 +182,10 @@ class AsyncWebCrawler:
         Returns:
             CrawlResult: The result of crawling and processing
         """
+        # Check if url is not string and is not empty
+        if not isinstance(url, str) or not url:
+            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
+        
         async with self._lock or nullcontext():
             try:
                 # Handle deprecated parameters
@@ -335,7 +339,8 @@ class AsyncWebCrawler:
                 # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
                 
                 self.logger.error_status(
-                    url=cache_context.display_url,
+                    # url=cache_context.display_url,
+                    url=url,
                     error=create_box_message(e.msg, type = "error"),
                     tag="ERROR"
                 )            
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index b79e0c43..a778bf4d 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -632,7 +632,7 @@ class ContentSummarizationStrategy(ExtractionStrategy):
         # Sort summaries by the original section index to maintain order
         summaries.sort(key=lambda x: x[0])
         return [summary for _, summary in summaries]
-  
+ 
 class JsonCssExtractionStrategy(ExtractionStrategy):
     def __init__(self, schema: Dict[str, Any], **kwargs):
         super().__init__(**kwargs)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 0a9e6f56..879ba562 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -147,6 +147,7 @@ class CustomHTML2Text(HTML2Text):
     #         self.preserved_content.append(data)
     #         return
     #     super().handle_data(data, entity_char)
+
 class InvalidCSSSelectorError(Exception):
     pass
 
diff --git a/main.py b/main.py
index d6c792e8..21d3de16 100644
--- a/main.py
+++ b/main.py
@@ -342,7 +342,7 @@ app.add_middleware(
 
 # API token security
 security = HTTPBearer()
-CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
 
 async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
     if not CRAWL4AI_API_TOKEN: