From 206a9dfabdcd4a292f49bcd818569497c78d1886 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 8 May 2025 17:13:35 +0800
Subject: [PATCH] feat(crawler): add session management and view-source support

Add session_id feature to allow reusing browser pages across multiple crawls.
Add support for view-source: protocol in URL handling.
Fix browser config reference and string formatting issues.
Update examples to demonstrate new session management features.

BREAKING CHANGE: Browser page handling now persists when using session_id
---
 crawl4ai/async_crawler_strategy.py  |  4 +--
 crawl4ai/browser_profiler.py        |  6 ++---
 docs/examples/hello_world.py        | 10 +++++---
 docs/examples/session_id_example.py | 38 +++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 9 deletions(-)
 create mode 100644 docs/examples/session_id_example.py

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 0d62c7b5..153205db 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -441,7 +441,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         status_code = 200  # Default for local/raw HTML
         screenshot_data = None
 
-        if url.startswith(("http://", "https://")):
+        if url.startswith(("http://", "https://", "view-source:")):
             return await self._crawl_web(url, config)
 
         elif url.startswith("file://"):
@@ -784,7 +784,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             except Error:
                 visibility_info = await self.check_visibility(page)
 
-                if self.config.verbose:
+                if self.browser_config.config.verbose:
                     self.logger.debug(
                         message="Body visibility info: {info}",
                         tag="DEBUG",
diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index 5a95b25d..961ba740 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -128,7 +128,7 @@ class BrowserProfiler:
         os.makedirs(profile_path, exist_ok=True)
         
         # Print instructions for the user with rich formatting
-        border = "{'='*80}"
+        border = f"{'='*80}"
         self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
         self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
         self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
@@ -607,8 +607,8 @@ class BrowserProfiler:
             os.makedirs(profile_path, exist_ok=True)
         
         # Print initial information
-        border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
-        self.logger.info(f"\n{border}", tag="CDP")
+        border = f"{'='*80}"
+        self.logger.info("{border}", tag="CDP", params={"border": border}, colors={"border": LogColor.CYAN})
         self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
         self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
         self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index b9f1b328..2ba2e852 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -8,17 +8,19 @@ from crawl4ai import (
     CrawlResult
 )
 
-                   
 
-async def main():    
-    browser_config = BrowserConfig(headless=True, verbose=True)
+async def main():
+    browser_config = BrowserConfig(
+        headless=False,
+        verbose=True,
+    )
     async with AsyncWebCrawler(config=browser_config) as crawler:
         crawler_config = CrawlerRunConfig(
             markdown_generator=DefaultMarkdownGenerator(
                 content_filter=PruningContentFilter()
             ),
         )
-        result : CrawlResult = await crawler.arun(
+        result: CrawlResult = await crawler.arun(
             url="https://www.helloworld.org", config=crawler_config
         )
         print(result.markdown.raw_markdown[:500])
diff --git a/docs/examples/session_id_example.py b/docs/examples/session_id_example.py
new file mode 100644
index 00000000..e49b7819
--- /dev/null
+++ b/docs/examples/session_id_example.py
@@ -0,0 +1,38 @@
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
+
+                   
+
+async def main():    
+    browser_config = BrowserConfig(
+        headless=False, 
+        verbose=True,
+    )
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            session_id= "hello_world", # This help us to use the same page 
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        # Add a breakpoint here, then you will the page is open and browser is not closed
+        print(result.markdown.raw_markdown[:500])
+        
+        new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
+        result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
+            url="https://www.helloworld.org", config= new_config
+        )
+        print(result.js_execution_result) # You should see {'data':'hello'} in the console
+        
+        # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config 
+        page, context = crawler.crawler_strategy.get_page(new_config)
+
+if __name__ == "__main__":
+    asyncio.run(main())