From 206a9dfabdcd4a292f49bcd818569497c78d1886 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 8 May 2025 17:13:35 +0800 Subject: [PATCH] feat(crawler): add session management and view-source support Add session_id feature to allow reusing browser pages across multiple crawls. Add support for view-source: protocol in URL handling. Fix browser config reference and string formatting issues. Update examples to demonstrate new session management features. BREAKING CHANGE: Browser page handling now persists when using session_id --- crawl4ai/async_crawler_strategy.py | 4 +-- crawl4ai/browser_profiler.py | 6 ++--- docs/examples/hello_world.py | 10 +++++--- docs/examples/session_id_example.py | 38 +++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 docs/examples/session_id_example.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 0d62c7b5..153205db 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -441,7 +441,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): status_code = 200 # Default for local/raw HTML screenshot_data = None - if url.startswith(("http://", "https://")): + if url.startswith(("http://", "https://", "view-source:")): return await self._crawl_web(url, config) elif url.startswith("file://"): @@ -784,7 +784,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error: visibility_info = await self.check_visibility(page) - if self.config.verbose: + if self.browser_config.config.verbose: self.logger.debug( message="Body visibility info: {info}", tag="DEBUG", diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 5a95b25d..961ba740 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -128,7 +128,7 @@ class BrowserProfiler: os.makedirs(profile_path, exist_ok=True) # Print instructions for the user with rich formatting - border = "{'='*80}" + border = f"{'='*80}" self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN}) self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN}) self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW}) @@ -607,8 +607,8 @@ class BrowserProfiler: os.makedirs(profile_path, exist_ok=True) # Print initial information - border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" - self.logger.info(f"\n{border}", tag="CDP") + border = f"{'='*80}" + self.logger.info("{border}", tag="CDP", params={"border": border}, colors={"border": LogColor.CYAN}) self.logger.info("Launching standalone browser with CDP debugging", tag="CDP") self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN}) self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW}) diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index b9f1b328..2ba2e852 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -8,17 +8,19 @@ from crawl4ai import ( CrawlResult ) - -async def main(): - browser_config = BrowserConfig(headless=True, verbose=True) +async def main(): + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) async with AsyncWebCrawler(config=browser_config) as crawler: crawler_config = CrawlerRunConfig( markdown_generator=DefaultMarkdownGenerator( content_filter=PruningContentFilter() ), ) - result : CrawlResult = await crawler.arun( + result: CrawlResult = await crawler.arun( url="https://www.helloworld.org", config=crawler_config ) print(result.markdown.raw_markdown[:500]) diff --git a/docs/examples/session_id_example.py b/docs/examples/session_id_example.py new file mode 100644 index 00000000..e49b7819 --- /dev/null +++ b/docs/examples/session_id_example.py @@ -0,0 +1,38 @@ +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + DefaultMarkdownGenerator, + PruningContentFilter, + CrawlResult +) + + + +async def main(): + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + session_id= "hello_world", # This help us to use the same page + ) + result : CrawlResult = await crawler.arun( + url="https://www.helloworld.org", config=crawler_config + ) + # Add a breakpoint here, then you will the page is open and browser is not closed + print(result.markdown.raw_markdown[:500]) + + new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True) + result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page + url="https://www.helloworld.org", config= new_config + ) + print(result.js_execution_result) # You should see {'data':'hello'} in the console + + # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config + page, context = crawler.crawler_strategy.get_page(new_config) + +if __name__ == "__main__": + asyncio.run(main())