feat(crawler): add session management and view-source support
Add session_id feature to allow reusing browser pages across multiple crawls. Add support for view-source: protocol in URL handling. Fix browser config reference and string formatting issues. Update examples to demonstrate new session management features. BREAKING CHANGE: Browser page handling now persists when using session_id
This commit is contained in:
@@ -441,7 +441,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
status_code = 200 # Default for local/raw HTML
|
status_code = 200 # Default for local/raw HTML
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
|
|
||||||
if url.startswith(("http://", "https://")):
|
if url.startswith(("http://", "https://", "view-source:")):
|
||||||
return await self._crawl_web(url, config)
|
return await self._crawl_web(url, config)
|
||||||
|
|
||||||
elif url.startswith("file://"):
|
elif url.startswith("file://"):
|
||||||
@@ -784,7 +784,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error:
|
except Error:
|
||||||
visibility_info = await self.check_visibility(page)
|
visibility_info = await self.check_visibility(page)
|
||||||
|
|
||||||
if self.config.verbose:
|
if self.browser_config.config.verbose:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
message="Body visibility info: {info}",
|
message="Body visibility info: {info}",
|
||||||
tag="DEBUG",
|
tag="DEBUG",
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ class BrowserProfiler:
|
|||||||
os.makedirs(profile_path, exist_ok=True)
|
os.makedirs(profile_path, exist_ok=True)
|
||||||
|
|
||||||
# Print instructions for the user with rich formatting
|
# Print instructions for the user with rich formatting
|
||||||
border = "{'='*80}"
|
border = f"{'='*80}"
|
||||||
self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
|
self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
|
||||||
self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
|
self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
|
||||||
self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
|
self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
|
||||||
@@ -607,8 +607,8 @@ class BrowserProfiler:
|
|||||||
os.makedirs(profile_path, exist_ok=True)
|
os.makedirs(profile_path, exist_ok=True)
|
||||||
|
|
||||||
# Print initial information
|
# Print initial information
|
||||||
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
|
border = f"{'='*80}"
|
||||||
self.logger.info(f"\n{border}", tag="CDP")
|
self.logger.info("{border}", tag="CDP", params={"border": border}, colors={"border": LogColor.CYAN})
|
||||||
self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
|
self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
|
||||||
self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
|
self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
|
||||||
self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
|
self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
|
||||||
|
|||||||
@@ -9,16 +9,18 @@ from crawl4ai import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
browser_config = BrowserConfig(
|
||||||
|
headless=False,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
crawler_config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
markdown_generator=DefaultMarkdownGenerator(
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
content_filter=PruningContentFilter()
|
content_filter=PruningContentFilter()
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
result : CrawlResult = await crawler.arun(
|
result: CrawlResult = await crawler.arun(
|
||||||
url="https://www.helloworld.org", config=crawler_config
|
url="https://www.helloworld.org", config=crawler_config
|
||||||
)
|
)
|
||||||
print(result.markdown.raw_markdown[:500])
|
print(result.markdown.raw_markdown[:500])
|
||||||
|
|||||||
38
docs/examples/session_id_example.py
Normal file
38
docs/examples/session_id_example.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import asyncio
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
BrowserConfig,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
|
PruningContentFilter,
|
||||||
|
CrawlResult
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=False,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
session_id= "hello_world", # This help us to use the same page
|
||||||
|
)
|
||||||
|
result : CrawlResult = await crawler.arun(
|
||||||
|
url="https://www.helloworld.org", config=crawler_config
|
||||||
|
)
|
||||||
|
# Add a breakpoint here, then you will the page is open and browser is not closed
|
||||||
|
print(result.markdown.raw_markdown[:500])
|
||||||
|
|
||||||
|
new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
|
||||||
|
result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
|
||||||
|
url="https://www.helloworld.org", config= new_config
|
||||||
|
)
|
||||||
|
print(result.js_execution_result) # You should see {'data':'hello'} in the console
|
||||||
|
|
||||||
|
# Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config
|
||||||
|
page, context = crawler.crawler_strategy.get_page(new_config)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user