feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
This commit is contained in:
85
tests/general/test_cache_context.py
Normal file
85
tests/general/test_cache_context.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
async def test_reuse_context_by_config():
|
||||
# We will store each context ID in these maps to confirm reuse
|
||||
context_ids_for_A = []
|
||||
context_ids_for_B = []
|
||||
|
||||
# Create a small hook to track context creation
|
||||
async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
|
||||
c_id = id(context)
|
||||
print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
|
||||
# Distinguish which config we used by checking a custom hook param
|
||||
config_label = config.shared_data.get("config_label", "unknown")
|
||||
if config_label == "A":
|
||||
context_ids_for_A.append(c_id)
|
||||
elif config_label == "B":
|
||||
context_ids_for_B.append(c_id)
|
||||
return page
|
||||
|
||||
# Browser config - Headless, verbose so we see logs
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
|
||||
# Two crawler run configs that differ (for example, text_mode):
|
||||
configA = CrawlerRunConfig(
|
||||
only_text=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="domcontentloaded",
|
||||
shared_data = {
|
||||
"config_label" : "A"
|
||||
}
|
||||
)
|
||||
configB = CrawlerRunConfig(
|
||||
only_text=False,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="domcontentloaded",
|
||||
shared_data = {
|
||||
"config_label" : "B"
|
||||
}
|
||||
)
|
||||
|
||||
# Create the crawler
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
# Attach our custom hook
|
||||
# Note: "on_page_context_created" will be called each time a new context+page is generated
|
||||
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||||
|
||||
# Start the crawler (launches the browser)
|
||||
await crawler.start()
|
||||
|
||||
# For demonstration, we’ll crawl a benign site multiple times with each config
|
||||
test_url = "https://example.com"
|
||||
print("\n--- Crawling with config A (text_mode=True) ---")
|
||||
for _ in range(2):
|
||||
# Pass an extra kwarg to the hook so we know which config is being used
|
||||
await crawler.arun(test_url, config=configA)
|
||||
|
||||
print("\n--- Crawling with config B (text_mode=False) ---")
|
||||
for _ in range(2):
|
||||
await crawler.arun(test_url, config=configB)
|
||||
|
||||
# Close the crawler (shuts down the browser, closes contexts)
|
||||
await crawler.close()
|
||||
|
||||
# Validate and show the results
|
||||
print("\n=== RESULTS ===")
|
||||
print(f"Config A context IDs: {context_ids_for_A}")
|
||||
print(f"Config B context IDs: {context_ids_for_B}")
|
||||
if len(set(context_ids_for_A)) == 1:
|
||||
print("✅ All config A crawls used the SAME BrowserContext.")
|
||||
else:
|
||||
print("❌ Config A crawls created multiple contexts unexpectedly.")
|
||||
if len(set(context_ids_for_B)) == 1:
|
||||
print("✅ All config B crawls used the SAME BrowserContext.")
|
||||
else:
|
||||
print("❌ Config B crawls created multiple contexts unexpectedly.")
|
||||
if set(context_ids_for_A).isdisjoint(context_ids_for_B):
|
||||
print("✅ Config A context is different from Config B context.")
|
||||
else:
|
||||
print("❌ A and B ended up sharing the same context somehow!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_reuse_context_by_config())
|
||||
Reference in New Issue
Block a user