feat(crawler): add network request and console message capturing
Implement comprehensive network request and console message capturing functionality: - Add capture_network_requests and capture_console_messages config parameters - Add network_requests and console_messages fields to models - Implement Playwright event listeners to capture requests, responses, and console output - Create detailed documentation and examples - Add comprehensive tests This feature enables deep visibility into web page activity for debugging, security analysis, performance profiling, and API discovery in web applications.
This commit is contained in:
46
tests/general/test_deep_crawl.py
Normal file
46
tests/general/test_deep_crawl.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
"""Example deep crawl of documentation site."""
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False
|
||||
),
|
||||
stream=False,
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
print("\nStarting deep crawl in batch mode:")
|
||||
results = await crawler.arun(
|
||||
url="https://docs.crawl4ai.com",
|
||||
config=config
|
||||
)
|
||||
print(f"Crawled {len(results)} pages")
|
||||
print(f"Example page: {results[0].url}")
|
||||
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
|
||||
|
||||
print("Starting deep crawl in streaming mode:")
|
||||
config.stream = True
|
||||
start_time = time.perf_counter()
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com",
|
||||
config=config
|
||||
):
|
||||
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
||||
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user