feat(mcp): Implement MCP protocol and enhance server capabilities

This commit introduces several significant enhancements to the Crawl4AI Docker deployment: 1. Add MCP Protocol Support: - Implement WebSocket and SSE transport layers for MCP server communication - Create mcp_bridge.py to expose existing API endpoints via MCP protocol - Add comprehensive tests for both socket and SSE transport methods 2. Enhance Docker Server Capabilities: - Add PDF generation endpoint with file saving functionality - Add screenshot capture endpoint with configurable wait time - Implement JavaScript execution endpoint for dynamic page interaction - Add intelligent file path handling for saving generated assets 3. Improve Search and Context Functionality: - Implement syntax-aware code function chunking using AST parsing - Add BM25-based intelligent document search with relevance scoring - Create separate code and documentation context endpoints - Enhance response format with structured results and scores 4. Rename and Fix File Organization: - Fix typo in test_docker_config_gen.py filename - Update import statements and dependencies - Add FileResponse for context endpoints This enhancement significantly improves the machine-to-machine communication capabilities of Crawl4AI, making it more suitable for integration with LLM agents and other automated systems. The CHANGELOG update has been applied successfully, highlighting the key features and improvements made in this release. The commit message provides a detailed explanation of all the changes, which will be helpful for tracking the project's evolution.
2025-04-21 22:22:02 +08:00
parent a58c8000aa
commit 5297e362f3
9 changed files with 21327 additions and 30 deletions
--- a/tests/mcp/test_mcp_socket.py
+++ b/tests/mcp/test_mcp_socket.py
@@ -0,0 +1,119 @@
+# pip install "mcp-sdk[ws]" anyio
+import anyio, json
+from mcp.client.websocket import websocket_client
+from mcp.client.session import ClientSession
+
+async def test_list():
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()
+
+            print("tools      :", [t.name for t in (await s.list_tools()).tools])
+            print("resources  :", [r.name for r in (await s.list_resources()).resources])
+            print("templates  :", [t.name for t in (await s.list_resource_templates()).resource_templates])
+
+
+async def test_crawl(s: ClientSession) -> None:
+    """Hit the @mcp_tool('crawl') endpoint."""
+    res = await s.call_tool(
+        "crawl",
+        {
+            "urls": ["https://example.com"],
+            "browser_config": {},
+            "crawler_config": {},
+        },
+    )
+    print("crawl →", json.loads(res.content[0].text))
+
+
+async def test_md(s: ClientSession) -> None:
+    """Hit the @mcp_tool('md') endpoint."""
+    res = await s.call_tool(
+        "md",
+        {
+            "url": "https://example.com",
+            "f": "fit",   # or RAW, BM25, LLM
+            "q": None,
+            "c": "0",
+        },
+    )
+    result = json.loads(res.content[0].text)
+    print("md →", result['markdown'][:100], "...")
+
+async def test_screenshot(s: ClientSession):
+    res = await s.call_tool(
+        "screenshot",
+        {
+            "url": "https://example.com",
+            "screenshot_wait_for": 1.0,
+        },
+    )
+    png_b64 = json.loads(res.content[0].text)["screenshot"]
+    print("screenshot →", png_b64[:60], "… (base64)")
+
+
+async def test_pdf(s: ClientSession):
+    res = await s.call_tool(
+        "pdf",
+        {
+            "url": "https://example.com",
+        },
+    )
+    pdf_b64 = json.loads(res.content[0].text)["pdf"]
+    print("pdf →", pdf_b64[:60], "… (base64)")
+
+async def test_execute_js(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "execute_js",
+        {
+            "url": "https://news.ycombinator.com/news",
+            "js_code": [
+                "await page.click('a.morelink')",
+                "await page.waitForTimeout(1000)",
+            ],
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+    
+async def test_html(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "html",
+        {
+            "url": "https://news.ycombinator.com/news",
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+    
+async def test_context(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "ask",
+        {
+            "query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+
+
+async def main() -> None:
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()                       # handshake
+            tools = (await s.list_tools()).tools
+            print("tools:", [t.name for t in tools])
+
+            # await test_list()
+            # await test_crawl(s)
+            # await test_md(s)
+            # await test_screenshot(s)
+            # await test_pdf(s)
+            # await test_execute_js(s)
+            # await test_html(s)
+            await test_context(s)
+
+anyio.run(main)
--- a/tests/mcp/test_mcp_sse.py
+++ b/tests/mcp/test_mcp_sse.py
@@ -0,0 +1,11 @@
+from mcp.client.sse import sse_client
+from mcp.client.session import ClientSession
+
+async def main():
+    async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
+        async with ClientSession(r, w) as sess:
+            print(await sess.list_tools())      # now works
+            
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
--- a/tests/memory/test_docker_config_gen.py
+++ b/tests/memory/test_docker_config_gen.py
@@ -11,7 +11,8 @@ If the server isn’t running, start it first:

 import sys, json, textwrap, requests

-BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
 URL  = f"{BASE.rstrip('/')}/config/dump"

 CASES = [