docs: remove CRAWL4AI_API_TOKEN references and use correct endpoints in Docker example scripts (#1015)

- Remove deprecated API token authentication from all Docker examples - Fix async job endpoints: /crawl -> /crawl/job for submission, /task/{id} -> /crawl/job/{id} for polling - Fix sync endpoint: /crawl_sync -> /crawl (synchronous) - Remove non-existent /crawl_direct endpoint - Update request format to use new structure with browser_config and crawler_config - Fix response handling for both async and sync calls - Update extraction strategy format to use proper nested structure - Add Ollama connectivity check before running tests - Update test schemas and selectors for current website structures This makes the Docker examples work out-of-the-box with the current API structure.
2025-08-09 19:15:11 +05:30
parent a5bcac4c9d
commit cd2dd68e4c
2 changed files with 354 additions and 270 deletions
--- a/tests/docker_example.py
+++ b/tests/docker_example.py
@@ -6,28 +6,22 @@ import base64
 import os
 from typing import Dict, Any

-
 class Crawl4AiTester:
-    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
+    def __init__(self, base_url: str = "http://localhost:11235"):
        self.base_url = base_url
-        self.api_token = api_token or os.getenv(
-            "CRAWL4AI_API_TOKEN"
-        )  # Check environment variable as fallback
-        self.headers = (
-            {"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
-        )
+

    def submit_and_wait(
        self, request_data: Dict[str, Any], timeout: int = 300
    ) -> Dict[str, Any]:
-        # Submit crawl job
+        # Submit crawl job using async endpoint
        response = requests.post(
-            f"{self.base_url}/crawl", json=request_data, headers=self.headers
+            f"{self.base_url}/crawl/job", json=request_data
        )
-        if response.status_code == 403:
-            raise Exception("API token is invalid or missing")
-        task_id = response.json()["task_id"]
-        print(f"Task ID: {task_id}")
+        response.raise_for_status()
+        job_response = response.json()
+        task_id = job_response["task_id"]
+        print(f"Submitted job with task_id: {task_id}")

        # Poll for result
        start_time = time.time()
@@ -38,8 +32,9 @@ class Crawl4AiTester:
                )

            result = requests.get(
-                f"{self.base_url}/task/{task_id}", headers=self.headers
+                f"{self.base_url}/crawl/job/{task_id}"
            )
+            result.raise_for_status()
            status = result.json()

            if status["status"] == "failed":
@@ -52,10 +47,10 @@ class Crawl4AiTester:
            time.sleep(2)

    def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+        # Use synchronous crawl endpoint
        response = requests.post(
-            f"{self.base_url}/crawl_sync",
+            f"{self.base_url}/crawl",
            json=request_data,
-            headers=self.headers,
            timeout=60,
        )
        if response.status_code == 408:
@@ -66,9 +61,8 @@ class Crawl4AiTester:

 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
-        # base_url="http://localhost:11235" ,
-        base_url="https://crawl4ai-sby74.ondigitalocean.app",
-        api_token="test",
+        base_url="http://localhost:11235",
+        #base_url="https://crawl4ai-sby74.ondigitalocean.app",
    )
    print(f"Testing Crawl4AI Docker {version} version")

@@ -88,63 +82,60 @@ def test_docker_deployment(version="basic"):

    # Test cases based on version
    test_basic_crawl(tester)
-    test_basic_crawl(tester)
    test_basic_crawl_sync(tester)

-    # if version in ["full", "transformer"]:
-    #     test_cosine_extraction(tester)
+    if version in ["full", "transformer"]:
+        test_cosine_extraction(tester)

-    # test_js_execution(tester)
-    # test_css_selector(tester)
-    # test_structured_extraction(tester)
-    # test_llm_extraction(tester)
-    # test_llm_with_ollama(tester)
-    # test_screenshot(tester)
+    test_js_execution(tester)
+    test_css_selector(tester)
+    test_structured_extraction(tester)
+    test_llm_extraction(tester)
+    test_llm_with_ollama(tester)
+    test_screenshot(tester)


 def test_basic_crawl(tester: Crawl4AiTester):
-    print("\n=== Testing Basic Crawl ===")
+    print("\n=== Testing Basic Crawl (Async) ===")
    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 10,
-        "session_id": "test",
    }

    result = tester.submit_and_wait(request)
-    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+    print(f"Basic crawl result count: {len(result['result']['results'])}")
    assert result["result"]["success"]
-    assert len(result["result"]["markdown"]) > 0
+    assert len(result["result"]["results"]) > 0
+    assert len(result["result"]["results"][0]["markdown"]) > 0


 def test_basic_crawl_sync(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Sync) ===")
    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 10,
-        "session_id": "test",
    }

    result = tester.submit_sync(request)
-    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result["status"] == "completed"
-    assert result["result"]["success"]
-    assert len(result["result"]["markdown"]) > 0
+    print(f"Basic crawl result count: {len(result['results'])}")
+    assert result["success"]
+    assert len(result["results"]) > 0
+    assert len(result["results"][0]["markdown"]) > 0


 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 8,
-        "js_code": [
-            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-        ],
-        "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {"headless": True},
+        "browser_config": {"headless": True},
+        "crawler_config": {
+            "js_code": [
+                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); if(loadMoreButton) loadMoreButton.click();"
+            ],
+            "wait_for": "wide-tease-item__wrapper df flex-column flex-row-m flex-nowrap-m enable-new-sports-feed-mobile-design(10)"
+        }
    }

    result = tester.submit_and_wait(request)
-    print(f"JS execution result length: {len(result['result']['markdown'])}")
+    print(f"JS execution result count: {len(result['result']['results'])}")
    assert result["result"]["success"]


@@ -152,51 +143,78 @@ def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 7,
-        "css_selector": ".wide-tease-item__description",
-        "crawler_params": {"headless": True},
-        "extra": {"word_count_threshold": 10},
+        "browser_config": {"headless": True},
+        "crawler_config": {
+            "css_selector": ".wide-tease-item__description",
+            "word_count_threshold": 10
+        }
    }

    result = tester.submit_and_wait(request)
-    print(f"CSS selector result length: {len(result['result']['markdown'])}")
+    print(f"CSS selector result count: {len(result['result']['results'])}")
    assert result["result"]["success"]


 def test_structured_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Structured Extraction ===")
    schema = {
-        "name": "Coinbase Crypto Prices",
-        "baseSelector": ".cds-tableRow-t45thuk",
-        "fields": [
-            {
-                "name": "crypto",
-                "selector": "td:nth-child(1) h2",
-                "type": "text",
-            },
-            {
-                "name": "symbol",
-                "selector": "td:nth-child(1) p",
-                "type": "text",
-            },
-            {
-                "name": "price",
-                "selector": "td:nth-child(2)",
-                "type": "text",
-            },
-        ],
+  "name": "Cryptocurrency Prices",
+  "baseSelector": "table[data-testid=\"prices-table\"] tbody tr",
+  "fields": [
+    {
+      "name": "asset_name",
+      "selector": "td:nth-child(2) p.cds-headline-h4steop",
+      "type": "text"
+    },
+    {
+      "name": "asset_symbol",
+      "selector": "td:nth-child(2) p.cds-label2-l1sm09ec",
+      "type": "text"
+    },
+    {
+      "name": "asset_image_url",
+      "selector": "td:nth-child(2) img[alt=\"Asset Symbol\"]",
+      "type": "attribute",
+      "attribute": "src"
+    },
+    {
+      "name": "asset_url",
+      "selector": "td:nth-child(2) a[aria-label^=\"Asset page for\"]",
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "price",
+      "selector": "td:nth-child(3) div.cds-typographyResets-t6muwls.cds-body-bwup3gq",
+      "type": "text"
+    },
+    {
+      "name": "change",
+      "selector": "td:nth-child(7) p.cds-body-bwup3gq",
+      "type": "text"
    }
+  ]
+}
+

    request = {
        "urls": ["https://www.coinbase.com/explore"],
-        "priority": 9,
-        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "extraction_strategy": {
+                    "type": "JsonCssExtractionStrategy",
+                    "params": {"schema": schema}
+                }
+            }
+        }
    }

    result = tester.submit_and_wait(request)
-    extracted = json.loads(result["result"]["extracted_content"])
+    extracted = json.loads(result["result"]["results"][0]["extracted_content"])
    print(f"Extracted {len(extracted)} items")
-    print("Sample item:", json.dumps(extracted[0], indent=2))
+    if extracted:
+        print("Sample item:", json.dumps(extracted[0], indent=2))
    assert result["result"]["success"]
    assert len(extracted) > 0

@@ -206,43 +224,54 @@ def test_llm_extraction(tester: Crawl4AiTester):
    schema = {
        "type": "object",
        "properties": {
-            "model_name": {
+            "asset_name": {
                "type": "string",
-                "description": "Name of the OpenAI model.",
+                "description": "Name of the asset.",
            },
-            "input_fee": {
+            "price": {
                "type": "string",
-                "description": "Fee for input token for the OpenAI model.",
+                "description": "Price of the asset.",
            },
-            "output_fee": {
+            "change": {
                "type": "string",
-                "description": "Fee for output token for the OpenAI model.",
+                "description": "Change in price of the asset.",
            },
        },
-        "required": ["model_name", "input_fee", "output_fee"],
+        "required": ["asset_name", "price", "change"],
    }

    request = {
-        "urls": ["https://openai.com/api/pricing"],
-        "priority": 8,
-        "extraction_config": {
-            "type": "llm",
+        "urls": ["https://www.coinbase.com/en-in/explore"],
+        "browser_config": {},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
            "params": {
-                "provider": "openai/gpt-4o-mini",
-                "api_token": os.getenv("OPENAI_API_KEY"),
-                "schema": schema,
-                "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
-            },
-        },
-        "crawler_params": {"word_count_threshold": 1},
+                "extraction_strategy": {
+                    "type": "LLMExtractionStrategy",
+                    "params": {
+                        "llm_config": {
+                            "type": "LLMConfig",
+                            "params": {
+                                "provider": "gemini/gemini-2.5-flash",
+                                "api_token": os.getenv("GEMINI_API_KEY")
+                            }
+                        },
+                        "schema": schema,
+                        "extraction_type": "schema",
+                        "instruction": "From the crawled content    tioned asset names along with their prices and change in price.",
+                    }
+                },
+                "word_count_threshold": 1
+            }
+        }
    }

    try:
        result = tester.submit_and_wait(request)
-        extracted = json.loads(result["result"]["extracted_content"])
+        extracted = json.loads(result["result"]["results"][0]["extracted_content"])
        print(f"Extracted {len(extracted)} model pricing entries")
-        print("Sample entry:", json.dumps(extracted[0], indent=2))
+        if extracted:
+            print("Sample entry:", json.dumps(extracted[0], indent=2))
        assert result["result"]["success"]
    except Exception as e:
        print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
@@ -271,23 +300,32 @@ def test_llm_with_ollama(tester: Crawl4AiTester):

    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 8,
-        "extraction_config": {
-            "type": "llm",
+        "browser_config": {"verbose": True},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
            "params": {
-                "provider": "ollama/llama2",
-                "schema": schema,
-                "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics.",
-            },
-        },
-        "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True},
+                "extraction_strategy": {
+                    "type": "LLMExtractionStrategy",
+                    "params": {
+                        "llm_config": {
+                            "type": "LLMConfig",
+                            "params": {
+                                "provider": "ollama/llama3.2:latest",
+                            }
+                        },
+                        "schema": schema,
+                        "extraction_type": "schema",
+                        "instruction": "Extract the main article information including title, summary, and main topics.",
+                    }
+                },
+                "word_count_threshold": 1
+            }
+        }
    }

    try:
        result = tester.submit_and_wait(request)
-        extracted = json.loads(result["result"]["extracted_content"])
+        extracted = json.loads(result["result"]["results"][0]["extracted_content"])
        print("Extracted content:", json.dumps(extracted, indent=2))
        assert result["result"]["success"]
    except Exception as e:
@@ -298,23 +336,29 @@ def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 8,
-        "extraction_config": {
-            "type": "cosine",
+        "browser_config": {},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
            "params": {
-                "semantic_filter": "business finance economy",
-                "word_count_threshold": 10,
-                "max_dist": 0.2,
-                "top_k": 3,
-            },
-        },
+                "extraction_strategy": {
+                    "type": "CosineStrategy",
+                    "params": {
+                        "semantic_filter": "business finance economy",
+                        "word_count_threshold": 10,
+                        "max_dist": 0.2,
+                        "top_k": 3,
+                    }
+                }
+            }
+        }
    }

    try:
        result = tester.submit_and_wait(request)
-        extracted = json.loads(result["result"]["extracted_content"])
+        extracted = json.loads(result["result"]["results"][0]["extracted_content"])
        print(f"Extracted {len(extracted)} text clusters")
-        print("First cluster tags:", extracted[0]["tags"])
+        if extracted:
+            print("First cluster tags:", extracted[0]["tags"])
        assert result["result"]["success"]
    except Exception as e:
        print(f"Cosine extraction test failed: {str(e)}")
@@ -324,19 +368,24 @@ def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
        "urls": ["https://www.nbcnews.com/business"],
-        "priority": 5,
-        "screenshot": True,
-        "crawler_params": {"headless": True},
+        "browser_config": {"headless": True},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "screenshot": True
+            }
+        }
    }

    result = tester.submit_and_wait(request)
-    print("Screenshot captured:", bool(result["result"]["screenshot"]))
+    screenshot_data = result["result"]["results"][0]["screenshot"]
+    print("Screenshot captured:", bool(screenshot_data))

-    if result["result"]["screenshot"]:
+    if screenshot_data:
        # Save screenshot
-        screenshot_data = base64.b64decode(result["result"]["screenshot"])
+        screenshot_bytes = base64.b64decode(screenshot_data)
        with open("test_screenshot.jpg", "wb") as f:
-            f.write(screenshot_data)
+            f.write(screenshot_bytes)
        print("Screenshot saved as test_screenshot.jpg")

    assert result["result"]["success"]