From 53245e4e0e54dc4604f8b427105d820dba6c38a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sacrist=C3=A1n?= Date: Tue, 29 Apr 2025 16:26:35 +0200 Subject: [PATCH] Fix: README.md urls list --- README.md | 18 +++++++++++++----- tests/docker_example.py | 18 +++++++++--------- tests/test_docker.py | 16 ++++++++-------- tests/test_main.py | 16 ++++++++-------- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 97787b2f..879baa51 100644 --- a/README.md +++ b/README.md @@ -291,12 +291,20 @@ import requests # Submit a crawl job response = requests.post( "http://localhost:11235/crawl", - json={"urls": "https://example.com", "priority": 10} + json={"urls": ["https://example.com"], "priority": 10} ) -task_id = response.json()["task_id"] - -# Continue polling until the task is complete (status="completed") -result = requests.get(f"http://localhost:11235/task/{task_id}") +if response.status_code == 200: + print("Crawl job submitted successfully.") + +if "results" in response.json(): + results = response.json()["results"] + print("Crawl job completed. Results:") + for result in results: + print(result) +else: + task_id = response.json()["task_id"] + print(f"Crawl job submitted. Task ID:: {task_id}") + result = requests.get(f"http://localhost:11235/task/{task_id}") ``` For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/). diff --git a/tests/docker_example.py b/tests/docker_example.py index 336ca52f..03348d50 100644 --- a/tests/docker_example.py +++ b/tests/docker_example.py @@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"): def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 10, "session_id": "test", } @@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester): def test_basic_crawl_sync(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl (Sync) ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 10, "session_id": "test", } @@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" @@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester): def test_css_selector(tester: Crawl4AiTester): print("\n=== Testing CSS Selector ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 7, "css_selector": ".wide-tease-item__description", "crawler_params": {"headless": True}, @@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://www.coinbase.com/explore", + "urls": ["https://www.coinbase.com/explore"], "priority": 9, "extraction_config": {"type": "json_css", "params": {"schema": schema}}, } @@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://openai.com/api/pricing", + "urls": ["https://openai.com/api/pricing"], "priority": 8, "extraction_config": { "type": "llm", @@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "llm", @@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): def test_cosine_extraction(tester: Crawl4AiTester): print("\n=== Testing Cosine Extraction ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "cosine", @@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester): def test_screenshot(tester: Crawl4AiTester): print("\n=== Testing Screenshot ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 5, "screenshot": True, "crawler_params": {"headless": True}, diff --git a/tests/test_docker.py b/tests/test_docker.py index 3570d608..c507ae56 100644 --- a/tests/test_docker.py +++ b/tests/test_docker.py @@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"): def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") - request = {"urls": "https://www.nbcnews.com/business", "priority": 10} + request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10} result = tester.submit_and_wait(request) print(f"Basic crawl result length: {len(result['result']['markdown'])}") @@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" @@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester): def test_css_selector(tester: Crawl4AiTester): print("\n=== Testing CSS Selector ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 7, "css_selector": ".wide-tease-item__description", "crawler_params": {"headless": True}, @@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://www.coinbase.com/explore", + "urls": ["https://www.coinbase.com/explore"], "priority": 9, "extraction_config": {"type": "json_css", "params": {"schema": schema}}, } @@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester): } request = { - "urls": "https://openai.com/api/pricing", + "urls": ["https://openai.com/api/pricing"], "priority": 8, "extraction_config": { "type": "llm", @@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "llm", @@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester): def test_cosine_extraction(tester: Crawl4AiTester): print("\n=== Testing Cosine Extraction ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "cosine", @@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester): def test_screenshot(tester: Crawl4AiTester): print("\n=== Testing Screenshot ===") request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 5, "screenshot": True, "crawler_params": {"headless": True}, diff --git a/tests/test_main.py b/tests/test_main.py index 0e938f59..b32b68f0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -54,7 +54,7 @@ class NBCNewsAPITest: async def test_basic_crawl(): print("\n=== Testing Basic Crawl ===") async with NBCNewsAPITest() as api: - request = {"urls": "https://www.nbcnews.com/business", "priority": 10} + request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10} task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print(f"Basic crawl result length: {len(result['result']['markdown'])}") @@ -67,7 +67,7 @@ async def test_js_execution(): print("\n=== Testing JS Execution ===") async with NBCNewsAPITest() as api: request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" @@ -86,7 +86,7 @@ async def test_css_selector(): print("\n=== Testing CSS Selector ===") async with NBCNewsAPITest() as api: request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 7, "css_selector": ".wide-tease-item__description", } @@ -120,7 +120,7 @@ async def test_structured_extraction(): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 9, "extraction_config": {"type": "json_css", "params": {"schema": schema}}, } @@ -177,7 +177,7 @@ async def test_llm_extraction(): } request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 8, "extraction_config": { "type": "llm", @@ -209,7 +209,7 @@ async def test_screenshot(): print("\n=== Testing Screenshot ===") async with NBCNewsAPITest() as api: request = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 5, "screenshot": True, "crawler_params": {"headless": True}, @@ -227,7 +227,7 @@ async def test_priority_handling(): async with NBCNewsAPITest() as api: # Submit low priority task first low_priority = { - "urls": "https://www.nbcnews.com/business", + "urls": ["https://www.nbcnews.com/business"], "priority": 1, "crawler_params": {"headless": True}, } @@ -235,7 +235,7 @@ async def test_priority_handling(): # Submit high priority task high_priority = { - "urls": "https://www.nbcnews.com/business/consumer", + "urls": ["https://www.nbcnews.com/business/consumer"], "priority": 10, "crawler_params": {"headless": True}, }