Fix: README.md urls list
This commit is contained in:
18
README.md
18
README.md
@@ -291,12 +291,20 @@ import requests
|
|||||||
# Submit a crawl job
|
# Submit a crawl job
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
"http://localhost:11235/crawl",
|
"http://localhost:11235/crawl",
|
||||||
json={"urls": "https://example.com", "priority": 10}
|
json={"urls": ["https://example.com"], "priority": 10}
|
||||||
)
|
)
|
||||||
task_id = response.json()["task_id"]
|
if response.status_code == 200:
|
||||||
|
print("Crawl job submitted successfully.")
|
||||||
# Continue polling until the task is complete (status="completed")
|
|
||||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
if "results" in response.json():
|
||||||
|
results = response.json()["results"]
|
||||||
|
print("Crawl job completed. Results:")
|
||||||
|
for result in results:
|
||||||
|
print(result)
|
||||||
|
else:
|
||||||
|
task_id = response.json()["task_id"]
|
||||||
|
print(f"Crawl job submitted. Task ID:: {task_id}")
|
||||||
|
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||||
```
|
```
|
||||||
|
|
||||||
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
|
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
|
|||||||
def test_basic_crawl(tester: Crawl4AiTester):
|
def test_basic_crawl(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Basic Crawl ===")
|
print("\n=== Testing Basic Crawl ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 10,
|
"priority": 10,
|
||||||
"session_id": "test",
|
"session_id": "test",
|
||||||
}
|
}
|
||||||
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
|||||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 10,
|
"priority": 10,
|
||||||
"session_id": "test",
|
"session_id": "test",
|
||||||
}
|
}
|
||||||
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
|
|||||||
def test_js_execution(tester: Crawl4AiTester):
|
def test_js_execution(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"js_code": [
|
"js_code": [
|
||||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||||
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
|||||||
def test_css_selector(tester: Crawl4AiTester):
|
def test_css_selector(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing CSS Selector ===")
|
print("\n=== Testing CSS Selector ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 7,
|
"priority": 7,
|
||||||
"css_selector": ".wide-tease-item__description",
|
"css_selector": ".wide-tease-item__description",
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.coinbase.com/explore",
|
"urls": ["https://www.coinbase.com/explore"],
|
||||||
"priority": 9,
|
"priority": 9,
|
||||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||||
}
|
}
|
||||||
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://openai.com/api/pricing",
|
"urls": ["https://openai.com/api/pricing"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Cosine Extraction ===")
|
print("\n=== Testing Cosine Extraction ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "cosine",
|
"type": "cosine",
|
||||||
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
|||||||
def test_screenshot(tester: Crawl4AiTester):
|
def test_screenshot(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Screenshot ===")
|
print("\n=== Testing Screenshot ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 5,
|
"priority": 5,
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
|
|||||||
|
|
||||||
def test_basic_crawl(tester: Crawl4AiTester):
|
def test_basic_crawl(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Basic Crawl ===")
|
print("\n=== Testing Basic Crawl ===")
|
||||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||||
|
|
||||||
result = tester.submit_and_wait(request)
|
result = tester.submit_and_wait(request)
|
||||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||||
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
|||||||
def test_js_execution(tester: Crawl4AiTester):
|
def test_js_execution(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"js_code": [
|
"js_code": [
|
||||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||||
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
|||||||
def test_css_selector(tester: Crawl4AiTester):
|
def test_css_selector(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing CSS Selector ===")
|
print("\n=== Testing CSS Selector ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 7,
|
"priority": 7,
|
||||||
"css_selector": ".wide-tease-item__description",
|
"css_selector": ".wide-tease-item__description",
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.coinbase.com/explore",
|
"urls": ["https://www.coinbase.com/explore"],
|
||||||
"priority": 9,
|
"priority": 9,
|
||||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||||
}
|
}
|
||||||
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://openai.com/api/pricing",
|
"urls": ["https://openai.com/api/pricing"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Cosine Extraction ===")
|
print("\n=== Testing Cosine Extraction ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "cosine",
|
"type": "cosine",
|
||||||
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
|||||||
def test_screenshot(tester: Crawl4AiTester):
|
def test_screenshot(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Screenshot ===")
|
print("\n=== Testing Screenshot ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 5,
|
"priority": 5,
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class NBCNewsAPITest:
|
|||||||
async def test_basic_crawl():
|
async def test_basic_crawl():
|
||||||
print("\n=== Testing Basic Crawl ===")
|
print("\n=== Testing Basic Crawl ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||||
task_id = await api.submit_crawl(request)
|
task_id = await api.submit_crawl(request)
|
||||||
result = await api.wait_for_task(task_id)
|
result = await api.wait_for_task(task_id)
|
||||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||||
@@ -67,7 +67,7 @@ async def test_js_execution():
|
|||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"js_code": [
|
"js_code": [
|
||||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||||
@@ -86,7 +86,7 @@ async def test_css_selector():
|
|||||||
print("\n=== Testing CSS Selector ===")
|
print("\n=== Testing CSS Selector ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 7,
|
"priority": 7,
|
||||||
"css_selector": ".wide-tease-item__description",
|
"css_selector": ".wide-tease-item__description",
|
||||||
}
|
}
|
||||||
@@ -120,7 +120,7 @@ async def test_structured_extraction():
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 9,
|
"priority": 9,
|
||||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||||
}
|
}
|
||||||
@@ -177,7 +177,7 @@ async def test_llm_extraction():
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -209,7 +209,7 @@ async def test_screenshot():
|
|||||||
print("\n=== Testing Screenshot ===")
|
print("\n=== Testing Screenshot ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 5,
|
"priority": 5,
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
@@ -227,7 +227,7 @@ async def test_priority_handling():
|
|||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
# Submit low priority task first
|
# Submit low priority task first
|
||||||
low_priority = {
|
low_priority = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 1,
|
"priority": 1,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
}
|
}
|
||||||
@@ -235,7 +235,7 @@ async def test_priority_handling():
|
|||||||
|
|
||||||
# Submit high priority task
|
# Submit high priority task
|
||||||
high_priority = {
|
high_priority = {
|
||||||
"urls": "https://www.nbcnews.com/business/consumer",
|
"urls": ["https://www.nbcnews.com/business/consumer"],
|
||||||
"priority": 10,
|
"priority": 10,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user