Merge branch '2025-MAY-2' into next-MAY
This commit is contained in:
@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 10,
|
||||
"session_id": "test",
|
||||
}
|
||||
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 10,
|
||||
"session_id": "test",
|
||||
}
|
||||
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {"headless": True},
|
||||
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"urls": ["https://www.coinbase.com/explore"],
|
||||
"priority": 9,
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"urls": ["https://openai.com/api/pricing"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {"headless": True},
|
||||
|
||||
@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
||||
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {"headless": True},
|
||||
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"urls": ["https://www.coinbase.com/explore"],
|
||||
"priority": 9,
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"urls": ["https://openai.com/api/pricing"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {"headless": True},
|
||||
|
||||
@@ -54,7 +54,7 @@ class NBCNewsAPITest:
|
||||
async def test_basic_crawl():
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
||||
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||
task_id = await api.submit_crawl(request)
|
||||
result = await api.wait_for_task(task_id)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
@@ -67,7 +67,7 @@ async def test_js_execution():
|
||||
print("\n=== Testing JS Execution ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
@@ -86,7 +86,7 @@ async def test_css_selector():
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
}
|
||||
@@ -120,7 +120,7 @@ async def test_structured_extraction():
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 9,
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
@@ -177,7 +177,7 @@ async def test_llm_extraction():
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -209,7 +209,7 @@ async def test_screenshot():
|
||||
print("\n=== Testing Screenshot ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {"headless": True},
|
||||
@@ -227,7 +227,7 @@ async def test_priority_handling():
|
||||
async with NBCNewsAPITest() as api:
|
||||
# Submit low priority task first
|
||||
low_priority = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 1,
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
@@ -235,7 +235,7 @@ async def test_priority_handling():
|
||||
|
||||
# Submit high priority task
|
||||
high_priority = {
|
||||
"urls": "https://www.nbcnews.com/business/consumer",
|
||||
"urls": ["https://www.nbcnews.com/business/consumer"],
|
||||
"priority": 10,
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
|
||||
91
tests/test_normalize_url.py
Normal file
91
tests/test_normalize_url.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import unittest
|
||||
from crawl4ai.utils import normalize_url
|
||||
|
||||
class TestNormalizeUrl(unittest.TestCase):
|
||||
|
||||
def test_basic_relative_path(self):
|
||||
self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html")
|
||||
|
||||
def test_base_url_with_trailing_slash(self):
|
||||
self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html")
|
||||
|
||||
def test_base_url_without_trailing_slash(self):
|
||||
# If normalize_url correctly uses urljoin, "base" is treated as a file.
|
||||
self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html")
|
||||
|
||||
def test_absolute_url_as_href(self):
|
||||
self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html")
|
||||
|
||||
def test_href_with_leading_trailing_spaces(self):
|
||||
self.assertEqual(normalize_url(" page.html ", "http://example.com/"), "http://example.com/page.html")
|
||||
|
||||
def test_empty_href(self):
|
||||
# urljoin with an empty href and base ending in '/' returns the base.
|
||||
self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/")
|
||||
# urljoin with an empty href and base not ending in '/' also returns base.
|
||||
self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base")
|
||||
|
||||
def test_href_with_query_parameters(self):
|
||||
self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test")
|
||||
|
||||
def test_href_with_fragment(self):
|
||||
self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section")
|
||||
|
||||
def test_different_scheme_in_href(self):
|
||||
self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html")
|
||||
|
||||
def test_parent_directory_in_href(self):
|
||||
self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html")
|
||||
|
||||
def test_root_relative_href(self):
|
||||
self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html")
|
||||
|
||||
def test_base_url_with_path_and_no_trailing_slash(self):
|
||||
# If normalize_url correctly uses urljoin, "path" is treated as a file.
|
||||
self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html")
|
||||
|
||||
def test_base_url_is_just_domain(self):
|
||||
self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html")
|
||||
|
||||
def test_href_is_only_query(self):
|
||||
self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true")
|
||||
|
||||
def test_href_is_only_fragment(self):
|
||||
self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment")
|
||||
|
||||
def test_relative_link_from_base_file_url(self):
|
||||
"""
|
||||
Tests the specific bug report: relative links from a base URL that is a file.
|
||||
Example:
|
||||
Page URL: http://example.com/path/to/document.html
|
||||
Link on page: <a href="./file.xlsx">
|
||||
Expected: http://example.com/path/to/file.xlsx
|
||||
"""
|
||||
base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml"
|
||||
href_relative_current_dir = "./P020241203375994691134.xlsx"
|
||||
expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx"
|
||||
self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1)
|
||||
|
||||
# Test with a relative link that doesn't start with "./"
|
||||
href_relative_no_dot_slash = "another.doc"
|
||||
expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc"
|
||||
self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2)
|
||||
|
||||
def test_invalid_base_url_scheme(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
normalize_url("page.html", "ftp://example.com/")
|
||||
self.assertIn("Invalid base URL format", str(context.exception))
|
||||
|
||||
def test_invalid_base_url_netloc(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
normalize_url("page.html", "http:///path/")
|
||||
self.assertIn("Invalid base URL format", str(context.exception))
|
||||
|
||||
def test_base_url_with_port(self):
|
||||
self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html")
|
||||
|
||||
def test_href_with_special_characters(self):
|
||||
self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user