Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -1,15 +1,16 @@
import os, sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
# Category 1: Browser Configuration Tests
async def test_browser_config_object():
@@ -21,29 +22,31 @@ async def test_browser_config_object():
viewport_height=1080,
use_managed_browser=True,
user_agent_mode="random",
user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
)
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
result = await crawler.arun("https://example.com", cache_mode=CacheMode.BYPASS)
assert result.success, "Browser config crawl failed"
assert len(result.html) > 0, "No HTML content retrieved"
async def test_browser_performance_config():
"""Test browser configurations focused on performance"""
browser_config = BrowserConfig(
text_mode=True,
light_mode=True,
extra_args=['--disable-gpu', '--disable-software-rasterizer'],
extra_args=["--disable-gpu", "--disable-software-rasterizer"],
ignore_https_errors=True,
java_script_enabled=False
java_script_enabled=False,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun('https://example.com')
result = await crawler.arun("https://example.com")
assert result.success, "Performance optimized crawl failed"
assert result.status_code == 200, "Unexpected status code"
# Category 2: Content Processing Tests
async def test_content_extraction_config():
"""Test content extraction with various strategies"""
@@ -53,24 +56,20 @@ async def test_content_extraction_config():
schema={
"name": "article",
"baseSelector": "div",
"fields": [{
"name": "title",
"selector": "h1",
"type": "text"
}]
"fields": [{"name": "title", "selector": "h1", "type": "text"}],
}
),
chunking_strategy=RegexChunking(),
content_filter=PruningContentFilter()
content_filter=PruningContentFilter(),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
'https://example.com/article',
config=crawler_config
"https://example.com/article", config=crawler_config
)
assert result.extracted_content is not None, "Content extraction failed"
assert 'title' in result.extracted_content, "Missing expected content field"
assert "title" in result.extracted_content, "Missing expected content field"
# Category 3: Cache and Session Management Tests
async def test_cache_and_session_management():
@@ -79,25 +78,20 @@ async def test_cache_and_session_management():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.WRITE_ONLY,
process_iframes=True,
remove_overlay_elements=True
remove_overlay_elements=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
# First request - should write to cache
result1 = await crawler.arun(
'https://example.com',
config=crawler_config
)
result1 = await crawler.arun("https://example.com", config=crawler_config)
# Second request - should use fresh fetch due to WRITE_ONLY mode
result2 = await crawler.arun(
'https://example.com',
config=crawler_config
)
result2 = await crawler.arun("https://example.com", config=crawler_config)
assert result1.success and result2.success, "Cache mode crawl failed"
assert result1.html == result2.html, "Inconsistent results between requests"
# Category 4: Media Handling Tests
async def test_media_handling_config():
"""Test configurations related to media handling"""
@@ -107,24 +101,22 @@ async def test_media_handling_config():
viewport_width=1920,
viewport_height=1080,
accept_downloads=True,
downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
downloads_path=os.path.expanduser("~/.crawl4ai/downloads"),
)
crawler_config = CrawlerRunConfig(
screenshot=True,
pdf=True,
adjust_viewport_to_content=True,
wait_for_images=True,
screenshot_height_threshold=20000
screenshot_height_threshold=20000,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
'https://example.com',
config=crawler_config
)
result = await crawler.arun("https://example.com", config=crawler_config)
assert result.screenshot is not None, "Screenshot capture failed"
assert result.pdf is not None, "PDF generation failed"
# Category 5: Anti-Bot and Site Interaction Tests
async def test_antibot_config():
"""Test configurations for handling anti-bot measures"""
@@ -135,76 +127,64 @@ async def test_antibot_config():
wait_for="js:()=>document.querySelector('body')",
delay_before_return_html=1.0,
log_console=True,
cache_mode=CacheMode.BYPASS
cache_mode=CacheMode.BYPASS,
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
'https://example.com',
config=crawler_config
)
result = await crawler.arun("https://example.com", config=crawler_config)
assert result.success, "Anti-bot measure handling failed"
# Category 6: Parallel Processing Tests
async def test_parallel_processing():
"""Test parallel processing capabilities"""
crawler_config = CrawlerRunConfig(
mean_delay=0.5,
max_range=1.0,
semaphore_count=5
)
urls = [
'https://example.com/1',
'https://example.com/2',
'https://example.com/3'
]
crawler_config = CrawlerRunConfig(mean_delay=0.5, max_range=1.0, semaphore_count=5)
urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls,
config=crawler_config
)
results = await crawler.arun_many(urls, config=crawler_config)
assert len(results) == len(urls), "Not all URLs were processed"
assert all(r.success for r in results), "Some parallel requests failed"
# Category 7: Backwards Compatibility Tests
async def test_legacy_parameter_support():
"""Test that legacy parameters still work"""
async with AsyncWebCrawler(
headless=True,
browser_type="chromium",
viewport_width=1024,
viewport_height=768
headless=True, browser_type="chromium", viewport_width=1024, viewport_height=768
) as crawler:
result = await crawler.arun(
'https://example.com',
"https://example.com",
screenshot=True,
word_count_threshold=200,
bypass_cache=True,
css_selector=".main-content"
css_selector=".main-content",
)
assert result.success, "Legacy parameter support failed"
# Category 8: Mixed Configuration Tests
async def test_mixed_config_usage():
"""Test mixing new config objects with legacy parameters"""
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(screenshot=True)
async with AsyncWebCrawler(
config=browser_config,
verbose=True # legacy parameter
verbose=True, # legacy parameter
) as crawler:
result = await crawler.arun(
'https://example.com',
"https://example.com",
config=crawler_config,
cache_mode=CacheMode.BYPASS, # legacy parameter
css_selector="body" # legacy parameter
css_selector="body", # legacy parameter
)
assert result.success, "Mixed configuration usage failed"
if __name__ == "__main__":
async def run_tests():
test_functions = [
test_browser_config_object,
@@ -217,7 +197,7 @@ if __name__ == "__main__":
# test_legacy_parameter_support,
# test_mixed_config_usage
]
for test in test_functions:
print(f"\nRunning {test.__name__}...")
try:
@@ -227,5 +207,5 @@ if __name__ == "__main__":
print(f"{test.__name__} failed: {str(e)}")
except Exception as e:
print(f"{test.__name__} error: {str(e)}")
asyncio.run(run_tests())
asyncio.run(run_tests())