feat: add comprehensive type definitions and improve test coverage
Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
This commit is contained in:
@@ -174,6 +174,31 @@ class SeedRequest(BaseModel):
|
||||
config: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class URLDiscoveryRequest(BaseModel):
|
||||
"""Request model for URL discovery endpoint."""
|
||||
|
||||
domain: str = Field(..., example="docs.crawl4ai.com", description="Domain to discover URLs from")
|
||||
seeding_config: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Configuration for URL discovery using AsyncUrlSeeder",
|
||||
example={
|
||||
"source": "sitemap+cc",
|
||||
"pattern": "*",
|
||||
"live_check": False,
|
||||
"extract_head": False,
|
||||
"max_urls": -1,
|
||||
"concurrency": 1000,
|
||||
"hits_per_sec": 5,
|
||||
"force": False,
|
||||
"verbose": False,
|
||||
"query": None,
|
||||
"score_threshold": None,
|
||||
"scoring_method": "bm25",
|
||||
"filter_nonsense_urls": True
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# --- C4A Script Schemas ---
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user