Enhance crawler capabilities and documentation
- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
This commit is contained in:
@@ -1,7 +1,14 @@
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union
|
||||
|
||||
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from dataclasses import dataclass
|
||||
@dataclass
|
||||
class TokenUsage:
|
||||
completion_tokens: int = 0
|
||||
prompt_tokens: int = 0
|
||||
total_tokens: int = 0
|
||||
completion_tokens_details: Optional[dict] = None
|
||||
prompt_tokens_details: Optional[dict] = None
|
||||
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: HttpUrl
|
||||
@@ -34,7 +41,8 @@ class CrawlResult(BaseModel):
|
||||
session_id: Optional[str] = None
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
|
||||
ssl_certificate: Optional[Dict[str, Any]] = None
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
@@ -43,8 +51,7 @@ class AsyncCrawlResponse(BaseModel):
|
||||
pdf_data: Optional[bytes] = None
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
ssl_certificate: Optional[Dict[str, Any]] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user