Enhance crawler capabilities and documentation

- Add llm.txt generator
  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation.
This commit is contained in:
UncleCode
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions

View File

@@ -1,7 +1,14 @@
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional, Callable, Awaitable, Union
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from dataclasses import dataclass
@dataclass
class TokenUsage:
completion_tokens: int = 0
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens_details: Optional[dict] = None
prompt_tokens_details: Optional[dict] = None
class UrlModel(BaseModel):
url: HttpUrl
@@ -34,7 +41,8 @@ class CrawlResult(BaseModel):
session_id: Optional[str] = None
response_headers: Optional[dict] = None
status_code: Optional[int] = None
ssl_certificate: Optional[Dict[str, Any]] = None
class AsyncCrawlResponse(BaseModel):
html: str
response_headers: Dict[str, str]
@@ -43,8 +51,7 @@ class AsyncCrawlResponse(BaseModel):
pdf_data: Optional[bytes] = None
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
downloaded_files: Optional[List[str]] = None
ssl_certificate: Optional[Dict[str, Any]] = None
class Config:
arbitrary_types_allowed = True