Enhance crawler capabilities and documentation

- Add llm.txt generator
  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation.
This commit is contained in:
UncleCode
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions

View File

@@ -14,7 +14,12 @@ class ChunkingStrategy(ABC):
Abstract method to chunk the given text.
"""
pass
# Create an identity chunking strategy f(x) = [x]
class IdentityChunking(ChunkingStrategy):
def chunk(self, text: str) -> list:
return [text]
# Regex-based chunking
class RegexChunking(ChunkingStrategy):
def __init__(self, patterns=None, **kwargs):
@@ -127,7 +132,6 @@ class SlidingWindowChunking(ChunkingStrategy):
return chunks
class OverlappingWindowChunking(ChunkingStrategy):
def __init__(self, window_size=1000, overlap=100, **kwargs):
"""