Enhance crawler capabilities and documentation
- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
This commit is contained in:
49
tests/test_llmtxt.py
Normal file
49
tests/test_llmtxt.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from crawl4ai.llmtxt import AsyncLLMTextManager # Changed to AsyncLLMTextManager
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from pathlib import Path
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
current_file = Path(__file__).resolve()
|
||||
# base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
|
||||
base_dir = current_file.parent.parent / "local/_docs/llm.txt"
|
||||
docs_dir = base_dir
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
docs_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize logger
|
||||
logger = AsyncLogger()
|
||||
# Updated initialization with default batching params
|
||||
# manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
|
||||
manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)
|
||||
|
||||
# Let's first check what files we have
|
||||
print("\nAvailable files:")
|
||||
for f in docs_dir.glob("*.md"):
|
||||
print(f"- {f.name}")
|
||||
|
||||
# Generate index files
|
||||
print("\nGenerating index files...")
|
||||
await manager.generate_index_files(
|
||||
force_generate_facts=False,
|
||||
clear_bm25_cache=False
|
||||
)
|
||||
|
||||
# Test some relevant queries about Crawl4AI
|
||||
test_queries = [
|
||||
"How is using the `arun_many` method?",
|
||||
]
|
||||
|
||||
print("\nTesting search functionality:")
|
||||
for query in test_queries:
|
||||
print(f"\nQuery: {query}")
|
||||
results = manager.search(query, top_k=2)
|
||||
print(f"Results length: {len(results)} characters")
|
||||
if results:
|
||||
print("First 200 chars of results:", results[:200].replace('\n', ' '), "...")
|
||||
else:
|
||||
print("No results found")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user