Enhance crawler capabilities and documentation

- Add llm.txt generator
  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation.
This commit is contained in:
UncleCode
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions

43
tests/test_cli_docs.py Normal file
View File

@@ -0,0 +1,43 @@
import asyncio
from pathlib import Path
from crawl4ai.docs_manager import DocsManager
from click.testing import CliRunner
from crawl4ai.cli import cli
def test_cli():
"""Test all CLI commands"""
runner = CliRunner()
print("\n1. Testing docs update...")
# Use sync version for testing
docs_manager = DocsManager()
loop = asyncio.get_event_loop()
loop.run_until_complete(docs_manager.fetch_docs())
# print("\n2. Testing listing...")
# result = runner.invoke(cli, ['docs', 'list'])
# print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
# print(result.output)
# print("\n2. Testing index building...")
# result = runner.invoke(cli, ['docs', 'index'])
# print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
# print(f"Output: {result.output}")
# print("\n3. Testing search...")
# result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
# print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
# print(f"First 200 chars: {result.output[:200]}...")
# print("\n4. Testing combine with sections...")
# result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
# print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
# print(f"First 200 chars: {result.output[:200]}...")
print("\n5. Testing combine all sections...")
result = runner.invoke(cli, ['docs', 'combine', '--mode', 'condensed'])
print(f"Status: {'' if result.exit_code == 0 else ''}")
print(f"First 200 chars: {result.output[:200]}...")
if __name__ == "__main__":
test_cli()

49
tests/test_llmtxt.py Normal file
View File

@@ -0,0 +1,49 @@
from crawl4ai.llmtxt import AsyncLLMTextManager # Changed to AsyncLLMTextManager
from crawl4ai.async_logger import AsyncLogger
from pathlib import Path
import asyncio
async def main():
current_file = Path(__file__).resolve()
# base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
base_dir = current_file.parent.parent / "local/_docs/llm.txt"
docs_dir = base_dir
# Create directory if it doesn't exist
docs_dir.mkdir(parents=True, exist_ok=True)
# Initialize logger
logger = AsyncLogger()
# Updated initialization with default batching params
# manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)
# Let's first check what files we have
print("\nAvailable files:")
for f in docs_dir.glob("*.md"):
print(f"- {f.name}")
# Generate index files
print("\nGenerating index files...")
await manager.generate_index_files(
force_generate_facts=False,
clear_bm25_cache=False
)
# Test some relevant queries about Crawl4AI
test_queries = [
"How is using the `arun_many` method?",
]
print("\nTesting search functionality:")
for query in test_queries:
print(f"\nQuery: {query}")
results = manager.search(query, top_k=2)
print(f"Results length: {len(results)} characters")
if results:
print("First 200 chars of results:", results[:200].replace('\n', ' '), "...")
else:
print("No results found")
if __name__ == "__main__":
asyncio.run(main())