Enhance crawler capabilities and documentation

- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions
--- a/tests/test_cli_docs.py
+++ b/tests/test_cli_docs.py
@@ -0,0 +1,43 @@
+import asyncio
+from pathlib import Path
+from crawl4ai.docs_manager import DocsManager
+from click.testing import CliRunner
+from crawl4ai.cli import cli
+
+def test_cli():
+    """Test all CLI commands"""
+    runner = CliRunner()
+    
+    print("\n1. Testing docs update...")
+    # Use sync version for testing
+    docs_manager = DocsManager()
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(docs_manager.fetch_docs())
+
+    # print("\n2. Testing listing...")
+    # result = runner.invoke(cli, ['docs', 'list'])
+    # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
+    # print(result.output)
+
+    # print("\n2. Testing index building...")
+    # result = runner.invoke(cli, ['docs', 'index'])
+    # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
+    # print(f"Output: {result.output}")
+
+    # print("\n3. Testing search...")
+    # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
+    # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
+    # print(f"First 200 chars: {result.output[:200]}...")    
+    
+    # print("\n4. Testing combine with sections...")
+    # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
+    # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
+    # print(f"First 200 chars: {result.output[:200]}...")
+
+    print("\n5. Testing combine all sections...")
+    result = runner.invoke(cli, ['docs', 'combine', '--mode', 'condensed'])
+    print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
+    print(f"First 200 chars: {result.output[:200]}...")
+
+if __name__ == "__main__":
+    test_cli()
--- a/tests/test_llmtxt.py
+++ b/tests/test_llmtxt.py
@@ -0,0 +1,49 @@
+from crawl4ai.llmtxt import AsyncLLMTextManager  # Changed to AsyncLLMTextManager
+from crawl4ai.async_logger import AsyncLogger
+from pathlib import Path
+import asyncio
+
+async def main():
+    current_file = Path(__file__).resolve()
+    # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
+    base_dir = current_file.parent.parent / "local/_docs/llm.txt"
+    docs_dir = base_dir
+    
+    # Create directory if it doesn't exist
+    docs_dir.mkdir(parents=True, exist_ok=True)
+   
+    # Initialize logger
+    logger = AsyncLogger()
+    # Updated initialization with default batching params
+    # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
+    manager = AsyncLLMTextManager(docs_dir, logger,  batch_size=2)
+
+    # Let's first check what files we have
+    print("\nAvailable files:")
+    for f in docs_dir.glob("*.md"):
+        print(f"- {f.name}")
+
+    # Generate index files
+    print("\nGenerating index files...")
+    await manager.generate_index_files(
+        force_generate_facts=False,
+        clear_bm25_cache=False
+    )
+
+    # Test some relevant queries about Crawl4AI
+    test_queries = [
+        "How is using the `arun_many` method?",
+    ]
+
+    print("\nTesting search functionality:")
+    for query in test_queries:
+        print(f"\nQuery: {query}")
+        results = manager.search(query, top_k=2)
+        print(f"Results length: {len(results)} characters")
+        if results:
+            print("First 200 chars of results:", results[:200].replace('\n', ' '), "...")
+        else:
+            print("No results found")
+
+if __name__ == "__main__":
+    asyncio.run(main())