Enhance crawler capabilities and documentation

- Add llm.txt generator
  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation.
This commit is contained in:
UncleCode
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions

View File

@@ -1,8 +1,7 @@
import click
import sys
import asyncio
from pathlib import Path
from typing import List, Optional
from typing import List
from .docs_manager import DocsManager
from .async_logger import AsyncLogger
@@ -10,20 +9,19 @@ logger = AsyncLogger(verbose=True)
docs_manager = DocsManager(logger)
def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
"""Helper function to print formatted tables"""
col_widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
border = '+' + '+'.join('-' * (width + 2 * padding) for width in col_widths) + '+'
"""Print formatted table with headers and rows"""
widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+'
def format_row(row):
return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
for cell, w in zip(row, widths)) + '|'
def print_row(row):
return '|' + '|'.join(
f"{str(cell):{' '}<{width}}" for cell, width in zip(row, col_widths)
) + '|'
click.echo(border)
click.echo(print_row(headers))
click.echo(format_row(headers))
click.echo(border)
for row in rows:
click.echo(print_row(row))
click.echo(format_row(row))
click.echo(border)
@click.group()
@@ -33,63 +31,75 @@ def cli():
@cli.group()
def docs():
"""Documentation and LLM text operations"""
"""Documentation operations"""
pass
@docs.command()
@click.argument('sections', nargs=-1)
@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended',
help='Documentation detail level')
@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended')
def combine(sections: tuple, mode: str):
"""Combine documentation sections.
If no sections are specified, combines all available sections.
"""
"""Combine documentation sections"""
try:
asyncio.run(docs_manager.ensure_docs_exist())
result = docs_manager.concatenate_docs(sections, mode)
click.echo(result)
click.echo(docs_manager.generate(sections, mode))
except Exception as e:
logger.error(str(e), tag="ERROR")
sys.exit(1)
@docs.command()
@click.argument('query')
@click.option('--top-k', '-k', default=5, help='Number of top results to return')
def search(query: str, top_k: int):
"""Search through documentation questions"""
@click.option('--top-k', '-k', default=5)
@click.option('--build-index', is_flag=True, help='Build index if missing')
def search(query: str, top_k: int, build_index: bool):
"""Search documentation"""
try:
results = docs_manager.search_questions(query, top_k)
click.echo(results)
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
@docs.command()
def list():
"""List available documentation sections"""
try:
file_map = docs_manager.get_file_map()
rows = [[num, name] for name, num in file_map.items()]
rows.sort(key=lambda x: int(x[0]))
print_table(['Number', 'Section Name'], rows)
result = docs_manager.search(query, top_k)
if result == "No search index available. Call build_search_index() first.":
if build_index or click.confirm('No search index found. Build it now?'):
asyncio.run(docs_manager.llm_text.generate_index_files())
result = docs_manager.search(query, top_k)
click.echo(result)
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
@docs.command()
def update():
"""Update local documentation cache from GitHub"""
"""Update docs from GitHub"""
try:
docs_manager = DocsManager()
docs_manager.update_docs()
asyncio.run(docs_manager.fetch_docs())
click.echo("Documentation updated successfully")
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
@docs.command()
@click.option('--force-facts', is_flag=True, help='Force regenerate fact files')
@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache')
def index(force_facts: bool, clear_cache: bool):
"""Build or rebuild search indexes"""
try:
asyncio.run(docs_manager.ensure_docs_exist())
asyncio.run(docs_manager.llm_text.generate_index_files(
force_generate_facts=force_facts,
clear_bm25_cache=clear_cache
))
click.echo("Search indexes built successfully")
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
# Add docs list command
@docs.command()
def list():
"""List available documentation sections"""
try:
sections = docs_manager.list()
print_table(["Sections"], [[section] for section in sections])
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
if __name__ == '__main__':
cli()
cli()