Enhance crawler capabilities and documentation
- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
import click
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from typing import List
|
||||
from .docs_manager import DocsManager
|
||||
from .async_logger import AsyncLogger
|
||||
|
||||
@@ -10,20 +9,19 @@ logger = AsyncLogger(verbose=True)
|
||||
docs_manager = DocsManager(logger)
|
||||
|
||||
def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
|
||||
"""Helper function to print formatted tables"""
|
||||
col_widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
|
||||
border = '+' + '+'.join('-' * (width + 2 * padding) for width in col_widths) + '+'
|
||||
"""Print formatted table with headers and rows"""
|
||||
widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
|
||||
border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+'
|
||||
|
||||
def format_row(row):
|
||||
return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
|
||||
for cell, w in zip(row, widths)) + '|'
|
||||
|
||||
def print_row(row):
|
||||
return '|' + '|'.join(
|
||||
f"{str(cell):{' '}<{width}}" for cell, width in zip(row, col_widths)
|
||||
) + '|'
|
||||
|
||||
click.echo(border)
|
||||
click.echo(print_row(headers))
|
||||
click.echo(format_row(headers))
|
||||
click.echo(border)
|
||||
for row in rows:
|
||||
click.echo(print_row(row))
|
||||
click.echo(format_row(row))
|
||||
click.echo(border)
|
||||
|
||||
@click.group()
|
||||
@@ -33,63 +31,75 @@ def cli():
|
||||
|
||||
@cli.group()
|
||||
def docs():
|
||||
"""Documentation and LLM text operations"""
|
||||
"""Documentation operations"""
|
||||
pass
|
||||
|
||||
@docs.command()
|
||||
@click.argument('sections', nargs=-1)
|
||||
@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended',
|
||||
help='Documentation detail level')
|
||||
@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended')
|
||||
def combine(sections: tuple, mode: str):
|
||||
"""Combine documentation sections.
|
||||
|
||||
If no sections are specified, combines all available sections.
|
||||
"""
|
||||
"""Combine documentation sections"""
|
||||
try:
|
||||
asyncio.run(docs_manager.ensure_docs_exist())
|
||||
result = docs_manager.concatenate_docs(sections, mode)
|
||||
click.echo(result)
|
||||
click.echo(docs_manager.generate(sections, mode))
|
||||
except Exception as e:
|
||||
logger.error(str(e), tag="ERROR")
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
@click.argument('query')
|
||||
@click.option('--top-k', '-k', default=5, help='Number of top results to return')
|
||||
def search(query: str, top_k: int):
|
||||
"""Search through documentation questions"""
|
||||
@click.option('--top-k', '-k', default=5)
|
||||
@click.option('--build-index', is_flag=True, help='Build index if missing')
|
||||
def search(query: str, top_k: int, build_index: bool):
|
||||
"""Search documentation"""
|
||||
try:
|
||||
results = docs_manager.search_questions(query, top_k)
|
||||
click.echo(results)
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
def list():
|
||||
"""List available documentation sections"""
|
||||
try:
|
||||
file_map = docs_manager.get_file_map()
|
||||
rows = [[num, name] for name, num in file_map.items()]
|
||||
rows.sort(key=lambda x: int(x[0]))
|
||||
print_table(['Number', 'Section Name'], rows)
|
||||
result = docs_manager.search(query, top_k)
|
||||
if result == "No search index available. Call build_search_index() first.":
|
||||
if build_index or click.confirm('No search index found. Build it now?'):
|
||||
asyncio.run(docs_manager.llm_text.generate_index_files())
|
||||
result = docs_manager.search(query, top_k)
|
||||
click.echo(result)
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
def update():
|
||||
"""Update local documentation cache from GitHub"""
|
||||
"""Update docs from GitHub"""
|
||||
try:
|
||||
docs_manager = DocsManager()
|
||||
docs_manager.update_docs()
|
||||
asyncio.run(docs_manager.fetch_docs())
|
||||
click.echo("Documentation updated successfully")
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
@click.option('--force-facts', is_flag=True, help='Force regenerate fact files')
|
||||
@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache')
|
||||
def index(force_facts: bool, clear_cache: bool):
|
||||
"""Build or rebuild search indexes"""
|
||||
try:
|
||||
asyncio.run(docs_manager.ensure_docs_exist())
|
||||
asyncio.run(docs_manager.llm_text.generate_index_files(
|
||||
force_generate_facts=force_facts,
|
||||
clear_bm25_cache=clear_cache
|
||||
))
|
||||
click.echo("Search indexes built successfully")
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Add docs list command
|
||||
@docs.command()
|
||||
def list():
|
||||
"""List available documentation sections"""
|
||||
try:
|
||||
sections = docs_manager.list()
|
||||
print_table(["Sections"], [[section] for section in sections])
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
cli()
|
||||
Reference in New Issue
Block a user