Commit Message:

Enhance Crawl4AI with CLI and documentation updates - Implemented Command-Line Interface (CLI) in `crawl4ai/cli.py` - Added chunking strategies and their documentation in `llm.txt`
2024-12-21 14:26:56 +08:00
parent 8fbc2e0463
commit 84b311760f
47 changed files with 6510 additions and 2 deletions
--- a/docs/llm.txt/llmtxt.py
+++ b/docs/llm.txt/llmtxt.py
@@ -0,0 +1,187 @@
+import os
+from pathlib import Path
+from rank_bm25 import BM25Okapi
+import re
+from typing import List, Literal
+
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import nltk
+
+
+BASE_PATH = Path(__file__).resolve().parent
+
+def get_file_map() -> dict:
+    """Cache file mappings to avoid repeated directory scans"""
+    files = os.listdir(BASE_PATH)
+    file_map = {}
+    
+    for file in files:
+        if file.endswith('.md'):
+            # Extract number and name: "6_chunking_strategies.md" -> ("chunking_strategies", "6")
+            match = re.match(r'(\d+)_(.+?)(?:\.(?:ex|xs|sm|q)?\.md)?$', file)
+            if match:
+                num, name = match.groups()
+                if name not in file_map:
+                    file_map[name] = num
+    return file_map
+
+def concatenate_docs(file_names: List[str], mode: Literal["extended", "condensed"]) -> str:
+    """Concatenate documentation files based on names and mode."""
+    file_map = get_file_map()
+    result = []
+    suffix_map = {
+        "extended": ".ex.md",
+        "condensed": [".xs.md", ".sm.md"]
+    }
+    
+    for name in file_names:
+        if name not in file_map:
+            continue
+            
+        num = file_map[name]
+        base_path = BASE_PATH
+        
+        if mode == "extended":
+            file_path = base_path / f"{num}_{name}{suffix_map[mode]}"
+            if not file_path.exists():
+                file_path = base_path / f"{num}_{name}.md"
+        else:
+            file_path = None
+            for suffix in suffix_map["condensed"]:
+                temp_path = base_path / f"{num}_{name}{suffix}"
+                if temp_path.exists():
+                    file_path = temp_path
+                    break
+            if not file_path:
+                file_path = base_path / f"{num}_{name}.md"
+        
+        if file_path.exists():
+            with open(file_path, 'r', encoding='utf-8') as f:
+                result.append(f.read())
+    
+    return "\n\n---\n\n".join(result)
+
+def extract_questions(content: str) -> List[tuple[str, str, str]]:
+    """
+    Extract questions from Q files, returning list of (category, question, full_section).
+    """
+    # Split into main sections (### Questions or ### Hypothetical Questions)
+    sections = re.split(r'^###\s+.*Questions\s*$', content, flags=re.MULTILINE)[1:]
+    
+    results = []
+    for section in sections:
+        # Find all numbered categories (1. **Category Name**)
+        categories = re.split(r'^\d+\.\s+\*\*([^*]+)\*\*\s*$', section.strip(), flags=re.MULTILINE)
+        
+        # Process each category
+        for i in range(1, len(categories), 2):
+            category = categories[i].strip()
+            category_content = categories[i+1].strip()
+            
+            # Extract questions (lines starting with dash and wrapped in italics)
+            questions = re.findall(r'^\s*-\s*\*"([^"]+)"\*\s*$', category_content, flags=re.MULTILINE)
+            
+            # Add each question with its category and full context
+            for q in questions:
+                results.append((category, q, f"Category: {category}\nQuestion: {q}"))
+    
+    return results
+
+def preprocess_text(text: str) -> List[str]:
+    """Preprocess text for better semantic matching"""
+    # Lowercase and tokenize
+    tokens = word_tokenize(text.lower())
+    
+    # Remove stopwords but keep question words
+    stop_words = set(stopwords.words('english')) - {'how', 'what', 'when', 'where', 'why', 'which'}
+    lemmatizer = WordNetLemmatizer()
+    
+    # Lemmatize but preserve original form for technical terms
+    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
+    
+    return tokens
+
+def search_questions(query: str, top_k: int = 5) -> str:
+    """Search through Q files using BM25 ranking and return top K matches."""
+    q_files = [f for f in os.listdir(BASE_PATH) if f.endswith(".q.md")]
+    # Prepare base path for file reading
+    q_files = [BASE_PATH / f for f in q_files] # Convert to full path
+    
+    documents = []
+    file_contents = {}
+    
+    for file in q_files:
+        with open(file, 'r', encoding='utf-8') as f:
+            content = f.read()
+            questions = extract_questions(content)
+            for category, question, full_section in questions:
+                documents.append(question)
+                file_contents[question] = (file, category, full_section)
+
+    if not documents:
+        return "No questions found in documentation."
+
+    tokenized_docs = [preprocess_text(doc) for doc in documents]
+    tokenized_query = preprocess_text(query)
+    
+    bm25 = BM25Okapi(tokenized_docs)
+    doc_scores = bm25.get_scores(tokenized_query)
+    
+    score_threshold = max(doc_scores) * 0.4
+    
+    # Aggregate scores by file
+    file_data = {}
+    for idx, score in enumerate(doc_scores):
+        if score > score_threshold:
+            question = documents[idx]
+            file, category, _ = file_contents[question]
+            
+            if file not in file_data:
+                file_data[file] = {
+                    'total_score': 0,
+                    'match_count': 0,
+                    'questions': []
+                }
+            
+            file_data[file]['total_score'] += score
+            file_data[file]['match_count'] += 1
+            file_data[file]['questions'].append({
+                'category': category,
+                'question': question,
+                'score': score
+            })
+    
+    # Sort files by match count and total score
+    ranked_files = sorted(
+        file_data.items(),
+        key=lambda x: (x[1]['match_count'], x[1]['total_score']),
+        reverse=True
+    )[:top_k]
+    
+    # Format results by file
+    results = []
+    for file, data in ranked_files:
+        questions_summary = "\n".join(
+            f"- [{q['category']}] {q['question']} (score: {q['score']:.2f})"
+            for q in sorted(data['questions'], key=lambda x: x['score'], reverse=True)
+        )
+        
+        results.append(
+            f"File: {file}\n"
+            f"Match Count: {data['match_count']}\n"
+            f"Total Score: {data['total_score']:.2f}\n\n"
+            f"Matching Questions:\n{questions_summary}"
+        )
+    
+    return "\n\n---\n\n".join(results) if results else "No relevant matches found."
+
+if __name__ == "__main__":
+    # Example 1: Concatenate docs
+    docs = concatenate_docs(["chunking_strategies", "content_selection"], "extended")
+    print("Concatenated docs:", docs[:200], "...\n")
+    
+    # Example 2: Search questions
+    results = search_questions("How do I execute JS script on the page?", 3)
+    print("Search results:", results[:200], "...")