Enhance Markdown generation and external content control

- Integrate customized html2text library for flexible Markdown output
- Add options to exclude external links and images
- Improve content scraping efficiency and error handling
- Update AsyncPlaywrightCrawlerStrategy for faster closing
- Enhance CosineStrategy with generic embedding model loading
This commit is contained in:
UncleCode
2024-10-20 18:56:58 +08:00
parent e7cd8a1c2d
commit 6ec4cb33ca
14 changed files with 1981 additions and 21 deletions

View File

@@ -4,22 +4,21 @@ from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
MODEL_REPO_BRANCH = "new-release-0.0.2"
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
PROVIDER_MODELS = {
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
}
# Chunk token threshold
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
OVERLAP_RATE = 0.1
@@ -29,6 +28,27 @@ WORD_TOKEN_RATE = 1.3
MIN_WORD_THRESHOLD = 1
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
SOCIAL_MEDIA_DOMAINS = [
'facebook.com',
'twitter.com',
'x.com',
'linkedin.com',
'instagram.com',
'pinterest.com',
'youtube.com',
'tiktok.com',
'snapchat.com',
'whatsapp.com',
'messenger.com',
'reddit.com',
'tumblr.com',
'buffer.com',
'xing.com',
'flipboard.com',
]
# Threshold for the Image extraction - Range is 1 to 6
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
# to each image based on the following aspects.