Files
crawl4ai/crawl4ai/config.py
UncleCode bcfe83f702 feat: enhance crawler with overlay removal and improved screenshot capabilities
• Add smart overlay removal system for handling popups and modals
• Improve screenshot functionality with configurable timing controls
• Implement URL normalization and enhanced link processing
• Add custom base directory support for cache storage
• Refine external content filtering and social media domain handling

This commit significantly improves the crawler's ability to handle modern
websites by automatically removing intrusive overlays and providing better
screenshot capabilities. URL handling is now more robust with proper
normalization and duplicate detection. The cache system is more flexible
with customizable base directory support.

Breaking changes: None
Issue numbers: None
2024-10-24 20:22:47 +08:00

54 lines
2.3 KiB
Python

import os
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
MODEL_REPO_BRANCH = "new-release-0.0.2"
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
PROVIDER_MODELS = {
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
}
# Chunk token threshold
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
OVERLAP_RATE = 0.1
WORD_TOKEN_RATE = 1.3
# Threshold for the minimum number of word in a HTML tag to be considered
MIN_WORD_THRESHOLD = 1
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
SOCIAL_MEDIA_DOMAINS = [
'facebook.com',
'twitter.com',
'x.com',
'linkedin.com',
'instagram.com',
'pinterest.com',
'tiktok.com',
'snapchat.com',
'reddit.com',
]
# Threshold for the Image extraction - Range is 1 to 6
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
# to each image based on the following aspects.
# If either height or width exceeds 150px
# If image size is greater than 10Kb
# If alt property is set
# If image format is in jpg, png or webp
# If image is in the first half of the total images extracted from the page
IMAGE_SCORE_THRESHOLD = 2