refactor(core): improve type hints and remove unused file

- Add RelevantContentFilter to __init__.py exports
- Update version to 0.4.3b3
- Enhance type hints in async_configs.py
- Remove empty utils.scraping.py file
- Update mkdocs configuration with version info and GitHub integration

BREAKING CHANGE: None
This commit is contained in:
UncleCode
2025-01-23 18:53:22 +08:00
parent 357414c345
commit 6dc01eae3a
5 changed files with 20 additions and 7 deletions

View File

@@ -16,7 +16,7 @@ from .extraction_strategy import (
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
from .models import CrawlResult, MarkdownGenerationResult
from .async_dispatcher import (
MemoryAdaptiveDispatcher,
@@ -44,6 +44,7 @@ __all__ = [
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",
"RelevantContentFilter",
"PruningContentFilter",
"BM25ContentFilter",
"LLMContentFilter",

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.4.3b2"
__version__ = "0.4.3b3"

View File

@@ -6,12 +6,15 @@ from .config import (
IMAGE_SCORE_THRESHOLD,
SOCIAL_MEDIA_DOMAINS,
)
from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from typing import Optional, Union, List
from .cache_context import CacheMode
class BrowserConfig:
@@ -81,13 +84,13 @@ class BrowserConfig:
user_data_dir: str = None,
chrome_channel: str = "chromium",
channel: str = "chromium",
proxy: Optional[str] = None,
proxy: str = None,
proxy_config: dict = None,
viewport_width: int = 1080,
viewport_height: int = 600,
accept_downloads: bool = False,
downloads_path: str = None,
storage_state=None,
storage_state : Union[str, dict, None]=None,
ignore_https_errors: bool = True,
java_script_enabled: bool = True,
sleep_on_close: bool = False,
@@ -382,7 +385,7 @@ class CrawlerRunConfig:
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None,
content_filter=None,
content_filter : RelevantContentFilter = None,
only_text: bool = False,
css_selector: str = None,
excluded_tags: list = None,
@@ -396,7 +399,7 @@ class CrawlerRunConfig:
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode=None,
cache_mode: CacheMode =None,
session_id: str = None,
bypass_cache: bool = False,
disable_cache: bool = False,

View File

@@ -1,4 +1,4 @@
site_name: Crawl4AI Documentation
site_name: Crawl4AI Documentation (v0.4.3b2)
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
site_url: https://docs.crawl4ai.com
repo_url: https://github.com/unclecode/crawl4ai
@@ -52,6 +52,11 @@ nav:
theme:
name: 'terminal'
palette: 'dark'
icon:
repo: fontawesome/brands/github
plugins:
- search
markdown_extensions:
- pymdownx.highlight:
@@ -64,6 +69,9 @@ markdown_extensions:
- attr_list
- tables
extra:
version: !ENV [CRAWL4AI_VERSION, 'development']
extra_css:
- assets/styles.css
- assets/highlight.css
@@ -72,3 +80,4 @@ extra_css:
extra_javascript:
- assets/highlight.min.js
- assets/highlight_init.js
- https://buttons.github.io/buttons.js