refactor(core): improve type hints and remove unused file

- Add RelevantContentFilter to __init__.py exports
- Update version to 0.4.3b3
- Enhance type hints in async_configs.py
- Remove empty utils.scraping.py file
- Update mkdocs configuration with version info and GitHub integration

BREAKING CHANGE: None
This commit is contained in:
UncleCode
2025-01-23 18:53:22 +08:00
parent 357414c345
commit 6dc01eae3a
5 changed files with 20 additions and 7 deletions

View File

@@ -16,7 +16,7 @@ from .extraction_strategy import (
) )
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator from .markdown_generation_strategy import DefaultMarkdownGenerator
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_dispatcher import ( from .async_dispatcher import (
MemoryAdaptiveDispatcher, MemoryAdaptiveDispatcher,
@@ -44,6 +44,7 @@ __all__ = [
"ChunkingStrategy", "ChunkingStrategy",
"RegexChunking", "RegexChunking",
"DefaultMarkdownGenerator", "DefaultMarkdownGenerator",
"RelevantContentFilter",
"PruningContentFilter", "PruningContentFilter",
"BM25ContentFilter", "BM25ContentFilter",
"LLMContentFilter", "LLMContentFilter",

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.4.3b2" __version__ = "0.4.3b3"

View File

@@ -6,12 +6,15 @@ from .config import (
IMAGE_SCORE_THRESHOLD, IMAGE_SCORE_THRESHOLD,
SOCIAL_MEDIA_DOMAINS, SOCIAL_MEDIA_DOMAINS,
) )
from .user_agent_generator import UserAgentGenerator from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from typing import Optional, Union, List from typing import Optional, Union, List
from .cache_context import CacheMode
class BrowserConfig: class BrowserConfig:
@@ -81,13 +84,13 @@ class BrowserConfig:
user_data_dir: str = None, user_data_dir: str = None,
chrome_channel: str = "chromium", chrome_channel: str = "chromium",
channel: str = "chromium", channel: str = "chromium",
proxy: Optional[str] = None, proxy: str = None,
proxy_config: dict = None, proxy_config: dict = None,
viewport_width: int = 1080, viewport_width: int = 1080,
viewport_height: int = 600, viewport_height: int = 600,
accept_downloads: bool = False, accept_downloads: bool = False,
downloads_path: str = None, downloads_path: str = None,
storage_state=None, storage_state : Union[str, dict, None]=None,
ignore_https_errors: bool = True, ignore_https_errors: bool = True,
java_script_enabled: bool = True, java_script_enabled: bool = True,
sleep_on_close: bool = False, sleep_on_close: bool = False,
@@ -382,7 +385,7 @@ class CrawlerRunConfig:
extraction_strategy: ExtractionStrategy = None, extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(), chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None, markdown_generator: MarkdownGenerationStrategy = None,
content_filter=None, content_filter : RelevantContentFilter = None,
only_text: bool = False, only_text: bool = False,
css_selector: str = None, css_selector: str = None,
excluded_tags: list = None, excluded_tags: list = None,
@@ -396,7 +399,7 @@ class CrawlerRunConfig:
# SSL Parameters # SSL Parameters
fetch_ssl_certificate: bool = False, fetch_ssl_certificate: bool = False,
# Caching Parameters # Caching Parameters
cache_mode=None, cache_mode: CacheMode =None,
session_id: str = None, session_id: str = None,
bypass_cache: bool = False, bypass_cache: bool = False,
disable_cache: bool = False, disable_cache: bool = False,

View File

@@ -1,4 +1,4 @@
site_name: Crawl4AI Documentation site_name: Crawl4AI Documentation (v0.4.3b2)
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
site_url: https://docs.crawl4ai.com site_url: https://docs.crawl4ai.com
repo_url: https://github.com/unclecode/crawl4ai repo_url: https://github.com/unclecode/crawl4ai
@@ -52,6 +52,11 @@ nav:
theme: theme:
name: 'terminal' name: 'terminal'
palette: 'dark' palette: 'dark'
icon:
repo: fontawesome/brands/github
plugins:
- search
markdown_extensions: markdown_extensions:
- pymdownx.highlight: - pymdownx.highlight:
@@ -64,6 +69,9 @@ markdown_extensions:
- attr_list - attr_list
- tables - tables
extra:
version: !ENV [CRAWL4AI_VERSION, 'development']
extra_css: extra_css:
- assets/styles.css - assets/styles.css
- assets/highlight.css - assets/highlight.css
@@ -72,3 +80,4 @@ extra_css:
extra_javascript: extra_javascript:
- assets/highlight.min.js - assets/highlight.min.js
- assets/highlight_init.js - assets/highlight_init.js
- https://buttons.github.io/buttons.js