From 6dc01eae3ac77092d6fe3e9f6730cb6afb1ae8d2 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 18:53:22 +0800 Subject: [PATCH] refactor(core): improve type hints and remove unused file - Add RelevantContentFilter to __init__.py exports - Update version to 0.4.3b3 - Enhance type hints in async_configs.py - Remove empty utils.scraping.py file - Update mkdocs configuration with version info and GitHub integration BREAKING CHANGE: None --- crawl4ai/__init__.py | 3 ++- crawl4ai/__version__.py | 2 +- crawl4ai/async_configs.py | 11 +++++++---- crawl4ai/utils.scraping.py | 0 mkdocs.yml | 11 ++++++++++- 5 files changed, 20 insertions(+), 7 deletions(-) delete mode 100644 crawl4ai/utils.scraping.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 482afdd7..7f284323 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -16,7 +16,7 @@ from .extraction_strategy import ( ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -44,6 +44,7 @@ __all__ = [ "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", + "RelevantContentFilter", "PruningContentFilter", "BM25ContentFilter", "LLMContentFilter", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index a0acc761..3274435a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.3b2" +__version__ = "0.4.3b3" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index b0813abe..c1404026 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -6,12 +6,15 @@ from .config import ( IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) + from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from typing import Optional, Union, List +from .cache_context import CacheMode class BrowserConfig: @@ -81,13 +84,13 @@ class BrowserConfig: user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", - proxy: Optional[str] = None, + proxy: str = None, proxy_config: dict = None, viewport_width: int = 1080, viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, - storage_state=None, + storage_state : Union[str, dict, None]=None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, @@ -382,7 +385,7 @@ class CrawlerRunConfig: extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), markdown_generator: MarkdownGenerationStrategy = None, - content_filter=None, + content_filter : RelevantContentFilter = None, only_text: bool = False, css_selector: str = None, excluded_tags: list = None, @@ -396,7 +399,7 @@ class CrawlerRunConfig: # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters - cache_mode=None, + cache_mode: CacheMode =None, session_id: str = None, bypass_cache: bool = False, disable_cache: bool = False, diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mkdocs.yml b/mkdocs.yml index 255492e3..16f44b05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Crawl4AI Documentation +site_name: Crawl4AI Documentation (v0.4.3b2) site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_url: https://docs.crawl4ai.com repo_url: https://github.com/unclecode/crawl4ai @@ -52,6 +52,11 @@ nav: theme: name: 'terminal' palette: 'dark' + icon: + repo: fontawesome/brands/github + +plugins: + - search markdown_extensions: - pymdownx.highlight: @@ -64,6 +69,9 @@ markdown_extensions: - attr_list - tables +extra: + version: !ENV [CRAWL4AI_VERSION, 'development'] + extra_css: - assets/styles.css - assets/highlight.css @@ -72,3 +80,4 @@ extra_css: extra_javascript: - assets/highlight.min.js - assets/highlight_init.js + - https://buttons.github.io/buttons.js \ No newline at end of file