diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 482afdd7..7f284323 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -16,7 +16,7 @@ from .extraction_strategy import ( ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -44,6 +44,7 @@ __all__ = [ "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", + "RelevantContentFilter", "PruningContentFilter", "BM25ContentFilter", "LLMContentFilter", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index a0acc761..3274435a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.3b2" +__version__ = "0.4.3b3" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index b0813abe..c1404026 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -6,12 +6,15 @@ from .config import ( IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) + from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from typing import Optional, Union, List +from .cache_context import CacheMode class BrowserConfig: @@ -81,13 +84,13 @@ class BrowserConfig: user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", - proxy: Optional[str] = None, + proxy: str = None, proxy_config: dict = None, viewport_width: int = 1080, viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, - storage_state=None, + storage_state : Union[str, dict, None]=None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, @@ -382,7 +385,7 @@ class CrawlerRunConfig: extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), markdown_generator: MarkdownGenerationStrategy = None, - content_filter=None, + content_filter : RelevantContentFilter = None, only_text: bool = False, css_selector: str = None, excluded_tags: list = None, @@ -396,7 +399,7 @@ class CrawlerRunConfig: # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters - cache_mode=None, + cache_mode: CacheMode =None, session_id: str = None, bypass_cache: bool = False, disable_cache: bool = False, diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mkdocs.yml b/mkdocs.yml index 255492e3..16f44b05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Crawl4AI Documentation +site_name: Crawl4AI Documentation (v0.4.3b2) site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_url: https://docs.crawl4ai.com repo_url: https://github.com/unclecode/crawl4ai @@ -52,6 +52,11 @@ nav: theme: name: 'terminal' palette: 'dark' + icon: + repo: fontawesome/brands/github + +plugins: + - search markdown_extensions: - pymdownx.highlight: @@ -64,6 +69,9 @@ markdown_extensions: - attr_list - tables +extra: + version: !ENV [CRAWL4AI_VERSION, 'development'] + extra_css: - assets/styles.css - assets/highlight.css @@ -72,3 +80,4 @@ extra_css: extra_javascript: - assets/highlight.min.js - assets/highlight_init.js + - https://buttons.github.io/buttons.js \ No newline at end of file