refactor(core): improve type hints and remove unused file
- Add RelevantContentFilter to __init__.py exports - Update version to 0.4.3b3 - Enhance type hints in async_configs.py - Remove empty utils.scraping.py file - Update mkdocs configuration with version info and GitHub integration BREAKING CHANGE: None
This commit is contained in:
@@ -16,7 +16,7 @@ from .extraction_strategy import (
|
|||||||
)
|
)
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter
|
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
from .async_dispatcher import (
|
from .async_dispatcher import (
|
||||||
MemoryAdaptiveDispatcher,
|
MemoryAdaptiveDispatcher,
|
||||||
@@ -44,6 +44,7 @@ __all__ = [
|
|||||||
"ChunkingStrategy",
|
"ChunkingStrategy",
|
||||||
"RegexChunking",
|
"RegexChunking",
|
||||||
"DefaultMarkdownGenerator",
|
"DefaultMarkdownGenerator",
|
||||||
|
"RelevantContentFilter",
|
||||||
"PruningContentFilter",
|
"PruningContentFilter",
|
||||||
"BM25ContentFilter",
|
"BM25ContentFilter",
|
||||||
"LLMContentFilter",
|
"LLMContentFilter",
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.3b2"
|
__version__ = "0.4.3b3"
|
||||||
|
|||||||
@@ -6,12 +6,15 @@ from .config import (
|
|||||||
IMAGE_SCORE_THRESHOLD,
|
IMAGE_SCORE_THRESHOLD,
|
||||||
SOCIAL_MEDIA_DOMAINS,
|
SOCIAL_MEDIA_DOMAINS,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .user_agent_generator import UserAgentGenerator
|
from .user_agent_generator import UserAgentGenerator
|
||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
|
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
||||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||||
from typing import Optional, Union, List
|
from typing import Optional, Union, List
|
||||||
|
from .cache_context import CacheMode
|
||||||
|
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
@@ -81,13 +84,13 @@ class BrowserConfig:
|
|||||||
user_data_dir: str = None,
|
user_data_dir: str = None,
|
||||||
chrome_channel: str = "chromium",
|
chrome_channel: str = "chromium",
|
||||||
channel: str = "chromium",
|
channel: str = "chromium",
|
||||||
proxy: Optional[str] = None,
|
proxy: str = None,
|
||||||
proxy_config: dict = None,
|
proxy_config: dict = None,
|
||||||
viewport_width: int = 1080,
|
viewport_width: int = 1080,
|
||||||
viewport_height: int = 600,
|
viewport_height: int = 600,
|
||||||
accept_downloads: bool = False,
|
accept_downloads: bool = False,
|
||||||
downloads_path: str = None,
|
downloads_path: str = None,
|
||||||
storage_state=None,
|
storage_state : Union[str, dict, None]=None,
|
||||||
ignore_https_errors: bool = True,
|
ignore_https_errors: bool = True,
|
||||||
java_script_enabled: bool = True,
|
java_script_enabled: bool = True,
|
||||||
sleep_on_close: bool = False,
|
sleep_on_close: bool = False,
|
||||||
@@ -382,7 +385,7 @@ class CrawlerRunConfig:
|
|||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
markdown_generator: MarkdownGenerationStrategy = None,
|
markdown_generator: MarkdownGenerationStrategy = None,
|
||||||
content_filter=None,
|
content_filter : RelevantContentFilter = None,
|
||||||
only_text: bool = False,
|
only_text: bool = False,
|
||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
excluded_tags: list = None,
|
excluded_tags: list = None,
|
||||||
@@ -396,7 +399,7 @@ class CrawlerRunConfig:
|
|||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate: bool = False,
|
fetch_ssl_certificate: bool = False,
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
cache_mode=None,
|
cache_mode: CacheMode =None,
|
||||||
session_id: str = None,
|
session_id: str = None,
|
||||||
bypass_cache: bool = False,
|
bypass_cache: bool = False,
|
||||||
disable_cache: bool = False,
|
disable_cache: bool = False,
|
||||||
|
|||||||
11
mkdocs.yml
11
mkdocs.yml
@@ -1,4 +1,4 @@
|
|||||||
site_name: Crawl4AI Documentation
|
site_name: Crawl4AI Documentation (v0.4.3b2)
|
||||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||||
site_url: https://docs.crawl4ai.com
|
site_url: https://docs.crawl4ai.com
|
||||||
repo_url: https://github.com/unclecode/crawl4ai
|
repo_url: https://github.com/unclecode/crawl4ai
|
||||||
@@ -52,6 +52,11 @@ nav:
|
|||||||
theme:
|
theme:
|
||||||
name: 'terminal'
|
name: 'terminal'
|
||||||
palette: 'dark'
|
palette: 'dark'
|
||||||
|
icon:
|
||||||
|
repo: fontawesome/brands/github
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- search
|
||||||
|
|
||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- pymdownx.highlight:
|
- pymdownx.highlight:
|
||||||
@@ -64,6 +69,9 @@ markdown_extensions:
|
|||||||
- attr_list
|
- attr_list
|
||||||
- tables
|
- tables
|
||||||
|
|
||||||
|
extra:
|
||||||
|
version: !ENV [CRAWL4AI_VERSION, 'development']
|
||||||
|
|
||||||
extra_css:
|
extra_css:
|
||||||
- assets/styles.css
|
- assets/styles.css
|
||||||
- assets/highlight.css
|
- assets/highlight.css
|
||||||
@@ -72,3 +80,4 @@ extra_css:
|
|||||||
extra_javascript:
|
extra_javascript:
|
||||||
- assets/highlight.min.js
|
- assets/highlight.min.js
|
||||||
- assets/highlight_init.js
|
- assets/highlight_init.js
|
||||||
|
- https://buttons.github.io/buttons.js
|
||||||
Reference in New Issue
Block a user