chore(cleanup): remove unused files and improve type hints
- Remove .pre-commit-config.yaml and duplicate mkdocs configuration files - Add Optional type hint for proxy parameter in BrowserConfig - Fix type annotation for results list in AsyncWebCrawler - Move calculate_batch_size function import to model_loader - Update prompt imports in extraction_strategy.py No breaking changes.
This commit is contained in:
@@ -1,8 +0,0 @@
|
||||
# .pre-commit-config.yaml
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.1.11
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
@@ -11,7 +11,7 @@ from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from typing import Union, List
|
||||
from typing import Optional, Union, List
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
@@ -38,7 +38,7 @@ class BrowserConfig:
|
||||
is "chromium". Default: "chromium".
|
||||
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
|
||||
is "chromium". Default: "chromium".
|
||||
proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
|
||||
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
|
||||
Default: None.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
@@ -81,7 +81,7 @@ class BrowserConfig:
|
||||
user_data_dir: str = None,
|
||||
chrome_channel: str = "chromium",
|
||||
channel: str = "chromium",
|
||||
proxy: str = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_config: dict = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
|
||||
@@ -10,7 +10,7 @@ import asyncio
|
||||
|
||||
# from contextlib import nullcontext, asynccontextmanager
|
||||
from contextlib import asynccontextmanager
|
||||
from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult, RateLimiter
|
||||
from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import * # noqa: F403
|
||||
from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
|
||||
@@ -31,7 +31,7 @@ from .markdown_generation_strategy import (
|
||||
from .async_logger import AsyncLogger
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
|
||||
from .config import MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
@@ -783,7 +783,7 @@ class AsyncWebCrawler:
|
||||
crawler=self, urls=urls, config=config
|
||||
)
|
||||
|
||||
results: CrawlResult = []
|
||||
results: List[CrawlResult] = []
|
||||
for res in _results:
|
||||
_res: CrawlResult = res.result
|
||||
dispatch_result: DispatchResult = DispatchResult(
|
||||
|
||||
@@ -5,20 +5,17 @@ import json
|
||||
import time
|
||||
import os
|
||||
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER, PROVIDER_MODELS,
|
||||
CHUNK_TOKEN_THRESHOLD,
|
||||
OVERLAP_RATE,
|
||||
WORD_TOKEN_RATE,
|
||||
PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION,
|
||||
PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
||||
)
|
||||
from .utils import * # noqa: F403
|
||||
|
||||
from .utils import (
|
||||
sanitize_html,
|
||||
calculate_batch_size,
|
||||
escape_json_string,
|
||||
perform_completion_with_backoff,
|
||||
extract_xml_data,
|
||||
@@ -34,6 +31,7 @@ from .model_loader import (
|
||||
get_device,
|
||||
load_HF_embedding_model,
|
||||
load_text_multilabel_classifier,
|
||||
calculate_batch_size
|
||||
)
|
||||
|
||||
from functools import partial
|
||||
|
||||
@@ -1,94 +0,0 @@
|
||||
site_name: Crawl4AI Documentation
|
||||
site_description: 🔥🕷️ Crawl4AI, Open-source LLM Friendly Web Crawler & Scrapper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
repo_url: https://github.com/unclecode/crawl4ai
|
||||
repo_name: unclecode/crawl4ai
|
||||
docs_dir: docs/md_v2
|
||||
|
||||
nav:
|
||||
- Home: 'index.md'
|
||||
- 'Installation': 'basic/installation.md'
|
||||
- 'Docker Deplotment': 'basic/docker-deploymeny.md'
|
||||
- 'Quick Start': 'basic/quickstart.md'
|
||||
- Changelog & Blog:
|
||||
- 'Blog Home': 'blog/index.md'
|
||||
- 'Latest (0.4.1)': 'blog/releases/0.4.1.md'
|
||||
- 'Changelog': 'https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md'
|
||||
|
||||
- Core:
|
||||
- 'Simple Crawling': 'basic/simple-crawling.md'
|
||||
- 'Crawler Result': 'basic/crawler-result.md'
|
||||
- 'Crawler & Browser Params': 'basic/browser-crawler-config.md'
|
||||
- 'Markdown Generation': 'basic/markdown-generation.md'
|
||||
- 'Fit Markdown': 'basic/fit-markdown.md'
|
||||
- 'Page Interaction': 'basic/page-interaction.md'
|
||||
- 'Content Selection': 'basic/content-selection.md'
|
||||
- 'Cache Modes': 'basic/cache-modes.md'
|
||||
- 'Local files & Raw HTML': 'basic/local-files.md'
|
||||
- 'File Downloading': 'basic/file-downloading.md'
|
||||
|
||||
- Advanced:
|
||||
- 'Link & Media Handling': 'advanced/link-media.md'
|
||||
- 'Hooks & Auth': 'advanced/hooks-auth.md'
|
||||
- 'Lazy Loading': 'advanced/lazy-loading.md'
|
||||
- 'Proxy & Security': 'advanced/proxy-security.md'
|
||||
- 'Session Management': 'advanced/session-management.md'
|
||||
- 'Session Management (Advanced)': 'advanced/session-management-advanced.md'
|
||||
|
||||
- Extraction:
|
||||
- 'Overview': 'extraction/overview.md'
|
||||
- 'LLM Strategy': 'extraction/llm.md'
|
||||
- 'Json-CSS Extractor Basic': 'extraction/css.md'
|
||||
- 'Json-CSS Extractor Advanced': 'extraction/css-advanced.md'
|
||||
- 'Cosine Strategy': 'extraction/cosine.md'
|
||||
- 'Chunking': 'extraction/chunking.md'
|
||||
|
||||
- API Reference:
|
||||
- 'Parameters Table': 'api/parameters.md'
|
||||
- 'AsyncWebCrawler': 'api/async-webcrawler.md'
|
||||
- 'AsyncWebCrawler.arun()': 'api/arun.md'
|
||||
- 'CrawlResult': 'api/crawl-result.md'
|
||||
- 'Strategies': 'api/strategies.md'
|
||||
|
||||
- Tutorial:
|
||||
- '1. Getting Started': 'tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md'
|
||||
- '2. Advanced Features': 'tutorial/episode_02_Overview_of_Advanced_Features.md'
|
||||
- '3. Browser Setup': 'tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md'
|
||||
- '4. Proxy Settings': 'tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md'
|
||||
- '5. Dynamic Content': 'tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md'
|
||||
- '6. Magic Mode': 'tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md'
|
||||
- '7. Content Cleaning': 'tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md'
|
||||
- '8. Media Handling': 'tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md'
|
||||
- '9. Link Analysis': 'tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md'
|
||||
- '10. User Simulation': 'tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md'
|
||||
- '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md'
|
||||
- '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies_LLM.md'
|
||||
- '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies_Cosine.md'
|
||||
- '12. Session Crawling': 'tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md'
|
||||
- '13. Text Chunking': 'tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md'
|
||||
- '14. Custom Workflows': 'tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md'
|
||||
|
||||
|
||||
theme:
|
||||
name: terminal
|
||||
palette: dark
|
||||
|
||||
markdown_extensions:
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.snippets
|
||||
- pymdownx.superfences
|
||||
- admonition
|
||||
- pymdownx.details
|
||||
- attr_list
|
||||
- tables
|
||||
|
||||
extra_css:
|
||||
- assets/styles.css
|
||||
- assets/highlight.css
|
||||
- assets/dmvendor.css
|
||||
|
||||
extra_javascript:
|
||||
- assets/highlight.min.js
|
||||
- assets/highlight_init.js
|
||||
@@ -1,96 +0,0 @@
|
||||
site_name: Crawl4AI Documentation
|
||||
site_description: 🔥🕷️ Crawl4AI, Open-source LLM Friendly Web Crawler & Scrapper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
repo_url: https://github.com/unclecode/crawl4ai
|
||||
repo_name: unclecode/crawl4ai
|
||||
docs_dir: docs/md_v3
|
||||
|
||||
|
||||
nav:
|
||||
- Home: index.md
|
||||
|
||||
- Tutorials:
|
||||
- "Getting Started": tutorials/getting-started.md
|
||||
- "AsyncWebCrawler Basics": tutorials/async-webcrawler-basics.md
|
||||
- "Targeted Crawling Techniques": tutorials/targeted-crawling.md
|
||||
- "Link & Media Analysis": tutorials/link-media-analysis.md
|
||||
- "Advanced Features (Proxy, PDF, Screenshots)": tutorials/advanced-features.md
|
||||
- "Hooks & Custom Code": tutorials/hooks-custom.md
|
||||
- "Markdown Generation Basics": tutorials/markdown-basics.md
|
||||
- "Extracting JSON (No LLM)": tutorials/json-extraction-basic.md
|
||||
- "Extracting JSON (LLM)": tutorials/json-extraction-llm.md
|
||||
- "Deploying with Docker (Quickstart)": tutorials/docker-quickstart.md
|
||||
|
||||
- How-To Guides:
|
||||
- "Advanced Browser Configuration": how-to/advanced-browser-config.md
|
||||
- "Managing Browser Contexts & Remote Browsers": how-to/browser-contexts-remote.md
|
||||
- "Identity-Based Crawling (Anti-Bot)": how-to/identity-anti-bot.md
|
||||
- "Link & Media Analysis": how-to/link-media-analysis.md
|
||||
- "Markdown Generation Customization": how-to/markdown-custom.md
|
||||
- "Structured Data Extraction (Advanced)": how-to/structured-data-advanced.md
|
||||
- "Deployment Options": how-to/deployment-options.md
|
||||
- "Performance & Caching": how-to/performance-caching.md
|
||||
|
||||
- Explanations:
|
||||
- "AsyncWebCrawler & Internal Flow": explanations/async-webcrawler-flow.md
|
||||
- "Configuration Objects Explained": explanations/configuration-objects.md
|
||||
- "Browser Context & Managed Browser": explanations/browser-management.md
|
||||
- "Markdown Generation Architecture": explanations/markdown-architecture.md
|
||||
- "Extraction & Chunking Strategies": explanations/extraction-chunking.md
|
||||
- "Identity-Based Crawling & Anti-Bot": explanations/identity-anti-bot.md
|
||||
- "Deployment Architectures": explanations/deployment-architectures.md
|
||||
|
||||
- Reference:
|
||||
- "Configuration": reference/configuration.md
|
||||
- "Core Crawler": reference/core-crawler.md
|
||||
- "Browser Strategies": reference/browser-strategies.md
|
||||
- "Markdown Generation": reference/markdown-generation.md
|
||||
- "Content Filters": reference/content-filters.md
|
||||
- "Extraction Strategies": reference/extraction-strategies.md
|
||||
- "Chunking Strategies": reference/chunking-strategies.md
|
||||
- "Identity & Utility": reference/identity-utilities.md
|
||||
- "Models": reference/models.md
|
||||
|
||||
- Blog:
|
||||
- "Blog Overview": blog/index.md
|
||||
# You can add real-life application posts here in the future
|
||||
# - "Cool Real-World E-Commerce Scraping": blog/ecommerce-case-study.md
|
||||
# - "Dealing with Complex Anti-Bot Systems": blog/anti-bot-tricks.md
|
||||
|
||||
|
||||
theme:
|
||||
name: terminal
|
||||
palette: dark
|
||||
|
||||
plugins:
|
||||
- search
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
analysis:
|
||||
follow_imports: true
|
||||
rendering:
|
||||
show_root_full_path: false
|
||||
|
||||
markdown_extensions:
|
||||
- codehilite
|
||||
- toc:
|
||||
permalink: true
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.snippets
|
||||
- pymdownx.superfences
|
||||
- admonition
|
||||
- pymdownx.details
|
||||
- attr_list
|
||||
- tables
|
||||
|
||||
extra_css:
|
||||
- assets/styles.css
|
||||
- assets/highlight.css
|
||||
- assets/dmvendor.css
|
||||
|
||||
extra_javascript:
|
||||
- assets/highlight.min.js
|
||||
- assets/highlight_init.js
|
||||
Reference in New Issue
Block a user