diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 92f3c1a7..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# .pre-commit-config.yaml -repos: -- repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.11 - hooks: - - id: ruff - args: [--fix] - - id: ruff-format \ No newline at end of file diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c6f25994..a5250455 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -11,7 +11,7 @@ from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy -from typing import Union, List +from typing import Optional, Union, List class BrowserConfig: @@ -38,7 +38,7 @@ class BrowserConfig: is "chromium". Default: "chromium". channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type is "chromium". Default: "chromium". - proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. + proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. Default: None. proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. @@ -81,7 +81,7 @@ class BrowserConfig: user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", - proxy: str = None, + proxy: Optional[str] = None, proxy_config: dict = None, viewport_width: int = 1080, viewport_height: int = 600, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index a7596a55..6b919c11 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,7 +10,7 @@ import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult, RateLimiter +from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking @@ -31,7 +31,7 @@ from .markdown_generation_strategy import ( from .async_logger import AsyncLogger from .async_configs import BrowserConfig, CrawlerRunConfig from .async_dispatcher import * # noqa: F403 -from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher +from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .config import MIN_WORD_THRESHOLD from .utils import ( @@ -783,7 +783,7 @@ class AsyncWebCrawler: crawler=self, urls=urls, config=config ) - results: CrawlResult = [] + results: List[CrawlResult] = [] for res in _results: _res: CrawlResult = res.result dispatch_result: DispatchResult = DispatchResult( diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 65cc005d..1e31a5cd 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -5,20 +5,17 @@ import json import time import os -from .prompts import PROMPT_EXTRACT_BLOCKS +from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION from .config import ( DEFAULT_PROVIDER, PROVIDER_MODELS, CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, WORD_TOKEN_RATE, - PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, - PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION ) from .utils import * # noqa: F403 from .utils import ( sanitize_html, - calculate_batch_size, escape_json_string, perform_completion_with_backoff, extract_xml_data, @@ -34,6 +31,7 @@ from .model_loader import ( get_device, load_HF_embedding_model, load_text_multilabel_classifier, + calculate_batch_size ) from functools import partial diff --git a/mkdocs copy.yml b/mkdocs copy.yml deleted file mode 100644 index 89fe7e46..00000000 --- a/mkdocs copy.yml +++ /dev/null @@ -1,94 +0,0 @@ -site_name: Crawl4AI Documentation -site_description: 🔥🕷️ Crawl4AI, Open-source LLM Friendly Web Crawler & Scrapper -site_url: https://docs.crawl4ai.com -repo_url: https://github.com/unclecode/crawl4ai -repo_name: unclecode/crawl4ai -docs_dir: docs/md_v2 - -nav: - - Home: 'index.md' - - 'Installation': 'basic/installation.md' - - 'Docker Deplotment': 'basic/docker-deploymeny.md' - - 'Quick Start': 'basic/quickstart.md' - - Changelog & Blog: - - 'Blog Home': 'blog/index.md' - - 'Latest (0.4.1)': 'blog/releases/0.4.1.md' - - 'Changelog': 'https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md' - - - Core: - - 'Simple Crawling': 'basic/simple-crawling.md' - - 'Crawler Result': 'basic/crawler-result.md' - - 'Crawler & Browser Params': 'basic/browser-crawler-config.md' - - 'Markdown Generation': 'basic/markdown-generation.md' - - 'Fit Markdown': 'basic/fit-markdown.md' - - 'Page Interaction': 'basic/page-interaction.md' - - 'Content Selection': 'basic/content-selection.md' - - 'Cache Modes': 'basic/cache-modes.md' - - 'Local files & Raw HTML': 'basic/local-files.md' - - 'File Downloading': 'basic/file-downloading.md' - - - Advanced: - - 'Link & Media Handling': 'advanced/link-media.md' - - 'Hooks & Auth': 'advanced/hooks-auth.md' - - 'Lazy Loading': 'advanced/lazy-loading.md' - - 'Proxy & Security': 'advanced/proxy-security.md' - - 'Session Management': 'advanced/session-management.md' - - 'Session Management (Advanced)': 'advanced/session-management-advanced.md' - - - Extraction: - - 'Overview': 'extraction/overview.md' - - 'LLM Strategy': 'extraction/llm.md' - - 'Json-CSS Extractor Basic': 'extraction/css.md' - - 'Json-CSS Extractor Advanced': 'extraction/css-advanced.md' - - 'Cosine Strategy': 'extraction/cosine.md' - - 'Chunking': 'extraction/chunking.md' - - - API Reference: - - 'Parameters Table': 'api/parameters.md' - - 'AsyncWebCrawler': 'api/async-webcrawler.md' - - 'AsyncWebCrawler.arun()': 'api/arun.md' - - 'CrawlResult': 'api/crawl-result.md' - - 'Strategies': 'api/strategies.md' - - - Tutorial: - - '1. Getting Started': 'tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md' - - '2. Advanced Features': 'tutorial/episode_02_Overview_of_Advanced_Features.md' - - '3. Browser Setup': 'tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md' - - '4. Proxy Settings': 'tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md' - - '5. Dynamic Content': 'tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md' - - '6. Magic Mode': 'tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md' - - '7. Content Cleaning': 'tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md' - - '8. Media Handling': 'tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md' - - '9. Link Analysis': 'tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md' - - '10. User Simulation': 'tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md' - - '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md' - - '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies_LLM.md' - - '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies_Cosine.md' - - '12. Session Crawling': 'tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md' - - '13. Text Chunking': 'tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md' - - '14. Custom Workflows': 'tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md' - - -theme: - name: terminal - palette: dark - -markdown_extensions: - - pymdownx.highlight: - anchor_linenums: true - - pymdownx.inlinehilite - - pymdownx.snippets - - pymdownx.superfences - - admonition - - pymdownx.details - - attr_list - - tables - -extra_css: - - assets/styles.css - - assets/highlight.css - - assets/dmvendor.css - -extra_javascript: - - assets/highlight.min.js - - assets/highlight_init.js \ No newline at end of file diff --git a/mkdocs_v2.yml b/mkdocs_v2.yml deleted file mode 100644 index ff8c93b9..00000000 --- a/mkdocs_v2.yml +++ /dev/null @@ -1,96 +0,0 @@ -site_name: Crawl4AI Documentation -site_description: 🔥🕷️ Crawl4AI, Open-source LLM Friendly Web Crawler & Scrapper -site_url: https://docs.crawl4ai.com -repo_url: https://github.com/unclecode/crawl4ai -repo_name: unclecode/crawl4ai -docs_dir: docs/md_v3 - - -nav: - - Home: index.md - - - Tutorials: - - "Getting Started": tutorials/getting-started.md - - "AsyncWebCrawler Basics": tutorials/async-webcrawler-basics.md - - "Targeted Crawling Techniques": tutorials/targeted-crawling.md - - "Link & Media Analysis": tutorials/link-media-analysis.md - - "Advanced Features (Proxy, PDF, Screenshots)": tutorials/advanced-features.md - - "Hooks & Custom Code": tutorials/hooks-custom.md - - "Markdown Generation Basics": tutorials/markdown-basics.md - - "Extracting JSON (No LLM)": tutorials/json-extraction-basic.md - - "Extracting JSON (LLM)": tutorials/json-extraction-llm.md - - "Deploying with Docker (Quickstart)": tutorials/docker-quickstart.md - - - How-To Guides: - - "Advanced Browser Configuration": how-to/advanced-browser-config.md - - "Managing Browser Contexts & Remote Browsers": how-to/browser-contexts-remote.md - - "Identity-Based Crawling (Anti-Bot)": how-to/identity-anti-bot.md - - "Link & Media Analysis": how-to/link-media-analysis.md - - "Markdown Generation Customization": how-to/markdown-custom.md - - "Structured Data Extraction (Advanced)": how-to/structured-data-advanced.md - - "Deployment Options": how-to/deployment-options.md - - "Performance & Caching": how-to/performance-caching.md - - - Explanations: - - "AsyncWebCrawler & Internal Flow": explanations/async-webcrawler-flow.md - - "Configuration Objects Explained": explanations/configuration-objects.md - - "Browser Context & Managed Browser": explanations/browser-management.md - - "Markdown Generation Architecture": explanations/markdown-architecture.md - - "Extraction & Chunking Strategies": explanations/extraction-chunking.md - - "Identity-Based Crawling & Anti-Bot": explanations/identity-anti-bot.md - - "Deployment Architectures": explanations/deployment-architectures.md - - - Reference: - - "Configuration": reference/configuration.md - - "Core Crawler": reference/core-crawler.md - - "Browser Strategies": reference/browser-strategies.md - - "Markdown Generation": reference/markdown-generation.md - - "Content Filters": reference/content-filters.md - - "Extraction Strategies": reference/extraction-strategies.md - - "Chunking Strategies": reference/chunking-strategies.md - - "Identity & Utility": reference/identity-utilities.md - - "Models": reference/models.md - - - Blog: - - "Blog Overview": blog/index.md - # You can add real-life application posts here in the future - # - "Cool Real-World E-Commerce Scraping": blog/ecommerce-case-study.md - # - "Dealing with Complex Anti-Bot Systems": blog/anti-bot-tricks.md - - -theme: - name: terminal - palette: dark - -plugins: - - search - - mkdocstrings: - handlers: - python: - analysis: - follow_imports: true - rendering: - show_root_full_path: false - -markdown_extensions: - - codehilite - - toc: - permalink: true - - pymdownx.highlight: - anchor_linenums: true - - pymdownx.inlinehilite - - pymdownx.snippets - - pymdownx.superfences - - admonition - - pymdownx.details - - attr_list - - tables - -extra_css: - - assets/styles.css - - assets/highlight.css - - assets/dmvendor.css - -extra_javascript: - - assets/highlight.min.js - - assets/highlight_init.js \ No newline at end of file