Merge branch 'main' into next

# Conflicts: # .gitignore
2025-02-19 13:26:42 +08:00
parent 392c923980 3b1025abbb
commit c171891999
9 changed files with 495 additions and 5 deletions
--- a/.github/DISCUSSION_TEMPLATE/feature-requests.yml
+++ b/.github/DISCUSSION_TEMPLATE/feature-requests.yml
@@ -0,0 +1,59 @@
+title: "[Feature Request]: "
+labels: ["⚙️ New"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for your interest in suggesting a new feature! Before you submit, please take a moment to check if already exists in
+        this discussions category to avoid duplicates. 😊
+
+  - type: textarea
+    id: needs_to_be_done
+    attributes:
+      label: What needs to be done?
+      description: Please describe the feature or functionality you'd like to see.
+      placeholder: "e.g., Return alt text along with images scraped from a webpages in Result"
+    validations:
+      required: true
+
+  - type: textarea
+    id: problem_to_solve
+    attributes:
+      label: What problem does this solve?
+      description: Explain the pain point or issue this feature will help address.
+      placeholder: "e.g., Bypass Captchas added by cloudflare"
+    validations:
+      required: true
+
+  - type: textarea
+    id: target_users
+    attributes:
+      label: Target users/beneficiaries
+      description: Who would benefit from this feature? (e.g., specific teams, developers, users, etc.)
+      placeholder: "e.g., Marketing teams, developers"
+    validations:
+      required: false
+
+  - type: textarea
+    id: current_workarounds
+    attributes:
+      label: Current alternatives/workarounds
+      description: Are there any existing solutions or workarounds? How does this feature improve upon them?
+      placeholder: "e.g., Users manually select the css classes mapped to data fields to extract them"
+    validations:
+      required: false
+
+  - type: markdown
+    attributes:
+      value: |
+        ### 💡 Implementation Ideas
+
+  - type: textarea
+    id: proposed_approach
+    attributes:
+      label: Proposed approach
+      description: Share any ideas you have for how this feature could be implemented. Point out any challenges your foresee
+       and the success metrics for this feature
+      placeholder: "e.g., Implement a breadth first traversal algorithm for scraper"
+    validations:
+      required: false
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,127 @@
+name: Bug Report
+description: Report a bug with the Crawl4AI.
+title: "[Bug]: "
+labels: ["🐞 Bug","🩺 Needs Triage"]
+body:
+  - type: input
+    id: crawl4ai_version
+    attributes:
+      label: crawl4ai version
+      description: Specify the version of crawl4ai you are using.
+      placeholder: "e.g., 2.0.0"
+    validations:
+      required: true
+
+  - type: textarea
+    id: expected_behavior
+    attributes:
+      label: Expected Behavior
+      description: Describe what you expected to happen.
+      placeholder: "Provide a detailed explanation of the expected outcome."
+    validations:
+      required: true
+
+  - type: textarea
+    id: current_behavior
+    attributes:
+      label: Current Behavior
+      description: Describe what is happening instead of the expected behavior.
+      placeholder: "Describe the actual result or issue you encountered."
+    validations:
+      required: true
+
+  - type: dropdown
+    id: reproducible
+    attributes:
+      label: Is this reproducible?
+      description: Indicate whether this bug can be reproduced consistently.
+      options:
+        - "Yes"
+        - "No"
+    validations:
+      required: true
+
+  - type: textarea
+    id: inputs
+    attributes:
+      label: Inputs Causing the Bug
+      description: Provide details about the inputs causing the issue.
+      placeholder: |
+        - URL(s): 
+        - Settings used: 
+        - Input data (if applicable): 
+      render: bash
+  
+  - type: textarea
+    id: steps_to_reproduce
+    attributes:
+      label: Steps to Reproduce
+      description: Provide step-by-step instructions to reproduce the issue.
+      placeholder: |
+        1. Go to...
+        2. Click on...
+        3. Observe the issue...
+      render: bash
+  
+  - type: textarea
+    id: code_snippets
+    attributes:
+      label: Code snippets
+      description: Provide code snippets(if any). Add comments as necessary
+      placeholder: print("Hello world")
+      render: python
+
+  # Header Section with Title
+  - type: markdown
+    attributes:
+      value: |
+          ## Supporting Information
+          Please provide the following details to help us understand and resolve your issue. This will assist us in reproducing and diagnosing the problem
+
+  - type: input
+    id: os
+    attributes:
+      label: OS
+      description: Please provide the operating system & distro where the issue occurs.
+      placeholder: "e.g., Windows, macOS, Linux"
+    validations:
+      required: true
+
+  - type: input
+    id: python_version
+    attributes:
+      label: Python version
+      description: Specify the Python version being used.
+      placeholder: "e.g., 3.8.5"
+    validations:
+      required: true
+
+  # Browser Field
+  - type: input
+    id: browser
+    attributes:
+      label: Browser
+      description: Provide the name of the browser you are using.
+      placeholder: "e.g., Chrome, Firefox, Safari"
+    validations:
+      required: false
+
+  # Browser Version Field
+  - type: input
+    id: browser_version
+    attributes:
+      label: Browser version
+      description: Provide the version of the browser you are using.
+      placeholder: "e.g., 91.0.4472.124"
+    validations:
+      required: false
+
+  # Error Logs Field (Text Area)
+  - type: textarea
+    id: error_logs
+    attributes:
+      label: Error logs & Screenshots (if applicable)
+      description: If you encountered any errors, please provide the error logs. Attach any relevant screenshots to help us understand the issue.
+      placeholder: "Paste error logs here and attach your screenshots"
+    validations:
+      required: false
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Feature Requests
+    url: https://github.com/unclecode/crawl4ai/discussions/categories/feature-requests
+    about: "Suggest new features or enhancements for Crawl4AI"
+  - name: Forums - Q&A
+    url: https://github.com/unclecode/crawl4ai/discussions/categories/forums-q-a
+    about: "Ask questions or engage in general discussions about Crawl4AI"
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -0,0 +1,19 @@
+## Summary
+Please include a summary of the change and/or which issues are fixed.
+
+eg: `Fixes #123` (Tag GitHub issue numbers in this format, so it automatically links the issues with your PR)
+
+## List of files changed and why
+eg: quickstart.py - To update the example as per new changes
+
+## How Has This Been Tested?
+Please describe the tests that you ran to verify your changes.
+
+## Checklist:
+
+- [ ] My code follows the style guidelines of this project
+- [ ] I have performed a self-review of my own code
+- [ ] I have commented my code, particularly in hard-to-understand areas
+- [ ] I have made corresponding changes to the documentation
+- [ ] I have added/updated unit tests that prove my fix is effective or that my feature works
+- [ ] New and existing unit tests pass locally with my changes
--- a/.gitignore
+++ b/.gitignore
@@ -226,6 +226,9 @@ tree.md
 .local
 .do
 /plans
+plans/
+
+# Codeium
 .codeiumignore
 todo/

@@ -251,4 +254,4 @@ continue_config.json
 .prompts/

 .llm.env
-.private/
+.private/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,9 +5,12 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+---
+
+### Changed
 Okay, here's a detailed changelog in Markdown format, generated from the provided git diff and commit history. I've focused on user-facing changes, fixes, and features, and grouped them as requested:

-## Version 0.4.3 (2025-01-21)
+## Version 0.4.3b2 (2025-01-21)

 This release introduces several powerful new features, including robots.txt compliance, dynamic proxy support, LLM-powered schema generation, and improved documentation.

@@ -135,9 +138,11 @@ This release introduces several powerful new features, including robots.txt comp
 - **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py))
 - **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))

-#### Other
- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore))
+## [0.4.267] - 2025 - 01 - 06

+### Added
+- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
+- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))

 ## [0.4.24] - 2024-12-31

--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,131 @@
+# Crawl4AI Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+unclecode@crawl4ai.com. All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@
 [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
+[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md)

 </div>

@@ -446,7 +447,7 @@ if __name__ == "__main__":
 </details>

 <details>
-<summary>🤖 <strong>Using You own Browswer with Custome User Profile</strong></summary>
+<summary>🤖 <strong>Using You own Browser with Custom User Profile</strong></summary>

 ```python
 import os, sys
--- a/docs/md_v2/basic/installation.md
+++ b/docs/md_v2/basic/installation.md
@@ -0,0 +1,137 @@
+# Installation 💻
+
+Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server.
+
+## Option 1: Python Package Installation (Recommended)
+
+Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs:
+
+### Basic Installation
+
+For basic web crawling and scraping tasks:
+
+```bash
+pip install crawl4ai
+playwright install # Install Playwright dependencies
+```
+
+### Installation with PyTorch
+
+For advanced text clustering (includes CosineSimilarity cluster strategy):
+
+```bash
+pip install crawl4ai[torch]
+```
+
+### Installation with Transformers
+
+For text summarization and Hugging Face models:
+
+```bash
+pip install crawl4ai[transformer]
+```
+
+### Full Installation
+
+For all features:
+
+```bash
+pip install crawl4ai[all]
+```
+
+### Development Installation
+
+For contributors who plan to modify the source code:
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+pip install -e ".[all]"
+playwright install # Install Playwright dependencies
+```
+
+💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models:
+
+```bash
+crawl4ai-download-models
+```
+
+This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
+
+## Playwright Installation Note for Ubuntu
+
+If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies:
+
+```bash
+sudo apt-get install -y \
+    libwoff1 \
+    libopus0 \
+    libwebp7 \
+    libwebpdemux2 \
+    libenchant-2-2 \
+    libgudev-1.0-0 \
+    libsecret-1-0 \
+    libhyphen0 \
+    libgdk-pixbuf2.0-0 \
+    libegl1 \
+    libnotify4 \
+    libxslt1.1 \
+    libevent-2.1-7 \
+    libgles2 \
+    libxcomposite1 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libepoxy0 \
+    libgtk-3-0 \
+    libharfbuzz-icu0 \
+    libgstreamer-gl1.0-0 \
+    libgstreamer-plugins-bad1.0-0 \
+    gstreamer1.0-plugins-good \
+    gstreamer1.0-plugins-bad \
+    libxt6 \
+    libxaw7 \
+    xvfb \
+    fonts-noto-color-emoji \
+    libfontconfig \
+    libfreetype6 \
+    xfonts-cyrillic \
+    xfonts-scalable \
+    fonts-liberation \
+    fonts-ipafont-gothic \
+    fonts-wqy-zenhei \
+    fonts-tlwg-loma-otf \
+    fonts-freefont-ttf
+```
+
+## Option 2: Using Docker (Coming Soon)
+
+Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.
+
+## Option 3: Local Server Installation
+
+For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete.
+
+## Verifying Your Installation
+
+After installation, you can verify that Crawl4AI is working correctly by running a simple Python script:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(url="https://www.example.com")
+        print(result.markdown[:500])  # Print first 500 characters
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+This script should successfully crawl the example website and print the first 500 characters of the extracted content.
+
+## Getting Help
+
+If you encounter any issues during installation or usage, please check the [documentation](https://docs.crawl4ai.com/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).
+
+Happy crawling! 🕷️🤖