Initial Commit

2024-05-09 19:10:25 +08:00
commit b8e743cd8d
19 changed files with 2296 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,165 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 Crawl4AI.egg-info/
 Crawl4AI.egg-info/*
--- a/40
+++ b/40
@@ -0,0 +1,40 @@
 # Use an official Python runtime as a parent image
 FROM python:3.10-slim
 # Set the working directory in the container
 WORKDIR /usr/src/app
 # Copy the current directory contents into the container at /usr/src/app
 COPY . .
 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Install dependencies for Chrome and ChromeDriver
 RUN apt-get update && apt-get install -y --no-install-recommends \
    wget \
    xvfb \
    unzip \
    curl \
    gnupg2 \
    ca-certificates \
    apt-transport-https \
    software-properties-common \
    && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
    && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
    && apt-get update \
    && apt-get install -y google-chrome-stable \
    && rm -rf /var/lib/apt/lists/*
 # Set display port and dbus env to avoid hanging
 ENV DISPLAY=:99
 ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
 # Make port 80 available to the world outside this container
 EXPOSE 80
 # Define environment variable
 ENV PYTHONUNBUFFERED 1
 # Run uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
--- a/51
+++ b/51
@@ -0,0 +1,51 @@
 Apache License
 Version 2.0, January 2004
 http://www.apache.org/licenses/
 TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 1. Definitions.
 "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
 "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
 "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
 "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
 "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
 "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
 "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
 "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
 "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
 "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
 You must give any other recipients of the Work or Derivative Works a copy of this License; and
 You must cause any modified files to carry prominent notices stating that You changed the files; and
 You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
 If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
 You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
 END OF TERMS AND CONDITIONS
--- a/README.md
+++ b/README.md
@@ -0,0 +1,199 @@
 # Crawl4AI 🕷️🤖
 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
 [![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues)
 [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls)
 [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
 Crawl4AI is a powerful, free web crawling service designed to extract useful information from web pages and make it accessible for large language models (LLMs) and AI applications. 🆓🌐
 ## Features ✨
 - 🕷️ Efficient web crawling to extract valuable data from websites
 - 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
 - 🌍 Supports crawling multiple URLs simultaneously
 - 🌃 Replace media tags with ALT.
 - 🆓 Completely free to use and open-source
 ## Getting Started 🚀
 To get started with Crawl4AI, simply visit our web application at [https://crawl4ai.your-domain.io](https://crawl4ai.uccode.io) and enter the URL(s) you want to crawl. The application will process the URLs and provide you with the extracted data in various formats.
 ## Installation 💻
 To install and run Crawl4AI locally or on your own server, follow these steps:
 1. Clone the repository:
 ```
 git clone https://github.com/your-username/crawl4ai.git
 ```
 2. Navigate to the project directory:
 ```
 cd crawl4ai
 ```
 3. Create a `.env` file in the root folder and set your Groq API token:
 ```
 GROQ_API_TOKEN=your_groq_api_token
 ```
 4. Build the Docker image:
 ```
 docker build -t crawl4ai .
 ```
   For Mac users, use the following command instead:
 ```
 docker build --platform linux/amd64 -t crawl4ai .
 ```
 5. Run the Docker container:
 ```
 docker run -p 8000:80 crawl4ai
 ```
 6. Access the application at `http://localhost:8000`.
 For more detailed instructions and advanced configuration options, please refer to the [installation guide](https://github.com/your-username/crawl4ai/blob/main/INSTALL.md).
 ## Usage with Python 🐍
 Here's an example of how to use Crawl4AI with Python to crawl a webpage and retrieve the extracted data:
 1. Make sure you have the `requests` library installed. You can install it using pip:
 ```
 pip install requests
 ```
 2. Use the following Python code to send a request to the Crawl4AI server and retrieve the crawled data:
 ```python
 import requests
 import os
 url = "http://localhost:8000/crawl"  # Replace with the appropriate server URL
 data = {
  "urls": [
    "https://example.com"
  ],
  "provider_model": "groq/llama3-70b-8192",
  "api_token": "your_api_token",
  "include_raw_html": true,
  "forced": false,
  "extract_blocks": true,
  "word_count_threshold": 5
 }
 response = requests.post(url, json=data)
 if response.status_code == 200:
    result = response.json()["results"][0]
    print("Parsed JSON:")
    print(result["parsed_json"])
    print("\nCleaned HTML:")
    print(result["cleaned_html"])
    print("\nMarkdown:")
    print(result["markdown"])
 else:
    print("Error:", response.status_code, response.text)
 ```
 This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`https://example.com`) and the desired options (`grq_api_token`, `include_raw_html`, and `forced`). The server processes the request and returns the crawled data in JSON format.
 The response from the server includes the parsed JSON, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
 Make sure to replace `"http://localhost:8000/crawl"` with the appropriate server URL if your Crawl4AI server is running on a different host or port.
 ## Using Crawl4AI as a Python Library 📚
 You can also use Crawl4AI as a Python library in your own projects. Here's an example of how to use the Crawl4AI library:
 1. Install the required dependencies:
 ```
 pip install -r requirements.txt
 ```
 2. Import the necessary modules and initialize the `WebCrawler`:
 ```python
 from crawler.web_crawler import WebCrawler
 from crawler.models import UrlModel
 import os
 crawler = WebCrawler(db_path='crawler_data.db')
 ```
 3. Fetch a single page:
 ```python
 single_url = UrlModel(url='https://kidocode.com', forced=True)
 result = crawler.fetch_page(
    single_url, 
    provider= "openai/gpt-3.5-turbo", 
    api_token = os.getenv('OPENAI_API_KEY'), 
    extract_blocks_flag=True,
    word_count_threshold=5 # Minimum word count for a HTML tag to be considered as a worthy block
 )
 print(result.model_dump())
 ```
 4. Fetch multiple pages:
 ```python
 urls = [
    UrlModel(url='http://example.com', forced=False),
    UrlModel(url='http://example.org', forced=False)
 ]
 results = crawler.fetch_pages(
    urls, 
    provider= "openai/gpt-3.5-turbo", 
    api_token = os.getenv('OPENAI_API_KEY'), 
    extract_blocks_flag=True, 
    word_count_threshold=5
 )
 for res in results:
    print(res.json())
 ```
 This code demonstrates how to use the Crawl4AI library to fetch a single page or multiple pages. The `WebCrawler` is initialized with the path to the database, and the `fetch_page` and `fetch_pages` methods are used to crawl the specified URLs.
 Make sure to set the `GROQ_API_TOKEN` environment variable with your Groq API token when using the library.
 That's it! You can now integrate Crawl4AI into your Python projects and leverage its web crawling capabilities. 🎉
 ## 📖 Parameters
 | Parameter            | Description                                                                                     | Required | Default Value |
 |----------------------|-------------------------------------------------------------------------------------------------|----------|---------------|
 | `urls`               | A list of URLs to crawl and extract data from.                                                  | Yes      | -             |
 | `provider_model`     | The provider and model to use for extracting relevant information (e.g., "groq/llama3-70b-8192"). | Yes      | -             |
 | `api_token`          | Your API token for the specified provider.                                                        | Yes      | -             |
 | `include_raw_html`   | Whether to include the raw HTML content in the response.                                        | No       | `false`       |
 | `forced`             | Whether to force a fresh crawl even if the URL has been previously crawled.                     | No       | `false`       |
 | `extract_blocks`     | Whether to extract meaningful blocks of text from the HTML.                                     | No       | `false`       |
 | `word_count_threshold` | The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No       | `5`           |
 ## Contributing 🤝
 We welcome contributions from the open-source community to help improve Crawl4AI and make it even more valuable for AI enthusiasts and developers. To contribute, please follow these steps:
 1. Fork the repository.
 2. Create a new branch for your feature or bug fix.
 3. Make your changes and commit them with descriptive messages.
 4. Push your changes to your forked repository.
 5. Submit a pull request to the main repository.
 For more information on contributing, please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md).
 ## License 📄
 Crawl4AI is released under the [MIT License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
 ## Contact 📧
 If you have any questions, suggestions, or feedback, please feel free to reach out to us:
 - GitHub: [unclecode](https://github.com/unclecode)
 - Twitter: [@unclecode](https://twitter.com/unclecode)
 - Discord: [your-invite-link](https://discord.gg/your-invite-link)
 Let's work together to make the web more accessible and useful for AI applications! 💪🌐🤖
--- a/crawler/init.py
+++ b/crawler/init.py
@@ -0,0 +1 @@
 from .web_crawler import WebCrawler
--- a/crawler/config.py
+++ b/crawler/config.py
@@ -0,0 +1,24 @@
 import os
 from dotenv import load_dotenv
 load_dotenv()  # Load environment variables from .env file
 # Default provider
 DEFAULT_PROVIDER = "openai/gpt-4-turbo"
 # Provider-model dictionary
 PROVIDER_MODELS = {
    "groq/llama3-70b-8192": os.getenv("GROQ_API_KEY", "YOUR_GROQ_TOKEN"),
    "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY", "YOUR_GROQ_TOKEN"),
    "openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_TOKEN"),
    "openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_TOKEN"),
    "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_TOKEN"),
    "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_TOKEN"),
    "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_TOKEN"),
 }
 # Chunk token threshold
 CHUNK_TOKEN_THRESHOLD = 1000
 # Threshold for the minimum number of word in a HTML tag to be considered 
 MIN_WORD_THRESHOLD = 5
--- a/crawler/database.py
+++ b/crawler/database.py
@@ -0,0 +1,53 @@
 import sqlite3
 from typing import Optional
 def init_db(db_path: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS crawled_data (
            url TEXT PRIMARY KEY,
            html TEXT,
            cleaned_html TEXT,
            markdown TEXT,
            parsed_json TEXT,
            success BOOLEAN
        )
    ''')
    conn.commit()
    conn.close()
 def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
    result = cursor.fetchone()
    conn.close()
    return result
 def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''
        INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
        VALUES (?, ?, ?, ?, ?, ?)
        ON CONFLICT(url) DO UPDATE SET
            html = excluded.html,
            cleaned_html = excluded.cleaned_html,
            markdown = excluded.markdown,
            parsed_json = excluded.parsed_json,
            success = excluded.success
    ''', (str(url), html, cleaned_html, markdown, parsed_json, success))
    conn.commit()
    conn.close()
 def get_total_count(db_path: str) -> int:
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute('SELECT COUNT(*) FROM crawled_data')
        result = cursor.fetchone()
        conn.close()
        return result[0]
    except Exception as e:
        return 0
--- a/crawler/models.py
+++ b/crawler/models.py
@@ -0,0 +1,15 @@
 from pydantic import BaseModel, HttpUrl
 from typing import List
 class UrlModel(BaseModel):
    url: HttpUrl
    forced: bool = False
 class CrawlResult(BaseModel):
    url: str
    html: str
    success: bool
    cleaned_html: str = None
    markdown: str = None
    parsed_json: str = None
    error_message: str = None
--- a/crawler/prompts.py
+++ b/crawler/prompts.py
@@ -0,0 +1,110 @@
 PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
 <url>{URL}</url>
 And here is the cleaned HTML content of that webpage:
 <html>
 {HTML}
 </html>
 Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
 - index: an integer representing the index of the block in the content
 - tags: a list of semantic tags that are relevant to the content of the block
 - content: a list of strings containing the text content of the block
 - questions: a list of 3 questions that a user may ask about the content in this block
 To generate the JSON objects:
 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
 2. For each block:
   a. Assign it an index based on its order in the content.
   b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about.
   c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field.
   d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block.
 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
 4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.).
 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
 6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
 Please provide your output within <blocks> tags, like this:
 <blocks>
 [{
  "index": 0,
  "tags": ["introduction", "overview"],
  "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."],
  "questions": [
    "What is the main topic of this article?",
    "What can I expect to learn from reading this article?",
    "Is this article suitable for beginners or experts in the field?"
  ]
 },
 {
  "index": 1,
  "tags": ["history", "background"],
  "content": ["This is the second paragraph, which delves into the history and background of the topic.",
              "It provides context and sets the stage for the rest of the article."],
  "questions": [
    "What historical events led to the development of this topic?",
    "How has the understanding of this topic evolved over time?",
    "What are some key milestones in the history of this topic?"
  ]
 }]
 </blocks>
 Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
 PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
 <url>{URL}</url>
 And here is the cleaned HTML content of that webpage:
 <html>
 {HTML}
 </html>
 Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
 - index: an integer representing the index of the block in the content
 - content: a list of strings containing the text content of the block
 To generate the JSON objects:
 1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
 2. For each block:
   a. Assign it an index based on its order in the content.
   b. Analyze the content and generate ONE semantic tag that describe what the block is about.
   c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
 4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
 6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
 7. Never alter the extracted content, just copy and paste it as it is.
 Please provide your output within <blocks> tags, like this:
 <blocks>
 [{
  "index": 0,
  "tags": ["introduction"],
  "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
 },
 {
  "index": 1,
  "tags": ["background"],
  "content": ["This is the second paragraph, which delves into the history and background of the topic.",
              "It provides context and sets the stage for the rest of the article."]
 }]
 </blocks>
 Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
--- a/crawler/utils.py
+++ b/crawler/utils.py
@@ -0,0 +1,400 @@
 import requests
 from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
 import html2text
 import json
 import re
 import os
 import litellm
 from litellm import completion, batch_completion
 from .prompts import PROMPT_EXTRACT_BLOCKS
 from .config import *
 import re
 import html
 def beautify_html(escaped_html):
    """
    Beautifies an escaped HTML string.
    Parameters:
    escaped_html (str): A string containing escaped HTML.
    Returns:
    str: A beautifully formatted HTML string.
    """
    # Unescape the HTML string
    unescaped_html = html.unescape(escaped_html)
    # Use BeautifulSoup to parse and prettify the HTML
    soup = BeautifulSoup(unescaped_html, 'html.parser')
    pretty_html = soup.prettify()
    return pretty_html
 def split_and_parse_json_objects(json_string):
    """
    Splits a JSON string which is a list of objects and tries to parse each object.
    Parameters:
    json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
    Returns:
    tuple: A tuple containing two lists:
        - First list contains all successfully parsed JSON objects.
        - Second list contains the string representations of all segments that couldn't be parsed.
    """
    # Trim the leading '[' and trailing ']'
    if json_string.startswith('[') and json_string.endswith(']'):
        json_string = json_string[1:-1].strip()
    # Split the string into segments that look like individual JSON objects
    segments = []
    depth = 0
    start_index = 0
    for i, char in enumerate(json_string):
        if char == '{':
            if depth == 0:
                start_index = i
            depth += 1
        elif char == '}':
            depth -= 1
            if depth == 0:
                segments.append(json_string[start_index:i+1])
    # Try parsing each segment
    parsed_objects = []
    unparsed_segments = []
    for segment in segments:
        try:
            obj = json.loads(segment)
            parsed_objects.append(obj)
        except json.JSONDecodeError:
            unparsed_segments.append(segment)
    return parsed_objects, unparsed_segments
 def sanitize_html(html):
    # Replace all weird and special characters with an empty string
    sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
    # Escape all double and single quotes
    sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'")
    return sanitized_html
 def escape_json_string(s):
    """
    Escapes characters in a string to be JSON safe.
    Parameters:
    s (str): The input string to be escaped.
    Returns:
    str: The escaped string, safe for JSON encoding.
    """
    # Replace problematic backslash first
    s = s.replace('\\', '\\\\')
    # Replace the double quote
    s = s.replace('"', '\\"')
    # Escape control characters
    s = s.replace('\b', '\\b')
    s = s.replace('\f', '\\f')
    s = s.replace('\n', '\\n')
    s = s.replace('\r', '\\r')
    s = s.replace('\t', '\\t')
    # Additional problematic characters
    # Unicode control characters
    s = re.sub(r'[\x00-\x1f\x7f-\x9f]', lambda x: '\\u{:04x}'.format(ord(x.group())), s)
    return s
 def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
    try:
        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        # Get the content within the <body> tag
        body = soup.body
        # Remove script, style, and other tags that don't carry useful content from body
        for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
            tag.decompose()
        # Remove all attributes from remaining tags in body, except for img tags
        for tag in body.find_all():
            if tag.name != 'img':
                tag.attrs = {}
        # Replace images with their alt text or remove them if no alt text is available
        for img in body.find_all('img'):
            alt_text = img.get('alt')
            if alt_text:
                img.replace_with(soup.new_string(alt_text))
            else:
                img.decompose()
        # Recursively remove empty elements, their parent elements, and elements with word count below threshold
        def remove_empty_and_low_word_count_elements(node):
            for child in node.contents:
                if isinstance(child, element.Tag):
                    remove_empty_and_low_word_count_elements(child)
                    word_count = len(child.get_text(strip=True).split())
                    if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold:
                        child.decompose()
            return node
        body = remove_empty_and_low_word_count_elements(body)
        def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD):
            # We'll use a list to collect all tags that don't meet the word count requirement
            tags_to_remove = []
            # Traverse all tags in the body
            for tag in body.find_all(True):  # True here means all tags
                # Check if the tag contains text and if it's not just whitespace
                if tag.string and tag.string.strip():
                    # Split the text by spaces and count the words
                    word_count = len(tag.string.strip().split())
                    # If the word count is less than the threshold, mark the tag for removal
                    if word_count < word_count_threshold:
                        tags_to_remove.append(tag)
            # Remove all marked tags from the tree
            for tag in tags_to_remove:
                tag.decompose()  # or tag.extract() to remove and get the element
            return body
        # Remove small text tags
        body = remove_small_text_tags(body, word_count_threshold)       
        def is_empty_or_whitespace(tag: Tag):
            if isinstance(tag, NavigableString):
                return not tag.strip()
            # Check if the tag itself is empty or all its children are empty/whitespace
            if not tag.contents:
                return True
            return all(is_empty_or_whitespace(child) for child in tag.contents)
        def remove_empty_tags(body: Tag):
            # Continue processing until no more changes are made
            changes = True
            while changes:
                changes = False
                # Collect all tags that are empty or contain only whitespace
                empty_tags = [tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)]
                for tag in empty_tags:
                    # If a tag is empty, decompose it
                    tag.decompose()
                    changes = True  # Mark that a change was made
            return body        
        # Remove empty tags
        body = remove_empty_tags(body)
        # Flatten nested elements with only one child of the same type
        def flatten_nested_elements(node):
            for child in node.contents:
                if isinstance(child, element.Tag):
                    flatten_nested_elements(child)
                    if len(child.contents) == 1 and child.contents[0].name == child.name:
                        # print('Flattening:', child.name)
                        child_content = child.contents[0]
                        child.replace_with(child_content)
            return node
        body = flatten_nested_elements(body)
        # Remove comments
        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
            comment.extract()
        # Remove consecutive empty newlines and replace multiple spaces with a single space
        cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
        # Sanitize the cleaned HTML content
        cleaned_html = sanitize_html(cleaned_html)
        # sanitized_html = escape_json_string(cleaned_html)
        # Convert cleaned HTML to Markdown
        h = html2text.HTML2Text()
        h.ignore_links = True
        markdown = h.handle(cleaned_html)
        # Return the Markdown content
        return{
            'markdown': markdown,
            'cleaned_html': cleaned_html,
            'success': True
        }
    except Exception as e:
        print('Error processing HTML content:', str(e))
        return None
 # Example usage
 # word_count_threshold = 5  # Adjust this value according to your desired threshold
 # markdown_content = get_content_of_website(word_count_threshold)
 # print(markdown_content)
 def extract_xml_tags(string):
    tags = re.findall(r'<(\w+)>', string)
    return list(set(tags))
 def extract_xml_data(tags, string):
    data = {}
    for tag in tags:
        pattern = f"<{tag}>(.*?)</{tag}>"
        match = re.search(pattern, string, re.DOTALL)
        if match:
            data[tag] = match.group(1).strip()
        else:
            data[tag] = ""
    return data
 import time
 import litellm
 # Function to perform the completion with exponential backoff
 def perform_completion_with_backoff(provider, prompt_with_variables, api_token):
    max_attempts = 3
    base_delay = 2  # Base delay in seconds, you can adjust this based on your needs
    for attempt in range(max_attempts):
        try:
            response = completion(
                model=provider,
                messages=[
                    {"role": "user", "content": prompt_with_variables}
                ],
                temperature=0.01,
                api_key=api_token
            )
            return response  # Return the successful response
        except litellm.exceptions.RateLimitError as e:
            print("Rate limit error:", str(e))
            # Check if we have exhausted our max attempts
            if attempt < max_attempts - 1:
                # Calculate the delay and wait
                delay = base_delay * (2 ** attempt)  # Exponential backoff formula
                print(f"Waiting for {delay} seconds before retrying...")
                time.sleep(delay)
            else:
                # Return an error response after exhausting all retries
                return [{
                    "index": 0,
                    "tags": ["error"],
                    "content": ["Rate limit error. Please try again later."]
                }]
 def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
    # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
    api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
    variable_values = {
        "URL": url,
        "HTML": escape_json_string(sanitize_html(html)),
    }
    prompt_with_variables = PROMPT_EXTRACT_BLOCKS
    for variable in variable_values:
        prompt_with_variables = prompt_with_variables.replace(
            "{" + variable + "}", variable_values[variable]
        )
    response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
    # try:
    #     response = completion(
    #         model = provider,
    #         messages = [
    #             {"role": "user", "content": prompt_with_variables}
    #         ],
    #         temperature = 0.01,
    #         api_key = api_token
    #     )
    # except litellm.exceptions.RateLimitError as e:
    #     print("Rate limit error:", str(e))
    #     return [{
    #         "index": 0,
    #         "tags": ["error"],
    #         "content": ["Rate limit error. Please try again later."]
    #     }]
    try:
        blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
        blocks = json.loads(blocks)
        ## Add error: False to the blocks
        for block in blocks:
            block['error'] = False
    except Exception as e:
        print("Error extracting blocks:", str(e))
        parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
        blocks = parsed
        # Append all unparsed segments as onr error block and content is list of unparsed segments
        if unparsed:
            blocks.append({
                "index": 0,
                "error": True,
                "tags": ["error"],
                "content": unparsed
            })
    return blocks
 def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
    api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
    messages = []
    for url, html in batch_data:        
        variable_values = {
            "URL": url,
            "HTML": html,
        }
        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
        for variable in variable_values:
            prompt_with_variables = prompt_with_variables.replace(
                "{" + variable + "}", variable_values[variable]
            )
        messages.append([{"role": "user", "content": prompt_with_variables}])
    responses = batch_completion(
        model = provider,
        messages = messages,
        temperature = 0.01
    )
    all_blocks = []
    for response in responses:    
        try:
            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
            blocks = json.loads(blocks)
        except Exception as e:
            print("Error extracting blocks:", str(e))
            blocks = [{
                "index": 0,
                "tags": ["error"],
                "content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."],
                "questions": ["What went wrong during the block extraction process?"]
            }]
        all_blocks.append(blocks)
    return sum(all_blocks, [])
--- a/crawler/web_crawler.py
+++ b/crawler/web_crawler.py
@@ -0,0 +1,133 @@
 import asyncio
 import os, time
 import json
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 import chromedriver_autoinstaller
 from pydantic import parse_obj_as
 from .models import UrlModel, CrawlResult
 from .database import init_db, get_cached_url, cache_url
 from .utils import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from .config import * 
 class WebCrawler:
    def __init__(self, db_path: str):
        self.db_path = db_path
        init_db(self.db_path)
        self.options = Options()
        self.options.headless = True
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-dev-shm-usage")
        # make it headless
        self.options.add_argument("--headless")
        # Automatically install or update chromedriver
        chromedriver_autoinstaller.install()
    def fetch_page(self, url_model: UrlModel, provider: str = DEFAULT_PROVIDER, api_token: str = None, extract_blocks_flag: bool = True, word_count_threshold = MIN_WORD_THRESHOLD) -> CrawlResult:
        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
        if word_count_threshold < MIN_WORD_THRESHOLD:
            word_count_threshold = MIN_WORD_THRESHOLD
        # Check cache first
        cached = get_cached_url(self.db_path, str(url_model.url))
        if cached and not url_model.forced:
            return CrawlResult(**{
                "url": cached[0],
                "html": cached[1],
                "cleaned_html": cached[2],
                "markdown": cached[3],
                "parsed_json": cached[4],
                "success": cached[5],
                "error_message": ""
            })
        # Initialize WebDriver for crawling
        service = Service(chromedriver_autoinstaller.install())
        driver = webdriver.Chrome(service=service, options=self.options)
        try:
            driver.get(str(url_model.url))
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
            )
            html = driver.page_source
            success = True
            error_message = ""
        except Exception as e:
            html = ""
            success = False
            error_message = str(e)
        finally:
            driver.quit()
        # Extract content from HTML
        result = get_content_of_website(html, word_count_threshold)
        cleaned_html = result.get('cleaned_html', html)
        markdown = result.get('markdown', "")
        print("Crawling is done 🚀")
        parsed_json = []
        if extract_blocks_flag:
            # Split markdown into sections
            paragraphs = markdown.split('\n\n')
            sections = []
            chunks = []
            total_token_so_far = 0
            for paragraph in paragraphs:
                if total_token_so_far < CHUNK_TOKEN_THRESHOLD:
                    chunk = paragraph.split(' ')
                    total_token_so_far += len(chunk) * 1.3
                    chunks.append(paragraph)
                else:
                    sections.append('\n\n'.join(chunks))
                    chunks = [paragraph]
                    total_token_so_far = len(paragraph.split(' ')) * 1.3
            if chunks:
                sections.append('\n\n'.join(chunks))
            # Process sections to extract blocks
            parsed_json = []
            if provider.startswith("groq/"):
                # Sequential processing with a delay
                for section in sections:
                    parsed_json.extend(extract_blocks(str(url_model.url), section, provider, api_token))
                    time.sleep(0.5)  # 500 ms delay between each processing
            else:
                # Parallel processing using ThreadPoolExecutor
                with ThreadPoolExecutor() as executor:
                    futures = [executor.submit(extract_blocks, str(url_model.url), section, provider, api_token) for section in sections]
                    for future in as_completed(futures):
                        parsed_json.extend(future.result())
            parsed_json = json.dumps(parsed_json)
        # Cache the result
        cleaned_html = beautify_html(cleaned_html)
        cache_url(self.db_path, str(url_model.url), html, cleaned_html, markdown, parsed_json, success)
        return CrawlResult(
            url=str(url_model.url), 
            html=html, 
            cleaned_html=cleaned_html, 
            markdown=markdown, 
            parsed_json=parsed_json, 
            success=success, 
            error_message=error_message
        )
    def fetch_pages(self, url_models: List[UrlModel], provider: str = DEFAULT_PROVIDER, api_token: str = None) -> List[CrawlResult]:
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(self.fetch_page, url_models, [provider] * len(url_models), [api_token] * len(url_models)))
        return results
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,10 @@
 version: '3.8'
 services:
  web:
    build: .
    command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
    ports:
      - "80:80"
    environment:
      - PYTHONUNBUFFERED=1
--- a/examples/test.py
+++ b/examples/test.py
@@ -0,0 +1,31 @@
 from crawler.web_crawler import WebCrawler
 from crawler.models import UrlModel
 from crawler.utils import get_content_of_website
 import os
 def main():
    # Initialize the WebCrawler with just the database path
    crawler = WebCrawler(db_path='crawler_data.db')
    # Fetch a single page
    single_url = UrlModel(url='https://kidocode.com', forced=True)
    result = crawler.fetch_page(
        single_url, 
        provider= "openai/gpt-3.5-turbo", 
        api_token = os.getenv('OPENAI_API_KEY'), 
        extract_blocks_flag=True,
        word_count_threshold=5
    )
    print(result.model_dump())
    # Fetch multiple pages
    # urls = [
    #     UrlModel(url='http://example.com', forced=False),
    #     UrlModel(url='http://example.org', forced=False)
    # ]
    # results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
    # for res in results:
    #     print(res.model_copy())
 if __name__ == '__main__':
    main()
--- a/main.py
+++ b/main.py
@@ -0,0 +1,154 @@
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, HttpUrl
 from typing import List, Optional
 from crawler.web_crawler import WebCrawler
 from crawler.models import UrlModel
 import asyncio
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import chromedriver_autoinstaller
 from functools import lru_cache
 from crawler.database import get_total_count
 import os
 import uuid
 # Task management
 tasks = {}
 # Configuration
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 MAX_CONCURRENT_REQUESTS = 10  # Adjust this to change the maximum concurrent requests
 current_requests = 0
 lock = asyncio.Lock()
 app = FastAPI()
 # Mount the pages directory as a static directory
 app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
 chromedriver_autoinstaller.install()  # Ensure chromedriver is installed
 class UrlsInput(BaseModel):
    urls: List[HttpUrl]
    provider_model: str
    api_token: str
    include_raw_html: Optional[bool] = False
    forced: bool = False
    extract_blocks: bool = True
    word_count_threshold: Optional[int] = 5
@lru_cache()
 def get_crawler():
    # Initialize and return a WebCrawler instance
    return WebCrawler(db_path='crawler_data.db')
@app.get("/", response_class=HTMLResponse)
 async def read_index():
    with open(f"{__location__}/pages/index.html", "r") as file:
        html_content = file.read()
    return HTMLResponse(content=html_content, status_code=200)
@app.get("/total-count")
 async def get_total_url_count():
    count = get_total_count(db_path='crawler_data.db')
    return JSONResponse(content={"count": count})
@app.post("/crawl")
 async def crawl_urls(urls_input: UrlsInput, request: Request):
    global current_requests
    # Raise error if api_token is not provided
    if not urls_input.api_token:
        raise HTTPException(status_code=401, detail="API token is required.")
    async with lock:
        if current_requests >= MAX_CONCURRENT_REQUESTS:
            raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
        current_requests += 1
    try:
        # Prepare URL models for crawling
        url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
        # Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
        with ThreadPoolExecutor() as executor:
            loop = asyncio.get_event_loop()
            futures = [
                loop.run_in_executor(executor, get_crawler().fetch_page, url_model, urls_input.provider_model, urls_input.api_token, urls_input.extract_blocks, urls_input.word_count_threshold)
                for url_model in url_models
            ]
            results = await asyncio.gather(*futures)
        # if include_raw_html is False, remove the raw HTML content from the results
        if not urls_input.include_raw_html:
            for result in results:
                result.html = None
        return {"results": [result.dict() for result in results]}
    finally:
        async with lock:
            current_requests -= 1
@app.post("/crawl_async")
 async def crawl_urls(urls_input: UrlsInput, request: Request):
    global current_requests
    if not urls_input.api_token:
        raise HTTPException(status_code=401, detail="API token is required.")
    async with lock:
        if current_requests >= MAX_CONCURRENT_REQUESTS:
            raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
        current_requests += 1
    task_id = str(uuid.uuid4())
    tasks[task_id] = {"status": "pending", "results": None}
    try:
        url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
        loop = asyncio.get_running_loop()
        loop.create_task(
            process_crawl_task(url_models, urls_input.provider_model, urls_input.api_token, task_id, urls_input.extract_blocks)
        )
        return {"task_id": task_id}
    finally:
        async with lock:
            current_requests -= 1
 async def process_crawl_task(url_models, provider, api_token, task_id, extract_blocks_flag):
    try:
        with ThreadPoolExecutor() as executor:
            loop = asyncio.get_running_loop()
            futures = [
                loop.run_in_executor(executor, get_crawler().fetch_page, url_model, provider, api_token, extract_blocks_flag)
                for url_model in url_models
            ]
            results = await asyncio.gather(*futures)
        tasks[task_id] = {"status": "done", "results": results}
    except Exception as e:
        tasks[task_id] = {"status": "failed", "error": str(e)}
@app.get("/task/{task_id}")
 async def get_task_status(task_id: str):
    task = tasks.get(task_id)
    if not task:
        raise HTTPException(status_code=404, detail="Task not found")
    if task['status'] == 'done':
        return {
            "status": task['status'],
            "results": [result.dict() for result in task['results']]
        }
    elif task['status'] == 'failed':
        return {
            "status": task['status'],
            "error": task['error']
        }
    else:
        return {"status": task['status']}
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/pages/index.html
+++ b/pages/index.html
@@ -0,0 +1,411 @@
 <!DOCTYPE html>
 <html lang="en">
    <head>
        <meta charset="UTF-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
        <title>Crawl4AI</title>
        <link rel="preconnect" href="https://fonts.googleapis.com" />
        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
        <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
        <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
        <link
            rel="stylesheet"
            href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
        />
        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
        <style>
            :root {
                --ifm-font-size-base: 100%;
                --ifm-line-height-base: 1.65;
                --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
                    sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
                    "Segoe UI Emoji", "Segoe UI Symbol";
            }
            html {
                -webkit-font-smoothing: antialiased;
                -webkit-text-size-adjust: 100%;
                text-size-adjust: 100%;
                font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
            }
            body {
                background-color: #1a202c;
                color: #fff;
            }
            .tab-content {
                max-height: 400px;
                overflow: auto;
            }
            pre {
                white-space: pre-wrap;
                font-size: 14px;
            }
            pre code {
                width: 100%;
            }
        </style>
    </head>
    <body>
        <header class="bg-gray-900 text-white py-4">
            <div class="container mx-auto px-4">
                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper</h1>
            </div>
        </header>
        <!--  Add a section to show total-count websited already crawled -->
        <section class="bg-gray-600 py-8">
            <div class="container mx-auto px-4 flex font-bold text-xl gap-2">
                <span>📊 Total Website Procceced</span>
                <span id="total-count" class="text-blue-400">0</span>
            </div>
        </section>
        <section class="try-it py-8 pb-20">
            <div class="container mx-auto px-4">
                <h2 class="text-2xl font-bold mb-4">Try It Now</h2>
                <div class="mb-4 flex w-full gap-2">
                    <div class="flex items-center gap-2 flex-col flex-grow">
                        <label for="url-input" class="text-white">URL(s)</label>
                        <input
                            type="text"
                            id="url-input"
                            value="https://kidocode.com"
                            class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
                            placeholder="Enter URL(s) separated by commas"
                        />
                    </div>
                    <!-- Add a number set if 5 with a label word threshold -->
                    <div class="flex items-center gap-2 flex-col">
                        <label for="threshold" class="text-white">Min Words Threshold</label>
                        <select id="threshold" class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full">
                            <option value="5">5</option>
                            <option value="10">10</option>
                            <option value="15">15</option>
                            <option value="20">20</option>
                            <option value="25">25</option>
                        </select>
                    </div>
                    <div class="flex items-center gap-2 flex-col">
                        <label for="provider-model-select" class="text-white">Provider Model</label>
                        <select
                            id="provider-model-select"
                            class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full"
                        >
                            <!-- Add your option values here -->
                            <option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
                            <option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
                            <option value="openai/gpt-4-turbo">gpt-4-turbo</option>
                            <option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
                            <option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
                            <option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
                            <option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
                        </select>
                    </div>
                    <div class="flex items-center gap-2 flex-col">
                        <label for="token-input" class="text-white">API Token</label>
                        <input
                            type="password"
                            id="token-input"
                            class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
                            placeholder="Enter Groq API token"
                        />
                    </div>
                    <div class="flex items-center justify-center gap-2 flex-col">
                        <label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
                        <input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked />
                    </div>
                    <button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
                </div>
                <div class="grid grid-cols-1 md:grid-cols-2 gap-8">
                    <div id="loading" class="hidden mt-4">
                        <p>
                            Depends on the selected model, it may take up to 1 or 2 minutes to process the request.
                            Loading...
                        </p>
                    </div>
                    <div id="result" class="tab-container flex-1 h-full flex-col">
                        <div class="tab-buttons flex gap-2">
                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
                                Cleaned HTML
                            </button>
                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
                                Markdown
                            </button>
                        </div>
                        <div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
                            <pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
                            <pre
                                class="hidden h-full flex"
                            ><code id="cleaned-html-result" class="language-html "></code></pre>
                            <pre
                                class="hidden h-full flex"
                            ><code id="markdown-result" class="language-markdown "></code></pre>
                        </div>
                    </div>
                    <div id="code_help" class="tab-container flex-1 h-full">
                        <div class="tab-buttons flex gap-2">
                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
                                Python
                            </button>
                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
                                Node.js
                            </button>
                        </div>
                        <div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
                            <pre class="h-full flex relative">
                                    <code id="curl-code" class="language-bash"></code>
                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
                                </pre>
                            <pre class="hidden h-full flex relative">
                                    <code id="python-code" class="language-python"></code>
                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
                                </pre>
                            <pre class="hidden h-full flex relative">
                                    <code id="nodejs-code" class="language-javascript"></code>
                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
                                </pre>
                        </div>
                    </div>
                </div>
            </div>
        </section>
        <section class="hero bg-gray-900 py-8">
            <div class="container mx-auto px-4">
                <h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
                <p class="text-lg mb-4">
                    In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
                    for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
                    crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
                    🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
                    definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
                    philosophy, we invite you to join our "Robinhood" band and help set these products free for the
                    benefit of all. 🤝💪
                </p>
            </div>
        </section>
        <section class="installation py-8">
            <div class="container mx-auto px-4">
                <h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
                <p class="mb-4">
                    To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
                    these steps:
                </p>
                <ol class="list-decimal list-inside mb-4">
                    <li>
                        Clone the GitHub repository: 📥
                        <code>git clone https://github.com/unclecode/crawl4ai.git</code>
                    </li>
                    <li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
                    <li>
                        Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
                        <code>docker build --platform linux/amd64 -t crawl4ai .</code>
                    </li>
                    <li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
                </ol>
                <p>
                    For more detailed instructions and advanced configuration options, please refer to the 📚
                    <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
                </p>
            </div>
        </section>
        <footer class="bg-gray-900 text-white py-4">
            <div class="container mx-auto px-4">
                <div class="flex justify-between items-center">
                    <p>© 2024 Crawl4AI. All rights reserved.</p>
                    <div class="social-links">
                        <a
                            href="https://github.com/unclecode/crawl4ai"
                            class="text-white hover:text-gray-300 mx-2"
                            target="_blank"
                            >😺 GitHub</a
                        >
                        <a
                            href="https://twitter.com/unclecode"
                            class="text-white hover:text-gray-300 mx-2"
                            target="_blank"
                            >🐦 Twitter</a
                        >
                        <a
                            href="https://discord.gg/your-invite-link"
                            class="text-white hover:text-gray-300 mx-2"
                            target="_blank"
                            >💬 Discord</a
                        >
                    </div>
                </div>
            </div>
        </footer>
        <script>
            // Get the selected provider model and token from local storage
            const storedProviderModel = localStorage.getItem("provider_model");
            const storedToken = localStorage.getItem(storedProviderModel);
            if (storedProviderModel) {
                document.getElementById("provider-model-select").value = storedProviderModel;
            }
            if (storedToken) {
                document.getElementById("token-input").value = storedToken;
            }
            // Handle provider model dropdown change
            document.getElementById("provider-model-select").addEventListener("change", () => {
                const selectedProviderModel = document.getElementById("provider-model-select").value;
                const storedToken = localStorage.getItem(selectedProviderModel);
                if (storedToken) {
                    document.getElementById("token-input").value = storedToken;
                } else {
                    document.getElementById("token-input").value = "";
                }
            });
            // Fetch total count from the database
            axios
                .get("/total-count")
                .then((response) => {
                    document.getElementById("total-count").textContent = response.data.count;
                })
                .catch((error) => console.error(error));
            // Handle crawl button click
            document.getElementById("crawl-btn").addEventListener("click", () => {
                // validate input to have both URL and API token
                if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
                    alert("Please enter both URL(s) and API token.");
                    return;
                }
                const selectedProviderModel = document.getElementById("provider-model-select").value;
                const apiToken = document.getElementById("token-input").value;
                const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
                // Save the selected provider model and token to local storage
                localStorage.setItem("provider_model", selectedProviderModel);
                localStorage.setItem(selectedProviderModel, apiToken);
                const urlsInput = document.getElementById("url-input").value;
                const urls = urlsInput.split(",").map((url) => url.trim());
                const data = {
                    urls: urls,
                    provider_model: selectedProviderModel,
                    api_token: apiToken,
                    include_raw_html: true,
                    forced: false,
                    extract_blocks: extractBlocks,
                    word_count_threshold: parseInt(document.getElementById("threshold").value),
                };
                // save api token to local storage
                localStorage.setItem("api_token", document.getElementById("token-input").value);
                document.getElementById("loading").classList.remove("hidden");
                document.getElementById("result").classList.add("hidden");
                document.getElementById("code_help").classList.add("hidden");
                axios
                    .post("/crawl", data)
                    .then((response) => {
                        const result = response.data.results[0];
                        const parsedJson = JSON.parse(result.parsed_json);
                        document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
                        document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
                        document.getElementById("markdown-result").textContent = result.markdown;
                        // Update code examples dynamically
                        // Update code examples dynamically
                        document.getElementById(
                            "curl-code"
                        ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                            ...data,
                            api_token: "your_api_token",
                        })}' http://localhost:8000/crawl`;
                        document.getElementById(
                            "python-code"
                        ).textContent = `import requests\n\ndata = ${JSON.stringify(
                            { ...data, api_token: "your_api_token" },
                            null,
                            2
                        )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
                        document.getElementById(
                            "nodejs-code"
                        ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
                            { ...data, api_token: "your_api_token" },
                            null,
                            2
                        )};\n\naxios.post("http://localhost:8000/crawl", data)\n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
                        // Highlight code syntax
                        hljs.highlightAll();
                        // Select JSON tab by default
                        document.querySelector('.tab-btn[data-tab="json"]').click();
                        document.getElementById("loading").classList.add("hidden");
                        document.getElementById("result").classList.remove("hidden");
                        document.getElementById("code_help").classList.remove("hidden");
                        // increment the total count
                        document.getElementById("total-count").textContent =
                            parseInt(document.getElementById("total-count").textContent) + 1;
                    })
                    .catch((error) => {
                        console.error(error);
                        document.getElementById("loading").classList.add("hidden");
                    });
            });
            // Handle tab clicks
            document.querySelectorAll(".tab-btn").forEach((btn) => {
                btn.addEventListener("click", () => {
                    const tab = btn.dataset.tab;
                    document
                        .querySelectorAll(".tab-btn")
                        .forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
                    btn.classList.add("bg-blue-600", "text-white");
                    document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
                    document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
                });
            });
            // Handle code tab clicks
            document.querySelectorAll(".code-tab-btn").forEach((btn) => {
                btn.addEventListener("click", () => {
                    const tab = btn.dataset.tab;
                    document
                        .querySelectorAll(".code-tab-btn")
                        .forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
                    btn.classList.add("bg-blue-600", "text-white");
                    document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
                    document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
                });
            });
            // Handle copy to clipboard button clicks
            document.querySelectorAll(".copy-btn").forEach((btn) => {
                btn.addEventListener("click", () => {
                    const target = btn.dataset.target;
                    const code = document.getElementById(target).textContent;
                    navigator.clipboard.writeText(code).then(() => {
                        btn.textContent = "Copied!";
                        setTimeout(() => {
                            btn.textContent = "Copy";
                        }, 2000);
                    });
                });
            });
        </script>
    </body>
 </html>
--- a/pages/index_pooling.html
+++ b/pages/index_pooling.html
@@ -0,0 +1,425 @@
 <!DOCTYPE html>
 <html lang="en">
    <head>
        <meta charset="UTF-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
        <title>Crawl4AI</title>
        <link rel="preconnect" href="https://fonts.googleapis.com" />
        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
        <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
        <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
        <link
            rel="stylesheet"
            href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
        />
        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
        <style>
            :root {
                --ifm-font-size-base: 100%;
                --ifm-line-height-base: 1.65;
                --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
                    sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
                    "Segoe UI Emoji", "Segoe UI Symbol";
            }
            html {
                -webkit-font-smoothing: antialiased;
                -webkit-text-size-adjust: 100%;
                text-size-adjust: 100%;
                font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
            }
            body {
                background-color: #1a202c;
                color: #fff;
            }
            .tab-content {
                max-height: 400px;
                overflow: auto;
            }
            pre {
                white-space: pre-wrap;
                font-size: 14px;
            }
            pre code {
                width: 100%;
            }
        </style>
    </head>
    <body>
        <header class="bg-gray-900 text-white py-4">
            <div class="container mx-auto px-4">
                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
            </div>
        </header>
        <section class="try-it py-8 pb-20">
            <div class="container mx-auto px-4">
                <h2 class="text-2xl font-bold mb-4">Try It Now</h2>
                <div class="mb-4 flex w-full gap-2">
                    <input
                        type="text"
                        id="url-input"
                        value="https://kidocode.com"
                        class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
                        placeholder="Enter URL(s) separated by commas"
                    />
                    <select
                        id="provider-model-select"
                        class="border border-gray-600 rounded px-4 py-2 bg-gray-800 text-white"
                    >
                        <!-- Add your option values here -->
                        <option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
                        <option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
                        <option value="openai/gpt-4-turbo">gpt-4-turbo</option>
                        <option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
                        <option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
                        <option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
                        <option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
                    </select>
                    <input
                        type="password"
                        id="token-input"
                        class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
                        placeholder="Enter Groq API token"
                    />
                    <div class="flex items-center justify-center">
                        <input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked>
                        <label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
                    </div>
                    <button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
                </div>
                <div class="grid grid-cols-1 md:grid-cols-2 gap-8">
                    <div id="loading" class="hidden mt-4">
                        <p>Loading...</p>
                    </div>
                    <div id="result" class="tab-container flex-1 h-full flex-col">
                        <div class="tab-buttons flex gap-2">
                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
                                Cleaned HTML
                            </button>
                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
                                Markdown
                            </button>
                        </div>
                        <div class="tab-content code bg-gray-800 p-2 rounded  h-full  flex-1 border border-gray-600">
                            <pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
                            <pre
                                class="hidden h-full flex"
                            ><code id="cleaned-html-result" class="language-html "></code></pre>
                            <pre
                                class="hidden h-full flex"
                            ><code id="markdown-result" class="language-markdown "></code></pre>
                        </div>
                    </div>
                    <div id="code_help" class="tab-container flex-1 h-full">
                        <div class="tab-buttons flex gap-2">
                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
                                Python
                            </button>
                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
                                Node.js
                            </button>
                        </div>
                        <div class="tab-content result bg-gray-800 p-2 rounded h-full  flex-1 border border-gray-600">
                            <pre class="h-full flex relative">
                                    <code id="curl-code" class="language-bash"></code>
                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
                                </pre>
                            <pre class="hidden h-full flex relative">
                                    <code id="python-code" class="language-python"></code>
                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
                                </pre>
                            <pre class="hidden h-full flex relative">
                                    <code id="nodejs-code" class="language-javascript"></code>
                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
                                </pre>
                        </div>
                    </div>
                </div>
            </div>
        </section>
        <section class="hero bg-gray-900 py-8">
            <div class="container mx-auto px-4">
                <h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
                <p class="text-lg mb-4">
                    In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for
                    services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl 
                    a web page, and transform it o a form suitable for LLM. We don't think one should build a business
                    out of this, but definilty should be opened source. So if you possess the skills to build such things 
                    and you have such philosphy you should join our "Robinhood" band and help set
                    these products free. 🆓🤝
                </p>
            </div>
        </section>
        <section class="installation py-8">
            <div class="container mx-auto px-4">
                <h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
                <p class="mb-4">
                    To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
                    these steps:
                </p>
                <ol class="list-decimal list-inside mb-4">
                    <li>
                        Clone the GitHub repository: 📥
                        <code>git clone https://github.com/unclecode/crawl4ai.git</code>
                    </li>
                    <li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
                    <li>
                        Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
                        <code>docker build --platform linux/amd64 -t crawl4ai .</code>
                    </li>
                    <li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
                </ol>
                <p>
                    For more detailed instructions and advanced configuration options, please refer to the 📚
                    <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
                </p>
            </div>
        </section>
        <footer class="bg-gray-900 text-white py-4">
            <div class="container mx-auto px-4">
                <div class="flex justify-between items-center">
                    <p>© 2024 Crawl4AI. All rights reserved.</p>
                    <div class="social-links">
                        <a
                            href="https://github.com/unclecode/crawl4ai"
                            class="text-white hover:text-gray-300 mx-2"
                            target="_blank"
                            >😺 GitHub</a
                        >
                        <a
                            href="https://twitter.com/unclecode"
                            class="text-white hover:text-gray-300 mx-2"
                            target="_blank"
                            >🐦 Twitter</a
                        >
                        <a
                            href="https://discord.gg/your-invite-link"
                            class="text-white hover:text-gray-300 mx-2"
                            target="_blank"
                            >💬 Discord</a
                        >
                    </div>
                </div>
            </div>
        </footer>
        <script>
            // Get the selected provider model and token from local storage
            const storedProviderModel = localStorage.getItem("provider_model");
            const storedToken = localStorage.getItem(storedProviderModel);
            if (storedProviderModel) {
                document.getElementById("provider-model-select").value = storedProviderModel;
            }
            if (storedToken) {
                document.getElementById("token-input").value = storedToken;
            }
            // Handle provider model dropdown change
            document.getElementById("provider-model-select").addEventListener("change", () => {
                const selectedProviderModel = document.getElementById("provider-model-select").value;
                const storedToken = localStorage.getItem(selectedProviderModel);
                if (storedToken) {
                    document.getElementById("token-input").value = storedToken;
                } else {
                    document.getElementById("token-input").value = "";
                }
            });
            // Fetch total count from the database
            axios
                .get("/total-count")
                .then((response) => {
                    document.getElementById("total-count").textContent = response.data.count;
                })
                .catch((error) => console.error(error));
            // Handle crawl button click
            document.getElementById("crawl-btn").addEventListener("click", () => {
                // validate input to have both URL and API token
                if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
                    alert("Please enter both URL(s) and API token.");
                    return;
                }
                const selectedProviderModel = document.getElementById("provider-model-select").value;
                const apiToken = document.getElementById("token-input").value;
                const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
                // Save the selected provider model and token to local storage
                localStorage.setItem("provider_model", selectedProviderModel);
                localStorage.setItem(selectedProviderModel, apiToken);
                const urlsInput = document.getElementById("url-input").value;
                const urls = urlsInput.split(",").map((url) => url.trim());
                const data = {
                    urls: urls,
                    provider_model: selectedProviderModel,
                    api_token: apiToken,
                    include_raw_html: true,
                    forced: false,
                    extract_blocks: extractBlocks,
                };
                // save api token to local storage
                localStorage.setItem("api_token", document.getElementById("token-input").value);
                document.getElementById("loading").classList.remove("hidden");
                document.getElementById("result").classList.add("hidden");
                document.getElementById("code_help").classList.add("hidden");
                axios
                    .post("/crawl", data)
                    .then((response) => {
                        const result = response.data.results[0];
                        const parsedJson = JSON.parse(result.parsed_json);
                        document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
                        document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
                        document.getElementById("markdown-result").textContent = result.markdown;
                        // Update code examples dynamically
                        // Update code examples dynamically
                        document.getElementById(
                            "curl-code"
                        ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                            ...data,
                            api_token: "your_api_token",
                        })}' http://localhost:8000/crawl`;
                        document.getElementById(
                            "python-code"
                        ).textContent = `import requests\n\ndata = ${JSON.stringify(
                            { ...data, api_token: "your_api_token" },
                            null,
                            2
                        )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
                        document.getElementById(
                            "nodejs-code"
                        ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
                            { ...data, api_token: "your_api_token" },
                            null,
                            2
                        )};\n\naxios.post("http://localhost:8000/crawl", data)\n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
                        // Highlight code syntax
                        hljs.highlightAll();
                        // Select JSON tab by default
                        document.querySelector('.tab-btn[data-tab="json"]').click();
                        document.getElementById("loading").classList.add("hidden");
                        document.getElementById("result").classList.remove("hidden");
                        document.getElementById("code_help").classList.remove("hidden");
                    })
                    .catch((error) => {
                        console.error(error);
                        document.getElementById("loading").classList.add("hidden");
                    });
            });
            // Handle tab clicks
            document.querySelectorAll(".tab-btn").forEach((btn) => {
                btn.addEventListener("click", () => {
                    const tab = btn.dataset.tab;
                    document
                        .querySelectorAll(".tab-btn")
                        .forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
                    btn.classList.add("bg-blue-600", "text-white");
                    document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
                    document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
                });
            });
            // Handle code tab clicks
            document.querySelectorAll(".code-tab-btn").forEach((btn) => {
                btn.addEventListener("click", () => {
                    const tab = btn.dataset.tab;
                    document
                        .querySelectorAll(".code-tab-btn")
                        .forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
                    btn.classList.add("bg-blue-600", "text-white");
                    document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
                    document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
                });
            });
            // Handle copy to clipboard button clicks
            document.querySelectorAll(".copy-btn").forEach((btn) => {
                btn.addEventListener("click", () => {
                    const target = btn.dataset.target;
                    const code = document.getElementById(target).textContent;
                    navigator.clipboard.writeText(code).then(() => {
                        btn.textContent = "Copied!";
                        setTimeout(() => {
                            btn.textContent = "Copy";
                        }, 2000);
                    });
                });
            });
            document.getElementById("crawl-btn").addEventListener("click", () => {
                const urlsInput = document.getElementById("url-input").value;
                const urls = urlsInput.split(",").map(url => url.trim());
                const apiToken = document.getElementById("token-input").value;
                const selectedProviderModel = document.getElementById("provider-model-select").value;
                const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
                const data = {
                    urls: urls,
                    provider_model: selectedProviderModel,
                    api_token: apiToken,
                    include_raw_html: true,
                    forced: false,
                    extract_blocks: extractBlocks
                };
                localStorage.setItem("api_token", apiToken);
                document.getElementById("loading").classList.remove("hidden");
                document.getElementById("result").classList.add("hidden");
                document.getElementById("code_help").classList.add("hidden");
                axios.post("/crawl", data)
                    .then(response => {
                        const taskId = response.data.task_id;
                        pollTaskStatus(taskId);
                    })
                    .catch(error => {
                        console.error('Error during fetch:', error);
                        document.getElementById("loading").classList.add("hidden");
                    });
            });
            function pollTaskStatus(taskId) {
                axios.get(`/task/${taskId}`)
                    .then(response => {
                        const task = response.data;
                        if (task.status === 'done') {
                            displayResults(task.results[0]);
                        } else if (task.status === 'pending') {
                            setTimeout(() => pollTaskStatus(taskId), 2000);  // Poll every 2 seconds
                        } else {
                            console.error('Task failed:', task.error);
                            document.getElementById("loading").classList.add("hidden");
                        }
                    })
                    .catch(error => {
                        console.error('Error polling task status:', error);
                        document.getElementById("loading").classList.add("hidden");
                    });
            }
        </script>
    </body>
 </html>
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,13 @@
 fastapi
 uvicorn
 selenium
 pydantic
 aiohttp
 aiosqlite
 chromedriver_autoinstaller
 httpx
 requests
 bs4
 html2text
 litellm
 python-dotenv
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,30 @@
 from setuptools import setup, find_packages
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
    requirements = f.read().splitlines()
 setup(
    name="Crawl4AI",
    version="0.1.0",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
    url="https://github.com/unclecode/crawl4ai",
    author="Unclecode",
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
    install_requires=requirements,
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Developers",
        "License :: OSI Approved :: Apache Software License",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
    ],
    python_requires=">=3.7",
 )
--- a/test.py
+++ b/test.py
@@ -0,0 +1,31 @@
 from crawler.web_crawler import WebCrawler
 from crawler.models import UrlModel
 from crawler.utils import get_content_of_website
 import os
 def main():
    # Initialize the WebCrawler with just the database path
    crawler = WebCrawler(db_path='crawler_data.db')
    # Fetch a single page
    single_url = UrlModel(url='https://kidocode.com', forced=True)
    result = crawler.fetch_page(
        single_url, 
        provider= "openai/gpt-3.5-turbo", 
        api_token = os.getenv('OPENAI_API_KEY'), 
        extract_blocks_flag=True,
        word_count_threshold=5
    )
    print(result.model_dump())
    # Fetch multiple pages
    # urls = [
    #     UrlModel(url='http://example.com', forced=False),
    #     UrlModel(url='http://example.org', forced=False)
    # ]
    # results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
    # for res in results:
    #     print(res.model_copy())
 if __name__ == '__main__':
    main()