Initial Commit
This commit is contained in:
165
.gitignore
vendored
Normal file
165
.gitignore
vendored
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
Crawl4AI.egg-info/
|
||||||
|
Crawl4AI.egg-info/*
|
||||||
40
Dockerfile
Normal file
40
Dockerfile
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# Use an official Python runtime as a parent image
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Copy the current directory contents into the container at /usr/src/app
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install any needed packages specified in requirements.txt
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Install dependencies for Chrome and ChromeDriver
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
wget \
|
||||||
|
xvfb \
|
||||||
|
unzip \
|
||||||
|
curl \
|
||||||
|
gnupg2 \
|
||||||
|
ca-certificates \
|
||||||
|
apt-transport-https \
|
||||||
|
software-properties-common \
|
||||||
|
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||||
|
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y google-chrome-stable \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set display port and dbus env to avoid hanging
|
||||||
|
ENV DISPLAY=:99
|
||||||
|
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||||
|
|
||||||
|
# Make port 80 available to the world outside this container
|
||||||
|
EXPOSE 80
|
||||||
|
|
||||||
|
# Define environment variable
|
||||||
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
|
||||||
|
# Run uvicorn
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
51
LICENSE
Normal file
51
LICENSE
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
|
||||||
|
|
||||||
|
You must give any other recipients of the Work or Derivative Works a copy of this License; and
|
||||||
|
You must cause any modified files to carry prominent notices stating that You changed the files; and
|
||||||
|
You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
|
||||||
|
If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
|
||||||
|
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
199
README.md
Normal file
199
README.md
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
# Crawl4AI 🕷️🤖
|
||||||
|
|
||||||
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
[](https://github.com/unclecode/crawl4ai/issues)
|
||||||
|
[](https://github.com/unclecode/crawl4ai/pulls)
|
||||||
|
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
||||||
|
|
||||||
|
Crawl4AI is a powerful, free web crawling service designed to extract useful information from web pages and make it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
||||||
|
|
||||||
|
## Features ✨
|
||||||
|
|
||||||
|
- 🕷️ Efficient web crawling to extract valuable data from websites
|
||||||
|
- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
|
||||||
|
- 🌍 Supports crawling multiple URLs simultaneously
|
||||||
|
- 🌃 Replace media tags with ALT.
|
||||||
|
- 🆓 Completely free to use and open-source
|
||||||
|
|
||||||
|
## Getting Started 🚀
|
||||||
|
|
||||||
|
To get started with Crawl4AI, simply visit our web application at [https://crawl4ai.your-domain.io](https://crawl4ai.uccode.io) and enter the URL(s) you want to crawl. The application will process the URLs and provide you with the extracted data in various formats.
|
||||||
|
|
||||||
|
## Installation 💻
|
||||||
|
|
||||||
|
To install and run Crawl4AI locally or on your own server, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
```
|
||||||
|
git clone https://github.com/your-username/crawl4ai.git
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Navigate to the project directory:
|
||||||
|
```
|
||||||
|
cd crawl4ai
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Create a `.env` file in the root folder and set your Groq API token:
|
||||||
|
```
|
||||||
|
GROQ_API_TOKEN=your_groq_api_token
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Build the Docker image:
|
||||||
|
```
|
||||||
|
docker build -t crawl4ai .
|
||||||
|
```
|
||||||
|
For Mac users, use the following command instead:
|
||||||
|
```
|
||||||
|
docker build --platform linux/amd64 -t crawl4ai .
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Run the Docker container:
|
||||||
|
```
|
||||||
|
docker run -p 8000:80 crawl4ai
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Access the application at `http://localhost:8000`.
|
||||||
|
|
||||||
|
For more detailed instructions and advanced configuration options, please refer to the [installation guide](https://github.com/your-username/crawl4ai/blob/main/INSTALL.md).
|
||||||
|
|
||||||
|
## Usage with Python 🐍
|
||||||
|
|
||||||
|
Here's an example of how to use Crawl4AI with Python to crawl a webpage and retrieve the extracted data:
|
||||||
|
|
||||||
|
1. Make sure you have the `requests` library installed. You can install it using pip:
|
||||||
|
```
|
||||||
|
pip install requests
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Use the following Python code to send a request to the Crawl4AI server and retrieve the crawled data:
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
|
||||||
|
url = "http://localhost:8000/crawl" # Replace with the appropriate server URL
|
||||||
|
data = {
|
||||||
|
"urls": [
|
||||||
|
"https://example.com"
|
||||||
|
],
|
||||||
|
"provider_model": "groq/llama3-70b-8192",
|
||||||
|
"api_token": "your_api_token",
|
||||||
|
"include_raw_html": true,
|
||||||
|
"forced": false,
|
||||||
|
"extract_blocks": true,
|
||||||
|
"word_count_threshold": 5
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(url, json=data)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()["results"][0]
|
||||||
|
print("Parsed JSON:")
|
||||||
|
print(result["parsed_json"])
|
||||||
|
print("\nCleaned HTML:")
|
||||||
|
print(result["cleaned_html"])
|
||||||
|
print("\nMarkdown:")
|
||||||
|
print(result["markdown"])
|
||||||
|
else:
|
||||||
|
print("Error:", response.status_code, response.text)
|
||||||
|
```
|
||||||
|
|
||||||
|
This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`https://example.com`) and the desired options (`grq_api_token`, `include_raw_html`, and `forced`). The server processes the request and returns the crawled data in JSON format.
|
||||||
|
|
||||||
|
The response from the server includes the parsed JSON, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
|
||||||
|
|
||||||
|
Make sure to replace `"http://localhost:8000/crawl"` with the appropriate server URL if your Crawl4AI server is running on a different host or port.
|
||||||
|
|
||||||
|
## Using Crawl4AI as a Python Library 📚
|
||||||
|
|
||||||
|
You can also use Crawl4AI as a Python library in your own projects. Here's an example of how to use the Crawl4AI library:
|
||||||
|
|
||||||
|
1. Install the required dependencies:
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Import the necessary modules and initialize the `WebCrawler`:
|
||||||
|
```python
|
||||||
|
from crawler.web_crawler import WebCrawler
|
||||||
|
from crawler.models import UrlModel
|
||||||
|
import os
|
||||||
|
|
||||||
|
crawler = WebCrawler(db_path='crawler_data.db')
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Fetch a single page:
|
||||||
|
```python
|
||||||
|
single_url = UrlModel(url='https://kidocode.com', forced=True)
|
||||||
|
result = crawler.fetch_page(
|
||||||
|
single_url,
|
||||||
|
provider= "openai/gpt-3.5-turbo",
|
||||||
|
api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
|
extract_blocks_flag=True,
|
||||||
|
word_count_threshold=5 # Minimum word count for a HTML tag to be considered as a worthy block
|
||||||
|
)
|
||||||
|
print(result.model_dump())
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Fetch multiple pages:
|
||||||
|
```python
|
||||||
|
urls = [
|
||||||
|
UrlModel(url='http://example.com', forced=False),
|
||||||
|
UrlModel(url='http://example.org', forced=False)
|
||||||
|
]
|
||||||
|
results = crawler.fetch_pages(
|
||||||
|
urls,
|
||||||
|
provider= "openai/gpt-3.5-turbo",
|
||||||
|
api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
|
extract_blocks_flag=True,
|
||||||
|
word_count_threshold=5
|
||||||
|
)
|
||||||
|
|
||||||
|
for res in results:
|
||||||
|
print(res.json())
|
||||||
|
```
|
||||||
|
|
||||||
|
This code demonstrates how to use the Crawl4AI library to fetch a single page or multiple pages. The `WebCrawler` is initialized with the path to the database, and the `fetch_page` and `fetch_pages` methods are used to crawl the specified URLs.
|
||||||
|
|
||||||
|
Make sure to set the `GROQ_API_TOKEN` environment variable with your Groq API token when using the library.
|
||||||
|
|
||||||
|
That's it! You can now integrate Crawl4AI into your Python projects and leverage its web crawling capabilities. 🎉
|
||||||
|
|
||||||
|
## 📖 Parameters
|
||||||
|
|
||||||
|
| Parameter | Description | Required | Default Value |
|
||||||
|
|----------------------|-------------------------------------------------------------------------------------------------|----------|---------------|
|
||||||
|
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||||
|
| `provider_model` | The provider and model to use for extracting relevant information (e.g., "groq/llama3-70b-8192"). | Yes | - |
|
||||||
|
| `api_token` | Your API token for the specified provider. | Yes | - |
|
||||||
|
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||||
|
| `forced` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||||
|
| `extract_blocks` | Whether to extract meaningful blocks of text from the HTML. | No | `false` |
|
||||||
|
| `word_count_threshold` | The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||||
|
|
||||||
|
|
||||||
|
## Contributing 🤝
|
||||||
|
|
||||||
|
We welcome contributions from the open-source community to help improve Crawl4AI and make it even more valuable for AI enthusiasts and developers. To contribute, please follow these steps:
|
||||||
|
|
||||||
|
1. Fork the repository.
|
||||||
|
2. Create a new branch for your feature or bug fix.
|
||||||
|
3. Make your changes and commit them with descriptive messages.
|
||||||
|
4. Push your changes to your forked repository.
|
||||||
|
5. Submit a pull request to the main repository.
|
||||||
|
|
||||||
|
For more information on contributing, please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md).
|
||||||
|
|
||||||
|
## License 📄
|
||||||
|
|
||||||
|
Crawl4AI is released under the [MIT License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
|
||||||
|
|
||||||
|
## Contact 📧
|
||||||
|
|
||||||
|
If you have any questions, suggestions, or feedback, please feel free to reach out to us:
|
||||||
|
|
||||||
|
- GitHub: [unclecode](https://github.com/unclecode)
|
||||||
|
- Twitter: [@unclecode](https://twitter.com/unclecode)
|
||||||
|
- Discord: [your-invite-link](https://discord.gg/your-invite-link)
|
||||||
|
|
||||||
|
Let's work together to make the web more accessible and useful for AI applications! 💪🌐🤖
|
||||||
1
crawler/__init__.py
Normal file
1
crawler/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .web_crawler import WebCrawler
|
||||||
24
crawler/config.py
Normal file
24
crawler/config.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv() # Load environment variables from .env file
|
||||||
|
|
||||||
|
# Default provider
|
||||||
|
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||||
|
|
||||||
|
# Provider-model dictionary
|
||||||
|
PROVIDER_MODELS = {
|
||||||
|
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY", "YOUR_GROQ_TOKEN"),
|
||||||
|
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY", "YOUR_GROQ_TOKEN"),
|
||||||
|
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_TOKEN"),
|
||||||
|
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_TOKEN"),
|
||||||
|
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_TOKEN"),
|
||||||
|
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_TOKEN"),
|
||||||
|
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_TOKEN"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Chunk token threshold
|
||||||
|
CHUNK_TOKEN_THRESHOLD = 1000
|
||||||
|
|
||||||
|
# Threshold for the minimum number of word in a HTML tag to be considered
|
||||||
|
MIN_WORD_THRESHOLD = 5
|
||||||
53
crawler/database.py
Normal file
53
crawler/database.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import sqlite3
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
def init_db(db_path: str):
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS crawled_data (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
html TEXT,
|
||||||
|
cleaned_html TEXT,
|
||||||
|
markdown TEXT,
|
||||||
|
parsed_json TEXT,
|
||||||
|
success BOOLEAN
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
return result
|
||||||
|
|
||||||
|
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
|
html = excluded.html,
|
||||||
|
cleaned_html = excluded.cleaned_html,
|
||||||
|
markdown = excluded.markdown,
|
||||||
|
parsed_json = excluded.parsed_json,
|
||||||
|
success = excluded.success
|
||||||
|
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def get_total_count(db_path: str) -> int:
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('SELECT COUNT(*) FROM crawled_data')
|
||||||
|
result = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
return result[0]
|
||||||
|
except Exception as e:
|
||||||
|
return 0
|
||||||
15
crawler/models.py
Normal file
15
crawler/models.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from pydantic import BaseModel, HttpUrl
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
class UrlModel(BaseModel):
|
||||||
|
url: HttpUrl
|
||||||
|
forced: bool = False
|
||||||
|
|
||||||
|
class CrawlResult(BaseModel):
|
||||||
|
url: str
|
||||||
|
html: str
|
||||||
|
success: bool
|
||||||
|
cleaned_html: str = None
|
||||||
|
markdown: str = None
|
||||||
|
parsed_json: str = None
|
||||||
|
error_message: str = None
|
||||||
110
crawler/prompts.py
Normal file
110
crawler/prompts.py
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
|
||||||
|
<url>{URL}</url>
|
||||||
|
|
||||||
|
And here is the cleaned HTML content of that webpage:
|
||||||
|
<html>
|
||||||
|
{HTML}
|
||||||
|
</html>
|
||||||
|
|
||||||
|
Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
|
||||||
|
|
||||||
|
- index: an integer representing the index of the block in the content
|
||||||
|
- tags: a list of semantic tags that are relevant to the content of the block
|
||||||
|
- content: a list of strings containing the text content of the block
|
||||||
|
- questions: a list of 3 questions that a user may ask about the content in this block
|
||||||
|
|
||||||
|
To generate the JSON objects:
|
||||||
|
|
||||||
|
1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
|
||||||
|
|
||||||
|
2. For each block:
|
||||||
|
a. Assign it an index based on its order in the content.
|
||||||
|
b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about.
|
||||||
|
c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field.
|
||||||
|
d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block.
|
||||||
|
|
||||||
|
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
||||||
|
|
||||||
|
4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.).
|
||||||
|
|
||||||
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
|
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
|
Please provide your output within <blocks> tags, like this:
|
||||||
|
|
||||||
|
<blocks>
|
||||||
|
[{
|
||||||
|
"index": 0,
|
||||||
|
"tags": ["introduction", "overview"],
|
||||||
|
"content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."],
|
||||||
|
"questions": [
|
||||||
|
"What is the main topic of this article?",
|
||||||
|
"What can I expect to learn from reading this article?",
|
||||||
|
"Is this article suitable for beginners or experts in the field?"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"tags": ["history", "background"],
|
||||||
|
"content": ["This is the second paragraph, which delves into the history and background of the topic.",
|
||||||
|
"It provides context and sets the stage for the rest of the article."],
|
||||||
|
"questions": [
|
||||||
|
"What historical events led to the development of this topic?",
|
||||||
|
"How has the understanding of this topic evolved over time?",
|
||||||
|
"What are some key milestones in the history of this topic?"
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
</blocks>
|
||||||
|
|
||||||
|
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||||
|
|
||||||
|
PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
|
||||||
|
<url>{URL}</url>
|
||||||
|
|
||||||
|
And here is the cleaned HTML content of that webpage:
|
||||||
|
<html>
|
||||||
|
{HTML}
|
||||||
|
</html>
|
||||||
|
|
||||||
|
Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
|
||||||
|
|
||||||
|
- index: an integer representing the index of the block in the content
|
||||||
|
- content: a list of strings containing the text content of the block
|
||||||
|
|
||||||
|
To generate the JSON objects:
|
||||||
|
|
||||||
|
1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
|
||||||
|
|
||||||
|
2. For each block:
|
||||||
|
a. Assign it an index based on its order in the content.
|
||||||
|
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
||||||
|
c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
||||||
|
|
||||||
|
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
||||||
|
|
||||||
|
4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
|
||||||
|
|
||||||
|
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||||
|
|
||||||
|
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||||
|
|
||||||
|
7. Never alter the extracted content, just copy and paste it as it is.
|
||||||
|
|
||||||
|
Please provide your output within <blocks> tags, like this:
|
||||||
|
|
||||||
|
<blocks>
|
||||||
|
[{
|
||||||
|
"index": 0,
|
||||||
|
"tags": ["introduction"],
|
||||||
|
"content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"tags": ["background"],
|
||||||
|
"content": ["This is the second paragraph, which delves into the history and background of the topic.",
|
||||||
|
"It provides context and sets the stage for the rest of the article."]
|
||||||
|
}]
|
||||||
|
</blocks>
|
||||||
|
|
||||||
|
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||||
400
crawler/utils.py
Normal file
400
crawler/utils.py
Normal file
@@ -0,0 +1,400 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||||
|
import html2text
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import litellm
|
||||||
|
from litellm import completion, batch_completion
|
||||||
|
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||||
|
from .config import *
|
||||||
|
import re
|
||||||
|
import html
|
||||||
|
|
||||||
|
|
||||||
|
def beautify_html(escaped_html):
|
||||||
|
"""
|
||||||
|
Beautifies an escaped HTML string.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
escaped_html (str): A string containing escaped HTML.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A beautifully formatted HTML string.
|
||||||
|
"""
|
||||||
|
# Unescape the HTML string
|
||||||
|
unescaped_html = html.unescape(escaped_html)
|
||||||
|
|
||||||
|
# Use BeautifulSoup to parse and prettify the HTML
|
||||||
|
soup = BeautifulSoup(unescaped_html, 'html.parser')
|
||||||
|
pretty_html = soup.prettify()
|
||||||
|
|
||||||
|
return pretty_html
|
||||||
|
|
||||||
|
def split_and_parse_json_objects(json_string):
|
||||||
|
"""
|
||||||
|
Splits a JSON string which is a list of objects and tries to parse each object.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: A tuple containing two lists:
|
||||||
|
- First list contains all successfully parsed JSON objects.
|
||||||
|
- Second list contains the string representations of all segments that couldn't be parsed.
|
||||||
|
"""
|
||||||
|
# Trim the leading '[' and trailing ']'
|
||||||
|
if json_string.startswith('[') and json_string.endswith(']'):
|
||||||
|
json_string = json_string[1:-1].strip()
|
||||||
|
|
||||||
|
# Split the string into segments that look like individual JSON objects
|
||||||
|
segments = []
|
||||||
|
depth = 0
|
||||||
|
start_index = 0
|
||||||
|
|
||||||
|
for i, char in enumerate(json_string):
|
||||||
|
if char == '{':
|
||||||
|
if depth == 0:
|
||||||
|
start_index = i
|
||||||
|
depth += 1
|
||||||
|
elif char == '}':
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
segments.append(json_string[start_index:i+1])
|
||||||
|
|
||||||
|
# Try parsing each segment
|
||||||
|
parsed_objects = []
|
||||||
|
unparsed_segments = []
|
||||||
|
|
||||||
|
for segment in segments:
|
||||||
|
try:
|
||||||
|
obj = json.loads(segment)
|
||||||
|
parsed_objects.append(obj)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
unparsed_segments.append(segment)
|
||||||
|
|
||||||
|
return parsed_objects, unparsed_segments
|
||||||
|
|
||||||
|
def sanitize_html(html):
|
||||||
|
# Replace all weird and special characters with an empty string
|
||||||
|
sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||||
|
|
||||||
|
# Escape all double and single quotes
|
||||||
|
sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'")
|
||||||
|
|
||||||
|
return sanitized_html
|
||||||
|
|
||||||
|
def escape_json_string(s):
|
||||||
|
"""
|
||||||
|
Escapes characters in a string to be JSON safe.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
s (str): The input string to be escaped.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The escaped string, safe for JSON encoding.
|
||||||
|
"""
|
||||||
|
# Replace problematic backslash first
|
||||||
|
s = s.replace('\\', '\\\\')
|
||||||
|
|
||||||
|
# Replace the double quote
|
||||||
|
s = s.replace('"', '\\"')
|
||||||
|
|
||||||
|
# Escape control characters
|
||||||
|
s = s.replace('\b', '\\b')
|
||||||
|
s = s.replace('\f', '\\f')
|
||||||
|
s = s.replace('\n', '\\n')
|
||||||
|
s = s.replace('\r', '\\r')
|
||||||
|
s = s.replace('\t', '\\t')
|
||||||
|
|
||||||
|
# Additional problematic characters
|
||||||
|
# Unicode control characters
|
||||||
|
s = re.sub(r'[\x00-\x1f\x7f-\x9f]', lambda x: '\\u{:04x}'.format(ord(x.group())), s)
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||||
|
try:
|
||||||
|
# Parse HTML content with BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Get the content within the <body> tag
|
||||||
|
body = soup.body
|
||||||
|
|
||||||
|
# Remove script, style, and other tags that don't carry useful content from body
|
||||||
|
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
# Remove all attributes from remaining tags in body, except for img tags
|
||||||
|
for tag in body.find_all():
|
||||||
|
if tag.name != 'img':
|
||||||
|
tag.attrs = {}
|
||||||
|
|
||||||
|
# Replace images with their alt text or remove them if no alt text is available
|
||||||
|
for img in body.find_all('img'):
|
||||||
|
alt_text = img.get('alt')
|
||||||
|
if alt_text:
|
||||||
|
img.replace_with(soup.new_string(alt_text))
|
||||||
|
else:
|
||||||
|
img.decompose()
|
||||||
|
|
||||||
|
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
|
||||||
|
def remove_empty_and_low_word_count_elements(node):
|
||||||
|
for child in node.contents:
|
||||||
|
if isinstance(child, element.Tag):
|
||||||
|
remove_empty_and_low_word_count_elements(child)
|
||||||
|
word_count = len(child.get_text(strip=True).split())
|
||||||
|
if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold:
|
||||||
|
child.decompose()
|
||||||
|
return node
|
||||||
|
|
||||||
|
body = remove_empty_and_low_word_count_elements(body)
|
||||||
|
|
||||||
|
def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD):
|
||||||
|
# We'll use a list to collect all tags that don't meet the word count requirement
|
||||||
|
tags_to_remove = []
|
||||||
|
|
||||||
|
# Traverse all tags in the body
|
||||||
|
for tag in body.find_all(True): # True here means all tags
|
||||||
|
# Check if the tag contains text and if it's not just whitespace
|
||||||
|
if tag.string and tag.string.strip():
|
||||||
|
# Split the text by spaces and count the words
|
||||||
|
word_count = len(tag.string.strip().split())
|
||||||
|
# If the word count is less than the threshold, mark the tag for removal
|
||||||
|
if word_count < word_count_threshold:
|
||||||
|
tags_to_remove.append(tag)
|
||||||
|
|
||||||
|
# Remove all marked tags from the tree
|
||||||
|
for tag in tags_to_remove:
|
||||||
|
tag.decompose() # or tag.extract() to remove and get the element
|
||||||
|
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
# Remove small text tags
|
||||||
|
body = remove_small_text_tags(body, word_count_threshold)
|
||||||
|
|
||||||
|
def is_empty_or_whitespace(tag: Tag):
|
||||||
|
if isinstance(tag, NavigableString):
|
||||||
|
return not tag.strip()
|
||||||
|
# Check if the tag itself is empty or all its children are empty/whitespace
|
||||||
|
if not tag.contents:
|
||||||
|
return True
|
||||||
|
return all(is_empty_or_whitespace(child) for child in tag.contents)
|
||||||
|
|
||||||
|
def remove_empty_tags(body: Tag):
|
||||||
|
# Continue processing until no more changes are made
|
||||||
|
changes = True
|
||||||
|
while changes:
|
||||||
|
changes = False
|
||||||
|
# Collect all tags that are empty or contain only whitespace
|
||||||
|
empty_tags = [tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)]
|
||||||
|
for tag in empty_tags:
|
||||||
|
# If a tag is empty, decompose it
|
||||||
|
tag.decompose()
|
||||||
|
changes = True # Mark that a change was made
|
||||||
|
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
# Remove empty tags
|
||||||
|
body = remove_empty_tags(body)
|
||||||
|
|
||||||
|
# Flatten nested elements with only one child of the same type
|
||||||
|
def flatten_nested_elements(node):
|
||||||
|
for child in node.contents:
|
||||||
|
if isinstance(child, element.Tag):
|
||||||
|
flatten_nested_elements(child)
|
||||||
|
if len(child.contents) == 1 and child.contents[0].name == child.name:
|
||||||
|
# print('Flattening:', child.name)
|
||||||
|
child_content = child.contents[0]
|
||||||
|
child.replace_with(child_content)
|
||||||
|
|
||||||
|
return node
|
||||||
|
|
||||||
|
body = flatten_nested_elements(body)
|
||||||
|
|
||||||
|
# Remove comments
|
||||||
|
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
# Remove consecutive empty newlines and replace multiple spaces with a single space
|
||||||
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
|
|
||||||
|
# Sanitize the cleaned HTML content
|
||||||
|
cleaned_html = sanitize_html(cleaned_html)
|
||||||
|
# sanitized_html = escape_json_string(cleaned_html)
|
||||||
|
|
||||||
|
# Convert cleaned HTML to Markdown
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = True
|
||||||
|
markdown = h.handle(cleaned_html)
|
||||||
|
|
||||||
|
# Return the Markdown content
|
||||||
|
return{
|
||||||
|
'markdown': markdown,
|
||||||
|
'cleaned_html': cleaned_html,
|
||||||
|
'success': True
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('Error processing HTML content:', str(e))
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
# word_count_threshold = 5 # Adjust this value according to your desired threshold
|
||||||
|
# markdown_content = get_content_of_website(word_count_threshold)
|
||||||
|
# print(markdown_content)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_xml_tags(string):
|
||||||
|
tags = re.findall(r'<(\w+)>', string)
|
||||||
|
return list(set(tags))
|
||||||
|
|
||||||
|
def extract_xml_data(tags, string):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
for tag in tags:
|
||||||
|
pattern = f"<{tag}>(.*?)</{tag}>"
|
||||||
|
match = re.search(pattern, string, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
data[tag] = match.group(1).strip()
|
||||||
|
else:
|
||||||
|
data[tag] = ""
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
import time
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
# Function to perform the completion with exponential backoff
|
||||||
|
def perform_completion_with_backoff(provider, prompt_with_variables, api_token):
|
||||||
|
max_attempts = 3
|
||||||
|
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
|
||||||
|
|
||||||
|
for attempt in range(max_attempts):
|
||||||
|
try:
|
||||||
|
response = completion(
|
||||||
|
model=provider,
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": prompt_with_variables}
|
||||||
|
],
|
||||||
|
temperature=0.01,
|
||||||
|
api_key=api_token
|
||||||
|
)
|
||||||
|
return response # Return the successful response
|
||||||
|
except litellm.exceptions.RateLimitError as e:
|
||||||
|
print("Rate limit error:", str(e))
|
||||||
|
|
||||||
|
# Check if we have exhausted our max attempts
|
||||||
|
if attempt < max_attempts - 1:
|
||||||
|
# Calculate the delay and wait
|
||||||
|
delay = base_delay * (2 ** attempt) # Exponential backoff formula
|
||||||
|
print(f"Waiting for {delay} seconds before retrying...")
|
||||||
|
time.sleep(delay)
|
||||||
|
else:
|
||||||
|
# Return an error response after exhausting all retries
|
||||||
|
return [{
|
||||||
|
"index": 0,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": ["Rate limit error. Please try again later."]
|
||||||
|
}]
|
||||||
|
|
||||||
|
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
|
||||||
|
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||||
|
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
||||||
|
|
||||||
|
variable_values = {
|
||||||
|
"URL": url,
|
||||||
|
"HTML": escape_json_string(sanitize_html(html)),
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt_with_variables = PROMPT_EXTRACT_BLOCKS
|
||||||
|
for variable in variable_values:
|
||||||
|
prompt_with_variables = prompt_with_variables.replace(
|
||||||
|
"{" + variable + "}", variable_values[variable]
|
||||||
|
)
|
||||||
|
|
||||||
|
response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# response = completion(
|
||||||
|
# model = provider,
|
||||||
|
# messages = [
|
||||||
|
# {"role": "user", "content": prompt_with_variables}
|
||||||
|
# ],
|
||||||
|
# temperature = 0.01,
|
||||||
|
# api_key = api_token
|
||||||
|
# )
|
||||||
|
# except litellm.exceptions.RateLimitError as e:
|
||||||
|
# print("Rate limit error:", str(e))
|
||||||
|
# return [{
|
||||||
|
# "index": 0,
|
||||||
|
# "tags": ["error"],
|
||||||
|
# "content": ["Rate limit error. Please try again later."]
|
||||||
|
# }]
|
||||||
|
|
||||||
|
try:
|
||||||
|
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||||
|
blocks = json.loads(blocks)
|
||||||
|
## Add error: False to the blocks
|
||||||
|
for block in blocks:
|
||||||
|
block['error'] = False
|
||||||
|
except Exception as e:
|
||||||
|
print("Error extracting blocks:", str(e))
|
||||||
|
parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
|
||||||
|
blocks = parsed
|
||||||
|
# Append all unparsed segments as onr error block and content is list of unparsed segments
|
||||||
|
if unparsed:
|
||||||
|
blocks.append({
|
||||||
|
"index": 0,
|
||||||
|
"error": True,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": unparsed
|
||||||
|
})
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
|
||||||
|
api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
|
||||||
|
for url, html in batch_data:
|
||||||
|
variable_values = {
|
||||||
|
"URL": url,
|
||||||
|
"HTML": html,
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt_with_variables = PROMPT_EXTRACT_BLOCKS
|
||||||
|
for variable in variable_values:
|
||||||
|
prompt_with_variables = prompt_with_variables.replace(
|
||||||
|
"{" + variable + "}", variable_values[variable]
|
||||||
|
)
|
||||||
|
|
||||||
|
messages.append([{"role": "user", "content": prompt_with_variables}])
|
||||||
|
|
||||||
|
|
||||||
|
responses = batch_completion(
|
||||||
|
model = provider,
|
||||||
|
messages = messages,
|
||||||
|
temperature = 0.01
|
||||||
|
)
|
||||||
|
|
||||||
|
all_blocks = []
|
||||||
|
for response in responses:
|
||||||
|
try:
|
||||||
|
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
|
||||||
|
blocks = json.loads(blocks)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Error extracting blocks:", str(e))
|
||||||
|
blocks = [{
|
||||||
|
"index": 0,
|
||||||
|
"tags": ["error"],
|
||||||
|
"content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."],
|
||||||
|
"questions": ["What went wrong during the block extraction process?"]
|
||||||
|
}]
|
||||||
|
all_blocks.append(blocks)
|
||||||
|
|
||||||
|
return sum(all_blocks, [])
|
||||||
133
crawler/web_crawler.py
Normal file
133
crawler/web_crawler.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
import asyncio
|
||||||
|
import os, time
|
||||||
|
import json
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import chromedriver_autoinstaller
|
||||||
|
from pydantic import parse_obj_as
|
||||||
|
from .models import UrlModel, CrawlResult
|
||||||
|
from .database import init_db, get_cached_url, cache_url
|
||||||
|
from .utils import *
|
||||||
|
from typing import List
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from .config import *
|
||||||
|
|
||||||
|
class WebCrawler:
|
||||||
|
def __init__(self, db_path: str):
|
||||||
|
self.db_path = db_path
|
||||||
|
init_db(self.db_path)
|
||||||
|
self.options = Options()
|
||||||
|
self.options.headless = True
|
||||||
|
self.options.add_argument("--no-sandbox")
|
||||||
|
self.options.add_argument("--disable-dev-shm-usage")
|
||||||
|
# make it headless
|
||||||
|
self.options.add_argument("--headless")
|
||||||
|
|
||||||
|
# Automatically install or update chromedriver
|
||||||
|
chromedriver_autoinstaller.install()
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(self, url_model: UrlModel, provider: str = DEFAULT_PROVIDER, api_token: str = None, extract_blocks_flag: bool = True, word_count_threshold = MIN_WORD_THRESHOLD) -> CrawlResult:
|
||||||
|
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
||||||
|
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
cached = get_cached_url(self.db_path, str(url_model.url))
|
||||||
|
if cached and not url_model.forced:
|
||||||
|
return CrawlResult(**{
|
||||||
|
"url": cached[0],
|
||||||
|
"html": cached[1],
|
||||||
|
"cleaned_html": cached[2],
|
||||||
|
"markdown": cached[3],
|
||||||
|
"parsed_json": cached[4],
|
||||||
|
"success": cached[5],
|
||||||
|
"error_message": ""
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize WebDriver for crawling
|
||||||
|
service = Service(chromedriver_autoinstaller.install())
|
||||||
|
driver = webdriver.Chrome(service=service, options=self.options)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(str(url_model.url))
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||||
|
)
|
||||||
|
html = driver.page_source
|
||||||
|
success = True
|
||||||
|
error_message = ""
|
||||||
|
except Exception as e:
|
||||||
|
html = ""
|
||||||
|
success = False
|
||||||
|
error_message = str(e)
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
# Extract content from HTML
|
||||||
|
result = get_content_of_website(html, word_count_threshold)
|
||||||
|
cleaned_html = result.get('cleaned_html', html)
|
||||||
|
markdown = result.get('markdown', "")
|
||||||
|
|
||||||
|
print("Crawling is done 🚀")
|
||||||
|
|
||||||
|
parsed_json = []
|
||||||
|
if extract_blocks_flag:
|
||||||
|
# Split markdown into sections
|
||||||
|
paragraphs = markdown.split('\n\n')
|
||||||
|
sections = []
|
||||||
|
chunks = []
|
||||||
|
total_token_so_far = 0
|
||||||
|
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if total_token_so_far < CHUNK_TOKEN_THRESHOLD:
|
||||||
|
chunk = paragraph.split(' ')
|
||||||
|
total_token_so_far += len(chunk) * 1.3
|
||||||
|
chunks.append(paragraph)
|
||||||
|
else:
|
||||||
|
sections.append('\n\n'.join(chunks))
|
||||||
|
chunks = [paragraph]
|
||||||
|
total_token_so_far = len(paragraph.split(' ')) * 1.3
|
||||||
|
|
||||||
|
if chunks:
|
||||||
|
sections.append('\n\n'.join(chunks))
|
||||||
|
|
||||||
|
# Process sections to extract blocks
|
||||||
|
parsed_json = []
|
||||||
|
if provider.startswith("groq/"):
|
||||||
|
# Sequential processing with a delay
|
||||||
|
for section in sections:
|
||||||
|
parsed_json.extend(extract_blocks(str(url_model.url), section, provider, api_token))
|
||||||
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
|
else:
|
||||||
|
# Parallel processing using ThreadPoolExecutor
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
futures = [executor.submit(extract_blocks, str(url_model.url), section, provider, api_token) for section in sections]
|
||||||
|
for future in as_completed(futures):
|
||||||
|
parsed_json.extend(future.result())
|
||||||
|
|
||||||
|
parsed_json = json.dumps(parsed_json)
|
||||||
|
|
||||||
|
# Cache the result
|
||||||
|
cleaned_html = beautify_html(cleaned_html)
|
||||||
|
cache_url(self.db_path, str(url_model.url), html, cleaned_html, markdown, parsed_json, success)
|
||||||
|
|
||||||
|
return CrawlResult(
|
||||||
|
url=str(url_model.url),
|
||||||
|
html=html,
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
markdown=markdown,
|
||||||
|
parsed_json=parsed_json,
|
||||||
|
success=success,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
def fetch_pages(self, url_models: List[UrlModel], provider: str = DEFAULT_PROVIDER, api_token: str = None) -> List[CrawlResult]:
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
results = list(executor.map(self.fetch_page, url_models, [provider] * len(url_models), [api_token] * len(url_models)))
|
||||||
|
return results
|
||||||
10
docker-compose.yml
Normal file
10
docker-compose.yml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
web:
|
||||||
|
build: .
|
||||||
|
command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
31
examples/test.py
Normal file
31
examples/test.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from crawler.web_crawler import WebCrawler
|
||||||
|
from crawler.models import UrlModel
|
||||||
|
from crawler.utils import get_content_of_website
|
||||||
|
import os
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Initialize the WebCrawler with just the database path
|
||||||
|
crawler = WebCrawler(db_path='crawler_data.db')
|
||||||
|
|
||||||
|
# Fetch a single page
|
||||||
|
single_url = UrlModel(url='https://kidocode.com', forced=True)
|
||||||
|
result = crawler.fetch_page(
|
||||||
|
single_url,
|
||||||
|
provider= "openai/gpt-3.5-turbo",
|
||||||
|
api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
|
extract_blocks_flag=True,
|
||||||
|
word_count_threshold=5
|
||||||
|
)
|
||||||
|
print(result.model_dump())
|
||||||
|
|
||||||
|
# Fetch multiple pages
|
||||||
|
# urls = [
|
||||||
|
# UrlModel(url='http://example.com', forced=False),
|
||||||
|
# UrlModel(url='http://example.org', forced=False)
|
||||||
|
# ]
|
||||||
|
# results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
|
||||||
|
# for res in results:
|
||||||
|
# print(res.model_copy())
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
154
main.py
Normal file
154
main.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
from fastapi import FastAPI, HTTPException, Request
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from pydantic import BaseModel, HttpUrl
|
||||||
|
from typing import List, Optional
|
||||||
|
from crawler.web_crawler import WebCrawler
|
||||||
|
from crawler.models import UrlModel
|
||||||
|
import asyncio
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
import chromedriver_autoinstaller
|
||||||
|
from functools import lru_cache
|
||||||
|
from crawler.database import get_total_count
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
# Task management
|
||||||
|
tasks = {}
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
MAX_CONCURRENT_REQUESTS = 10 # Adjust this to change the maximum concurrent requests
|
||||||
|
current_requests = 0
|
||||||
|
lock = asyncio.Lock()
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Mount the pages directory as a static directory
|
||||||
|
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
|
||||||
|
|
||||||
|
|
||||||
|
chromedriver_autoinstaller.install() # Ensure chromedriver is installed
|
||||||
|
|
||||||
|
class UrlsInput(BaseModel):
|
||||||
|
urls: List[HttpUrl]
|
||||||
|
provider_model: str
|
||||||
|
api_token: str
|
||||||
|
include_raw_html: Optional[bool] = False
|
||||||
|
forced: bool = False
|
||||||
|
extract_blocks: bool = True
|
||||||
|
word_count_threshold: Optional[int] = 5
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def get_crawler():
|
||||||
|
# Initialize and return a WebCrawler instance
|
||||||
|
return WebCrawler(db_path='crawler_data.db')
|
||||||
|
|
||||||
|
@app.get("/", response_class=HTMLResponse)
|
||||||
|
async def read_index():
|
||||||
|
with open(f"{__location__}/pages/index.html", "r") as file:
|
||||||
|
html_content = file.read()
|
||||||
|
return HTMLResponse(content=html_content, status_code=200)
|
||||||
|
|
||||||
|
@app.get("/total-count")
|
||||||
|
async def get_total_url_count():
|
||||||
|
count = get_total_count(db_path='crawler_data.db')
|
||||||
|
return JSONResponse(content={"count": count})
|
||||||
|
|
||||||
|
@app.post("/crawl")
|
||||||
|
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||||
|
global current_requests
|
||||||
|
# Raise error if api_token is not provided
|
||||||
|
if not urls_input.api_token:
|
||||||
|
raise HTTPException(status_code=401, detail="API token is required.")
|
||||||
|
async with lock:
|
||||||
|
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||||
|
raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
|
||||||
|
current_requests += 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare URL models for crawling
|
||||||
|
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
|
||||||
|
|
||||||
|
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
futures = [
|
||||||
|
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, urls_input.provider_model, urls_input.api_token, urls_input.extract_blocks, urls_input.word_count_threshold)
|
||||||
|
for url_model in url_models
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*futures)
|
||||||
|
|
||||||
|
# if include_raw_html is False, remove the raw HTML content from the results
|
||||||
|
if not urls_input.include_raw_html:
|
||||||
|
for result in results:
|
||||||
|
result.html = None
|
||||||
|
|
||||||
|
return {"results": [result.dict() for result in results]}
|
||||||
|
finally:
|
||||||
|
async with lock:
|
||||||
|
current_requests -= 1
|
||||||
|
|
||||||
|
@app.post("/crawl_async")
|
||||||
|
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||||
|
global current_requests
|
||||||
|
if not urls_input.api_token:
|
||||||
|
raise HTTPException(status_code=401, detail="API token is required.")
|
||||||
|
|
||||||
|
async with lock:
|
||||||
|
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||||
|
raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
|
||||||
|
current_requests += 1
|
||||||
|
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
tasks[task_id] = {"status": "pending", "results": None}
|
||||||
|
|
||||||
|
try:
|
||||||
|
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
loop.create_task(
|
||||||
|
process_crawl_task(url_models, urls_input.provider_model, urls_input.api_token, task_id, urls_input.extract_blocks)
|
||||||
|
)
|
||||||
|
return {"task_id": task_id}
|
||||||
|
finally:
|
||||||
|
async with lock:
|
||||||
|
current_requests -= 1
|
||||||
|
|
||||||
|
async def process_crawl_task(url_models, provider, api_token, task_id, extract_blocks_flag):
|
||||||
|
try:
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
futures = [
|
||||||
|
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, provider, api_token, extract_blocks_flag)
|
||||||
|
for url_model in url_models
|
||||||
|
]
|
||||||
|
results = await asyncio.gather(*futures)
|
||||||
|
|
||||||
|
tasks[task_id] = {"status": "done", "results": results}
|
||||||
|
except Exception as e:
|
||||||
|
tasks[task_id] = {"status": "failed", "error": str(e)}
|
||||||
|
|
||||||
|
@app.get("/task/{task_id}")
|
||||||
|
async def get_task_status(task_id: str):
|
||||||
|
task = tasks.get(task_id)
|
||||||
|
if not task:
|
||||||
|
raise HTTPException(status_code=404, detail="Task not found")
|
||||||
|
|
||||||
|
if task['status'] == 'done':
|
||||||
|
return {
|
||||||
|
"status": task['status'],
|
||||||
|
"results": [result.dict() for result in task['results']]
|
||||||
|
}
|
||||||
|
elif task['status'] == 'failed':
|
||||||
|
return {
|
||||||
|
"status": task['status'],
|
||||||
|
"error": task['error']
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {"status": task['status']}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
411
pages/index.html
Normal file
411
pages/index.html
Normal file
@@ -0,0 +1,411 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>Crawl4AI</title>
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||||
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||||
|
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||||
|
<link
|
||||||
|
rel="stylesheet"
|
||||||
|
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
|
||||||
|
/>
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
--ifm-font-size-base: 100%;
|
||||||
|
--ifm-line-height-base: 1.65;
|
||||||
|
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
||||||
|
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
||||||
|
"Segoe UI Emoji", "Segoe UI Symbol";
|
||||||
|
}
|
||||||
|
html {
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
-webkit-text-size-adjust: 100%;
|
||||||
|
text-size-adjust: 100%;
|
||||||
|
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
background-color: #1a202c;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
.tab-content {
|
||||||
|
max-height: 400px;
|
||||||
|
overflow: auto;
|
||||||
|
}
|
||||||
|
pre {
|
||||||
|
white-space: pre-wrap;
|
||||||
|
font-size: 14px;
|
||||||
|
}
|
||||||
|
pre code {
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header class="bg-gray-900 text-white py-4">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper</h1>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- Add a section to show total-count websited already crawled -->
|
||||||
|
<section class="bg-gray-600 py-8">
|
||||||
|
<div class="container mx-auto px-4 flex font-bold text-xl gap-2">
|
||||||
|
<span>📊 Total Website Procceced</span>
|
||||||
|
<span id="total-count" class="text-blue-400">0</span>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="try-it py-8 pb-20">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||||
|
<div class="mb-4 flex w-full gap-2">
|
||||||
|
<div class="flex items-center gap-2 flex-col flex-grow">
|
||||||
|
<label for="url-input" class="text-white">URL(s)</label>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
id="url-input"
|
||||||
|
value="https://kidocode.com"
|
||||||
|
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
|
||||||
|
placeholder="Enter URL(s) separated by commas"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<!-- Add a number set if 5 with a label word threshold -->
|
||||||
|
<div class="flex items-center gap-2 flex-col">
|
||||||
|
<label for="threshold" class="text-white">Min Words Threshold</label>
|
||||||
|
<select id="threshold" class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full">
|
||||||
|
<option value="5">5</option>
|
||||||
|
<option value="10">10</option>
|
||||||
|
<option value="15">15</option>
|
||||||
|
<option value="20">20</option>
|
||||||
|
<option value="25">25</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="flex items-center gap-2 flex-col">
|
||||||
|
<label for="provider-model-select" class="text-white">Provider Model</label>
|
||||||
|
|
||||||
|
<select
|
||||||
|
id="provider-model-select"
|
||||||
|
class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full"
|
||||||
|
>
|
||||||
|
<!-- Add your option values here -->
|
||||||
|
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||||
|
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||||
|
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||||
|
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||||
|
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||||
|
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||||
|
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="flex items-center gap-2 flex-col">
|
||||||
|
<label for="token-input" class="text-white">API Token</label>
|
||||||
|
|
||||||
|
<input
|
||||||
|
type="password"
|
||||||
|
id="token-input"
|
||||||
|
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
|
||||||
|
placeholder="Enter Groq API token"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div class="flex items-center justify-center gap-2 flex-col">
|
||||||
|
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
|
||||||
|
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked />
|
||||||
|
</div>
|
||||||
|
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
|
||||||
|
</div>
|
||||||
|
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
|
||||||
|
<div id="loading" class="hidden mt-4">
|
||||||
|
<p>
|
||||||
|
Depends on the selected model, it may take up to 1 or 2 minutes to process the request.
|
||||||
|
Loading...
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<div id="result" class="tab-container flex-1 h-full flex-col">
|
||||||
|
<div class="tab-buttons flex gap-2">
|
||||||
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
|
||||||
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
|
||||||
|
Cleaned HTML
|
||||||
|
</button>
|
||||||
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
|
||||||
|
Markdown
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||||
|
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
|
||||||
|
<pre
|
||||||
|
class="hidden h-full flex"
|
||||||
|
><code id="cleaned-html-result" class="language-html "></code></pre>
|
||||||
|
<pre
|
||||||
|
class="hidden h-full flex"
|
||||||
|
><code id="markdown-result" class="language-markdown "></code></pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="code_help" class="tab-container flex-1 h-full">
|
||||||
|
<div class="tab-buttons flex gap-2">
|
||||||
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
|
||||||
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
|
||||||
|
Python
|
||||||
|
</button>
|
||||||
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
|
||||||
|
Node.js
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||||
|
<pre class="h-full flex relative">
|
||||||
|
<code id="curl-code" class="language-bash"></code>
|
||||||
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||||
|
</pre>
|
||||||
|
<pre class="hidden h-full flex relative">
|
||||||
|
<code id="python-code" class="language-python"></code>
|
||||||
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||||
|
</pre>
|
||||||
|
<pre class="hidden h-full flex relative">
|
||||||
|
<code id="nodejs-code" class="language-javascript"></code>
|
||||||
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||||
|
</pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="hero bg-gray-900 py-8">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||||
|
<p class="text-lg mb-4">
|
||||||
|
In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
|
||||||
|
for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
|
||||||
|
crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
|
||||||
|
🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
|
||||||
|
definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
|
||||||
|
philosophy, we invite you to join our "Robinhood" band and help set these products free for the
|
||||||
|
benefit of all. 🤝💪
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="installation py-8">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||||
|
<p class="mb-4">
|
||||||
|
To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
|
||||||
|
these steps:
|
||||||
|
</p>
|
||||||
|
<ol class="list-decimal list-inside mb-4">
|
||||||
|
<li>
|
||||||
|
Clone the GitHub repository: 📥
|
||||||
|
<code>git clone https://github.com/unclecode/crawl4ai.git</code>
|
||||||
|
</li>
|
||||||
|
<li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
|
||||||
|
<li>
|
||||||
|
Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
|
||||||
|
<code>docker build --platform linux/amd64 -t crawl4ai .</code>
|
||||||
|
</li>
|
||||||
|
<li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
|
||||||
|
</ol>
|
||||||
|
<p>
|
||||||
|
For more detailed instructions and advanced configuration options, please refer to the 📚
|
||||||
|
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<footer class="bg-gray-900 text-white py-4">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<div class="flex justify-between items-center">
|
||||||
|
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||||
|
<div class="social-links">
|
||||||
|
<a
|
||||||
|
href="https://github.com/unclecode/crawl4ai"
|
||||||
|
class="text-white hover:text-gray-300 mx-2"
|
||||||
|
target="_blank"
|
||||||
|
>😺 GitHub</a
|
||||||
|
>
|
||||||
|
<a
|
||||||
|
href="https://twitter.com/unclecode"
|
||||||
|
class="text-white hover:text-gray-300 mx-2"
|
||||||
|
target="_blank"
|
||||||
|
>🐦 Twitter</a
|
||||||
|
>
|
||||||
|
<a
|
||||||
|
href="https://discord.gg/your-invite-link"
|
||||||
|
class="text-white hover:text-gray-300 mx-2"
|
||||||
|
target="_blank"
|
||||||
|
>💬 Discord</a
|
||||||
|
>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Get the selected provider model and token from local storage
|
||||||
|
const storedProviderModel = localStorage.getItem("provider_model");
|
||||||
|
const storedToken = localStorage.getItem(storedProviderModel);
|
||||||
|
|
||||||
|
if (storedProviderModel) {
|
||||||
|
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (storedToken) {
|
||||||
|
document.getElementById("token-input").value = storedToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle provider model dropdown change
|
||||||
|
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||||
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||||
|
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||||
|
|
||||||
|
if (storedToken) {
|
||||||
|
document.getElementById("token-input").value = storedToken;
|
||||||
|
} else {
|
||||||
|
document.getElementById("token-input").value = "";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fetch total count from the database
|
||||||
|
axios
|
||||||
|
.get("/total-count")
|
||||||
|
.then((response) => {
|
||||||
|
document.getElementById("total-count").textContent = response.data.count;
|
||||||
|
})
|
||||||
|
.catch((error) => console.error(error));
|
||||||
|
|
||||||
|
// Handle crawl button click
|
||||||
|
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||||
|
// validate input to have both URL and API token
|
||||||
|
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||||
|
alert("Please enter both URL(s) and API token.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||||
|
const apiToken = document.getElementById("token-input").value;
|
||||||
|
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||||
|
|
||||||
|
// Save the selected provider model and token to local storage
|
||||||
|
localStorage.setItem("provider_model", selectedProviderModel);
|
||||||
|
localStorage.setItem(selectedProviderModel, apiToken);
|
||||||
|
|
||||||
|
const urlsInput = document.getElementById("url-input").value;
|
||||||
|
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||||
|
const data = {
|
||||||
|
urls: urls,
|
||||||
|
provider_model: selectedProviderModel,
|
||||||
|
api_token: apiToken,
|
||||||
|
include_raw_html: true,
|
||||||
|
forced: false,
|
||||||
|
extract_blocks: extractBlocks,
|
||||||
|
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||||
|
};
|
||||||
|
|
||||||
|
// save api token to local storage
|
||||||
|
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||||
|
|
||||||
|
document.getElementById("loading").classList.remove("hidden");
|
||||||
|
document.getElementById("result").classList.add("hidden");
|
||||||
|
document.getElementById("code_help").classList.add("hidden");
|
||||||
|
|
||||||
|
axios
|
||||||
|
.post("/crawl", data)
|
||||||
|
.then((response) => {
|
||||||
|
const result = response.data.results[0];
|
||||||
|
const parsedJson = JSON.parse(result.parsed_json);
|
||||||
|
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||||
|
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||||
|
document.getElementById("markdown-result").textContent = result.markdown;
|
||||||
|
|
||||||
|
// Update code examples dynamically
|
||||||
|
// Update code examples dynamically
|
||||||
|
document.getElementById(
|
||||||
|
"curl-code"
|
||||||
|
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||||
|
...data,
|
||||||
|
api_token: "your_api_token",
|
||||||
|
})}' http://localhost:8000/crawl`;
|
||||||
|
|
||||||
|
document.getElementById(
|
||||||
|
"python-code"
|
||||||
|
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||||
|
{ ...data, api_token: "your_api_token" },
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
|
||||||
|
|
||||||
|
document.getElementById(
|
||||||
|
"nodejs-code"
|
||||||
|
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||||
|
{ ...data, api_token: "your_api_token" },
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||||
|
// Highlight code syntax
|
||||||
|
hljs.highlightAll();
|
||||||
|
|
||||||
|
// Select JSON tab by default
|
||||||
|
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||||
|
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
document.getElementById("result").classList.remove("hidden");
|
||||||
|
document.getElementById("code_help").classList.remove("hidden");
|
||||||
|
|
||||||
|
// increment the total count
|
||||||
|
document.getElementById("total-count").textContent =
|
||||||
|
parseInt(document.getElementById("total-count").textContent) + 1;
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error(error);
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle tab clicks
|
||||||
|
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const tab = btn.dataset.tab;
|
||||||
|
document
|
||||||
|
.querySelectorAll(".tab-btn")
|
||||||
|
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||||
|
btn.classList.add("bg-blue-600", "text-white");
|
||||||
|
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||||
|
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle code tab clicks
|
||||||
|
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const tab = btn.dataset.tab;
|
||||||
|
document
|
||||||
|
.querySelectorAll(".code-tab-btn")
|
||||||
|
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||||
|
btn.classList.add("bg-blue-600", "text-white");
|
||||||
|
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||||
|
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle copy to clipboard button clicks
|
||||||
|
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const target = btn.dataset.target;
|
||||||
|
const code = document.getElementById(target).textContent;
|
||||||
|
navigator.clipboard.writeText(code).then(() => {
|
||||||
|
btn.textContent = "Copied!";
|
||||||
|
setTimeout(() => {
|
||||||
|
btn.textContent = "Copy";
|
||||||
|
}, 2000);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
425
pages/index_pooling.html
Normal file
425
pages/index_pooling.html
Normal file
@@ -0,0 +1,425 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>Crawl4AI</title>
|
||||||
|
|
||||||
|
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||||
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||||
|
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||||
|
<link
|
||||||
|
rel="stylesheet"
|
||||||
|
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
|
||||||
|
/>
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
--ifm-font-size-base: 100%;
|
||||||
|
--ifm-line-height-base: 1.65;
|
||||||
|
--ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
|
||||||
|
sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
|
||||||
|
"Segoe UI Emoji", "Segoe UI Symbol";
|
||||||
|
}
|
||||||
|
html {
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
-webkit-text-size-adjust: 100%;
|
||||||
|
text-size-adjust: 100%;
|
||||||
|
font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
background-color: #1a202c;
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
.tab-content {
|
||||||
|
max-height: 400px;
|
||||||
|
overflow: auto;
|
||||||
|
}
|
||||||
|
pre {
|
||||||
|
white-space: pre-wrap;
|
||||||
|
font-size: 14px;
|
||||||
|
}
|
||||||
|
pre code {
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header class="bg-gray-900 text-white py-4">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Scrapper</h1>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<section class="try-it py-8 pb-20">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||||
|
<div class="mb-4 flex w-full gap-2">
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
id="url-input"
|
||||||
|
value="https://kidocode.com"
|
||||||
|
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
|
||||||
|
placeholder="Enter URL(s) separated by commas"
|
||||||
|
/>
|
||||||
|
<select
|
||||||
|
id="provider-model-select"
|
||||||
|
class="border border-gray-600 rounded px-4 py-2 bg-gray-800 text-white"
|
||||||
|
>
|
||||||
|
<!-- Add your option values here -->
|
||||||
|
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||||
|
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||||
|
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||||
|
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||||
|
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||||
|
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||||
|
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||||
|
</select>
|
||||||
|
<input
|
||||||
|
type="password"
|
||||||
|
id="token-input"
|
||||||
|
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
|
||||||
|
placeholder="Enter Groq API token"
|
||||||
|
/>
|
||||||
|
<div class="flex items-center justify-center">
|
||||||
|
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked>
|
||||||
|
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
|
||||||
|
</div>
|
||||||
|
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
|
||||||
|
</div>
|
||||||
|
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
|
||||||
|
<div id="loading" class="hidden mt-4">
|
||||||
|
<p>Loading...</p>
|
||||||
|
</div>
|
||||||
|
<div id="result" class="tab-container flex-1 h-full flex-col">
|
||||||
|
<div class="tab-buttons flex gap-2">
|
||||||
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
|
||||||
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
|
||||||
|
Cleaned HTML
|
||||||
|
</button>
|
||||||
|
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
|
||||||
|
Markdown
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||||
|
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
|
||||||
|
<pre
|
||||||
|
class="hidden h-full flex"
|
||||||
|
><code id="cleaned-html-result" class="language-html "></code></pre>
|
||||||
|
<pre
|
||||||
|
class="hidden h-full flex"
|
||||||
|
><code id="markdown-result" class="language-markdown "></code></pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="code_help" class="tab-container flex-1 h-full">
|
||||||
|
<div class="tab-buttons flex gap-2">
|
||||||
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
|
||||||
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
|
||||||
|
Python
|
||||||
|
</button>
|
||||||
|
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
|
||||||
|
Node.js
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||||
|
<pre class="h-full flex relative">
|
||||||
|
<code id="curl-code" class="language-bash"></code>
|
||||||
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||||
|
</pre>
|
||||||
|
<pre class="hidden h-full flex relative">
|
||||||
|
<code id="python-code" class="language-python"></code>
|
||||||
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||||
|
</pre>
|
||||||
|
<pre class="hidden h-full flex relative">
|
||||||
|
<code id="nodejs-code" class="language-javascript"></code>
|
||||||
|
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||||
|
</pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="hero bg-gray-900 py-8">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||||
|
<p class="text-lg mb-4">
|
||||||
|
In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for
|
||||||
|
services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl
|
||||||
|
a web page, and transform it o a form suitable for LLM. We don't think one should build a business
|
||||||
|
out of this, but definilty should be opened source. So if you possess the skills to build such things
|
||||||
|
and you have such philosphy you should join our "Robinhood" band and help set
|
||||||
|
these products free. 🆓🤝
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="installation py-8">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||||
|
<p class="mb-4">
|
||||||
|
To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
|
||||||
|
these steps:
|
||||||
|
</p>
|
||||||
|
<ol class="list-decimal list-inside mb-4">
|
||||||
|
<li>
|
||||||
|
Clone the GitHub repository: 📥
|
||||||
|
<code>git clone https://github.com/unclecode/crawl4ai.git</code>
|
||||||
|
</li>
|
||||||
|
<li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
|
||||||
|
<li>
|
||||||
|
Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
|
||||||
|
<code>docker build --platform linux/amd64 -t crawl4ai .</code>
|
||||||
|
</li>
|
||||||
|
<li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
|
||||||
|
</ol>
|
||||||
|
<p>
|
||||||
|
For more detailed instructions and advanced configuration options, please refer to the 📚
|
||||||
|
<a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<footer class="bg-gray-900 text-white py-4">
|
||||||
|
<div class="container mx-auto px-4">
|
||||||
|
<div class="flex justify-between items-center">
|
||||||
|
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||||
|
<div class="social-links">
|
||||||
|
<a
|
||||||
|
href="https://github.com/unclecode/crawl4ai"
|
||||||
|
class="text-white hover:text-gray-300 mx-2"
|
||||||
|
target="_blank"
|
||||||
|
>😺 GitHub</a
|
||||||
|
>
|
||||||
|
<a
|
||||||
|
href="https://twitter.com/unclecode"
|
||||||
|
class="text-white hover:text-gray-300 mx-2"
|
||||||
|
target="_blank"
|
||||||
|
>🐦 Twitter</a
|
||||||
|
>
|
||||||
|
<a
|
||||||
|
href="https://discord.gg/your-invite-link"
|
||||||
|
class="text-white hover:text-gray-300 mx-2"
|
||||||
|
target="_blank"
|
||||||
|
>💬 Discord</a
|
||||||
|
>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Get the selected provider model and token from local storage
|
||||||
|
const storedProviderModel = localStorage.getItem("provider_model");
|
||||||
|
const storedToken = localStorage.getItem(storedProviderModel);
|
||||||
|
|
||||||
|
if (storedProviderModel) {
|
||||||
|
document.getElementById("provider-model-select").value = storedProviderModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (storedToken) {
|
||||||
|
document.getElementById("token-input").value = storedToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle provider model dropdown change
|
||||||
|
document.getElementById("provider-model-select").addEventListener("change", () => {
|
||||||
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||||
|
const storedToken = localStorage.getItem(selectedProviderModel);
|
||||||
|
|
||||||
|
if (storedToken) {
|
||||||
|
document.getElementById("token-input").value = storedToken;
|
||||||
|
} else {
|
||||||
|
document.getElementById("token-input").value = "";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fetch total count from the database
|
||||||
|
axios
|
||||||
|
.get("/total-count")
|
||||||
|
.then((response) => {
|
||||||
|
document.getElementById("total-count").textContent = response.data.count;
|
||||||
|
})
|
||||||
|
.catch((error) => console.error(error));
|
||||||
|
|
||||||
|
// Handle crawl button click
|
||||||
|
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||||
|
// validate input to have both URL and API token
|
||||||
|
if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
|
||||||
|
alert("Please enter both URL(s) and API token.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||||
|
const apiToken = document.getElementById("token-input").value;
|
||||||
|
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||||
|
|
||||||
|
|
||||||
|
// Save the selected provider model and token to local storage
|
||||||
|
localStorage.setItem("provider_model", selectedProviderModel);
|
||||||
|
localStorage.setItem(selectedProviderModel, apiToken);
|
||||||
|
|
||||||
|
const urlsInput = document.getElementById("url-input").value;
|
||||||
|
const urls = urlsInput.split(",").map((url) => url.trim());
|
||||||
|
const data = {
|
||||||
|
urls: urls,
|
||||||
|
provider_model: selectedProviderModel,
|
||||||
|
api_token: apiToken,
|
||||||
|
include_raw_html: true,
|
||||||
|
forced: false,
|
||||||
|
extract_blocks: extractBlocks,
|
||||||
|
};
|
||||||
|
|
||||||
|
// save api token to local storage
|
||||||
|
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||||
|
|
||||||
|
document.getElementById("loading").classList.remove("hidden");
|
||||||
|
document.getElementById("result").classList.add("hidden");
|
||||||
|
document.getElementById("code_help").classList.add("hidden");
|
||||||
|
|
||||||
|
axios
|
||||||
|
.post("/crawl", data)
|
||||||
|
.then((response) => {
|
||||||
|
const result = response.data.results[0];
|
||||||
|
const parsedJson = JSON.parse(result.parsed_json);
|
||||||
|
document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
|
||||||
|
document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
|
||||||
|
document.getElementById("markdown-result").textContent = result.markdown;
|
||||||
|
|
||||||
|
// Update code examples dynamically
|
||||||
|
// Update code examples dynamically
|
||||||
|
document.getElementById(
|
||||||
|
"curl-code"
|
||||||
|
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||||
|
...data,
|
||||||
|
api_token: "your_api_token",
|
||||||
|
})}' http://localhost:8000/crawl`;
|
||||||
|
|
||||||
|
document.getElementById(
|
||||||
|
"python-code"
|
||||||
|
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||||
|
{ ...data, api_token: "your_api_token" },
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
|
||||||
|
|
||||||
|
document.getElementById(
|
||||||
|
"nodejs-code"
|
||||||
|
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||||
|
{ ...data, api_token: "your_api_token" },
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||||
|
// Highlight code syntax
|
||||||
|
hljs.highlightAll();
|
||||||
|
|
||||||
|
// Select JSON tab by default
|
||||||
|
document.querySelector('.tab-btn[data-tab="json"]').click();
|
||||||
|
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
document.getElementById("result").classList.remove("hidden");
|
||||||
|
document.getElementById("code_help").classList.remove("hidden");
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error(error);
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle tab clicks
|
||||||
|
document.querySelectorAll(".tab-btn").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const tab = btn.dataset.tab;
|
||||||
|
document
|
||||||
|
.querySelectorAll(".tab-btn")
|
||||||
|
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||||
|
btn.classList.add("bg-blue-600", "text-white");
|
||||||
|
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||||
|
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle code tab clicks
|
||||||
|
document.querySelectorAll(".code-tab-btn").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const tab = btn.dataset.tab;
|
||||||
|
document
|
||||||
|
.querySelectorAll(".code-tab-btn")
|
||||||
|
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||||
|
btn.classList.add("bg-blue-600", "text-white");
|
||||||
|
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||||
|
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle copy to clipboard button clicks
|
||||||
|
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const target = btn.dataset.target;
|
||||||
|
const code = document.getElementById(target).textContent;
|
||||||
|
navigator.clipboard.writeText(code).then(() => {
|
||||||
|
btn.textContent = "Copied!";
|
||||||
|
setTimeout(() => {
|
||||||
|
btn.textContent = "Copy";
|
||||||
|
}, 2000);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
document.getElementById("crawl-btn").addEventListener("click", () => {
|
||||||
|
const urlsInput = document.getElementById("url-input").value;
|
||||||
|
const urls = urlsInput.split(",").map(url => url.trim());
|
||||||
|
const apiToken = document.getElementById("token-input").value;
|
||||||
|
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||||
|
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
urls: urls,
|
||||||
|
provider_model: selectedProviderModel,
|
||||||
|
api_token: apiToken,
|
||||||
|
include_raw_html: true,
|
||||||
|
forced: false,
|
||||||
|
extract_blocks: extractBlocks
|
||||||
|
};
|
||||||
|
|
||||||
|
localStorage.setItem("api_token", apiToken);
|
||||||
|
|
||||||
|
document.getElementById("loading").classList.remove("hidden");
|
||||||
|
document.getElementById("result").classList.add("hidden");
|
||||||
|
document.getElementById("code_help").classList.add("hidden");
|
||||||
|
|
||||||
|
axios.post("/crawl", data)
|
||||||
|
.then(response => {
|
||||||
|
const taskId = response.data.task_id;
|
||||||
|
pollTaskStatus(taskId);
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error during fetch:', error);
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
function pollTaskStatus(taskId) {
|
||||||
|
axios.get(`/task/${taskId}`)
|
||||||
|
.then(response => {
|
||||||
|
const task = response.data;
|
||||||
|
if (task.status === 'done') {
|
||||||
|
displayResults(task.results[0]);
|
||||||
|
} else if (task.status === 'pending') {
|
||||||
|
setTimeout(() => pollTaskStatus(taskId), 2000); // Poll every 2 seconds
|
||||||
|
} else {
|
||||||
|
console.error('Task failed:', task.error);
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error polling task status:', error);
|
||||||
|
document.getElementById("loading").classList.add("hidden");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
13
requirements.txt
Normal file
13
requirements.txt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
fastapi
|
||||||
|
uvicorn
|
||||||
|
selenium
|
||||||
|
pydantic
|
||||||
|
aiohttp
|
||||||
|
aiosqlite
|
||||||
|
chromedriver_autoinstaller
|
||||||
|
httpx
|
||||||
|
requests
|
||||||
|
bs4
|
||||||
|
html2text
|
||||||
|
litellm
|
||||||
|
python-dotenv
|
||||||
30
setup.py
Normal file
30
setup.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
# Read the requirements from requirements.txt
|
||||||
|
with open("requirements.txt") as f:
|
||||||
|
requirements = f.read().splitlines()
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="Crawl4AI",
|
||||||
|
version="0.1.0",
|
||||||
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
|
long_description=open("README.md").read(),
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="https://github.com/unclecode/crawl4ai",
|
||||||
|
author="Unclecode",
|
||||||
|
author_email="unclecode@kidocode.com",
|
||||||
|
license="MIT",
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=requirements,
|
||||||
|
classifiers=[
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
"Programming Language :: Python :: 3.8",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
],
|
||||||
|
python_requires=">=3.7",
|
||||||
|
)
|
||||||
31
test.py
Normal file
31
test.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from crawler.web_crawler import WebCrawler
|
||||||
|
from crawler.models import UrlModel
|
||||||
|
from crawler.utils import get_content_of_website
|
||||||
|
import os
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Initialize the WebCrawler with just the database path
|
||||||
|
crawler = WebCrawler(db_path='crawler_data.db')
|
||||||
|
|
||||||
|
# Fetch a single page
|
||||||
|
single_url = UrlModel(url='https://kidocode.com', forced=True)
|
||||||
|
result = crawler.fetch_page(
|
||||||
|
single_url,
|
||||||
|
provider= "openai/gpt-3.5-turbo",
|
||||||
|
api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
|
extract_blocks_flag=True,
|
||||||
|
word_count_threshold=5
|
||||||
|
)
|
||||||
|
print(result.model_dump())
|
||||||
|
|
||||||
|
# Fetch multiple pages
|
||||||
|
# urls = [
|
||||||
|
# UrlModel(url='http://example.com', forced=False),
|
||||||
|
# UrlModel(url='http://example.org', forced=False)
|
||||||
|
# ]
|
||||||
|
# results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
|
||||||
|
# for res in results:
|
||||||
|
# print(res.model_copy())
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user