Compare commits
454 Commits
new-releas
...
vr0.4.246
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
da1bc0f7bf | ||
|
|
aa4f92f458 | ||
|
|
a96e05d4ae | ||
|
|
4cb2a62551 | ||
|
|
5b4fad9e25 | ||
|
|
ea0ac25f38 | ||
|
|
7688aca7d6 | ||
|
|
a7215ad972 | ||
|
|
318554e6bf | ||
|
|
c64979b8dd | ||
|
|
bfe21b29d4 | ||
|
|
e9d9a6ffe8 | ||
|
|
5313c71a0d | ||
|
|
d36ef3d424 | ||
|
|
4a4f613238 | ||
|
|
dc6a24618e | ||
|
|
74a7c6dbb6 | ||
|
|
67f65f958b | ||
|
|
78b6ba5cef | ||
|
|
3f019d34cc | ||
|
|
304260e484 | ||
|
|
704bd66b63 | ||
|
|
1acc162c18 | ||
|
|
553c97a0c1 | ||
|
|
bd66befcf0 | ||
|
|
3e769a9c6c | ||
|
|
19b0a5ae82 | ||
|
|
bd71f7f4ea | ||
|
|
171ce25ba6 | ||
|
|
6c5a44f774 | ||
|
|
5c3c05bf93 | ||
|
|
67d0999bc3 | ||
|
|
553a4622bf | ||
|
|
6f81ef006d | ||
|
|
a04870a662 | ||
|
|
f7d26390c5 | ||
|
|
141783fb2d | ||
|
|
2fedd4876e | ||
|
|
e187b0aaf0 | ||
|
|
e95374d7c6 | ||
|
|
8f2d0cda2f | ||
|
|
9d261d2b9c | ||
|
|
7792fe0e4c | ||
|
|
86259244e4 | ||
|
|
0ec593fa90 | ||
|
|
7391d6be73 | ||
|
|
e4e23065f1 | ||
|
|
fb33a24891 | ||
|
|
78768fd714 | ||
|
|
f2d9912697 | ||
|
|
9a4ed6bbd7 | ||
|
|
d5ed451299 | ||
|
|
bacbeb3ed4 | ||
|
|
84b311760f | ||
|
|
8fbc2e0463 | ||
|
|
849765712f | ||
|
|
393bb911c0 | ||
|
|
4a5f1aebee | ||
|
|
a11d9646e3 | ||
|
|
ed7bc1909c | ||
|
|
e9e5b5642d | ||
|
|
7524aa7b5e | ||
|
|
7af1d32ef6 | ||
|
|
399af801a1 | ||
|
|
4a72c5ea6e | ||
|
|
20d6f5fdf4 | ||
|
|
3d69715dba | ||
|
|
de1766d565 | ||
|
|
0982c639ae | ||
|
|
5188b7a6a0 | ||
|
|
759164831d | ||
|
|
5431fa2d0c | ||
|
|
e130fd8db9 | ||
|
|
ded554d334 | ||
|
|
2d31915f0a | ||
|
|
ba3e808802 | ||
|
|
e3488da194 | ||
|
|
740214e021 | ||
|
|
c51e901f68 | ||
|
|
8c611dcb4b | ||
|
|
a45b8b1eb1 | ||
|
|
56f82f3e7f | ||
|
|
486db3a771 | ||
|
|
b02544bc0b | ||
|
|
e9639ad189 | ||
|
|
95a4f74d2a | ||
|
|
293f299c08 | ||
|
|
80d58ad24c | ||
|
|
3e83893b3f | ||
|
|
8c76a8c7dc | ||
|
|
0780db55e1 | ||
|
|
1ed7c15118 | ||
|
|
569bdb6073 | ||
|
|
1def53b7fe | ||
|
|
f9c98a377d | ||
|
|
93bf3e8a1f | ||
|
|
d202f3539b | ||
|
|
12e73d4898 | ||
|
|
449dd7cc0b | ||
|
|
b0419edda6 | ||
|
|
c0e87abaee | ||
|
|
c8485776fe | ||
|
|
aa3e2d0fe6 | ||
|
|
98c64f9d5f | ||
|
|
7d81c17cca | ||
|
|
652d396a81 | ||
|
|
1d83c493af | ||
|
|
cf35cbe59e | ||
|
|
9221c08418 | ||
|
|
48d43c14b1 | ||
|
|
776efa74a4 | ||
|
|
b14e83f499 | ||
|
|
a9b6b65238 | ||
|
|
a036b7f122 | ||
|
|
0bccf23db3 | ||
|
|
0cbd594512 | ||
|
|
efe93a5f57 | ||
|
|
3fda66b85b | ||
|
|
ddfb6707b4 | ||
|
|
a69f7a9531 | ||
|
|
d583aa43ca | ||
|
|
3abb573142 | ||
|
|
d556dada9f | ||
|
|
ce7d49484f | ||
|
|
e4acd18429 | ||
|
|
c2d4784810 | ||
|
|
76bea6c577 | ||
|
|
3ff0b0b2c4 | ||
|
|
a1c7dc17ce | ||
|
|
24723b2f10 | ||
|
|
f998e9e949 | ||
|
|
73661f7d1f | ||
|
|
b5d4db07d1 | ||
|
|
c6a022132b | ||
|
|
195c0ccf8a | ||
|
|
b09a86c0c1 | ||
|
|
de43505ae4 | ||
|
|
d7c5b900b8 | ||
|
|
edad7b6a74 | ||
|
|
829a1f7992 | ||
|
|
d729aa7d5e | ||
|
|
0d0cef3438 | ||
|
|
d7a112fefe | ||
|
|
a5decaa7cf | ||
|
|
8dea3f470f | ||
|
|
e02935dc5b | ||
|
|
24ad2fe2dd | ||
|
|
571dda6549 | ||
|
|
006bee4a5a | ||
|
|
dbb751c8f0 | ||
|
|
3439f7886d | ||
|
|
d418a04602 | ||
|
|
7047422e48 | ||
|
|
2bdec1fa5a | ||
|
|
b654c49e55 | ||
|
|
f2cb7d506d | ||
|
|
a6dad3fc6d | ||
|
|
fbcff85ecb | ||
|
|
788c67c29a | ||
|
|
2f19d38693 | ||
|
|
3aae30ed2a | ||
|
|
593c7ad307 | ||
|
|
73658c758a | ||
|
|
b6af94cbbb | ||
|
|
852729ff38 | ||
|
|
152ac35bc2 | ||
|
|
df63a40606 | ||
|
|
a59c107b23 | ||
|
|
f9fe6f89fe | ||
|
|
2a82455b3d | ||
|
|
3a524a3bdd | ||
|
|
3a66aa8a60 | ||
|
|
4b45b28f25 | ||
|
|
9139ef3125 | ||
|
|
6360d0545a | ||
|
|
1961adb530 | ||
|
|
79feab89c4 | ||
|
|
5d0b13294c | ||
|
|
67edc2d641 | ||
|
|
6b569cceb5 | ||
|
|
6f2fe5954f | ||
|
|
fca1319b7d | ||
|
|
f77f06a3bd | ||
|
|
e62c807295 | ||
|
|
90df6921b7 | ||
|
|
5098442086 | ||
|
|
d0014c6793 | ||
|
|
ae7ebc0bd8 | ||
|
|
1f269f9834 | ||
|
|
7f1ae5adcf | ||
|
|
3d00fee6c2 | ||
|
|
17913f5acf | ||
|
|
c38ac29edb | ||
|
|
38044d4afe | ||
|
|
61b93ebf36 | ||
|
|
bf91adf3f8 | ||
|
|
00026b5f8b | ||
|
|
8c22396d8b | ||
|
|
b6d6631b12 | ||
|
|
a098483cbb | ||
|
|
f9a297e08d | ||
|
|
bcdd80911f | ||
|
|
b120965b6a | ||
|
|
16f918621f | ||
|
|
f7574230a1 | ||
|
|
2879344d9c | ||
|
|
9f5eef1f38 | ||
|
|
c5aa1bec18 | ||
|
|
b51263664e | ||
|
|
1e7db0d293 | ||
|
|
2a54f3c048 | ||
|
|
1c20b815b3 | ||
|
|
43a2b26f63 | ||
|
|
3cf19a1bc2 | ||
|
|
67a23c3182 | ||
|
|
796dbaf08c | ||
|
|
3a3c88a2d0 | ||
|
|
870296fa7e | ||
|
|
a28046c233 | ||
|
|
0bba0e074f | ||
|
|
c4c6227962 | ||
|
|
e6c914d2fa | ||
|
|
be8f4fc59a | ||
|
|
fbdf870fbf | ||
|
|
7b0cca41b4 | ||
|
|
33d0e9ec8c | ||
|
|
42f1c67ca8 | ||
|
|
e28c49a8fe | ||
|
|
54d5a3a259 | ||
|
|
de6b43f334 | ||
|
|
07f508bd0c | ||
|
|
62a86dbe8d | ||
|
|
492ada0ed4 | ||
|
|
d8eef02867 | ||
|
|
6c7235d6a7 | ||
|
|
0a09d78fa5 | ||
|
|
19c3f3efb2 | ||
|
|
e97e8df6ba | ||
|
|
cb6f5323ae | ||
|
|
47464cedec | ||
|
|
982d203d91 | ||
|
|
9307c19f35 | ||
|
|
605a82793b | ||
|
|
df9ee44d42 | ||
|
|
e9f7d5e73a | ||
|
|
3529c2e732 | ||
|
|
d9e0b7abab | ||
|
|
b2800fefc6 | ||
|
|
d913e20edc | ||
|
|
c2a71a5abe | ||
|
|
d61615e0b0 | ||
|
|
ac9d83c72f | ||
|
|
ff9149b5c9 | ||
|
|
4239654722 | ||
|
|
38474bd66a | ||
|
|
bcfe83f702 | ||
|
|
32f57c49d6 | ||
|
|
60ba131ac8 | ||
|
|
a5f627ba1a | ||
|
|
04d16e6d2b | ||
|
|
1dd36f9035 | ||
|
|
6ec4cb33ca | ||
|
|
e7cd8a1c2d | ||
|
|
4e2852d5ff | ||
|
|
b309bc34e1 | ||
|
|
b8147b64e0 | ||
|
|
aab6ea022e | ||
|
|
dd17ed0e63 | ||
|
|
dbb587d681 | ||
|
|
768aa06ceb | ||
|
|
9ffa34b697 | ||
|
|
740802c491 | ||
|
|
b9ac96c332 | ||
|
|
d06535388a | ||
|
|
2b73bdf6b0 | ||
|
|
6aa803d712 | ||
|
|
320afdea64 | ||
|
|
ccbe72cfc1 | ||
|
|
b9bbd42373 | ||
|
|
68e9144ce3 | ||
|
|
9b2b267820 | ||
|
|
ff3524d9b1 | ||
|
|
b99d20b725 | ||
|
|
768b93140f | ||
|
|
4750810a67 | ||
|
|
e0e0db4247 | ||
|
|
bccadec887 | ||
|
|
0759503e50 | ||
|
|
7f1c020746 | ||
|
|
5d4e92db7d | ||
|
|
8b6e88c85c | ||
|
|
64190dd0c4 | ||
|
|
7100bcdf04 | ||
|
|
10cdad039d | ||
|
|
f1eee09cf4 | ||
|
|
4d48bd31ca | ||
|
|
d628bc4034 | ||
|
|
b179aa9b6f | ||
|
|
30807f5535 | ||
|
|
396f430022 | ||
|
|
eb131bebdf | ||
|
|
5c15837677 | ||
|
|
2fada16abb | ||
|
|
c37614cbc8 | ||
|
|
3116f95c1a | ||
|
|
b0e8b66666 | ||
|
|
3caf48c9be | ||
|
|
3c6ebb73ae | ||
|
|
0d9b638636 | ||
|
|
2ba70b9501 | ||
|
|
16f98cebc0 | ||
|
|
fe9ff498ce | ||
|
|
eba831ca30 | ||
|
|
dec3d44224 | ||
|
|
9ed1551125 | ||
|
|
e5e6a34e80 | ||
|
|
897e766728 | ||
|
|
9200a6731d | ||
|
|
61c166ab19 | ||
|
|
659c8cd953 | ||
|
|
9ee988753d | ||
|
|
8ae6c43ca4 | ||
|
|
b6713870ef | ||
|
|
40477493d3 | ||
|
|
efcf3ac6eb | ||
|
|
9e43f7beda | ||
|
|
aa9412e1b4 | ||
|
|
cf6c835e18 | ||
|
|
e5ecf291f3 | ||
|
|
9d0cafcfa6 | ||
|
|
7715623430 | ||
|
|
f5a4e80e2c | ||
|
|
8463aabedf | ||
|
|
7f30144ef2 | ||
|
|
fa5516aad6 | ||
|
|
ca0336af9e | ||
|
|
65ed1aeade | ||
|
|
4d283ab386 | ||
|
|
3ff2a0d0e7 | ||
|
|
3cd1b3719f | ||
|
|
9926eb9f95 | ||
|
|
3abaa82501 | ||
|
|
88d8cd8650 | ||
|
|
a08f21d66c | ||
|
|
d58286989c | ||
|
|
b58af3349c | ||
|
|
940df4631f | ||
|
|
685706e0aa | ||
|
|
7b0979e134 | ||
|
|
61ae2de841 | ||
|
|
5b28eed2c0 | ||
|
|
f8a11779fe | ||
|
|
d11a83c232 | ||
|
|
3255c7a3fa | ||
|
|
4756d0a532 | ||
|
|
7ba2142363 | ||
|
|
96d1eb0d0d | ||
|
|
144cfa0eda | ||
|
|
a0dff192ae | ||
|
|
1fffeeedd2 | ||
|
|
f51b078042 | ||
|
|
b6023a51fb | ||
|
|
78cfad8b2f | ||
|
|
68b3dff74a | ||
|
|
bfc4abd6e8 | ||
|
|
8c77a760fc | ||
|
|
b9bf8ac9d7 | ||
|
|
d6182bedd7 | ||
|
|
2217904876 | ||
|
|
2c2362b4d3 | ||
|
|
612ed3fef2 | ||
|
|
fb2a6d0d04 | ||
|
|
19d3d39115 | ||
|
|
c1413e6916 | ||
|
|
e7705e661a | ||
|
|
21b110bfd7 | ||
|
|
1fcb573909 | ||
|
|
0f6c5f5453 | ||
|
|
350ca1511b | ||
|
|
539263a8ba | ||
|
|
3f0e265baf | ||
|
|
21e2538e57 | ||
|
|
480902bd66 | ||
|
|
853b9d59d8 | ||
|
|
6d04284c44 | ||
|
|
4a50781453 | ||
|
|
18561c55ce | ||
|
|
77da48050d | ||
|
|
9a97aacd85 | ||
|
|
52daf3936a | ||
|
|
2f246d19f4 | ||
|
|
413595542a | ||
|
|
42a5da854d | ||
|
|
d1d83a6ef7 | ||
|
|
194050705d | ||
|
|
989f8c91c8 | ||
|
|
edba5fb5e9 | ||
|
|
faa1defa5c | ||
|
|
f7e0cee1b0 | ||
|
|
b3a0edaa6d | ||
|
|
9c34b30723 | ||
|
|
36a5847df5 | ||
|
|
a19379aa58 | ||
|
|
768d048e1c | ||
|
|
94c11a0262 | ||
|
|
649b0bfd02 | ||
|
|
57a00ec677 | ||
|
|
aeb2114170 | ||
|
|
b8d405fddd | ||
|
|
b32013cb97 | ||
|
|
226a62a3c0 | ||
|
|
8e73a482a2 | ||
|
|
0533aeb814 | ||
|
|
aead6de888 | ||
|
|
8d82fd4cfe | ||
|
|
8f44db6499 | ||
|
|
c7553b1280 | ||
|
|
8b8683f22e | ||
|
|
774ace6e3b | ||
|
|
4a8f91a0fc | ||
|
|
18c9784b61 | ||
|
|
e5d401c67c | ||
|
|
ae77589a98 | ||
|
|
ad373c0e19 | ||
|
|
51f26d12fe | ||
|
|
f1b60b2016 | ||
|
|
8c2dc2b1e4 | ||
|
|
dc9a44c12a | ||
|
|
d9753b6349 | ||
|
|
a554c0b143 | ||
|
|
7381fa95e6 | ||
|
|
53d1176d53 | ||
|
|
52c4be0696 | ||
|
|
13a3b21d19 | ||
|
|
5cee084340 | ||
|
|
bf00c26a83 | ||
|
|
3846648c12 | ||
|
|
eb6423875f | ||
|
|
e3524a10a7 | ||
|
|
468dad6169 | ||
|
|
bc27982992 | ||
|
|
57e5decb55 | ||
|
|
b6319c6f6e | ||
|
|
0a902f562f | ||
|
|
454135856e | ||
|
|
33fddc27ad | ||
|
|
ce052a4eb5 | ||
|
|
b43d77a56b | ||
|
|
1635a92218 | ||
|
|
2a8a1b27e1 | ||
|
|
f5f3cce2c8 | ||
|
|
a085e6315b | ||
|
|
a8d600a3b4 | ||
|
|
4a2e17447b |
220
.codeiumignore
Normal file
@@ -0,0 +1,220 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
Crawl4AI.egg-info/
|
||||
Crawl4AI.egg-info/*
|
||||
crawler_data.db
|
||||
.vscode/
|
||||
.tests/
|
||||
.test_pads/
|
||||
test_pad.py
|
||||
test_pad*.py
|
||||
.data/
|
||||
Crawl4AI.egg-info/
|
||||
|
||||
requirements0.txt
|
||||
a.txt
|
||||
|
||||
*.sh
|
||||
.idea
|
||||
docs/examples/.chainlit/
|
||||
docs/examples/.chainlit/*
|
||||
.chainlit/config.toml
|
||||
.chainlit/translations/en-US.json
|
||||
|
||||
local/
|
||||
.files/
|
||||
|
||||
a.txt
|
||||
.lambda_function.py
|
||||
ec2*
|
||||
|
||||
update_changelog.sh
|
||||
|
||||
.DS_Store
|
||||
docs/.DS_Store
|
||||
tmp/
|
||||
test_env/
|
||||
**/.DS_Store
|
||||
**/.DS_Store
|
||||
|
||||
todo.md
|
||||
todo_executor.md
|
||||
git_changes.py
|
||||
git_changes.md
|
||||
pypi_build.sh
|
||||
git_issues.py
|
||||
git_issues.md
|
||||
|
||||
.next/
|
||||
.tests/
|
||||
.docs/
|
||||
.gitboss/
|
||||
todo_executor.md
|
||||
protect-all-except-feature.sh
|
||||
manage-collab.sh
|
||||
publish.sh
|
||||
combine.sh
|
||||
combined_output.txt
|
||||
tree.md
|
||||
|
||||
53
.gitignore
vendored
@@ -165,6 +165,8 @@ Crawl4AI.egg-info/
|
||||
Crawl4AI.egg-info/*
|
||||
crawler_data.db
|
||||
.vscode/
|
||||
.tests/
|
||||
.test_pads/
|
||||
test_pad.py
|
||||
test_pad*.py
|
||||
.data/
|
||||
@@ -172,3 +174,54 @@ Crawl4AI.egg-info/
|
||||
|
||||
requirements0.txt
|
||||
a.txt
|
||||
|
||||
*.sh
|
||||
.idea
|
||||
docs/examples/.chainlit/
|
||||
docs/examples/.chainlit/*
|
||||
.chainlit/config.toml
|
||||
.chainlit/translations/en-US.json
|
||||
|
||||
local/
|
||||
.files/
|
||||
|
||||
a.txt
|
||||
.lambda_function.py
|
||||
ec2*
|
||||
|
||||
update_changelog.sh
|
||||
|
||||
.DS_Store
|
||||
docs/.DS_Store
|
||||
tmp/
|
||||
test_env/
|
||||
**/.DS_Store
|
||||
**/.DS_Store
|
||||
|
||||
todo.md
|
||||
todo_executor.md
|
||||
git_changes.py
|
||||
git_changes.md
|
||||
pypi_build.sh
|
||||
git_issues.py
|
||||
git_issues.md
|
||||
|
||||
.next/
|
||||
.tests/
|
||||
# .issues/
|
||||
.docs/
|
||||
.issues/
|
||||
.gitboss/
|
||||
todo_executor.md
|
||||
protect-all-except-feature.sh
|
||||
manage-collab.sh
|
||||
publish.sh
|
||||
combine.sh
|
||||
combined_output.txt
|
||||
.local
|
||||
.scripts
|
||||
tree.md
|
||||
tree.md
|
||||
.scripts
|
||||
.local
|
||||
.do
|
||||
|
||||
1051
CHANGELOG.md
42
CONTRIBUTORS.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Contributors to Crawl4AI
|
||||
|
||||
We would like to thank the following people for their contributions to Crawl4AI:
|
||||
|
||||
## Core Team
|
||||
|
||||
- [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer
|
||||
- [Nasrin](https://github.com/ntohidi) - Project Manager and Developer
|
||||
- [Aravind Karnam](https://github.com/aravindkarnam) - Developer
|
||||
|
||||
## Community Contributors
|
||||
|
||||
- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined.
|
||||
- [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors
|
||||
- [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies
|
||||
- [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for
|
||||
- [datehoer](https://github.com/datehoer) - Add browser prxy support
|
||||
|
||||
## Pull Requests
|
||||
|
||||
- [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304)
|
||||
- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286)
|
||||
- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293)
|
||||
- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
|
||||
- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
|
||||
|
||||
|
||||
## Other Contributors
|
||||
|
||||
- [Gokhan](https://github.com/gkhngyk)
|
||||
- [Shiv Kumar](https://github.com/shivkumar0757)
|
||||
- [QIN2DIM](https://github.com/QIN2DIM)
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.
|
||||
|
||||
---
|
||||
|
||||
If you've contributed to Crawl4AI and your name isn't on this list, please [open a pull request](https://github.com/unclecode/crawl4ai/pulls) with your name, link, and contribution, and we'll review it promptly.
|
||||
|
||||
Thank you all for your contributions!
|
||||
154
Dockerfile
@@ -1,40 +1,136 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.10-slim
|
||||
# syntax=docker/dockerfile:1.4
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /usr/src/app
|
||||
ARG TARGETPLATFORM
|
||||
ARG BUILDPLATFORM
|
||||
|
||||
# Copy the current directory contents into the container at /usr/src/app
|
||||
COPY . .
|
||||
# Other build arguments
|
||||
ARG PYTHON_VERSION=3.10
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
# Base stage with system dependencies
|
||||
FROM python:${PYTHON_VERSION}-slim as base
|
||||
|
||||
# Install dependencies for Chrome and ChromeDriver
|
||||
# Declare ARG variables again within the build stage
|
||||
ARG INSTALL_TYPE=all
|
||||
ARG ENABLE_GPU=false
|
||||
|
||||
# Platform-specific labels
|
||||
LABEL maintainer="unclecode"
|
||||
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
LABEL version="1.0"
|
||||
|
||||
# Environment setup
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_DEFAULT_TIMEOUT=100 \
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
xvfb \
|
||||
unzip \
|
||||
build-essential \
|
||||
curl \
|
||||
gnupg2 \
|
||||
ca-certificates \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
wget \
|
||||
gnupg \
|
||||
git \
|
||||
cmake \
|
||||
pkg-config \
|
||||
python3-dev \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set display port and dbus env to avoid hanging
|
||||
ENV DISPLAY=:99
|
||||
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||
# Playwright system dependencies for Linux
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Make port 80 available to the world outside this container
|
||||
EXPOSE 80
|
||||
# GPU support if enabled and architecture is supported
|
||||
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
nvidia-cuda-toolkit \
|
||||
&& rm -rf /var/lib/apt/lists/* ; \
|
||||
else \
|
||||
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
||||
fi
|
||||
|
||||
# Define environment variable
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
# Create and set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Run uvicorn
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||
# Copy the entire project
|
||||
COPY . .
|
||||
|
||||
# Install base requirements
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install required library for FastAPI
|
||||
RUN pip install fastapi uvicorn psutil
|
||||
|
||||
# Install ML dependencies first for better layer caching
|
||||
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
||||
pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
scikit-learn \
|
||||
nltk \
|
||||
transformers \
|
||||
tokenizers && \
|
||||
python -m nltk.downloader punkt stopwords ; \
|
||||
fi
|
||||
|
||||
# Install the package
|
||||
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
||||
pip install ".[all]" && \
|
||||
python -m crawl4ai.model_loader ; \
|
||||
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
|
||||
pip install ".[torch]" ; \
|
||||
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
|
||||
pip install ".[transformer]" && \
|
||||
python -m crawl4ai.model_loader ; \
|
||||
else \
|
||||
pip install "." ; \
|
||||
fi
|
||||
|
||||
# Install MkDocs and required plugins
|
||||
RUN pip install --no-cache-dir \
|
||||
mkdocs \
|
||||
mkdocs-material \
|
||||
mkdocs-terminal \
|
||||
pymdown-extensions
|
||||
|
||||
# Build MkDocs documentation
|
||||
RUN mkdocs build
|
||||
|
||||
# Install Playwright and browsers
|
||||
RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||
playwright install chromium; \
|
||||
elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
playwright install chromium; \
|
||||
fi
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000 11235 9222 8080
|
||||
|
||||
# Start the FastAPI server
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
|
||||
2
MANIFEST.in
Normal file
@@ -0,0 +1,2 @@
|
||||
include requirements.txt
|
||||
recursive-include crawl4ai/js_snippet *.js
|
||||
46
MISSION.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Mission
|
||||
|
||||

|
||||
|
||||
### 1. The Data Capitalization Opportunity
|
||||
|
||||
We live in an unprecedented era of digital wealth creation. Every day, individuals and enterprises generate massive amounts of valuable digital footprints across various platforms, social media channels, messenger apps, and cloud services. While people can interact with their data within these platforms, there's an immense untapped opportunity to transform this data into true capital assets. Just as physical property became a foundational element of wealth creation, personal and enterprise data has the potential to become a new form of capital on balance sheets.
|
||||
|
||||
For individuals, this represents an opportunity to transform their digital activities into valuable assets. For enterprises, their internal communications, team discussions, and collaborative documents contain rich insights that could be structured and valued as intellectual capital. This wealth of information represents an unprecedented opportunity for value creation in the digital age.
|
||||
|
||||
### 2. The Potential of Authentic Data
|
||||
|
||||
While synthetic data has played a crucial role in AI development, there's an enormous untapped potential in the authentic data generated by individuals and organizations. Every message, document, and interaction contains unique insights and patterns that could enhance AI development. The challenge isn't a lack of data - it's that most authentic human-generated data remains inaccessible for productive use.
|
||||
|
||||
By enabling willing participation in data sharing, we can unlock this vast reservoir of authentic human knowledge. This represents an opportunity to enhance AI development with diverse, real-world data that reflects the full spectrum of human experience and knowledge.
|
||||
|
||||
## Our Pathway to Data Democracy
|
||||
|
||||
### 1. Open-Source Foundation
|
||||
|
||||
Our first step is creating an open-source data extraction engine that empowers developers and innovators to build tools for data structuring and organization. This foundation ensures transparency, security, and community-driven development. By making these tools openly available, we enable the technical infrastructure needed for true data ownership and capitalization.
|
||||
|
||||
### 2. Data Capitalization Platform
|
||||
|
||||
Building on this open-source foundation, we're developing a platform that helps individuals and enterprises transform their digital footprints into structured, valuable assets. This platform will provide the tools and frameworks needed to organize, understand, and value personal and organizational data as true capital assets.
|
||||
|
||||
### 3. Creating a Data Marketplace
|
||||
|
||||
The final piece is establishing a marketplace where individuals and organizations can willingly share their data assets. This creates opportunities for:
|
||||
- Individuals to earn equity, revenue, or other forms of value from their data
|
||||
- Enterprises to access diverse, high-quality data for AI development
|
||||
- Researchers to work with authentic human-generated data
|
||||
- Startups to build innovative solutions using real-world data
|
||||
|
||||
## Economic Vision: A Shared Data Economy
|
||||
|
||||
We envision a future where data becomes a fundamental asset class in a thriving shared economy. This transformation will democratize AI development by enabling willing participation in data sharing, ensuring that the benefits of AI advancement flow back to data creators. Just as property rights revolutionized economic systems, establishing data as a capital asset will create new opportunities for wealth creation and economic participation.
|
||||
|
||||
This shared data economy will:
|
||||
- Enable individuals to capitalize on their digital footprints
|
||||
- Create new revenue streams for data creators
|
||||
- Provide AI developers with access to diverse, authentic data
|
||||
- Foster innovation through broader access to real-world data
|
||||
- Ensure more equitable distribution of AI's economic benefits
|
||||
|
||||
Our vision is to facilitate this transformation from the ground up - starting with open-source tools, progressing to data capitalization platforms, and ultimately creating a thriving marketplace where data becomes a true asset class in a shared economy. This approach ensures that the future of AI is built on a foundation of authentic human knowledge, with benefits flowing back to the individuals and organizations who create and share their valuable data.
|
||||
902
README.md
@@ -1,506 +1,556 @@
|
||||
# Crawl4AI 🕷️🤖
|
||||
# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
|
||||
|
||||
<div align="center">
|
||||
|
||||
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||
[](https://github.com/unclecode/crawl4ai/issues)
|
||||
[](https://github.com/unclecode/crawl4ai/pulls)
|
||||
|
||||
[](https://badge.fury.io/py/crawl4ai)
|
||||
[](https://pypi.org/project/crawl4ai/)
|
||||
[](https://pepy.tech/project/crawl4ai)
|
||||
|
||||
<!-- [](https://crawl4ai.readthedocs.io/) -->
|
||||
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
||||
[](https://github.com/psf/black)
|
||||
[](https://github.com/PyCQA/bandit)
|
||||
|
||||
Crawl4AI has one clear task: to simplify crawling and extract useful information from web pages, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
||||
</div>
|
||||
|
||||
[](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
## Recent Changes
|
||||
[✨ Check out latest update v0.4.24x](#-recent-updates)
|
||||
|
||||
- 🚀 10x faster!!
|
||||
- 📜 Execute custom JavaScript before crawling!
|
||||
- 🤝 Colab friendly!
|
||||
- 📚 Chunking strategies: topic-based, regex, sentence, and more!
|
||||
- 🧠 Extraction strategies: cosine clustering, LLM, and more!
|
||||
- 🎯 CSS selector support
|
||||
- 📝 Pass instructions/keywords to refine extraction
|
||||
🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
|
||||
|
||||
## Power and Simplicity of Crawl4AI 🚀
|
||||
## 🧐 Why Crawl4AI?
|
||||
|
||||
To show the simplicity take a look at the first example:
|
||||
1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications.
|
||||
2. **Lightning Fast**: Delivers results 6x faster with real-time, cost-efficient performance.
|
||||
3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for seamless data access.
|
||||
4. **Heuristic Intelligence**: Uses advanced algorithms for efficient extraction, reducing reliance on costly models.
|
||||
5. **Open Source & Deployable**: Fully open-source with no API keys—ready for Docker and cloud integration.
|
||||
6. **Thriving Community**: Actively maintained by a vibrant community and the #1 trending GitHub repository.
|
||||
|
||||
```python
|
||||
from crawl4ai import WebCrawler
|
||||
## 🚀 Quick Start
|
||||
|
||||
# Create the WebCrawler instance
|
||||
crawler = WebCrawler()
|
||||
1. Install Crawl4AI:
|
||||
```bash
|
||||
# Install the package
|
||||
pip install -U crawl4ai
|
||||
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
print(result) # {url, html, markdown, extracted_content, metadata}
|
||||
# Run post-installation setup
|
||||
crawl4ai-setup
|
||||
|
||||
# Verify your installation
|
||||
crawl4ai-doctor
|
||||
```
|
||||
|
||||
Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
|
||||
|
||||
1. Instantiate a WebCrawler object.
|
||||
2. Execute custom JavaScript to click a "Load More" button.
|
||||
3. Extract semantical chunks of content and filter the data to include only content related to technology.
|
||||
4. Use a CSS selector to extract only paragraphs (`<p>` tags).
|
||||
|
||||
```python
|
||||
# Import necessary modules
|
||||
from crawl4ai import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
# Define the JavaScript code to click the "Load More" button
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
|
||||
# Define the crawling strategy
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
|
||||
# Create the WebCrawler instance with the defined strategy
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy)
|
||||
|
||||
# Run the crawler with keyword filtering and CSS selector
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="technology",
|
||||
),
|
||||
)
|
||||
|
||||
# Run the crawler with LLM extraction strategy
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="Extract only content related to technology"
|
||||
),
|
||||
css_selector="p"
|
||||
)
|
||||
|
||||
# Display the extracted result
|
||||
print(result)
|
||||
If you encounter any browser-related issues, you can install them manually:
|
||||
```bash
|
||||
python -m playwright install --with-deps chromium
|
||||
```
|
||||
|
||||
With Crawl4AI, you can perform advanced web crawling and data extraction tasks with just a few lines of code. This example demonstrates how you can harness the power of Crawl4AI to simplify your workflow and get the data you need efficiently.
|
||||
2. Run a simple web crawl:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import *
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
)
|
||||
print(result.markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## ✨ Features
|
||||
|
||||
<details>
|
||||
<summary>📝 <strong>Markdown Generation</strong></summary>
|
||||
|
||||
- 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting.
|
||||
- 🎯 **Fit Markdown**: Heuristic-based filtering to remove noise and irrelevant parts for AI-friendly processing.
|
||||
- 🔗 **Citations and References**: Converts page links into a numbered reference list with clean citations.
|
||||
- 🛠️ **Custom Strategies**: Users can create their own Markdown generation strategies tailored to specific needs.
|
||||
- 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content.
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>📊 <strong>Structured Data Extraction</strong></summary>
|
||||
|
||||
- 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction.
|
||||
- 🧱 **Chunking Strategies**: Implements chunking (topic-based, regex, sentence-level) for targeted content processing.
|
||||
- 🌌 **Cosine Similarity**: Find relevant content chunks based on user queries for semantic extraction.
|
||||
- 🔎 **CSS-Based Extraction**: Fast schema-based data extraction using XPath and CSS selectors.
|
||||
- 🔧 **Schema Definition**: Define custom schemas for extracting structured JSON from repetitive patterns.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🌐 <strong>Browser Integration</strong></summary>
|
||||
|
||||
- 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection.
|
||||
- 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction.
|
||||
- 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling.
|
||||
- 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access.
|
||||
- ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups.
|
||||
- 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit.
|
||||
- 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to match page content, ensuring complete rendering and capturing of all elements.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🔎 <strong>Crawling & Scraping</strong></summary>
|
||||
|
||||
- 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`.
|
||||
- 🚀 **Dynamic Crawling**: Execute JS and wait for async or sync for dynamic content extraction.
|
||||
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
||||
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
||||
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
||||
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
|
||||
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
||||
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
||||
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
||||
- 🕵️ **Lazy Load Handling**: Waits for images to fully load, ensuring no content is missed due to lazy loading.
|
||||
- 🔄 **Full-Page Scanning**: Simulates scrolling to load and capture all dynamic content, perfect for infinite scroll pages.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🚀 <strong>Deployment</strong></summary>
|
||||
|
||||
- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment.
|
||||
- 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows.
|
||||
- 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance.
|
||||
- ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🎯 <strong>Additional Features</strong></summary>
|
||||
|
||||
- 🕶️ **Stealth Mode**: Avoid bot detection by mimicking real users.
|
||||
- 🏷️ **Tag-Based Content Extraction**: Refine crawling based on custom tags, headers, or metadata.
|
||||
- 🔗 **Link Analysis**: Extract and analyze all links for detailed data exploration.
|
||||
- 🛡️ **Error Handling**: Robust error management for seamless execution.
|
||||
- 🔐 **CORS & Static Serving**: Supports filesystem-based caching and cross-origin requests.
|
||||
- 📖 **Clear Documentation**: Simplified and updated guides for onboarding and advanced usage.
|
||||
- 🙌 **Community Recognition**: Acknowledges contributors and pull requests for transparency.
|
||||
|
||||
</details>
|
||||
|
||||
## Try it Now!
|
||||
|
||||
✨ Play around with this [](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
|
||||
|
||||
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
|
||||
|
||||
## Installation 🛠️
|
||||
|
||||
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
|
||||
|
||||
<details>
|
||||
<summary>🐍 <strong>Using pip</strong></summary>
|
||||
|
||||
Choose the installation option that best fits your needs:
|
||||
|
||||
### Basic Installation
|
||||
|
||||
For basic web crawling and scraping tasks:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai
|
||||
crawl4ai-setup # Setup the browser
|
||||
```
|
||||
|
||||
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
|
||||
|
||||
👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
|
||||
|
||||
1. Through the command line:
|
||||
|
||||
```bash
|
||||
playwright install
|
||||
```
|
||||
|
||||
2. If the above doesn't work, try this more specific command:
|
||||
|
||||
```bash
|
||||
python -m playwright install chromium
|
||||
```
|
||||
|
||||
This second method has proven to be more reliable in some cases.
|
||||
|
||||
---
|
||||
|
||||
*Continue reading to learn more about the features, installation process, usage, and more.*
|
||||
### Installation with Synchronous Version
|
||||
|
||||
The sync version is deprecated and will be removed in future versions. If you need the synchronous version using Selenium:
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Features](#features-)
|
||||
2. [Installation](#installation-)
|
||||
3. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
|
||||
4. [Python Library Usage](#python-library-usage-)
|
||||
5. [Parameters](#parameters-)
|
||||
6. [Chunking Strategies](#chunking-strategies-)
|
||||
7. [Extraction Strategies](#extraction-strategies-)
|
||||
8. [Contributing](#contributing-)
|
||||
9. [License](#license-)
|
||||
10. [Contact](#contact-)
|
||||
|
||||
|
||||
## Features ✨
|
||||
|
||||
- 🕷️ Efficient web crawling to extract valuable data from websites
|
||||
- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
|
||||
- 🌍 Supports crawling multiple URLs simultaneously
|
||||
- 🌃 Replace media tags with ALT.
|
||||
- 🆓 Completely free to use and open-source
|
||||
- 📜 Execute custom JavaScript before crawling
|
||||
- 📚 Chunking strategies: topic-based, regex, sentence, and more
|
||||
- 🧠 Extraction strategies: cosine clustering, LLM, and more
|
||||
- 🎯 CSS selector support
|
||||
- 📝 Pass instructions/keywords to refine extraction
|
||||
|
||||
## Installation 💻
|
||||
|
||||
There are three ways to use Crawl4AI:
|
||||
1. As a library (Recommended)
|
||||
2. As a local server (Docker) or using the REST API
|
||||
4. As a Google Colab notebook. [](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
|
||||
|
||||
To install Crawl4AI as a library, follow these steps:
|
||||
|
||||
1. Install the package from GitHub:
|
||||
```bash
|
||||
virtualenv venv
|
||||
source venv/bin/activate
|
||||
pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
|
||||
pip install crawl4ai[sync]
|
||||
```
|
||||
|
||||
💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
|
||||
---
|
||||
|
||||
crawl4ai-download-models
|
||||
### Development Installation
|
||||
|
||||
For contributors who plan to modify the source code:
|
||||
|
||||
2. Alternatively, you can clone the repository and install the package locally:
|
||||
```bash
|
||||
virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .[all]
|
||||
pip install -e . # Basic installation in editable mode
|
||||
```
|
||||
|
||||
3. Use docker to run the local server:
|
||||
Install optional features:
|
||||
|
||||
```bash
|
||||
docker build -t crawl4ai .
|
||||
# For Mac users
|
||||
# docker build --platform linux/amd64 -t crawl4ai .
|
||||
docker run -d -p 8000:80 crawl4ai
|
||||
pip install -e ".[torch]" # With PyTorch features
|
||||
pip install -e ".[transformer]" # With Transformer features
|
||||
pip install -e ".[cosine]" # With cosine similarity features
|
||||
pip install -e ".[sync]" # With synchronous crawling (Selenium)
|
||||
pip install -e ".[all]" # Install all optional features
|
||||
```
|
||||
|
||||
For more information about how to run Crawl4AI as a local server, please refer to the [GitHub repository](https://github.com/unclecode/crawl4ai).
|
||||
</details>
|
||||
|
||||
## Using the Local server ot REST API 🌐
|
||||
<details>
|
||||
<summary>🐳 <strong>Docker Deployment</strong></summary>
|
||||
|
||||
You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl`. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
|
||||
> 🚀 **Major Changes Coming!** We're developing a completely new Docker implementation that will make deployment even more efficient and seamless. The current Docker setup is being deprecated in favor of this new solution.
|
||||
|
||||
### Example Usage
|
||||
### Current Docker Support
|
||||
|
||||
To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with the following parameters in the request body.
|
||||
The existing Docker implementation is being deprecated and will be replaced soon. If you still need to use Docker with the current version:
|
||||
|
||||
**Example Request:**
|
||||
```json
|
||||
{
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"include_raw_html": false,
|
||||
"bypass_cache": true,
|
||||
"word_count_threshold": 5,
|
||||
"extraction_strategy": "CosineStrategy",
|
||||
"chunking_strategy": "RegexChunking",
|
||||
"css_selector": "p",
|
||||
"verbose": true,
|
||||
"extraction_strategy_args": {
|
||||
"semantic_filter": "finance economy and stock market",
|
||||
"word_count_threshold": 20,
|
||||
"max_dist": 0.2,
|
||||
"linkage_method": "ward",
|
||||
"top_k": 3
|
||||
},
|
||||
"chunking_strategy_args": {
|
||||
"patterns": ["\n\n"]
|
||||
- 📚 [Deprecated Docker Setup](./docs/deprecated/docker-deployment.md) - Instructions for the current Docker implementation
|
||||
- ⚠️ Note: This setup will be replaced in the next major release
|
||||
|
||||
### What's Coming Next?
|
||||
|
||||
Our new Docker implementation will bring:
|
||||
- Improved performance and resource efficiency
|
||||
- Streamlined deployment process
|
||||
- Better integration with Crawl4AI features
|
||||
- Enhanced scalability options
|
||||
|
||||
Stay connected with our [GitHub repository](https://github.com/unclecode/crawl4ai) for updates!
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
### Quick Test
|
||||
|
||||
Run a quick test (works for both Docker options):
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Submit a crawl job
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json={"urls": "https://example.com", "priority": 10}
|
||||
)
|
||||
task_id = response.json()["task_id"]
|
||||
|
||||
# Continue polling until the task is complete (status="completed")
|
||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||
```
|
||||
|
||||
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
## 🔬 Advanced Usage Examples 🔬
|
||||
|
||||
You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared.
|
||||
|
||||
<details>
|
||||
<summary>📝 <strong>Heuristic Markdown Generation with Clean and Fit Markdown</strong></summary>
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=True,
|
||||
)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
),
|
||||
# markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0)
|
||||
# ),
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://docs.micronaut.io/4.7.6/guide/",
|
||||
config=run_config
|
||||
)
|
||||
print(len(result.markdown))
|
||||
print(len(result.fit_markdown))
|
||||
print(len(result.markdown_v2.fit_markdown))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🖥️ <strong>Executing JavaScript & Extract Structured Data without LLMs</strong></summary>
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
import json
|
||||
|
||||
async def main():
|
||||
schema = {
|
||||
"name": "KidoCode Courses",
|
||||
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
||||
"fields": [
|
||||
{
|
||||
"name": "section_title",
|
||||
"selector": "h3.heading-50",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "section_description",
|
||||
"selector": ".charge-content",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_name",
|
||||
"selector": ".text-block-93",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_description",
|
||||
"selector": ".course-content-text",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_icon",
|
||||
"selector": ".image-92",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example Response:**
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data": [
|
||||
{
|
||||
"url": "https://www.nbcnews.com/business",
|
||||
"extracted_content": "...",
|
||||
"html": "...",
|
||||
"markdown": "...",
|
||||
"metadata": {...}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
For more information about the available parameters and their descriptions, refer to the [Parameters](#parameters) section.
|
||||
|
||||
|
||||
## Python Library Usage 🚀
|
||||
|
||||
🔥 A great way to try out Crawl4AI is to run `quickstart.py` in the `docs/examples` directory. This script demonstrates how to use Crawl4AI to crawl a website and extract content from it.
|
||||
|
||||
### Quickstart Guide
|
||||
|
||||
Create an instance of WebCrawler and call the `warmup()` function.
|
||||
```python
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
```
|
||||
|
||||
### Understanding 'bypass_cache' and 'include_raw_html' parameters
|
||||
|
||||
First crawl (caches the result):
|
||||
```python
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
```
|
||||
|
||||
Second crawl (Force to crawl again):
|
||||
```python
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
|
||||
```
|
||||
💡 Don't forget to set `bypass_cache` to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set `always_by_pass_cache` in constructor to True to always bypass the cache.
|
||||
|
||||
Crawl result without raw HTML content:
|
||||
```python
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||
```
|
||||
|
||||
### Adding a chunking strategy: RegexChunking
|
||||
|
||||
Using RegexChunking:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)
|
||||
```
|
||||
|
||||
Using NlpSentenceChunking:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=NlpSentenceChunking()
|
||||
)
|
||||
```
|
||||
|
||||
### Extraction strategy: CosineStrategy
|
||||
|
||||
So far, the extracted content is just the result of chunking. To extract meaningful content, you can use extraction strategies. These strategies cluster consecutive chunks into meaningful blocks, keeping the same order as the text in the HTML. This approach is perfect for use in RAG applications and semantical search queries.
|
||||
|
||||
Using CosineStrategy:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="",
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2,
|
||||
linkage_method="ward",
|
||||
top_k=3
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
You can set `semantic_filter` to filter relevant documents before clustering. Documents are filtered based on their cosine similarity to the keyword filter embedding.
|
||||
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="finance economy and stock market",
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2,
|
||||
linkage_method="ward",
|
||||
top_k=3
|
||||
run_config = CrawlerRunConfig(
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=["""(async () => {const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");for(let tab of tabs) {tab.scrollIntoView();tab.click();await new Promise(r => setTimeout(r, 500));}})();"""],
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://www.kidocode.com/degrees/technology",
|
||||
config=run_config
|
||||
)
|
||||
|
||||
companies = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(companies)} companies")
|
||||
print(json.dumps(companies[0], indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Using LLMExtractionStrategy
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>📚 <strong>Extracting Structured Data with LLMs</strong></summary>
|
||||
|
||||
Without instructions:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY')
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
run_config = CrawlerRunConfig(
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
|
||||
# provider="ollama/qwen2", api_token="no-token",
|
||||
provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'),
|
||||
schema=OpenAIModelFee.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
||||
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
|
||||
),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://openai.com/api/pricing/',
|
||||
config=run_config
|
||||
)
|
||||
print(result.extracted_content)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
With instructions:
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🤖 <strong>Using You own Browswer with Custome User Profile</strong></summary>
|
||||
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
instruction="I am interested in only financial news"
|
||||
import os, sys
|
||||
from pathlib import Path
|
||||
import asyncio, time
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def test_news_crawl():
|
||||
# Create a persistent user data directory
|
||||
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
headless=True,
|
||||
user_data_dir=user_data_dir,
|
||||
use_persistent_context=True,
|
||||
)
|
||||
)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
url = "ADDRESS_OF_A_CHALLENGING_WEBSITE"
|
||||
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=run_config,
|
||||
magic=True,
|
||||
)
|
||||
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
```
|
||||
|
||||
### Targeted extraction using CSS selector
|
||||
</details>
|
||||
|
||||
Extract only H2 tags:
|
||||
```python
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector="h2"
|
||||
)
|
||||
```
|
||||
|
||||
### Passing JavaScript code to click 'Load More' button
|
||||
## ✨ Recent Updates
|
||||
|
||||
Using JavaScript to click 'Load More' button:
|
||||
```python
|
||||
js_code = """
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
```
|
||||
- 🔒 **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling
|
||||
- 🔍 **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies
|
||||
- 📦 **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction
|
||||
- 🏗️ **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types
|
||||
- ⚡ **Performance Boost**: Optimized caching, parallel processing, and memory management
|
||||
- 🐛 **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking
|
||||
- 🔐 **Security Features**: Improved input validation and safe expression evaluation
|
||||
|
||||
## Parameters 📖
|
||||
Read the full details of this release in our [0.4.24 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
| Parameter | Description | Required | Default Value |
|
||||
|-----------------------|-------------------------------------------------------------------------------------------------------|----------|---------------------|
|
||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
|
||||
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
||||
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
||||
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
||||
## 📖 Documentation & Roadmap
|
||||
|
||||
## Chunking Strategies 📚
|
||||
> 🚨 **Documentation Update Alert**: We're undertaking a major documentation overhaul next week to reflect recent updates and improvements. Stay tuned for a more comprehensive and up-to-date guide!
|
||||
|
||||
### RegexChunking
|
||||
For current documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
||||
|
||||
`RegexChunking` is a text chunking strategy that splits a given text into smaller parts using regular expressions. This is useful for preparing large texts for processing by language models, ensuring they are divided into manageable segments.
|
||||
To check our development plans and upcoming features, visit our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md).
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `patterns` (list, optional): A list of regular expression patterns used to split the text. Default is to split by double newlines (`['\n\n']`).
|
||||
<details>
|
||||
<summary>📈 <strong>Development TODOs</strong></summary>
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
|
||||
```
|
||||
- [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction
|
||||
- [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction
|
||||
- [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction
|
||||
- [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations
|
||||
- [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas
|
||||
- [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce)
|
||||
- [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content
|
||||
- [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance
|
||||
- [ ] 8. Performance Monitor: Real-time insights into crawler operations
|
||||
- [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers
|
||||
- [ ] 10. Sponsorship Program: Structured support system with tiered benefits
|
||||
- [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials
|
||||
|
||||
### NlpSentenceChunking
|
||||
</details>
|
||||
|
||||
`NlpSentenceChunking` uses a natural language processing model to chunk a given text into sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.
|
||||
## 🤝 Contributing
|
||||
|
||||
**Constructor Parameters:**
|
||||
- None.
|
||||
We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = NlpSentenceChunking()
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
|
||||
```
|
||||
|
||||
### TopicSegmentationChunking
|
||||
|
||||
`TopicSegmentationChunking` uses the TextTiling algorithm to segment a given text into topic-based chunks. This method identifies thematic boundaries in the text.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `num_keywords` (int, optional): The number of keywords to extract for each topic segment. Default is `3`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = TopicSegmentationChunking(num_keywords=3)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
|
||||
```
|
||||
|
||||
### FixedLengthWordChunking
|
||||
|
||||
`FixedLengthWordChunking` splits a given text into chunks of fixed length, based on the number of words.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `chunk_size` (int, optional): The number of words in each chunk. Default is `100`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = FixedLengthWordChunking(chunk_size=100)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
|
||||
```
|
||||
|
||||
### SlidingWindowChunking
|
||||
|
||||
`SlidingWindowChunking` uses a sliding window approach to chunk a given text. Each chunk has a fixed length, and the window slides by a specified step size.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `window_size` (int, optional): The number of words in each chunk. Default is `100`.
|
||||
- `step` (int, optional): The number of words to slide the window. Default is `50`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
chunker = SlidingWindowChunking(window_size=100, step=50)
|
||||
chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
|
||||
```
|
||||
|
||||
## Extraction Strategies 🧠
|
||||
|
||||
### NoExtractionStrategy
|
||||
|
||||
`NoExtractionStrategy` is a basic extraction strategy that returns the entire HTML content without any modification. It is useful for cases where no specific extraction is required.
|
||||
|
||||
**Constructor Parameters:**
|
||||
None.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = NoExtractionStrategy()
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
### LLMExtractionStrategy
|
||||
|
||||
`LLMExtractionStrategy` uses a Language Model (LLM) to extract meaningful blocks or chunks from the given HTML content. This strategy leverages an external provider for language model completions.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `provider` (str, optional): The provider to use for the language model completions. Default is `DEFAULT_PROVIDER` (e.g., openai/gpt-4).
|
||||
- `api_token` (str, optional): The API token for the provider. If not provided, it will try to load from the environment variable `OPENAI_API_KEY`.
|
||||
- `instruction` (str, optional): An instruction to guide the LLM on how to perform the extraction. This allows users to specify the type of data they are interested in or set the tone of the response. Default is `None`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
### CosineStrategy
|
||||
|
||||
`CosineStrategy` uses hierarchical clustering based on cosine similarity to extract clusters of text from the given HTML content. This strategy is suitable for identifying related content sections.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `semantic_filter` (str, optional): A string containing keywords for filtering relevant documents before clustering. If provided, documents are filtered based on their cosine similarity to the keyword filter embedding. Default is `None`.
|
||||
- `word_count_threshold` (int, optional): Minimum number of words per cluster. Default is `20`.
|
||||
- `max_dist` (float, optional): The maximum cophenetic distance on the dendrogram to form clusters. Default is `0.2`.
|
||||
- `linkage_method` (str, optional): The linkage method for hierarchical clustering. Default is `'ward'`.
|
||||
- `top_k` (int, optional): Number of top categories to extract. Default is `3`.
|
||||
- `model_name` (str, optional): The model name for embedding generation. Default is `'BAAI/bge-small-en-v1.5'`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = CosineStrategy(semantic_filter='finance rental prices', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
### TopicExtractionStrategy
|
||||
|
||||
`TopicExtractionStrategy` uses the TextTiling algorithm to segment the HTML content into topics and extracts keywords for each segment. This strategy is useful for identifying and summarizing thematic content.
|
||||
|
||||
**Constructor Parameters:**
|
||||
- `num_keywords` (int, optional): Number of keywords to represent each topic segment. Default is `3`.
|
||||
|
||||
**Example usage:**
|
||||
```python
|
||||
extractor = TopicExtractionStrategy(num_keywords=3)
|
||||
extracted_content = extractor.extract(url, html)
|
||||
```
|
||||
|
||||
## Contributing 🤝
|
||||
|
||||
We welcome contributions from the open-source community to help improve Crawl4AI and make it even more valuable for AI enthusiasts and developers. To contribute, please follow these steps:
|
||||
|
||||
1. Fork the repository.
|
||||
2. Create a new branch for your feature or bug fix.
|
||||
3. Make your changes and commit them with descriptive messages.
|
||||
4. Push your changes to your forked repository.
|
||||
5. Submit a pull request to the main repository.
|
||||
|
||||
For more information on contributing, please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md).
|
||||
|
||||
## License 📄
|
||||
## 📄 License
|
||||
|
||||
Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
|
||||
|
||||
## Contact 📧
|
||||
## 📧 Contact
|
||||
|
||||
If you have any questions, suggestions, or feedback, please feel free to reach out to us:
|
||||
For questions, suggestions, or feedback, feel free to reach out:
|
||||
|
||||
- GitHub: [unclecode](https://github.com/unclecode)
|
||||
- Twitter: [@unclecode](https://twitter.com/unclecode)
|
||||
- Website: [crawl4ai.com](https://crawl4ai.com)
|
||||
|
||||
Let's work together to make the web more accessible and useful for AI applications! 💪🌐🤖
|
||||
Happy Crawling! 🕸️🚀
|
||||
|
||||
## 🗾 Mission
|
||||
|
||||
Our mission is to unlock the value of personal and enterprise data by transforming digital footprints into structured, tradeable assets. Crawl4AI empowers individuals and organizations with open-source tools to extract and structure data, fostering a shared data economy.
|
||||
|
||||
We envision a future where AI is powered by real human knowledge, ensuring data creators directly benefit from their contributions. By democratizing data and enabling ethical sharing, we are laying the foundation for authentic AI advancement.
|
||||
|
||||
<details>
|
||||
<summary>🔑 <strong>Key Opportunities</strong></summary>
|
||||
|
||||
- **Data Capitalization**: Transform digital footprints into measurable, valuable assets.
|
||||
- **Authentic AI Data**: Provide AI systems with real human insights.
|
||||
- **Shared Economy**: Create a fair data marketplace that benefits data creators.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🚀 <strong>Development Pathway</strong></summary>
|
||||
|
||||
1. **Open-Source Tools**: Community-driven platforms for transparent data extraction.
|
||||
2. **Digital Asset Structuring**: Tools to organize and value digital knowledge.
|
||||
3. **Ethical Data Marketplace**: A secure, fair platform for exchanging structured data.
|
||||
|
||||
For more details, see our [full mission statement](./MISSION.md).
|
||||
</details>
|
||||
|
||||
## Star History
|
||||
|
||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||
|
||||
503
ROADMAP.md
Normal file
@@ -0,0 +1,503 @@
|
||||
# Crawl4AI Strategic Roadmap
|
||||
|
||||
```mermaid
|
||||
%%{init: {'themeVariables': { 'fontSize': '14px'}}}%%
|
||||
graph TD
|
||||
subgraph A1[Advanced Crawling Systems 🔧]
|
||||
A["`
|
||||
• Graph Crawler ✓
|
||||
• Question-Based Crawler
|
||||
• Knowledge-Optimal Crawler
|
||||
• Agentic Crawler
|
||||
`"]
|
||||
end
|
||||
|
||||
subgraph A2[Specialized Features 🛠️]
|
||||
B["`
|
||||
• Automated Schema Generator
|
||||
• Domain-Specific Scrapers
|
||||
•
|
||||
•
|
||||
`"]
|
||||
end
|
||||
|
||||
subgraph A3[Development Tools 🔨]
|
||||
C["`
|
||||
• Interactive Playground
|
||||
• Performance Monitor
|
||||
• Cloud Integration
|
||||
•
|
||||
`"]
|
||||
end
|
||||
|
||||
subgraph A4[Community & Growth 🌱]
|
||||
D["`
|
||||
• Sponsorship Program
|
||||
• Educational Content
|
||||
•
|
||||
•
|
||||
`"]
|
||||
end
|
||||
|
||||
classDef default fill:#f9f9f9,stroke:#333,stroke-width:2px
|
||||
classDef section fill:#f0f0f0,stroke:#333,stroke-width:4px,rx:10
|
||||
class A1,A2,A3,A4 section
|
||||
|
||||
%% Layout hints
|
||||
A1 --> A2[" "]
|
||||
A3 --> A4[" "]
|
||||
linkStyle 0,1 stroke:none
|
||||
```
|
||||
|
||||
Crawl4AI is evolving to provide more intelligent, efficient, and versatile web crawling capabilities. This roadmap outlines the key developments and features planned for the project, organized into strategic sections that build upon our current foundation.
|
||||
|
||||
## 1. Advanced Crawling Systems 🔧
|
||||
|
||||
This section introduces three powerful crawling systems that extend Crawl4AI's capabilities from basic web crawling to intelligent, purpose-driven data extraction.
|
||||
|
||||
### 1.1 Question-Based Crawler
|
||||
The Question-Based Crawler enhances our core engine by enabling automatic discovery and extraction of relevant web content based on natural language questions.
|
||||
|
||||
Key Features:
|
||||
- SerpiAPI integration for intelligent web search
|
||||
- Relevancy scoring for search results
|
||||
- Automatic URL discovery and prioritization
|
||||
- Cross-source validation
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.discovery import QuestionBasedDiscovery
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
discovery = QuestionBasedDiscovery(crawler)
|
||||
results = await discovery.arun(
|
||||
question="What are the system requirements for major cloud providers' GPU instances?",
|
||||
max_urls=5,
|
||||
relevance_threshold=0.7
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"Source: {result.url} (Relevance: {result.relevance_score})")
|
||||
print(f"Content: {result.markdown}\n")
|
||||
```
|
||||
|
||||
### 1.2 Knowledge-Optimal Crawler
|
||||
An intelligent crawling system that solves the optimization problem of minimizing data extraction while maximizing knowledge acquisition for specific objectives.
|
||||
|
||||
Key Features:
|
||||
- Smart content prioritization
|
||||
- Minimal data extraction for maximum knowledge
|
||||
- Probabilistic relevance assessment
|
||||
- Objective-driven crawling paths
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.optimization import KnowledgeOptimizer
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
optimizer = KnowledgeOptimizer(
|
||||
objective="Understand GPU instance pricing and limitations across cloud providers",
|
||||
required_knowledge=[
|
||||
"pricing structure",
|
||||
"GPU specifications",
|
||||
"usage limits",
|
||||
"availability zones"
|
||||
],
|
||||
confidence_threshold=0.85
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
urls=[
|
||||
"https://aws.amazon.com/ec2/pricing/",
|
||||
"https://cloud.google.com/gpu",
|
||||
"https://azure.microsoft.com/pricing/"
|
||||
],
|
||||
optimizer=optimizer,
|
||||
optimization_mode="minimal_extraction"
|
||||
)
|
||||
|
||||
print(f"Knowledge Coverage: {result.knowledge_coverage}")
|
||||
print(f"Data Efficiency: {result.efficiency_ratio}")
|
||||
print(f"Extracted Content: {result.optimal_content}")
|
||||
```
|
||||
|
||||
### 1.3 Agentic Crawler
|
||||
An autonomous system capable of understanding complex goals and automatically planning and executing multi-step crawling operations.
|
||||
|
||||
Key Features:
|
||||
- Autonomous goal interpretation
|
||||
- Dynamic step planning
|
||||
- Interactive navigation capabilities
|
||||
- Visual recognition and interaction
|
||||
- Automatic error recovery
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.agents import CrawlerAgent
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
agent = CrawlerAgent(crawler)
|
||||
|
||||
# Automatic planning and execution
|
||||
result = await agent.arun(
|
||||
goal="Find research papers about quantum computing published in 2023 with more than 50 citations",
|
||||
auto_retry=True
|
||||
)
|
||||
print("Generated Plan:", result.executed_steps)
|
||||
print("Extracted Data:", result.data)
|
||||
|
||||
# Using custom steps with automatic execution
|
||||
result = await agent.arun(
|
||||
goal="Extract conference deadlines from ML conferences",
|
||||
custom_plan=[
|
||||
"Navigate to conference page",
|
||||
"Find important dates section",
|
||||
"Extract submission deadlines",
|
||||
"Verify dates are for 2024"
|
||||
]
|
||||
)
|
||||
|
||||
# Monitoring execution
|
||||
print("Step Completion:", result.step_status)
|
||||
print("Execution Time:", result.execution_time)
|
||||
print("Success Rate:", result.success_rate)
|
||||
```
|
||||
|
||||
# Section 2: Specialized Features 🛠️
|
||||
|
||||
This section introduces specialized tools and features that enhance Crawl4AI's capabilities for specific use cases and data extraction needs.
|
||||
|
||||
### 2.1 Automated Schema Generator
|
||||
A system that automatically generates JsonCssExtractionStrategy schemas from natural language descriptions, making structured data extraction accessible to all users.
|
||||
|
||||
Key Features:
|
||||
- Natural language schema generation
|
||||
- Automatic pattern detection
|
||||
- Predefined schema templates
|
||||
- Chrome extension for visual schema building
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.schema import SchemaGenerator
|
||||
|
||||
# Generate schema from natural language description
|
||||
generator = SchemaGenerator()
|
||||
schema = await generator.generate(
|
||||
url="https://news-website.com",
|
||||
description="For each news article on the page, I need the headline, publication date, and main image"
|
||||
)
|
||||
|
||||
# Use generated schema with crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://news-website.com",
|
||||
extraction_strategy=schema
|
||||
)
|
||||
|
||||
# Example of generated schema:
|
||||
"""
|
||||
{
|
||||
"name": "News Article Extractor",
|
||||
"baseSelector": "article.news-item",
|
||||
"fields": [
|
||||
{
|
||||
"name": "headline",
|
||||
"selector": "h2.article-title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "date",
|
||||
"selector": "span.publish-date",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": "img.article-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
```
|
||||
|
||||
### 2.2 Domain Specific Scrapers
|
||||
Specialized extraction strategies optimized for common website types and platforms, providing consistent and reliable data extraction without additional configuration.
|
||||
|
||||
Key Features:
|
||||
- Pre-configured extractors for popular platforms
|
||||
- Academic site specialization (arXiv, NCBI)
|
||||
- E-commerce standardization
|
||||
- Documentation site handling
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extractors import AcademicExtractor, EcommerceExtractor
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Academic paper extraction
|
||||
papers = await crawler.arun(
|
||||
url="https://arxiv.org/list/cs.AI/recent",
|
||||
extractor="academic", # Built-in extractor type
|
||||
site_type="arxiv", # Specific site optimization
|
||||
extract_fields=[
|
||||
"title",
|
||||
"authors",
|
||||
"abstract",
|
||||
"citations"
|
||||
]
|
||||
)
|
||||
|
||||
# E-commerce product data
|
||||
products = await crawler.arun(
|
||||
url="https://store.example.com/products",
|
||||
extractor="ecommerce",
|
||||
extract_fields=[
|
||||
"name",
|
||||
"price",
|
||||
"availability",
|
||||
"reviews"
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2.3 Web Embedding Index
|
||||
Creates and maintains a semantic search infrastructure for crawled content, enabling efficient retrieval and querying of web content through vector embeddings.
|
||||
|
||||
Key Features:
|
||||
- Automatic embedding generation
|
||||
- Intelligent content chunking
|
||||
- Efficient vector storage and indexing
|
||||
- Semantic search capabilities
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.indexing import WebIndex
|
||||
|
||||
# Initialize and build index
|
||||
index = WebIndex(model="efficient-mini")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl and index content
|
||||
await index.build(
|
||||
urls=["https://docs.example.com"],
|
||||
crawler=crawler,
|
||||
options={
|
||||
"chunk_method": "semantic",
|
||||
"update_policy": "incremental",
|
||||
"embedding_batch_size": 100
|
||||
}
|
||||
)
|
||||
|
||||
# Search through indexed content
|
||||
results = await index.search(
|
||||
query="How to implement OAuth authentication?",
|
||||
filters={
|
||||
"content_type": "technical",
|
||||
"recency": "6months"
|
||||
},
|
||||
top_k=5
|
||||
)
|
||||
|
||||
# Get similar content
|
||||
similar = await index.find_similar(
|
||||
url="https://docs.example.com/auth/oauth",
|
||||
threshold=0.85
|
||||
)
|
||||
```
|
||||
|
||||
Each of these specialized features builds upon Crawl4AI's core functionality while providing targeted solutions for specific use cases. They can be used independently or combined for more complex data extraction and processing needs.
|
||||
|
||||
# Section 3: Development Tools 🔧
|
||||
|
||||
This section covers tools designed to enhance the development experience, monitoring, and deployment of Crawl4AI applications.
|
||||
|
||||
### 3.1 Crawl4AI Playground 🎮
|
||||
|
||||
The Crawl4AI Playground is an interactive web-based development environment that simplifies web scraping experimentation, development, and deployment. With its intuitive interface and AI-powered assistance, users can quickly prototype, test, and deploy web scraping solutions.
|
||||
|
||||
#### Key Features 🌟
|
||||
|
||||
##### Visual Strategy Builder
|
||||
- Interactive point-and-click interface for building extraction strategies
|
||||
- Real-time preview of selected elements
|
||||
- Side-by-side comparison of different extraction approaches
|
||||
- Visual validation of CSS selectors and XPath queries
|
||||
|
||||
##### AI Assistant Integration
|
||||
- Strategy recommendations based on target website analysis
|
||||
- Parameter optimization suggestions
|
||||
- Best practices guidance for specific use cases
|
||||
- Automated error detection and resolution
|
||||
- Performance optimization tips
|
||||
|
||||
##### Real-Time Testing & Validation
|
||||
- Live preview of extraction results
|
||||
- Side-by-side comparison of multiple strategies
|
||||
- Performance metrics visualization
|
||||
- Automatic validation of extracted data
|
||||
- Error detection and debugging tools
|
||||
|
||||
##### Project Management
|
||||
- Save and organize multiple scraping projects
|
||||
- Version control for configurations
|
||||
- Export/import project settings
|
||||
- Share configurations with team members
|
||||
- Project templates for common use cases
|
||||
|
||||
##### Deployment Pipeline
|
||||
- One-click deployment to various environments
|
||||
- Docker container generation
|
||||
- Cloud deployment templates (AWS, GCP, Azure)
|
||||
- Scaling configuration management
|
||||
- Monitoring setup automation
|
||||
|
||||
|
||||
### 3.2 Performance Monitoring System
|
||||
A comprehensive monitoring solution providing real-time insights into crawler operations, resource usage, and system health through both CLI and GUI interfaces.
|
||||
|
||||
Key Features:
|
||||
- Real-time resource tracking
|
||||
- Active crawl monitoring
|
||||
- Performance statistics
|
||||
- Customizable alerting system
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.monitor import CrawlMonitor
|
||||
|
||||
# Initialize monitoring
|
||||
monitor = CrawlMonitor()
|
||||
|
||||
# Start monitoring with CLI interface
|
||||
await monitor.start(
|
||||
mode="cli", # or "gui"
|
||||
refresh_rate="1s",
|
||||
metrics={
|
||||
"resources": ["cpu", "memory", "network"],
|
||||
"crawls": ["active", "queued", "completed"],
|
||||
"performance": ["success_rate", "response_times"]
|
||||
}
|
||||
)
|
||||
|
||||
# Example CLI output:
|
||||
"""
|
||||
Crawl4AI Monitor (Live) - Press Q to exit
|
||||
────────────────────────────────────────
|
||||
System Usage:
|
||||
├─ CPU: ███████░░░ 70%
|
||||
└─ Memory: ████░░░░░ 2.1GB/8GB
|
||||
|
||||
Active Crawls:
|
||||
ID URL Status Progress
|
||||
001 docs.example.com 🟢 Active 75%
|
||||
002 api.service.com 🟡 Queue -
|
||||
|
||||
Metrics (Last 5min):
|
||||
├─ Success Rate: 98%
|
||||
├─ Avg Response: 0.6s
|
||||
└─ Pages/sec: 8.5
|
||||
"""
|
||||
```
|
||||
|
||||
### 3.3 Cloud Integration
|
||||
Streamlined deployment tools for setting up Crawl4AI in various cloud environments, with support for scaling and monitoring.
|
||||
|
||||
Key Features:
|
||||
- One-click deployment solutions
|
||||
- Auto-scaling configuration
|
||||
- Load balancing setup
|
||||
- Cloud-specific optimizations
|
||||
- Monitoring integration
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.deploy import CloudDeployer
|
||||
|
||||
# Initialize deployer
|
||||
deployer = CloudDeployer()
|
||||
|
||||
# Deploy crawler service
|
||||
deployment = await deployer.deploy(
|
||||
service_name="crawler-cluster",
|
||||
platform="aws", # or "gcp", "azure"
|
||||
config={
|
||||
"instance_type": "compute-optimized",
|
||||
"auto_scaling": {
|
||||
"min_instances": 2,
|
||||
"max_instances": 10,
|
||||
"scale_based_on": "cpu_usage"
|
||||
},
|
||||
"region": "us-east-1",
|
||||
"monitoring": True
|
||||
}
|
||||
)
|
||||
|
||||
# Get deployment status and endpoints
|
||||
print(f"Service Status: {deployment.status}")
|
||||
print(f"API Endpoint: {deployment.endpoint}")
|
||||
print(f"Monitor URL: {deployment.monitor_url}")
|
||||
```
|
||||
|
||||
These development tools work together to provide a comprehensive environment for developing, testing, monitoring, and deploying Crawl4AI applications. The Playground helps users experiment and generate optimal configurations, the Performance Monitor ensures smooth operation, and the Cloud Integration tools simplify deployment and scaling.
|
||||
|
||||
# Section 4: Community & Growth 🌱
|
||||
|
||||
This section outlines initiatives designed to build and support the Crawl4AI community, provide educational resources, and ensure sustainable project growth.
|
||||
|
||||
### 4.1 Sponsorship Program
|
||||
A structured program to support ongoing development and maintenance of Crawl4AI while providing valuable benefits to sponsors.
|
||||
|
||||
Key Features:
|
||||
- Multiple sponsorship tiers
|
||||
- Sponsor recognition system
|
||||
- Priority support for sponsors
|
||||
- Early access to new features
|
||||
- Custom feature development opportunities
|
||||
|
||||
Program Structure (not yet finalized):
|
||||
```
|
||||
Sponsorship Tiers:
|
||||
|
||||
🥉 Bronze Supporter
|
||||
- GitHub Sponsor badge
|
||||
- Priority issue response
|
||||
- Community Discord role
|
||||
|
||||
🥈 Silver Supporter
|
||||
- All Bronze benefits
|
||||
- Technical support channel
|
||||
- Vote on roadmap priorities
|
||||
- Early access to beta features
|
||||
|
||||
🥇 Gold Supporter
|
||||
- All Silver benefits
|
||||
- Custom feature requests
|
||||
- Direct developer access
|
||||
- Private support sessions
|
||||
|
||||
💎 Diamond Partner
|
||||
- All Gold benefits
|
||||
- Custom development
|
||||
- On-demand consulting
|
||||
- Integration support
|
||||
```
|
||||
|
||||
### 4.2 "How to Crawl" Video Series
|
||||
A comprehensive educational resource teaching users how to effectively use Crawl4AI for various web scraping and data extraction scenarios.
|
||||
|
||||
Key Features:
|
||||
- Step-by-step tutorials
|
||||
- Real-world use cases
|
||||
- Best practices
|
||||
- Integration guides
|
||||
- Advanced feature deep-dives
|
||||
|
||||
These community initiatives are designed to:
|
||||
- Provide comprehensive learning resources
|
||||
- Foster a supportive user community
|
||||
- Ensure sustainable project development
|
||||
- Share knowledge and best practices
|
||||
- Create opportunities for collaboration
|
||||
|
||||
The combination of structured support through sponsorship, educational content through video series, and interactive learning through the playground creates a robust ecosystem for both new and experienced users of Crawl4AI.
|
||||
@@ -1 +1,46 @@
|
||||
from .web_crawler import WebCrawler
|
||||
# __init__.py
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
|
||||
from .models import CrawlResult
|
||||
from .__version__ import __version__
|
||||
|
||||
__all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"CrawlResult",
|
||||
"CacheMode",
|
||||
'BrowserConfig',
|
||||
'CrawlerRunConfig',
|
||||
'ExtractionStrategy',
|
||||
'LLMExtractionStrategy',
|
||||
'CosineStrategy',
|
||||
'JsonCssExtractionStrategy',
|
||||
'ChunkingStrategy',
|
||||
'RegexChunking',
|
||||
'DefaultMarkdownGenerator',
|
||||
'PruningContentFilter',
|
||||
'BM25ContentFilter',
|
||||
]
|
||||
|
||||
def is_sync_version_installed():
|
||||
try:
|
||||
import selenium
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
if is_sync_version_installed():
|
||||
try:
|
||||
from .web_crawler import WebCrawler
|
||||
__all__.append("WebCrawler")
|
||||
except ImportError:
|
||||
import warnings
|
||||
print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
|
||||
else:
|
||||
WebCrawler = None
|
||||
# import warnings
|
||||
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||
2
crawl4ai/__version__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.4.246"
|
||||
603
crawl4ai/async_configs.py
Normal file
@@ -0,0 +1,603 @@
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
SCREENSHOT_HEIGHT_TRESHOLD,
|
||||
PAGE_TIMEOUT,
|
||||
IMAGE_SCORE_THRESHOLD,
|
||||
SOCIAL_MEDIA_DOMAINS,
|
||||
|
||||
)
|
||||
from .user_agent_generator import UserAgentGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from typing import Union, List
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
"""
|
||||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||||
|
||||
This class centralizes all parameters that affect browser and context creation. Instead of passing
|
||||
scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
|
||||
code will then reference these settings to initialize the browser in a consistent, documented manner.
|
||||
|
||||
Attributes:
|
||||
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
||||
Default: "chromium".
|
||||
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
||||
Default: True.
|
||||
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||||
advanced manipulation. Default: False.
|
||||
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
|
||||
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
|
||||
Automatically sets use_managed_browser=True. Default: False.
|
||||
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
||||
temporary directory may be used. Default: None.
|
||||
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
|
||||
is "chromium". Default: "chromium".
|
||||
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
|
||||
is "chromium". Default: "chromium".
|
||||
proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
|
||||
Default: None.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
verbose (bool): Enable verbose logging.
|
||||
Default: True.
|
||||
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
|
||||
Default: False.
|
||||
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
||||
a default path will be created. Default: None.
|
||||
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
|
||||
Default: None.
|
||||
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
||||
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
||||
cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
|
||||
{"name": "...", "value": "...", "url": "..."}.
|
||||
Default: [].
|
||||
headers (dict): Extra HTTP headers to apply to all requests in this context.
|
||||
Default: {}.
|
||||
user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
|
||||
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
|
||||
user_agent as-is. Default: None.
|
||||
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
|
||||
Default: None.
|
||||
text_mode (bool): If True, disables images and other rich content for potentially faster load times.
|
||||
Default: False.
|
||||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||||
extra_args (list): Additional command-line arguments passed to the browser.
|
||||
Default: [].
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
browser_type: str = "chromium",
|
||||
headless: bool = True,
|
||||
use_managed_browser: bool = False,
|
||||
use_persistent_context: bool = False,
|
||||
user_data_dir: str = None,
|
||||
chrome_channel: str = "chromium",
|
||||
channel: str = "chromium",
|
||||
proxy: str = None,
|
||||
proxy_config: dict = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: str = None,
|
||||
storage_state=None,
|
||||
ignore_https_errors: bool = True,
|
||||
java_script_enabled: bool = True,
|
||||
sleep_on_close: bool = False,
|
||||
verbose: bool = True,
|
||||
cookies: list = None,
|
||||
headers: dict = None,
|
||||
user_agent: str = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||||
),
|
||||
user_agent_mode: str = None,
|
||||
user_agent_generator_config: dict = None,
|
||||
text_mode: bool = False,
|
||||
light_mode: bool = False,
|
||||
extra_args: list = None,
|
||||
debugging_port : int = 9222,
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
self.use_managed_browser = use_managed_browser
|
||||
self.use_persistent_context = use_persistent_context
|
||||
self.user_data_dir = user_data_dir
|
||||
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
||||
self.channel = channel or self.browser_type or "chromium"
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
self.accept_downloads = accept_downloads
|
||||
self.downloads_path = downloads_path
|
||||
self.storage_state = storage_state
|
||||
self.ignore_https_errors = ignore_https_errors
|
||||
self.java_script_enabled = java_script_enabled
|
||||
self.cookies = cookies if cookies is not None else []
|
||||
self.headers = headers if headers is not None else {}
|
||||
self.user_agent = user_agent
|
||||
self.user_agent_mode = user_agent_mode
|
||||
self.user_agent_generator_config = user_agent_generator_config
|
||||
self.text_mode = text_mode
|
||||
self.light_mode = light_mode
|
||||
self.extra_args = extra_args if extra_args is not None else []
|
||||
self.sleep_on_close = sleep_on_close
|
||||
self.verbose = verbose
|
||||
self.debugging_port = debugging_port
|
||||
|
||||
user_agenr_generator = UserAgentGenerator()
|
||||
if self.user_agent_mode != "random" and self.user_agent_generator_config:
|
||||
self.user_agent = user_agenr_generator.generate(
|
||||
**(self.user_agent_generator_config or {})
|
||||
)
|
||||
elif self.user_agent_mode == "random":
|
||||
self.user_agent = user_agenr_generator.generate()
|
||||
else:
|
||||
pass
|
||||
|
||||
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
||||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||||
|
||||
# If persistent context is requested, ensure managed browser is enabled
|
||||
if self.use_persistent_context:
|
||||
self.use_managed_browser = True
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||
return BrowserConfig(
|
||||
browser_type=kwargs.get("browser_type", "chromium"),
|
||||
headless=kwargs.get("headless", True),
|
||||
use_managed_browser=kwargs.get("use_managed_browser", False),
|
||||
use_persistent_context=kwargs.get("use_persistent_context", False),
|
||||
user_data_dir=kwargs.get("user_data_dir"),
|
||||
chrome_channel=kwargs.get("chrome_channel", "chromium"),
|
||||
channel=kwargs.get("channel", "chromium"),
|
||||
proxy=kwargs.get("proxy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
viewport_width=kwargs.get("viewport_width", 1080),
|
||||
viewport_height=kwargs.get("viewport_height", 600),
|
||||
accept_downloads=kwargs.get("accept_downloads", False),
|
||||
downloads_path=kwargs.get("downloads_path"),
|
||||
storage_state=kwargs.get("storage_state"),
|
||||
ignore_https_errors=kwargs.get("ignore_https_errors", True),
|
||||
java_script_enabled=kwargs.get("java_script_enabled", True),
|
||||
cookies=kwargs.get("cookies", []),
|
||||
headers=kwargs.get("headers", {}),
|
||||
user_agent=kwargs.get(
|
||||
"user_agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||||
),
|
||||
user_agent_mode=kwargs.get("user_agent_mode"),
|
||||
user_agent_generator_config=kwargs.get("user_agent_generator_config"),
|
||||
text_mode=kwargs.get("text_mode", False),
|
||||
light_mode=kwargs.get("light_mode", False),
|
||||
extra_args=kwargs.get("extra_args", []),
|
||||
)
|
||||
|
||||
|
||||
class CrawlerRunConfig:
|
||||
"""
|
||||
Configuration class for controlling how the crawler runs each crawl operation.
|
||||
This includes parameters for content extraction, page manipulation, waiting conditions,
|
||||
caching, and other runtime behaviors.
|
||||
|
||||
This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
|
||||
By using this class, you have a single place to understand and adjust the crawling options.
|
||||
|
||||
Attributes:
|
||||
# Content Processing Parameters
|
||||
word_count_threshold (int): Minimum word count threshold before processing content.
|
||||
Default: MIN_WORD_THRESHOLD (typically 200).
|
||||
extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
|
||||
Default: None (NoExtractionStrategy is used if None).
|
||||
chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
|
||||
Default: RegexChunking().
|
||||
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
|
||||
Default: None.
|
||||
content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
|
||||
Default: None.
|
||||
only_text (bool): If True, attempt to extract text-only content where applicable.
|
||||
Default: False.
|
||||
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
||||
Default: None.
|
||||
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
|
||||
Default: None.
|
||||
excluded_selector (str or None): CSS selector to exclude from processing.
|
||||
Default: None.
|
||||
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
|
||||
Default: False.
|
||||
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
|
||||
Default: False.
|
||||
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
||||
Default: False.
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||
If None, defaults to CacheMode.ENABLED internally.
|
||||
Default: None.
|
||||
session_id (str or None): Optional session ID to persist the browser context and the created
|
||||
page instance. If the ID already exists, the crawler does not
|
||||
create a new page and uses the current page to preserve the state.
|
||||
bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
|
||||
Default: False.
|
||||
disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
|
||||
Default: False.
|
||||
no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
|
||||
Default: False.
|
||||
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
|
||||
Default: False.
|
||||
|
||||
# Page Navigation and Timing Parameters
|
||||
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
|
||||
Default: "domcontentloaded".
|
||||
page_timeout (int): Timeout in ms for page operations like navigation.
|
||||
Default: 60000 (60 seconds).
|
||||
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
||||
Default: None.
|
||||
wait_for_images (bool): If True, wait for images to load before extracting content.
|
||||
Default: False.
|
||||
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
||||
Default: 0.1.
|
||||
mean_delay (float): Mean base delay between requests when calling arun_many.
|
||||
Default: 0.1.
|
||||
max_range (float): Max random additional delay range for requests in arun_many.
|
||||
Default: 0.3.
|
||||
semaphore_count (int): Number of concurrent operations allowed.
|
||||
Default: 5.
|
||||
|
||||
# Page Interaction Parameters
|
||||
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
|
||||
Default: None.
|
||||
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
|
||||
Default: False.
|
||||
ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
|
||||
Default: True.
|
||||
scan_full_page (bool): If True, scroll through the entire page to load all content.
|
||||
Default: False.
|
||||
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
|
||||
Default: 0.2.
|
||||
process_iframes (bool): If True, attempts to process and inline iframe content.
|
||||
Default: False.
|
||||
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
||||
Default: False.
|
||||
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
|
||||
Default: False.
|
||||
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
|
||||
Default: False.
|
||||
magic (bool): If True, attempts automatic handling of overlays/popups.
|
||||
Default: False.
|
||||
adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
|
||||
Default: False.
|
||||
|
||||
# Media Handling Parameters
|
||||
screenshot (bool): Whether to take a screenshot after crawling.
|
||||
Default: False.
|
||||
screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
|
||||
Default: None.
|
||||
screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
|
||||
Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
|
||||
pdf (bool): Whether to generate a PDF of the page.
|
||||
Default: False.
|
||||
image_description_min_word_threshold (int): Minimum words for image description extraction.
|
||||
Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
|
||||
image_score_threshold (int): Minimum score threshold for processing an image.
|
||||
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
|
||||
exclude_external_images (bool): If True, exclude all external images from processing.
|
||||
Default: False.
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
|
||||
Default: SOCIAL_MEDIA_DOMAINS (from config).
|
||||
exclude_external_links (bool): If True, exclude all external links from the results.
|
||||
Default: False.
|
||||
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
|
||||
Default: False.
|
||||
exclude_domains (list of str): List of specific domains to exclude from results.
|
||||
Default: [].
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
verbose (bool): Enable verbose logging.
|
||||
Default: True.
|
||||
log_console (bool): If True, log console messages from the page.
|
||||
Default: False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# Content Processing Parameters
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = None,
|
||||
markdown_generator: MarkdownGenerationStrategy = None,
|
||||
content_filter=None,
|
||||
only_text: bool = False,
|
||||
css_selector: str = None,
|
||||
excluded_tags: list = None,
|
||||
excluded_selector: str = None,
|
||||
keep_data_attributes: bool = False,
|
||||
remove_forms: bool = False,
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode=None,
|
||||
session_id: str = None,
|
||||
bypass_cache: bool = False,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
|
||||
# Page Navigation and Timing Parameters
|
||||
wait_until: str = "domcontentloaded",
|
||||
page_timeout: int = PAGE_TIMEOUT,
|
||||
wait_for: str = None,
|
||||
wait_for_images: bool = False,
|
||||
delay_before_return_html: float = 0.1,
|
||||
mean_delay: float = 0.1,
|
||||
max_range: float = 0.3,
|
||||
semaphore_count: int = 5,
|
||||
|
||||
# Page Interaction Parameters
|
||||
js_code: Union[str, List[str]] = None,
|
||||
js_only: bool = False,
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
scroll_delay: float = 0.2,
|
||||
process_iframes: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
simulate_user: bool = False,
|
||||
override_navigator: bool = False,
|
||||
magic: bool = False,
|
||||
adjust_viewport_to_content: bool = False,
|
||||
|
||||
# Media Handling Parameters
|
||||
screenshot: bool = False,
|
||||
screenshot_wait_for: float = None,
|
||||
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
|
||||
pdf: bool = False,
|
||||
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||||
exclude_external_images: bool = False,
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains: list = None,
|
||||
exclude_external_links: bool = False,
|
||||
exclude_social_media_links: bool = False,
|
||||
exclude_domains: list = None,
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
|
||||
url: str = None,
|
||||
):
|
||||
self.url = url
|
||||
|
||||
# Content Processing Parameters
|
||||
self.word_count_threshold = word_count_threshold
|
||||
self.extraction_strategy = extraction_strategy
|
||||
self.chunking_strategy = chunking_strategy
|
||||
self.markdown_generator = markdown_generator
|
||||
self.content_filter = content_filter
|
||||
self.only_text = only_text
|
||||
self.css_selector = css_selector
|
||||
self.excluded_tags = excluded_tags or []
|
||||
self.excluded_selector = excluded_selector or ""
|
||||
self.keep_data_attributes = keep_data_attributes
|
||||
self.remove_forms = remove_forms
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
|
||||
# Caching Parameters
|
||||
self.cache_mode = cache_mode
|
||||
self.session_id = session_id
|
||||
self.bypass_cache = bypass_cache
|
||||
self.disable_cache = disable_cache
|
||||
self.no_cache_read = no_cache_read
|
||||
self.no_cache_write = no_cache_write
|
||||
|
||||
# Page Navigation and Timing Parameters
|
||||
self.wait_until = wait_until
|
||||
self.page_timeout = page_timeout
|
||||
self.wait_for = wait_for
|
||||
self.wait_for_images = wait_for_images
|
||||
self.delay_before_return_html = delay_before_return_html
|
||||
self.mean_delay = mean_delay
|
||||
self.max_range = max_range
|
||||
self.semaphore_count = semaphore_count
|
||||
|
||||
# Page Interaction Parameters
|
||||
self.js_code = js_code
|
||||
self.js_only = js_only
|
||||
self.ignore_body_visibility = ignore_body_visibility
|
||||
self.scan_full_page = scan_full_page
|
||||
self.scroll_delay = scroll_delay
|
||||
self.process_iframes = process_iframes
|
||||
self.remove_overlay_elements = remove_overlay_elements
|
||||
self.simulate_user = simulate_user
|
||||
self.override_navigator = override_navigator
|
||||
self.magic = magic
|
||||
self.adjust_viewport_to_content = adjust_viewport_to_content
|
||||
|
||||
# Media Handling Parameters
|
||||
self.screenshot = screenshot
|
||||
self.screenshot_wait_for = screenshot_wait_for
|
||||
self.screenshot_height_threshold = screenshot_height_threshold
|
||||
self.pdf = pdf
|
||||
self.image_description_min_word_threshold = image_description_min_word_threshold
|
||||
self.image_score_threshold = image_score_threshold
|
||||
self.exclude_external_images = exclude_external_images
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
|
||||
self.exclude_external_links = exclude_external_links
|
||||
self.exclude_social_media_links = exclude_social_media_links
|
||||
self.exclude_domains = exclude_domains or []
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
self.verbose = verbose
|
||||
self.log_console = log_console
|
||||
|
||||
# Validate type of extraction strategy and chunking strategy if they are provided
|
||||
if self.extraction_strategy is not None and not isinstance(
|
||||
self.extraction_strategy, ExtractionStrategy
|
||||
):
|
||||
raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
|
||||
if self.chunking_strategy is not None and not isinstance(
|
||||
self.chunking_strategy, ChunkingStrategy
|
||||
):
|
||||
raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")
|
||||
|
||||
# Set default chunking strategy if None
|
||||
if self.chunking_strategy is None:
|
||||
from .chunking_strategy import RegexChunking
|
||||
self.chunking_strategy = RegexChunking()
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
|
||||
return CrawlerRunConfig(
|
||||
# Content Processing Parameters
|
||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||
chunking_strategy=kwargs.get("chunking_strategy"),
|
||||
markdown_generator=kwargs.get("markdown_generator"),
|
||||
content_filter=kwargs.get("content_filter"),
|
||||
only_text=kwargs.get("only_text", False),
|
||||
css_selector=kwargs.get("css_selector"),
|
||||
excluded_tags=kwargs.get("excluded_tags", []),
|
||||
excluded_selector=kwargs.get("excluded_selector", ""),
|
||||
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
||||
remove_forms=kwargs.get("remove_forms", False),
|
||||
prettiify=kwargs.get("prettiify", False),
|
||||
parser_type=kwargs.get("parser_type", "lxml"),
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode=kwargs.get("cache_mode"),
|
||||
session_id=kwargs.get("session_id"),
|
||||
bypass_cache=kwargs.get("bypass_cache", False),
|
||||
disable_cache=kwargs.get("disable_cache", False),
|
||||
no_cache_read=kwargs.get("no_cache_read", False),
|
||||
no_cache_write=kwargs.get("no_cache_write", False),
|
||||
|
||||
# Page Navigation and Timing Parameters
|
||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||
page_timeout=kwargs.get("page_timeout", 60000),
|
||||
wait_for=kwargs.get("wait_for"),
|
||||
wait_for_images=kwargs.get("wait_for_images", False),
|
||||
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
||||
mean_delay=kwargs.get("mean_delay", 0.1),
|
||||
max_range=kwargs.get("max_range", 0.3),
|
||||
semaphore_count=kwargs.get("semaphore_count", 5),
|
||||
|
||||
# Page Interaction Parameters
|
||||
js_code=kwargs.get("js_code"),
|
||||
js_only=kwargs.get("js_only", False),
|
||||
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
|
||||
scan_full_page=kwargs.get("scan_full_page", False),
|
||||
scroll_delay=kwargs.get("scroll_delay", 0.2),
|
||||
process_iframes=kwargs.get("process_iframes", False),
|
||||
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
|
||||
simulate_user=kwargs.get("simulate_user", False),
|
||||
override_navigator=kwargs.get("override_navigator", False),
|
||||
magic=kwargs.get("magic", False),
|
||||
adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
|
||||
|
||||
# Media Handling Parameters
|
||||
screenshot=kwargs.get("screenshot", False),
|
||||
screenshot_wait_for=kwargs.get("screenshot_wait_for"),
|
||||
screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
|
||||
pdf=kwargs.get("pdf", False),
|
||||
image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
|
||||
image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
|
||||
exclude_external_images=kwargs.get("exclude_external_images", False),
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
|
||||
exclude_external_links=kwargs.get("exclude_external_links", False),
|
||||
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
||||
exclude_domains=kwargs.get("exclude_domains", []),
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
verbose=kwargs.get("verbose", True),
|
||||
log_console=kwargs.get("log_console", False),
|
||||
|
||||
url=kwargs.get("url"),
|
||||
)
|
||||
|
||||
# Create a funciton returns dict of the object
|
||||
def to_dict(self):
|
||||
return {
|
||||
"word_count_threshold": self.word_count_threshold,
|
||||
"extraction_strategy": self.extraction_strategy,
|
||||
"chunking_strategy": self.chunking_strategy,
|
||||
"markdown_generator": self.markdown_generator,
|
||||
"content_filter": self.content_filter,
|
||||
"only_text": self.only_text,
|
||||
"css_selector": self.css_selector,
|
||||
"excluded_tags": self.excluded_tags,
|
||||
"excluded_selector": self.excluded_selector,
|
||||
"keep_data_attributes": self.keep_data_attributes,
|
||||
"remove_forms": self.remove_forms,
|
||||
"prettiify": self.prettiify,
|
||||
"parser_type": self.parser_type,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
"bypass_cache": self.bypass_cache,
|
||||
"disable_cache": self.disable_cache,
|
||||
"no_cache_read": self.no_cache_read,
|
||||
"no_cache_write": self.no_cache_write,
|
||||
"wait_until": self.wait_until,
|
||||
"page_timeout": self.page_timeout,
|
||||
"wait_for": self.wait_for,
|
||||
"wait_for_images": self.wait_for_images,
|
||||
"delay_before_return_html": self.delay_before_return_html,
|
||||
"mean_delay": self.mean_delay,
|
||||
"max_range": self.max_range,
|
||||
"semaphore_count": self.semaphore_count,
|
||||
"js_code": self.js_code,
|
||||
"js_only": self.js_only,
|
||||
"ignore_body_visibility": self.ignore_body_visibility,
|
||||
"scan_full_page": self.scan_full_page,
|
||||
"scroll_delay": self.scroll_delay,
|
||||
"process_iframes": self.process_iframes,
|
||||
"remove_overlay_elements": self.remove_overlay_elements,
|
||||
"simulate_user": self.simulate_user,
|
||||
"override_navigator": self.override_navigator,
|
||||
"magic": self.magic,
|
||||
"adjust_viewport_to_content": self.adjust_viewport_to_content,
|
||||
"screenshot": self.screenshot,
|
||||
"screenshot_wait_for": self.screenshot_wait_for,
|
||||
"screenshot_height_threshold": self.screenshot_height_threshold,
|
||||
"pdf": self.pdf,
|
||||
"image_description_min_word_threshold": self.image_description_min_word_threshold,
|
||||
"image_score_threshold": self.image_score_threshold,
|
||||
"exclude_external_images": self.exclude_external_images,
|
||||
"exclude_social_media_domains": self.exclude_social_media_domains,
|
||||
"exclude_external_links": self.exclude_external_links,
|
||||
"exclude_social_media_links": self.exclude_social_media_links,
|
||||
"exclude_domains": self.exclude_domains,
|
||||
"verbose": self.verbose,
|
||||
"log_console": self.log_console,
|
||||
"url": self.url,
|
||||
}
|
||||
2161
crawl4ai/async_crawler_strategy.py
Normal file
495
crawl4ai/async_database.py
Normal file
@@ -0,0 +1,495 @@
|
||||
import os, sys
|
||||
from pathlib import Path
|
||||
import aiosqlite
|
||||
import asyncio
|
||||
from typing import Optional, Tuple, Dict
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
import xxhash
|
||||
import aiofiles
|
||||
from .config import NEED_MIGRATION
|
||||
from .version_manager import VersionManager
|
||||
from .async_logger import AsyncLogger
|
||||
from .utils import get_error_context, create_box_message
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(base_directory, "crawl4ai.db")
|
||||
|
||||
class AsyncDatabaseManager:
|
||||
def __init__(self, pool_size: int = 10, max_retries: int = 3):
|
||||
self.db_path = DB_PATH
|
||||
self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH))
|
||||
self.pool_size = pool_size
|
||||
self.max_retries = max_retries
|
||||
self.connection_pool: Dict[int, aiosqlite.Connection] = {}
|
||||
self.pool_lock = asyncio.Lock()
|
||||
self.init_lock = asyncio.Lock()
|
||||
self.connection_semaphore = asyncio.Semaphore(pool_size)
|
||||
self._initialized = False
|
||||
self.version_manager = VersionManager()
|
||||
self.logger = AsyncLogger(
|
||||
log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
|
||||
verbose=False,
|
||||
tag_width=10
|
||||
)
|
||||
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the database and connection pool"""
|
||||
try:
|
||||
self.logger.info("Initializing database", tag="INIT")
|
||||
# Ensure the database file exists
|
||||
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
|
||||
|
||||
# Check if version update is needed
|
||||
needs_update = self.version_manager.needs_update()
|
||||
|
||||
# Always ensure base table exists
|
||||
await self.ainit_db()
|
||||
|
||||
# Verify the table exists
|
||||
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
|
||||
async with db.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
|
||||
) as cursor:
|
||||
result = await cursor.fetchone()
|
||||
if not result:
|
||||
raise Exception("crawled_data table was not created")
|
||||
|
||||
# If version changed or fresh install, run updates
|
||||
if needs_update:
|
||||
self.logger.info("New version detected, running updates", tag="INIT")
|
||||
await self.update_db_schema()
|
||||
from .migrations import run_migration # Import here to avoid circular imports
|
||||
await run_migration()
|
||||
self.version_manager.update_version() # Update stored version after successful migration
|
||||
self.logger.success("Version update completed successfully", tag="COMPLETE")
|
||||
else:
|
||||
self.logger.success("Database initialization completed successfully", tag="COMPLETE")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Database initialization error: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
self.logger.info(
|
||||
message="Database will be initialized on first use",
|
||||
tag="INIT"
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup connections when shutting down"""
|
||||
async with self.pool_lock:
|
||||
for conn in self.connection_pool.values():
|
||||
await conn.close()
|
||||
self.connection_pool.clear()
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_connection(self):
|
||||
"""Connection pool manager with enhanced error handling"""
|
||||
if not self._initialized:
|
||||
async with self.init_lock:
|
||||
if not self._initialized:
|
||||
try:
|
||||
await self.initialize()
|
||||
self._initialized = True
|
||||
except Exception as e:
|
||||
import sys
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
self.logger.error(
|
||||
message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={
|
||||
"error": str(e),
|
||||
"context": error_context["code_context"],
|
||||
"traceback": error_context["full_traceback"]
|
||||
}
|
||||
)
|
||||
raise
|
||||
|
||||
await self.connection_semaphore.acquire()
|
||||
task_id = id(asyncio.current_task())
|
||||
|
||||
try:
|
||||
async with self.pool_lock:
|
||||
if task_id not in self.connection_pool:
|
||||
try:
|
||||
conn = await aiosqlite.connect(
|
||||
self.db_path,
|
||||
timeout=30.0
|
||||
)
|
||||
await conn.execute('PRAGMA journal_mode = WAL')
|
||||
await conn.execute('PRAGMA busy_timeout = 5000')
|
||||
|
||||
# Verify database structure
|
||||
async with conn.execute("PRAGMA table_info(crawled_data)") as cursor:
|
||||
columns = await cursor.fetchall()
|
||||
column_names = [col[1] for col in columns]
|
||||
expected_columns = {
|
||||
'url', 'html', 'cleaned_html', 'markdown', 'extracted_content',
|
||||
'success', 'media', 'links', 'metadata', 'screenshot',
|
||||
'response_headers', 'downloaded_files'
|
||||
}
|
||||
missing_columns = expected_columns - set(column_names)
|
||||
if missing_columns:
|
||||
raise ValueError(f"Database missing columns: {missing_columns}")
|
||||
|
||||
self.connection_pool[task_id] = conn
|
||||
except Exception as e:
|
||||
import sys
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
error_message = (
|
||||
f"Unexpected error in db get_connection at line {error_context['line_no']} "
|
||||
f"in {error_context['function']} ({error_context['filename']}):\n"
|
||||
f"Error: {str(e)}\n\n"
|
||||
f"Code context:\n{error_context['code_context']}"
|
||||
)
|
||||
self.logger.error(
|
||||
message=create_box_message(error_message, type= "error"),
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
yield self.connection_pool[task_id]
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
error_message = (
|
||||
f"Unexpected error in db get_connection at line {error_context['line_no']} "
|
||||
f"in {error_context['function']} ({error_context['filename']}):\n"
|
||||
f"Error: {str(e)}\n\n"
|
||||
f"Code context:\n{error_context['code_context']}"
|
||||
)
|
||||
self.logger.error(
|
||||
message=create_box_message(error_message, type= "error"),
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
async with self.pool_lock:
|
||||
if task_id in self.connection_pool:
|
||||
await self.connection_pool[task_id].close()
|
||||
del self.connection_pool[task_id]
|
||||
self.connection_semaphore.release()
|
||||
|
||||
|
||||
async def execute_with_retry(self, operation, *args):
|
||||
"""Execute database operations with retry logic"""
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
async with self.get_connection() as db:
|
||||
result = await operation(db, *args)
|
||||
await db.commit()
|
||||
return result
|
||||
except Exception as e:
|
||||
if attempt == self.max_retries - 1:
|
||||
self.logger.error(
|
||||
message="Operation failed after {retries} attempts: {error}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={
|
||||
"retries": self.max_retries,
|
||||
"error": str(e)
|
||||
}
|
||||
)
|
||||
raise
|
||||
await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
|
||||
|
||||
async def ainit_db(self):
|
||||
"""Initialize database schema"""
|
||||
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS crawled_data (
|
||||
url TEXT PRIMARY KEY,
|
||||
html TEXT,
|
||||
cleaned_html TEXT,
|
||||
markdown TEXT,
|
||||
extracted_content TEXT,
|
||||
success BOOLEAN,
|
||||
media TEXT DEFAULT "{}",
|
||||
links TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT "",
|
||||
response_headers TEXT DEFAULT "{}",
|
||||
downloaded_files TEXT DEFAULT "{}" -- New column added
|
||||
)
|
||||
''')
|
||||
await db.commit()
|
||||
|
||||
|
||||
|
||||
async def update_db_schema(self):
|
||||
"""Update database schema if needed"""
|
||||
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
|
||||
cursor = await db.execute("PRAGMA table_info(crawled_data)")
|
||||
columns = await cursor.fetchall()
|
||||
column_names = [column[1] for column in columns]
|
||||
|
||||
# List of new columns to add
|
||||
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
|
||||
|
||||
for column in new_columns:
|
||||
if column not in column_names:
|
||||
await self.aalter_db_add_column(column, db)
|
||||
await db.commit()
|
||||
|
||||
async def aalter_db_add_column(self, new_column: str, db):
|
||||
"""Add new column to the database"""
|
||||
if new_column == 'response_headers':
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
|
||||
else:
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
self.logger.info(
|
||||
message="Added column '{column}' to the database",
|
||||
tag="INIT",
|
||||
params={"column": new_column}
|
||||
)
|
||||
|
||||
|
||||
async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
|
||||
"""Retrieve cached URL data as CrawlResult"""
|
||||
async def _get(db):
|
||||
async with db.execute(
|
||||
'SELECT * FROM crawled_data WHERE url = ?', (url,)
|
||||
) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
# Get column names
|
||||
columns = [description[0] for description in cursor.description]
|
||||
# Create dict from row data
|
||||
row_dict = dict(zip(columns, row))
|
||||
|
||||
# Load content from files using stored hashes
|
||||
content_fields = {
|
||||
'html': row_dict['html'],
|
||||
'cleaned_html': row_dict['cleaned_html'],
|
||||
'markdown': row_dict['markdown'],
|
||||
'extracted_content': row_dict['extracted_content'],
|
||||
'screenshot': row_dict['screenshot'],
|
||||
'screenshots': row_dict['screenshot'],
|
||||
}
|
||||
|
||||
for field, hash_value in content_fields.items():
|
||||
if hash_value:
|
||||
content = await self._load_content(
|
||||
hash_value,
|
||||
field.split('_')[0] # Get content type from field name
|
||||
)
|
||||
row_dict[field] = content or ""
|
||||
else:
|
||||
row_dict[field] = ""
|
||||
|
||||
# Parse JSON fields
|
||||
json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
|
||||
for field in json_fields:
|
||||
try:
|
||||
row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
|
||||
except json.JSONDecodeError:
|
||||
row_dict[field] = {}
|
||||
|
||||
if isinstance(row_dict['markdown'], Dict):
|
||||
row_dict['markdown_v2'] = row_dict['markdown']
|
||||
if row_dict['markdown'].get('raw_markdown'):
|
||||
row_dict['markdown'] = row_dict['markdown']['raw_markdown']
|
||||
|
||||
# Parse downloaded_files
|
||||
try:
|
||||
row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
|
||||
except json.JSONDecodeError:
|
||||
row_dict['downloaded_files'] = []
|
||||
|
||||
# Remove any fields not in CrawlResult model
|
||||
valid_fields = CrawlResult.__annotations__.keys()
|
||||
filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
|
||||
|
||||
return CrawlResult(**filtered_dict)
|
||||
|
||||
try:
|
||||
return await self.execute_with_retry(_get)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error retrieving cached URL: {error}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return None
|
||||
|
||||
async def acache_url(self, result: CrawlResult):
|
||||
"""Cache CrawlResult data"""
|
||||
# Store content files and get hashes
|
||||
content_map = {
|
||||
'html': (result.html, 'html'),
|
||||
'cleaned_html': (result.cleaned_html or "", 'cleaned'),
|
||||
'markdown': None,
|
||||
'extracted_content': (result.extracted_content or "", 'extracted'),
|
||||
'screenshot': (result.screenshot or "", 'screenshots')
|
||||
}
|
||||
|
||||
try:
|
||||
if isinstance(result.markdown, MarkdownGenerationResult):
|
||||
content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
|
||||
elif hasattr(result, 'markdown_v2'):
|
||||
content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
|
||||
elif isinstance(result.markdown, str):
|
||||
markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
|
||||
content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
|
||||
else:
|
||||
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Error processing markdown content: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
# Fallback to empty markdown result
|
||||
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
||||
|
||||
content_hashes = {}
|
||||
for field, (content, content_type) in content_map.items():
|
||||
content_hashes[field] = await self._store_content(content, content_type)
|
||||
|
||||
async def _cache(db):
|
||||
await db.execute('''
|
||||
INSERT INTO crawled_data (
|
||||
url, html, cleaned_html, markdown,
|
||||
extracted_content, success, media, links, metadata,
|
||||
screenshot, response_headers, downloaded_files
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
extracted_content = excluded.extracted_content,
|
||||
success = excluded.success,
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot,
|
||||
response_headers = excluded.response_headers,
|
||||
downloaded_files = excluded.downloaded_files
|
||||
''', (
|
||||
result.url,
|
||||
content_hashes['html'],
|
||||
content_hashes['cleaned_html'],
|
||||
content_hashes['markdown'],
|
||||
content_hashes['extracted_content'],
|
||||
result.success,
|
||||
json.dumps(result.media),
|
||||
json.dumps(result.links),
|
||||
json.dumps(result.metadata or {}),
|
||||
content_hashes['screenshot'],
|
||||
json.dumps(result.response_headers or {}),
|
||||
json.dumps(result.downloaded_files or [])
|
||||
))
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_cache)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error caching URL: {error}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
|
||||
async def aget_total_count(self) -> int:
|
||||
"""Get total number of cached URLs"""
|
||||
async def _count(db):
|
||||
async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
|
||||
result = await cursor.fetchone()
|
||||
return result[0] if result else 0
|
||||
|
||||
try:
|
||||
return await self.execute_with_retry(_count)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error getting total count: {error}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return 0
|
||||
|
||||
async def aclear_db(self):
|
||||
"""Clear all data from the database"""
|
||||
async def _clear(db):
|
||||
await db.execute('DELETE FROM crawled_data')
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_clear)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error clearing database: {error}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
async def aflush_db(self):
|
||||
"""Drop the entire table"""
|
||||
async def _flush(db):
|
||||
await db.execute('DROP TABLE IF EXISTS crawled_data')
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_flush)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error flushing database: {error}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
|
||||
async def _store_content(self, content: str, content_type: str) -> str:
|
||||
"""Store content in filesystem and return hash"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
content_hash = generate_content_hash(content)
|
||||
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
||||
|
||||
# Only write if file doesn't exist
|
||||
if not os.path.exists(file_path):
|
||||
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
||||
await f.write(content)
|
||||
|
||||
return content_hash
|
||||
|
||||
async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
|
||||
"""Load content from filesystem by hash"""
|
||||
if not content_hash:
|
||||
return None
|
||||
|
||||
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
||||
try:
|
||||
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
|
||||
return await f.read()
|
||||
except:
|
||||
self.logger.error(
|
||||
message="Failed to load content: {file_path}",
|
||||
tag="ERROR",
|
||||
force_verbose=True,
|
||||
params={"file_path": file_path}
|
||||
)
|
||||
return None
|
||||
|
||||
# Create a singleton instance
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
231
crawl4ai/async_logger.py
Normal file
@@ -0,0 +1,231 @@
|
||||
from enum import Enum
|
||||
from typing import Optional, Dict, Any, Union
|
||||
from colorama import Fore, Back, Style, init
|
||||
import time
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
class LogLevel(Enum):
|
||||
DEBUG = 1
|
||||
INFO = 2
|
||||
SUCCESS = 3
|
||||
WARNING = 4
|
||||
ERROR = 5
|
||||
|
||||
class AsyncLogger:
|
||||
"""
|
||||
Asynchronous logger with support for colored console output and file logging.
|
||||
Supports templated messages with colored components.
|
||||
"""
|
||||
|
||||
DEFAULT_ICONS = {
|
||||
'INIT': '→',
|
||||
'READY': '✓',
|
||||
'FETCH': '↓',
|
||||
'SCRAPE': '◆',
|
||||
'EXTRACT': '■',
|
||||
'COMPLETE': '●',
|
||||
'ERROR': '×',
|
||||
'DEBUG': '⋯',
|
||||
'INFO': 'ℹ',
|
||||
'WARNING': '⚠',
|
||||
}
|
||||
|
||||
DEFAULT_COLORS = {
|
||||
LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
|
||||
LogLevel.INFO: Fore.CYAN,
|
||||
LogLevel.SUCCESS: Fore.GREEN,
|
||||
LogLevel.WARNING: Fore.YELLOW,
|
||||
LogLevel.ERROR: Fore.RED,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
log_file: Optional[str] = None,
|
||||
log_level: LogLevel = LogLevel.DEBUG,
|
||||
tag_width: int = 10,
|
||||
icons: Optional[Dict[str, str]] = None,
|
||||
colors: Optional[Dict[LogLevel, str]] = None,
|
||||
verbose: bool = True
|
||||
):
|
||||
"""
|
||||
Initialize the logger.
|
||||
|
||||
Args:
|
||||
log_file: Optional file path for logging
|
||||
log_level: Minimum log level to display
|
||||
tag_width: Width for tag formatting
|
||||
icons: Custom icons for different tags
|
||||
colors: Custom colors for different log levels
|
||||
verbose: Whether to output to console
|
||||
"""
|
||||
init() # Initialize colorama
|
||||
self.log_file = log_file
|
||||
self.log_level = log_level
|
||||
self.tag_width = tag_width
|
||||
self.icons = icons or self.DEFAULT_ICONS
|
||||
self.colors = colors or self.DEFAULT_COLORS
|
||||
self.verbose = verbose
|
||||
|
||||
# Create log file directory if needed
|
||||
if log_file:
|
||||
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
||||
|
||||
def _format_tag(self, tag: str) -> str:
|
||||
"""Format a tag with consistent width."""
|
||||
return f"[{tag}]".ljust(self.tag_width, ".")
|
||||
|
||||
def _get_icon(self, tag: str) -> str:
|
||||
"""Get the icon for a tag, defaulting to info icon if not found."""
|
||||
return self.icons.get(tag, self.icons['INFO'])
|
||||
|
||||
def _write_to_file(self, message: str):
|
||||
"""Write a message to the log file if configured."""
|
||||
if self.log_file:
|
||||
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
||||
with open(self.log_file, 'a', encoding='utf-8') as f:
|
||||
# Strip ANSI color codes for file output
|
||||
clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
|
||||
for color in vars(Fore).values():
|
||||
if isinstance(color, str):
|
||||
clean_message = clean_message.replace(color, '')
|
||||
f.write(f"[{timestamp}] {clean_message}\n")
|
||||
|
||||
def _log(
|
||||
self,
|
||||
level: LogLevel,
|
||||
message: str,
|
||||
tag: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
colors: Optional[Dict[str, str]] = None,
|
||||
base_color: Optional[str] = None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Core logging method that handles message formatting and output.
|
||||
|
||||
Args:
|
||||
level: Log level for this message
|
||||
message: Message template string
|
||||
tag: Tag for the message
|
||||
params: Parameters to format into the message
|
||||
colors: Color overrides for specific parameters
|
||||
base_color: Base color for the entire message
|
||||
"""
|
||||
if level.value < self.log_level.value:
|
||||
return
|
||||
|
||||
# Format the message with parameters if provided
|
||||
if params:
|
||||
try:
|
||||
# First format the message with raw parameters
|
||||
formatted_message = message.format(**params)
|
||||
|
||||
# Then apply colors if specified
|
||||
if colors:
|
||||
for key, color in colors.items():
|
||||
# Find the formatted value in the message and wrap it with color
|
||||
if key in params:
|
||||
value_str = str(params[key])
|
||||
formatted_message = formatted_message.replace(
|
||||
value_str,
|
||||
f"{color}{value_str}{Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
except KeyError as e:
|
||||
formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
|
||||
level = LogLevel.ERROR
|
||||
else:
|
||||
formatted_message = message
|
||||
|
||||
# Construct the full log line
|
||||
color = base_color or self.colors[level]
|
||||
log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
|
||||
|
||||
# Output to console if verbose
|
||||
if self.verbose or kwargs.get("force_verbose", False):
|
||||
print(log_line)
|
||||
|
||||
# Write to file if configured
|
||||
self._write_to_file(log_line)
|
||||
|
||||
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||
"""Log a debug message."""
|
||||
self._log(LogLevel.DEBUG, message, tag, **kwargs)
|
||||
|
||||
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||
"""Log an info message."""
|
||||
self._log(LogLevel.INFO, message, tag, **kwargs)
|
||||
|
||||
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||
"""Log a success message."""
|
||||
self._log(LogLevel.SUCCESS, message, tag, **kwargs)
|
||||
|
||||
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||
"""Log a warning message."""
|
||||
self._log(LogLevel.WARNING, message, tag, **kwargs)
|
||||
|
||||
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||
"""Log an error message."""
|
||||
self._log(LogLevel.ERROR, message, tag, **kwargs)
|
||||
|
||||
def url_status(
|
||||
self,
|
||||
url: str,
|
||||
success: bool,
|
||||
timing: float,
|
||||
tag: str = "FETCH",
|
||||
url_length: int = 50
|
||||
):
|
||||
"""
|
||||
Convenience method for logging URL fetch status.
|
||||
|
||||
Args:
|
||||
url: The URL being processed
|
||||
success: Whether the operation was successful
|
||||
timing: Time taken for the operation
|
||||
tag: Tag for the message
|
||||
url_length: Maximum length for URL in log
|
||||
"""
|
||||
self._log(
|
||||
level=LogLevel.SUCCESS if success else LogLevel.ERROR,
|
||||
message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
|
||||
tag=tag,
|
||||
params={
|
||||
"url": url,
|
||||
"url_length": url_length,
|
||||
"status": success,
|
||||
"timing": timing
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if success else Fore.RED,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
def error_status(
|
||||
self,
|
||||
url: str,
|
||||
error: str,
|
||||
tag: str = "ERROR",
|
||||
url_length: int = 50
|
||||
):
|
||||
"""
|
||||
Convenience method for logging error status.
|
||||
|
||||
Args:
|
||||
url: The URL being processed
|
||||
error: Error message
|
||||
tag: Tag for the message
|
||||
url_length: Maximum length for URL in log
|
||||
"""
|
||||
self._log(
|
||||
level=LogLevel.ERROR,
|
||||
message="{url:.{url_length}}... | Error: {error}",
|
||||
tag=tag,
|
||||
params={
|
||||
"url": url,
|
||||
"url_length": url_length,
|
||||
"error": error
|
||||
}
|
||||
)
|
||||
833
crawl4ai/async_webcrawler.py
Normal file
@@ -0,0 +1,833 @@
|
||||
import os, sys
|
||||
import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from colorama import init, Fore, Back, Style
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Union
|
||||
import json
|
||||
import asyncio
|
||||
# from contextlib import nullcontext, asynccontextmanager
|
||||
from contextlib import asynccontextmanager
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import *
|
||||
from .content_filter_strategy import *
|
||||
from .extraction_strategy import *
|
||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from .async_logger import AsyncLogger
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
URL_LOG_SHORTEN_LENGTH
|
||||
)
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
InvalidCSSSelectorError,
|
||||
format_html,
|
||||
fast_format_html,
|
||||
create_box_message
|
||||
)
|
||||
|
||||
from urllib.parse import urlparse
|
||||
import random
|
||||
from .__version__ import __version__ as crawl4ai_version
|
||||
|
||||
|
||||
class AsyncWebCrawler:
|
||||
"""
|
||||
Asynchronous web crawler with flexible caching capabilities.
|
||||
|
||||
There are two ways to use the crawler:
|
||||
|
||||
1. Using context manager (recommended for simple cases):
|
||||
```python
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
2. Using explicit lifecycle management (recommended for long-running applications):
|
||||
```python
|
||||
crawler = AsyncWebCrawler()
|
||||
await crawler.start()
|
||||
|
||||
# Use the crawler multiple times
|
||||
result1 = await crawler.arun(url="https://example.com")
|
||||
result2 = await crawler.arun(url="https://another.com")
|
||||
|
||||
await crawler.close()
|
||||
```
|
||||
|
||||
Migration Guide:
|
||||
Old way (deprecated):
|
||||
crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
|
||||
|
||||
New way (recommended):
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
|
||||
Attributes:
|
||||
browser_config (BrowserConfig): Configuration object for browser settings.
|
||||
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
||||
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||
always_bypass_cache (bool): Whether to always bypass cache.
|
||||
crawl4ai_folder (str): Directory for storing cache.
|
||||
base_directory (str): Base directory for storing cache.
|
||||
ready (bool): Whether the crawler is ready for use.
|
||||
|
||||
Methods:
|
||||
start(): Start the crawler explicitly without using context manager.
|
||||
close(): Close the crawler explicitly without using context manager.
|
||||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
awarmup(): Perform warmup sequence.
|
||||
arun_many(): Run the crawler for multiple sources.
|
||||
aprocess_html(): Process HTML content.
|
||||
|
||||
Typical Usage:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(result.markdown)
|
||||
|
||||
Using configuration:
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=crawler_config)
|
||||
print(result.markdown)
|
||||
"""
|
||||
_domain_last_hit = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
config: Optional[BrowserConfig] = None,
|
||||
always_bypass_cache: bool = False,
|
||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
thread_safe: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the AsyncWebCrawler.
|
||||
|
||||
Args:
|
||||
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
|
||||
config: Configuration object for browser settings. If None, will be created from kwargs
|
||||
always_bypass_cache: Whether to always bypass cache (new parameter)
|
||||
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
||||
base_directory: Base directory for storing cache
|
||||
thread_safe: Whether to use thread-safe operations
|
||||
**kwargs: Additional arguments for backwards compatibility
|
||||
"""
|
||||
# Handle browser configuration
|
||||
browser_config = config
|
||||
if browser_config is not None:
|
||||
if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
|
||||
self.logger.warning(
|
||||
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
|
||||
tag="WARNING"
|
||||
)
|
||||
else:
|
||||
# Create browser config from kwargs for backwards compatibility
|
||||
browser_config = BrowserConfig.from_kwargs(kwargs)
|
||||
|
||||
self.browser_config = browser_config
|
||||
|
||||
# Initialize logger first since other components may need it
|
||||
self.logger = AsyncLogger(
|
||||
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
||||
verbose=self.browser_config.verbose,
|
||||
tag_width=10
|
||||
)
|
||||
|
||||
|
||||
# Initialize crawler strategy
|
||||
params = {
|
||||
k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
|
||||
}
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
logger=self.logger,
|
||||
**params # Pass remaining kwargs for backwards compatibility
|
||||
)
|
||||
|
||||
# If craweler strategy doesnt have logger, use crawler logger
|
||||
if not self.crawler_strategy.logger:
|
||||
self.crawler_strategy.logger = self.logger
|
||||
|
||||
# Handle deprecated cache parameter
|
||||
if always_by_pass_cache is not None:
|
||||
if kwargs.get("warning", True):
|
||||
warnings.warn(
|
||||
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
|
||||
"Use 'always_bypass_cache' instead. "
|
||||
"Pass warning=False to suppress this warning.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
self.always_bypass_cache = always_by_pass_cache
|
||||
else:
|
||||
self.always_bypass_cache = always_bypass_cache
|
||||
|
||||
# Thread safety setup
|
||||
self._lock = asyncio.Lock() if thread_safe else None
|
||||
|
||||
# Initialize directories
|
||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
|
||||
self.ready = False
|
||||
|
||||
async def start(self):
|
||||
"""
|
||||
Start the crawler explicitly without using context manager.
|
||||
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
||||
|
||||
This method will:
|
||||
1. Initialize the browser and context
|
||||
2. Perform warmup sequence
|
||||
3. Return the crawler instance for method chaining
|
||||
|
||||
Returns:
|
||||
AsyncWebCrawler: The initialized crawler instance
|
||||
"""
|
||||
await self.crawler_strategy.__aenter__()
|
||||
await self.awarmup()
|
||||
return self
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close the crawler explicitly without using context manager.
|
||||
This should be called when you're done with the crawler if you used start().
|
||||
|
||||
This method will:
|
||||
1. Clean up browser resources
|
||||
2. Close any open pages and contexts
|
||||
"""
|
||||
await self.crawler_strategy.__aexit__(None, None, None)
|
||||
|
||||
async def __aenter__(self):
|
||||
return await self.start()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
async def awarmup(self):
|
||||
"""
|
||||
Initialize the crawler with warm-up sequence.
|
||||
|
||||
This method:
|
||||
1. Logs initialization info
|
||||
2. Sets up browser configuration
|
||||
3. Marks the crawler as ready
|
||||
"""
|
||||
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
||||
self.ready = True
|
||||
|
||||
@asynccontextmanager
|
||||
async def nullcontext(self):
|
||||
"""异步空上下文管理器"""
|
||||
yield
|
||||
|
||||
async def arun(
|
||||
self,
|
||||
url: str,
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
content_filter: RelevantContentFilter = None,
|
||||
cache_mode: Optional[CacheMode] = None,
|
||||
# Deprecated cache parameters
|
||||
bypass_cache: bool = False,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
# Other legacy parameters
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
pdf: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
Migration Guide:
|
||||
Old way (deprecated):
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
word_count_threshold=200,
|
||||
screenshot=True,
|
||||
...
|
||||
)
|
||||
|
||||
New way (recommended):
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=200,
|
||||
screenshot=True,
|
||||
...
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", crawler_config=config)
|
||||
|
||||
Args:
|
||||
url: The URL to crawl (http://, https://, file://, or raw:)
|
||||
crawler_config: Configuration object controlling crawl behavior
|
||||
[other parameters maintained for backwards compatibility]
|
||||
|
||||
Returns:
|
||||
CrawlResult: The result of crawling and processing
|
||||
"""
|
||||
crawler_config = config
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||
|
||||
async with self._lock or self.nullcontext():
|
||||
try:
|
||||
# Handle configuration
|
||||
if crawler_config is not None:
|
||||
# if any(param is not None for param in [
|
||||
# word_count_threshold, extraction_strategy, chunking_strategy,
|
||||
# content_filter, cache_mode, css_selector, screenshot, pdf
|
||||
# ]):
|
||||
# self.logger.warning(
|
||||
# message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
||||
# tag="WARNING"
|
||||
# )
|
||||
config = crawler_config
|
||||
else:
|
||||
# Merge all parameters into a single kwargs dict for config creation
|
||||
config_kwargs = {
|
||||
"word_count_threshold": word_count_threshold,
|
||||
"extraction_strategy": extraction_strategy,
|
||||
"chunking_strategy": chunking_strategy,
|
||||
"content_filter": content_filter,
|
||||
"cache_mode": cache_mode,
|
||||
"bypass_cache": bypass_cache,
|
||||
"disable_cache": disable_cache,
|
||||
"no_cache_read": no_cache_read,
|
||||
"no_cache_write": no_cache_write,
|
||||
"css_selector": css_selector,
|
||||
"screenshot": screenshot,
|
||||
"pdf": pdf,
|
||||
"verbose": verbose,
|
||||
**kwargs
|
||||
}
|
||||
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
||||
|
||||
# Handle deprecated cache parameters
|
||||
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||
if kwargs.get("warning", True):
|
||||
warnings.warn(
|
||||
"Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
|
||||
"Use 'cache_mode' parameter instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
# Convert legacy parameters if cache_mode not provided
|
||||
if config.cache_mode is None:
|
||||
config.cache_mode = _legacy_to_cache_mode(
|
||||
disable_cache=disable_cache,
|
||||
bypass_cache=bypass_cache,
|
||||
no_cache_read=no_cache_read,
|
||||
no_cache_write=no_cache_write
|
||||
)
|
||||
|
||||
# Default to ENABLED if no cache mode specified
|
||||
if config.cache_mode is None:
|
||||
config.cache_mode = CacheMode.ENABLED
|
||||
|
||||
# Create cache context
|
||||
cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)
|
||||
|
||||
# Initialize processing variables
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached_result: CrawlResult = None
|
||||
screenshot_data = None
|
||||
pdf_data = None
|
||||
extracted_content = None
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Try to get cached result if appropriate
|
||||
if cache_context.should_read():
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if cached_result:
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
||||
extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
|
||||
# If screenshot is requested but its not in cache, then set cache_result to None
|
||||
screenshot_data = cached_result.screenshot
|
||||
pdf_data = cached_result.pdf
|
||||
if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||
cached_result = None
|
||||
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="FETCH"
|
||||
)
|
||||
|
||||
# Fetch fresh content if needed
|
||||
if not cached_result or not html:
|
||||
t1 = time.perf_counter()
|
||||
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
|
||||
# Pass config to crawl method
|
||||
async_response = await self.crawler_strategy.crawl(
|
||||
url,
|
||||
config=config # Pass the entire config object
|
||||
)
|
||||
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
pdf_data = async_response.pdf_data
|
||||
|
||||
t2 = time.perf_counter()
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=bool(html),
|
||||
timing=t2 - t1,
|
||||
tag="FETCH"
|
||||
)
|
||||
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html = True if url.startswith("raw:") else False,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
||||
|
||||
# # Check and set values from async_response to crawl_result
|
||||
# try:
|
||||
# for key in vars(async_response):
|
||||
# if hasattr(crawl_result, key):
|
||||
# value = getattr(async_response, key, None)
|
||||
# current_value = getattr(crawl_result, key, None)
|
||||
# if value is not None and not current_value:
|
||||
# try:
|
||||
# setattr(crawl_result, key, value)
|
||||
# except Exception as e:
|
||||
# self.logger.warning(
|
||||
# message=f"Failed to set attribute {key}: {str(e)}",
|
||||
# tag="WARNING"
|
||||
# )
|
||||
# except Exception as e:
|
||||
# self.logger.warning(
|
||||
# message=f"Error copying response attributes: {str(e)}",
|
||||
# tag="WARNING"
|
||||
# )
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, 'session_id', None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
|
||||
else:
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": True,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, 'session_id', None)
|
||||
return cached_result
|
||||
|
||||
except Exception as e:
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
|
||||
error_message = (
|
||||
f"Unexpected error in _crawl_web at line {error_context['line_no']} "
|
||||
f"in {error_context['function']} ({error_context['filename']}):\n"
|
||||
f"Error: {str(e)}\n\n"
|
||||
f"Code context:\n{error_context['code_context']}"
|
||||
)
|
||||
# if not hasattr(e, "msg"):
|
||||
# e.msg = str(e)
|
||||
|
||||
self.logger.error_status(
|
||||
url=url,
|
||||
error=create_box_message(error_message, type="error"),
|
||||
tag="ERROR"
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
success=False,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
async def aprocess_html(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
config: CrawlerRunConfig,
|
||||
screenshot: str,
|
||||
pdf_data: str,
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
Process HTML content using the provided configuration.
|
||||
|
||||
Args:
|
||||
url: The URL being processed
|
||||
html: Raw HTML content
|
||||
extracted_content: Previously extracted content (if any)
|
||||
config: Configuration object controlling processing behavior
|
||||
screenshot: Screenshot data (if any)
|
||||
pdf_data: PDF data (if any)
|
||||
verbose: Whether to enable verbose logging
|
||||
**kwargs: Additional parameters for backwards compatibility
|
||||
|
||||
Returns:
|
||||
CrawlResult: Processed result containing extracted and formatted content
|
||||
"""
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
t1 = time.perf_counter()
|
||||
|
||||
# Initialize scraping strategy
|
||||
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
|
||||
|
||||
# Process HTML content
|
||||
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
|
||||
# add keys from kwargs to params that doesn't exist in params
|
||||
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
|
||||
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
**params,
|
||||
# word_count_threshold=config.word_count_threshold,
|
||||
# css_selector=config.css_selector,
|
||||
# only_text=config.only_text,
|
||||
# image_description_min_word_threshold=config.image_description_min_word_threshold,
|
||||
# content_filter=config.content_filter,
|
||||
# **kwargs
|
||||
)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
||||
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
except Exception as e:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||
|
||||
|
||||
|
||||
# Extract results
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
# Markdown Generation
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
||||
|
||||
# Uncomment if by default we want to use PruningContentFilter
|
||||
# if not config.content_filter and not markdown_generator.content_filter:
|
||||
# markdown_generator.content_filter = PruningContentFilter()
|
||||
|
||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url=url,
|
||||
# html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
markdown_v2 = markdown_result
|
||||
markdown = sanitize_input_encode(markdown_result.raw_markdown)
|
||||
|
||||
# Log processing completion
|
||||
self.logger.info(
|
||||
message="Processed {url:.50}... | Time: {timing}ms",
|
||||
tag="SCRAPE",
|
||||
params={
|
||||
"url": _url,
|
||||
"timing": int((time.perf_counter() - t1) * 1000)
|
||||
}
|
||||
)
|
||||
|
||||
# Handle content extraction if needed
|
||||
if (extracted_content is None and
|
||||
config.extraction_strategy and
|
||||
config.chunking_strategy and
|
||||
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
||||
|
||||
t1 = time.perf_counter()
|
||||
|
||||
# Choose content based on input_format
|
||||
content_format = config.extraction_strategy.input_format
|
||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||
self.logger.warning(
|
||||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
||||
tag="EXTRACT",
|
||||
params={"url": _url}
|
||||
)
|
||||
content_format = "markdown"
|
||||
|
||||
content = {
|
||||
"markdown": markdown,
|
||||
"html": html,
|
||||
"fit_markdown": markdown_result.raw_markdown
|
||||
}.get(content_format, markdown)
|
||||
|
||||
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
|
||||
chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
|
||||
sections = chunking.chunk(content)
|
||||
extracted_content = config.extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
|
||||
# Log extraction completion
|
||||
self.logger.info(
|
||||
message="Completed for {url:.50}... | Time: {timing}s",
|
||||
tag="EXTRACT",
|
||||
params={
|
||||
"url": _url,
|
||||
"timing": time.perf_counter() - t1
|
||||
}
|
||||
)
|
||||
|
||||
# Handle screenshot and PDF data
|
||||
screenshot_data = None if not screenshot else screenshot
|
||||
pdf_data = None if not pdf_data else pdf_data
|
||||
|
||||
# Apply HTML formatting if requested
|
||||
if config.prettiify:
|
||||
cleaned_html = fast_format_html(cleaned_html)
|
||||
|
||||
# Return complete crawl result
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown_v2=markdown_v2,
|
||||
markdown=markdown,
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html=fit_html,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot_data,
|
||||
pdf=pdf_data,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
)
|
||||
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
content_filter: RelevantContentFilter = None,
|
||||
cache_mode: Optional[CacheMode] = None,
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
pdf: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> List[CrawlResult]:
|
||||
"""
|
||||
Runs the crawler for multiple URLs concurrently.
|
||||
|
||||
Migration Guide:
|
||||
Old way (deprecated):
|
||||
results = await crawler.arun_many(
|
||||
urls,
|
||||
word_count_threshold=200,
|
||||
screenshot=True,
|
||||
...
|
||||
)
|
||||
|
||||
New way (recommended):
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=200,
|
||||
screenshot=True,
|
||||
...
|
||||
)
|
||||
results = await crawler.arun_many(urls, crawler_config=config)
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
crawler_config: Configuration object controlling crawl behavior for all URLs
|
||||
[other parameters maintained for backwards compatibility]
|
||||
|
||||
Returns:
|
||||
List[CrawlResult]: Results for each URL
|
||||
"""
|
||||
crawler_config = config
|
||||
# Handle configuration
|
||||
if crawler_config is not None:
|
||||
if any(param is not None for param in [
|
||||
word_count_threshold, extraction_strategy, chunking_strategy,
|
||||
content_filter, cache_mode, css_selector, screenshot, pdf
|
||||
]):
|
||||
self.logger.warning(
|
||||
message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
||||
tag="WARNING"
|
||||
)
|
||||
config = crawler_config
|
||||
else:
|
||||
# Merge all parameters into a single kwargs dict for config creation
|
||||
config_kwargs = {
|
||||
"word_count_threshold": word_count_threshold,
|
||||
"extraction_strategy": extraction_strategy,
|
||||
"chunking_strategy": chunking_strategy,
|
||||
"content_filter": content_filter,
|
||||
"cache_mode": cache_mode,
|
||||
"bypass_cache": bypass_cache,
|
||||
"css_selector": css_selector,
|
||||
"screenshot": screenshot,
|
||||
"pdf": pdf,
|
||||
"verbose": verbose,
|
||||
**kwargs
|
||||
}
|
||||
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
||||
|
||||
if bypass_cache:
|
||||
if kwargs.get("warning", True):
|
||||
warnings.warn(
|
||||
"'bypass_cache' is deprecated and will be removed in version 0.5.0. "
|
||||
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
||||
"Pass warning=False to suppress this warning.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
if config.cache_mode is None:
|
||||
config.cache_mode = CacheMode.BYPASS
|
||||
|
||||
semaphore_count = config.semaphore_count or 5
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
# Handle rate limiting per domain
|
||||
domain = urlparse(url).netloc
|
||||
current_time = time.time()
|
||||
|
||||
self.logger.debug(
|
||||
message="Started task for {url:.50}...",
|
||||
tag="PARALLEL",
|
||||
params={"url": url}
|
||||
)
|
||||
|
||||
# Get delay settings from config
|
||||
mean_delay = config.mean_delay
|
||||
max_range = config.max_range
|
||||
|
||||
# Apply rate limiting
|
||||
if domain in self._domain_last_hit:
|
||||
time_since_last = current_time - self._domain_last_hit[domain]
|
||||
if time_since_last < mean_delay:
|
||||
delay = mean_delay + random.uniform(0, max_range)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
self._domain_last_hit[domain] = current_time
|
||||
|
||||
async with semaphore:
|
||||
return await self.arun(
|
||||
url,
|
||||
crawler_config=config, # Pass the entire config object
|
||||
user_agent=user_agent # Maintain user_agent override capability
|
||||
)
|
||||
|
||||
# Log start of concurrent crawling
|
||||
self.logger.info(
|
||||
message="Starting concurrent crawling for {count} URLs...",
|
||||
tag="INIT",
|
||||
params={"count": len(urls)}
|
||||
)
|
||||
|
||||
# Execute concurrent crawls
|
||||
start_time = time.perf_counter()
|
||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
# Log completion
|
||||
self.logger.success(
|
||||
message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"count": len(urls),
|
||||
"timing": f"{end_time - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def aclear_cache(self):
|
||||
"""Clear the cache database."""
|
||||
await async_db_manager.cleanup()
|
||||
|
||||
async def aflush_cache(self):
|
||||
"""Flush the cache database."""
|
||||
await async_db_manager.aflush_db()
|
||||
|
||||
async def aget_cache_size(self):
|
||||
"""Get the total number of cached items."""
|
||||
return await async_db_manager.aget_total_count()
|
||||
115
crawl4ai/cache_context.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class CacheMode(Enum):
|
||||
"""
|
||||
Defines the caching behavior for web crawling operations.
|
||||
|
||||
Modes:
|
||||
- ENABLED: Normal caching behavior (read and write)
|
||||
- DISABLED: No caching at all
|
||||
- READ_ONLY: Only read from cache, don't write
|
||||
- WRITE_ONLY: Only write to cache, don't read
|
||||
- BYPASS: Bypass cache for this operation
|
||||
"""
|
||||
ENABLED = "enabled"
|
||||
DISABLED = "disabled"
|
||||
READ_ONLY = "read_only"
|
||||
WRITE_ONLY = "write_only"
|
||||
BYPASS = "bypass"
|
||||
|
||||
|
||||
class CacheContext:
|
||||
"""
|
||||
Encapsulates cache-related decisions and URL handling.
|
||||
|
||||
This class centralizes all cache-related logic and URL type checking,
|
||||
making the caching behavior more predictable and maintainable.
|
||||
|
||||
Attributes:
|
||||
url (str): The URL being processed.
|
||||
cache_mode (CacheMode): The cache mode for the current operation.
|
||||
always_bypass (bool): If True, bypasses caching for this operation.
|
||||
is_cacheable (bool): True if the URL is cacheable, False otherwise.
|
||||
is_web_url (bool): True if the URL is a web URL, False otherwise.
|
||||
is_local_file (bool): True if the URL is a local file, False otherwise.
|
||||
is_raw_html (bool): True if the URL is raw HTML, False otherwise.
|
||||
_url_display (str): The display name for the URL (web, local file, or raw HTML).
|
||||
"""
|
||||
def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
|
||||
"""
|
||||
Initializes the CacheContext with the provided URL and cache mode.
|
||||
|
||||
Args:
|
||||
url (str): The URL being processed.
|
||||
cache_mode (CacheMode): The cache mode for the current operation.
|
||||
always_bypass (bool): If True, bypasses caching for this operation.
|
||||
"""
|
||||
self.url = url
|
||||
self.cache_mode = cache_mode
|
||||
self.always_bypass = always_bypass
|
||||
self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
|
||||
self.is_web_url = url.startswith(('http://', 'https://'))
|
||||
self.is_local_file = url.startswith("file://")
|
||||
self.is_raw_html = url.startswith("raw:")
|
||||
self._url_display = url if not self.is_raw_html else "Raw HTML"
|
||||
|
||||
def should_read(self) -> bool:
|
||||
"""
|
||||
Determines if cache should be read based on context.
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED or READ_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be read, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
||||
|
||||
def should_write(self) -> bool:
|
||||
"""
|
||||
Determines if cache should be written based on context.
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED or WRITE_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be written, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
||||
|
||||
@property
|
||||
def display_url(self) -> str:
|
||||
"""Returns the URL in display format."""
|
||||
return self._url_display
|
||||
|
||||
|
||||
def _legacy_to_cache_mode(
|
||||
disable_cache: bool = False,
|
||||
bypass_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False
|
||||
) -> CacheMode:
|
||||
"""
|
||||
Converts legacy cache parameters to the new CacheMode enum.
|
||||
|
||||
This is an internal function to help transition from the old boolean flags
|
||||
to the new CacheMode system.
|
||||
"""
|
||||
if disable_cache:
|
||||
return CacheMode.DISABLED
|
||||
if bypass_cache:
|
||||
return CacheMode.BYPASS
|
||||
if no_cache_read and no_cache_write:
|
||||
return CacheMode.DISABLED
|
||||
if no_cache_read:
|
||||
return CacheMode.WRITE_ONLY
|
||||
if no_cache_write:
|
||||
return CacheMode.READ_ONLY
|
||||
return CacheMode.ENABLED
|
||||
@@ -3,20 +3,47 @@ import re
|
||||
from collections import Counter
|
||||
import string
|
||||
from .model_loader import load_nltk_punkt
|
||||
from .utils import *
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
"""
|
||||
Abstract base class for chunking strategies.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def chunk(self, text: str) -> list:
|
||||
"""
|
||||
Abstract method to chunk the given text.
|
||||
|
||||
Args:
|
||||
text (str): The text to chunk.
|
||||
|
||||
Returns:
|
||||
list: A list of chunks.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
# Create an identity chunking strategy f(x) = [x]
|
||||
class IdentityChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that returns the input text as a single chunk.
|
||||
"""
|
||||
def chunk(self, text: str) -> list:
|
||||
return [text]
|
||||
|
||||
# Regex-based chunking
|
||||
class RegexChunking(ChunkingStrategy):
|
||||
def __init__(self, patterns=None):
|
||||
"""
|
||||
Chunking strategy that splits text based on regular expression patterns.
|
||||
"""
|
||||
def __init__(self, patterns=None, **kwargs):
|
||||
"""
|
||||
Initialize the RegexChunking object.
|
||||
|
||||
Args:
|
||||
patterns (list): A list of regular expression patterns to split text.
|
||||
"""
|
||||
if patterns is None:
|
||||
patterns = [r'\n\n'] # Default split pattern
|
||||
self.patterns = patterns
|
||||
@@ -32,9 +59,15 @@ class RegexChunking(ChunkingStrategy):
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
def __init__(self):
|
||||
"""
|
||||
Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize the NlpSentenceChunking object.
|
||||
"""
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
# Improved regex for sentence splitting
|
||||
@@ -51,10 +84,23 @@ class NlpSentenceChunking(ChunkingStrategy):
|
||||
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
|
||||
|
||||
def __init__(self, num_keywords=3):
|
||||
How it works:
|
||||
1. Segment the text into topics using TextTilingTokenizer
|
||||
2. Extract keywords for each topic segment
|
||||
"""
|
||||
|
||||
def __init__(self, num_keywords=3, **kwargs):
|
||||
"""
|
||||
Initialize the TopicSegmentationChunking object.
|
||||
|
||||
Args:
|
||||
num_keywords (int): The number of keywords to extract for each topic segment.
|
||||
"""
|
||||
import nltk as nl
|
||||
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
||||
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
||||
self.num_keywords = num_keywords
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
@@ -82,7 +128,21 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
# Fixed-length word chunks
|
||||
class FixedLengthWordChunking(ChunkingStrategy):
|
||||
def __init__(self, chunk_size=100):
|
||||
"""
|
||||
Chunking strategy that splits text into fixed-length word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words
|
||||
2. Create chunks of fixed length
|
||||
3. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, chunk_size=100, **kwargs):
|
||||
"""
|
||||
Initialize the fixed-length word chunking strategy with the given chunk size.
|
||||
|
||||
Args:
|
||||
chunk_size (int): The size of each chunk in words.
|
||||
"""
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
@@ -91,15 +151,81 @@ class FixedLengthWordChunking(ChunkingStrategy):
|
||||
|
||||
# Sliding window chunking
|
||||
class SlidingWindowChunking(ChunkingStrategy):
|
||||
def __init__(self, window_size=100, step=50):
|
||||
"""
|
||||
Chunking strategy that splits text into overlapping word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words
|
||||
2. Create chunks of fixed length
|
||||
3. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, window_size=100, step=50, **kwargs):
|
||||
"""
|
||||
Initialize the sliding window chunking strategy with the given window size and
|
||||
step size.
|
||||
|
||||
Args:
|
||||
window_size (int): The size of the sliding window in words.
|
||||
step (int): The step size for sliding the window in words.
|
||||
"""
|
||||
self.window_size = window_size
|
||||
self.step = step
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
words = text.split()
|
||||
chunks = []
|
||||
for i in range(0, len(words), self.step):
|
||||
chunks.append(' '.join(words[i:i + self.window_size]))
|
||||
|
||||
if len(words) <= self.window_size:
|
||||
return [text]
|
||||
|
||||
for i in range(0, len(words) - self.window_size + 1, self.step):
|
||||
chunk = ' '.join(words[i:i + self.window_size])
|
||||
chunks.append(chunk)
|
||||
|
||||
# Handle the last chunk if it doesn't align perfectly
|
||||
if i + self.window_size < len(words):
|
||||
chunks.append(' '.join(words[-self.window_size:]))
|
||||
|
||||
return chunks
|
||||
|
||||
class OverlappingWindowChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into overlapping word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words using whitespace
|
||||
2. Create chunks of fixed length equal to the window size
|
||||
3. Slide the window by the overlap size
|
||||
4. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, window_size=1000, overlap=100, **kwargs):
|
||||
"""
|
||||
Initialize the overlapping window chunking strategy with the given window size and
|
||||
overlap size.
|
||||
|
||||
Args:
|
||||
window_size (int): The size of the window in words.
|
||||
overlap (int): The size of the overlap between consecutive chunks in words.
|
||||
"""
|
||||
self.window_size = window_size
|
||||
self.overlap = overlap
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
words = text.split()
|
||||
chunks = []
|
||||
|
||||
if len(words) <= self.window_size:
|
||||
return [text]
|
||||
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + self.window_size
|
||||
chunk = ' '.join(words[start:end])
|
||||
chunks.append(chunk)
|
||||
|
||||
if end >= len(words):
|
||||
break
|
||||
|
||||
start = end - self.overlap
|
||||
|
||||
return chunks
|
||||
105
crawl4ai/cli.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import click
|
||||
import sys
|
||||
import asyncio
|
||||
from typing import List
|
||||
from .docs_manager import DocsManager
|
||||
from .async_logger import AsyncLogger
|
||||
|
||||
logger = AsyncLogger(verbose=True)
|
||||
docs_manager = DocsManager(logger)
|
||||
|
||||
def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
|
||||
"""Print formatted table with headers and rows"""
|
||||
widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
|
||||
border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+'
|
||||
|
||||
def format_row(row):
|
||||
return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
|
||||
for cell, w in zip(row, widths)) + '|'
|
||||
|
||||
click.echo(border)
|
||||
click.echo(format_row(headers))
|
||||
click.echo(border)
|
||||
for row in rows:
|
||||
click.echo(format_row(row))
|
||||
click.echo(border)
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""Crawl4AI Command Line Interface"""
|
||||
pass
|
||||
|
||||
@cli.group()
|
||||
def docs():
|
||||
"""Documentation operations"""
|
||||
pass
|
||||
|
||||
@docs.command()
|
||||
@click.argument('sections', nargs=-1)
|
||||
@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended')
|
||||
def combine(sections: tuple, mode: str):
|
||||
"""Combine documentation sections"""
|
||||
try:
|
||||
asyncio.run(docs_manager.ensure_docs_exist())
|
||||
click.echo(docs_manager.generate(sections, mode))
|
||||
except Exception as e:
|
||||
logger.error(str(e), tag="ERROR")
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
@click.argument('query')
|
||||
@click.option('--top-k', '-k', default=5)
|
||||
@click.option('--build-index', is_flag=True, help='Build index if missing')
|
||||
def search(query: str, top_k: int, build_index: bool):
|
||||
"""Search documentation"""
|
||||
try:
|
||||
result = docs_manager.search(query, top_k)
|
||||
if result == "No search index available. Call build_search_index() first.":
|
||||
if build_index or click.confirm('No search index found. Build it now?'):
|
||||
asyncio.run(docs_manager.llm_text.generate_index_files())
|
||||
result = docs_manager.search(query, top_k)
|
||||
click.echo(result)
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
def update():
|
||||
"""Update docs from GitHub"""
|
||||
try:
|
||||
asyncio.run(docs_manager.fetch_docs())
|
||||
click.echo("Documentation updated successfully")
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@docs.command()
|
||||
@click.option('--force-facts', is_flag=True, help='Force regenerate fact files')
|
||||
@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache')
|
||||
def index(force_facts: bool, clear_cache: bool):
|
||||
"""Build or rebuild search indexes"""
|
||||
try:
|
||||
asyncio.run(docs_manager.ensure_docs_exist())
|
||||
asyncio.run(docs_manager.llm_text.generate_index_files(
|
||||
force_generate_facts=force_facts,
|
||||
clear_bm25_cache=clear_cache
|
||||
))
|
||||
click.echo("Search indexes built successfully")
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Add docs list command
|
||||
@docs.command()
|
||||
def list():
|
||||
"""List available documentation sections"""
|
||||
try:
|
||||
sections = docs_manager.list()
|
||||
print_table(["Sections"], [[section] for section in sections])
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {str(e)}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
@@ -4,24 +4,61 @@ from dotenv import load_dotenv
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
|
||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/o1-mini": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/o1-preview": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
|
||||
}
|
||||
|
||||
|
||||
# Chunk token threshold
|
||||
CHUNK_TOKEN_THRESHOLD = 1000
|
||||
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
|
||||
OVERLAP_RATE = 0.1
|
||||
WORD_TOKEN_RATE = 1.3
|
||||
|
||||
# Threshold for the minimum number of word in a HTML tag to be considered
|
||||
MIN_WORD_THRESHOLD = 5
|
||||
MIN_WORD_THRESHOLD = 1
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
|
||||
|
||||
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
|
||||
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
|
||||
SOCIAL_MEDIA_DOMAINS = [
|
||||
'facebook.com',
|
||||
'twitter.com',
|
||||
'x.com',
|
||||
'linkedin.com',
|
||||
'instagram.com',
|
||||
'pinterest.com',
|
||||
'tiktok.com',
|
||||
'snapchat.com',
|
||||
'reddit.com',
|
||||
]
|
||||
|
||||
# Threshold for the Image extraction - Range is 1 to 6
|
||||
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||
# to each image based on the following aspects.
|
||||
# If either height or width exceeds 150px
|
||||
# If image size is greater than 10Kb
|
||||
# If alt property is set
|
||||
# If image format is in jpg, png or webp
|
||||
# If image is in the first half of the total images extracted from the page
|
||||
IMAGE_SCORE_THRESHOLD = 2
|
||||
|
||||
MAX_METRICS_HISTORY = 1000
|
||||
|
||||
NEED_MIGRATION = True
|
||||
URL_LOG_SHORTEN_LENGTH = 30
|
||||
SHOW_DEPRECATION_WARNINGS = True
|
||||
SCREENSHOT_HEIGHT_TRESHOLD = 10000
|
||||
PAGE_TIMEOUT=60000
|
||||
DOWNLOAD_PAGE_TIMEOUT=60000
|
||||
627
crawl4ai/content_filter_strategy.py
Normal file
@@ -0,0 +1,627 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from typing import List, Tuple, Dict
|
||||
from rank_bm25 import BM25Okapi
|
||||
from time import perf_counter
|
||||
from collections import deque
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||
from .utils import clean_tokens
|
||||
from abc import ABC, abstractmethod
|
||||
import math
|
||||
from snowballstemmer import stemmer
|
||||
class RelevantContentFilter(ABC):
|
||||
"""Abstract base class for content filtering strategies"""
|
||||
def __init__(self, user_query: str = None):
|
||||
self.user_query = user_query
|
||||
self.included_tags = {
|
||||
# Primary structure
|
||||
'article', 'main', 'section', 'div',
|
||||
# List structures
|
||||
'ul', 'ol', 'li', 'dl', 'dt', 'dd',
|
||||
# Text content
|
||||
'p', 'span', 'blockquote', 'pre', 'code',
|
||||
# Headers
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
# Tables
|
||||
'table', 'thead', 'tbody', 'tr', 'td', 'th',
|
||||
# Other semantic elements
|
||||
'figure', 'figcaption', 'details', 'summary',
|
||||
# Text formatting
|
||||
'em', 'strong', 'b', 'i', 'mark', 'small',
|
||||
# Rich content
|
||||
'time', 'address', 'cite', 'q'
|
||||
}
|
||||
self.excluded_tags = {
|
||||
'nav', 'footer', 'header', 'aside', 'script',
|
||||
'style', 'form', 'iframe', 'noscript'
|
||||
}
|
||||
self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
||||
self.negative_patterns = re.compile(
|
||||
r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share',
|
||||
re.I
|
||||
)
|
||||
self.min_word_count = 2
|
||||
|
||||
@abstractmethod
|
||||
def filter_content(self, html: str) -> List[str]:
|
||||
"""Abstract method to be implemented by specific filtering strategies"""
|
||||
pass
|
||||
|
||||
def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
|
||||
"""Common method to extract page metadata with fallbacks"""
|
||||
if self.user_query:
|
||||
return self.user_query
|
||||
|
||||
query_parts = []
|
||||
|
||||
# Title
|
||||
try:
|
||||
title = soup.title.string
|
||||
if title:
|
||||
query_parts.append(title)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if soup.find('h1'):
|
||||
query_parts.append(soup.find('h1').get_text())
|
||||
|
||||
# Meta tags
|
||||
temp = ""
|
||||
for meta_name in ['keywords', 'description']:
|
||||
meta = soup.find('meta', attrs={'name': meta_name})
|
||||
if meta and meta.get('content'):
|
||||
query_parts.append(meta['content'])
|
||||
temp += meta['content']
|
||||
|
||||
# If still empty, grab first significant paragraph
|
||||
if not temp:
|
||||
# Find the first tag P thatits text contains more than 50 characters
|
||||
for p in body.find_all('p'):
|
||||
if len(p.get_text()) > 150:
|
||||
query_parts.append(p.get_text()[:150])
|
||||
break
|
||||
|
||||
return ' '.join(filter(None, query_parts))
|
||||
|
||||
def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Extracts text chunks from a BeautifulSoup body element while preserving order.
|
||||
Returns list of tuples (text, tag_name) for classification.
|
||||
|
||||
Args:
|
||||
body: BeautifulSoup Tag object representing the body element
|
||||
|
||||
Returns:
|
||||
List of (text, tag_name) tuples
|
||||
"""
|
||||
# Tags to ignore - inline elements that shouldn't break text flow
|
||||
INLINE_TAGS = {
|
||||
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code',
|
||||
'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
|
||||
'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
|
||||
'textarea', 'time', 'tt', 'var'
|
||||
}
|
||||
|
||||
# Tags that typically contain meaningful headers
|
||||
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'}
|
||||
|
||||
chunks = []
|
||||
current_text = []
|
||||
chunk_index = 0
|
||||
|
||||
def should_break_chunk(tag: Tag) -> bool:
|
||||
"""Determine if a tag should cause a break in the current text chunk"""
|
||||
return (
|
||||
tag.name not in INLINE_TAGS
|
||||
and not (tag.name == 'p' and len(current_text) == 0)
|
||||
)
|
||||
|
||||
# Use deque for efficient push/pop operations
|
||||
stack = deque([(body, False)])
|
||||
|
||||
while stack:
|
||||
element, visited = stack.pop()
|
||||
|
||||
if visited:
|
||||
# End of block element - flush accumulated text
|
||||
if current_text and should_break_chunk(element):
|
||||
text = ' '.join(''.join(current_text).split())
|
||||
if text:
|
||||
tag_type = 'header' if element.name in HEADER_TAGS else 'content'
|
||||
chunks.append((chunk_index, text, tag_type, element))
|
||||
chunk_index += 1
|
||||
current_text = []
|
||||
continue
|
||||
|
||||
if isinstance(element, NavigableString):
|
||||
if str(element).strip():
|
||||
current_text.append(str(element).strip())
|
||||
continue
|
||||
|
||||
# Pre-allocate children to avoid multiple list operations
|
||||
children = list(element.children)
|
||||
if not children:
|
||||
continue
|
||||
|
||||
# Mark block for revisit after processing children
|
||||
stack.append((element, True))
|
||||
|
||||
# Add children in reverse order for correct processing
|
||||
for child in reversed(children):
|
||||
if isinstance(child, (Tag, NavigableString)):
|
||||
stack.append((child, False))
|
||||
|
||||
# Handle any remaining text
|
||||
if current_text:
|
||||
text = ' '.join(''.join(current_text).split())
|
||||
if text:
|
||||
chunks.append((chunk_index, text, 'content', body))
|
||||
|
||||
if min_word_threshold:
|
||||
chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
|
||||
|
||||
return chunks
|
||||
|
||||
def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
|
||||
"""Common method for extracting text chunks"""
|
||||
_text_cache = {}
|
||||
def fast_text(element: Tag) -> str:
|
||||
elem_id = id(element)
|
||||
if elem_id in _text_cache:
|
||||
return _text_cache[elem_id]
|
||||
texts = []
|
||||
for content in element.contents:
|
||||
if isinstance(content, str):
|
||||
text = content.strip()
|
||||
if text:
|
||||
texts.append(text)
|
||||
result = ' '.join(texts)
|
||||
_text_cache[elem_id] = result
|
||||
return result
|
||||
|
||||
candidates = []
|
||||
index = 0
|
||||
|
||||
def dfs(element):
|
||||
nonlocal index
|
||||
if isinstance(element, Tag):
|
||||
if element.name in self.included_tags:
|
||||
if not self.is_excluded(element):
|
||||
text = fast_text(element)
|
||||
word_count = len(text.split())
|
||||
|
||||
# Headers pass through with adjusted minimum
|
||||
if element.name in self.header_tags:
|
||||
if word_count >= 3: # Minimal sanity check for headers
|
||||
candidates.append((index, text, element))
|
||||
index += 1
|
||||
# Regular content uses standard minimum
|
||||
elif word_count >= self.min_word_count:
|
||||
candidates.append((index, text, element))
|
||||
index += 1
|
||||
|
||||
for child in element.children:
|
||||
dfs(child)
|
||||
|
||||
dfs(soup.body if soup.body else soup)
|
||||
return candidates
|
||||
|
||||
def is_excluded(self, tag: Tag) -> bool:
|
||||
"""Common method for exclusion logic"""
|
||||
if tag.name in self.excluded_tags:
|
||||
return True
|
||||
class_id = ' '.join(filter(None, [
|
||||
' '.join(tag.get('class', [])),
|
||||
tag.get('id', '')
|
||||
]))
|
||||
return bool(self.negative_patterns.search(class_id))
|
||||
|
||||
def clean_element(self, tag: Tag) -> str:
|
||||
"""Common method for cleaning HTML elements with minimal overhead"""
|
||||
if not tag or not isinstance(tag, Tag):
|
||||
return ""
|
||||
|
||||
unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'}
|
||||
unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'}
|
||||
|
||||
# Use string builder pattern for better performance
|
||||
builder = []
|
||||
|
||||
def render_tag(elem):
|
||||
if not isinstance(elem, Tag):
|
||||
if isinstance(elem, str):
|
||||
builder.append(elem.strip())
|
||||
return
|
||||
|
||||
if elem.name in unwanted_tags:
|
||||
return
|
||||
|
||||
# Start tag
|
||||
builder.append(f'<{elem.name}')
|
||||
|
||||
# Add cleaned attributes
|
||||
attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
|
||||
for key, value in attrs.items():
|
||||
builder.append(f' {key}="{value}"')
|
||||
|
||||
builder.append('>')
|
||||
|
||||
# Process children
|
||||
for child in elem.children:
|
||||
render_tag(child)
|
||||
|
||||
# Close tag
|
||||
builder.append(f'</{elem.name}>')
|
||||
|
||||
try:
|
||||
render_tag(tag)
|
||||
return ''.join(builder)
|
||||
except Exception:
|
||||
return str(tag) # Fallback to original if anything fails
|
||||
|
||||
class BM25ContentFilter(RelevantContentFilter):
|
||||
"""
|
||||
Content filtering using BM25 algorithm with priority tag handling.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Tokenizes the corpus and query.
|
||||
4. Applies BM25 algorithm to calculate scores for each chunk.
|
||||
5. Filters out chunks below the threshold.
|
||||
6. Sorts chunks by score in descending order.
|
||||
7. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
user_query (str): User query for filtering (optional).
|
||||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||||
language (str): Language for stemming (default: 'english').
|
||||
|
||||
Methods:
|
||||
filter_content(self, html: str, min_word_threshold: int = None)
|
||||
"""
|
||||
def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
|
||||
"""
|
||||
Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
|
||||
|
||||
Note:
|
||||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||||
language (str): Language for stemming (default: 'english').
|
||||
"""
|
||||
super().__init__(user_query=user_query)
|
||||
self.bm25_threshold = bm25_threshold
|
||||
self.priority_tags = {
|
||||
'h1': 5.0,
|
||||
'h2': 4.0,
|
||||
'h3': 3.0,
|
||||
'title': 4.0,
|
||||
'strong': 2.0,
|
||||
'b': 1.5,
|
||||
'em': 1.5,
|
||||
'blockquote': 2.0,
|
||||
'code': 2.0,
|
||||
'pre': 1.5,
|
||||
'th': 1.5, # Table headers
|
||||
}
|
||||
self.stemmer = stemmer(language)
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""
|
||||
Implements content filtering using BM25 algorithm with priority tag handling.
|
||||
|
||||
Note:
|
||||
This method implements the filtering logic for the BM25ContentFilter class.
|
||||
It takes HTML content as input and returns a list of filtered text chunks.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered text chunks.
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
# Check if body is present
|
||||
if not soup.body:
|
||||
# Wrap in body tag if missing
|
||||
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
||||
body = soup.find('body')
|
||||
|
||||
query = self.extract_page_query(soup, body)
|
||||
|
||||
if not query:
|
||||
return []
|
||||
# return [self.clean_element(soup)]
|
||||
|
||||
candidates = self.extract_text_chunks(body, min_word_threshold)
|
||||
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
# Tokenize corpus
|
||||
# tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
|
||||
# tokenized_query = query.lower().split()
|
||||
|
||||
# tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
|
||||
# for _, chunk, _, _ in candidates]
|
||||
# tokenized_query = [ps.stem(word) for word in query.lower().split()]
|
||||
|
||||
tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()]
|
||||
for _, chunk, _, _ in candidates]
|
||||
tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
|
||||
|
||||
# tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
|
||||
# for _, chunk, _, _ in candidates]
|
||||
# tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
|
||||
|
||||
# Clean from stop words and noise
|
||||
tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
|
||||
tokenized_query = clean_tokens(tokenized_query)
|
||||
|
||||
bm25 = BM25Okapi(tokenized_corpus)
|
||||
scores = bm25.get_scores(tokenized_query)
|
||||
|
||||
# Adjust scores with tag weights
|
||||
adjusted_candidates = []
|
||||
for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
|
||||
tag_weight = self.priority_tags.get(tag.name, 1.0)
|
||||
adjusted_score = score * tag_weight
|
||||
adjusted_candidates.append((adjusted_score, index, chunk, tag))
|
||||
|
||||
# Filter candidates by threshold
|
||||
selected_candidates = [
|
||||
(index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
|
||||
if adjusted_score >= self.bm25_threshold
|
||||
]
|
||||
|
||||
if not selected_candidates:
|
||||
return []
|
||||
|
||||
# Sort selected candidates by original document order
|
||||
selected_candidates.sort(key=lambda x: x[0])
|
||||
|
||||
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
||||
|
||||
class PruningContentFilter(RelevantContentFilter):
|
||||
"""
|
||||
Content filtering using pruning algorithm with dynamic threshold.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Applies pruning algorithm to calculate scores for each chunk.
|
||||
4. Filters out chunks below the threshold.
|
||||
5. Sorts chunks by score in descending order.
|
||||
6. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||||
threshold (float): Fixed threshold value (default: 0.48).
|
||||
|
||||
Methods:
|
||||
filter_content(self, html: str, min_word_threshold: int = None):
|
||||
"""
|
||||
def __init__(self, user_query: str = None, min_word_threshold: int = None,
|
||||
threshold_type: str = 'fixed', threshold: float = 0.48):
|
||||
"""
|
||||
Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
|
||||
|
||||
Note:
|
||||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||||
threshold (float): Fixed threshold value (default: 0.48).
|
||||
"""
|
||||
super().__init__(None)
|
||||
self.min_word_threshold = min_word_threshold
|
||||
self.threshold_type = threshold_type
|
||||
self.threshold = threshold
|
||||
|
||||
# Add tag importance for dynamic threshold
|
||||
self.tag_importance = {
|
||||
'article': 1.5,
|
||||
'main': 1.4,
|
||||
'section': 1.3,
|
||||
'p': 1.2,
|
||||
'h1': 1.4,
|
||||
'h2': 1.3,
|
||||
'h3': 1.2,
|
||||
'div': 0.7,
|
||||
'span': 0.6
|
||||
}
|
||||
|
||||
# Metric configuration
|
||||
self.metric_config = {
|
||||
'text_density': True,
|
||||
'link_density': True,
|
||||
'tag_weight': True,
|
||||
'class_id_weight': True,
|
||||
'text_length': True,
|
||||
}
|
||||
|
||||
self.metric_weights = {
|
||||
'text_density': 0.4,
|
||||
'link_density': 0.2,
|
||||
'tag_weight': 0.2,
|
||||
'class_id_weight': 0.1,
|
||||
'text_length': 0.1,
|
||||
}
|
||||
|
||||
self.tag_weights = {
|
||||
'div': 0.5,
|
||||
'p': 1.0,
|
||||
'article': 1.5,
|
||||
'section': 1.0,
|
||||
'span': 0.3,
|
||||
'li': 0.5,
|
||||
'ul': 0.5,
|
||||
'ol': 0.5,
|
||||
'h1': 1.2,
|
||||
'h2': 1.1,
|
||||
'h3': 1.0,
|
||||
'h4': 0.9,
|
||||
'h5': 0.8,
|
||||
'h6': 0.7,
|
||||
}
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""
|
||||
Implements content filtering using pruning algorithm with dynamic threshold.
|
||||
|
||||
Note:
|
||||
This method implements the filtering logic for the PruningContentFilter class.
|
||||
It takes HTML content as input and returns a list of filtered text chunks.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered text chunks.
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
if not soup.body:
|
||||
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
||||
|
||||
# Remove comments and unwanted tags
|
||||
self._remove_comments(soup)
|
||||
self._remove_unwanted_tags(soup)
|
||||
|
||||
# Prune tree starting from body
|
||||
body = soup.find('body')
|
||||
self._prune_tree(body)
|
||||
|
||||
# Extract remaining content as list of HTML strings
|
||||
content_blocks = []
|
||||
for element in body.children:
|
||||
if isinstance(element, str) or not hasattr(element, 'name'):
|
||||
continue
|
||||
if len(element.get_text(strip=True)) > 0:
|
||||
content_blocks.append(str(element))
|
||||
|
||||
return content_blocks
|
||||
|
||||
def _remove_comments(self, soup):
|
||||
"""Removes HTML comments"""
|
||||
for element in soup(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
def _remove_unwanted_tags(self, soup):
|
||||
"""Removes unwanted tags"""
|
||||
for tag in self.excluded_tags:
|
||||
for element in soup.find_all(tag):
|
||||
element.decompose()
|
||||
|
||||
def _prune_tree(self, node):
|
||||
"""
|
||||
Prunes the tree starting from the given node.
|
||||
|
||||
Args:
|
||||
node (Tag): The node from which the pruning starts.
|
||||
"""
|
||||
if not node or not hasattr(node, 'name') or node.name is None:
|
||||
return
|
||||
|
||||
text_len = len(node.get_text(strip=True))
|
||||
tag_len = len(node.encode_contents().decode('utf-8'))
|
||||
link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s)
|
||||
|
||||
metrics = {
|
||||
'node': node,
|
||||
'tag_name': node.name,
|
||||
'text_len': text_len,
|
||||
'tag_len': tag_len,
|
||||
'link_text_len': link_text_len
|
||||
}
|
||||
|
||||
score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
|
||||
|
||||
if self.threshold_type == 'fixed':
|
||||
should_remove = score < self.threshold
|
||||
else: # dynamic
|
||||
tag_importance = self.tag_importance.get(node.name, 0.7)
|
||||
text_ratio = text_len / tag_len if tag_len > 0 else 0
|
||||
link_ratio = link_text_len / text_len if text_len > 0 else 1
|
||||
|
||||
threshold = self.threshold # base threshold
|
||||
if tag_importance > 1:
|
||||
threshold *= 0.8
|
||||
if text_ratio > 0.4:
|
||||
threshold *= 0.9
|
||||
if link_ratio > 0.6:
|
||||
threshold *= 1.2
|
||||
|
||||
should_remove = score < threshold
|
||||
|
||||
if should_remove:
|
||||
node.decompose()
|
||||
else:
|
||||
children = [child for child in node.children if hasattr(child, 'name')]
|
||||
for child in children:
|
||||
self._prune_tree(child)
|
||||
|
||||
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
|
||||
"""Computes the composite score"""
|
||||
if self.min_word_threshold:
|
||||
# Get raw text from metrics node - avoid extra processing
|
||||
text = metrics['node'].get_text(strip=True)
|
||||
word_count = text.count(' ') + 1
|
||||
if word_count < self.min_word_threshold:
|
||||
return -1.0 # Guaranteed removal
|
||||
score = 0.0
|
||||
total_weight = 0.0
|
||||
|
||||
if self.metric_config['text_density']:
|
||||
density = text_len / tag_len if tag_len > 0 else 0
|
||||
score += self.metric_weights['text_density'] * density
|
||||
total_weight += self.metric_weights['text_density']
|
||||
|
||||
if self.metric_config['link_density']:
|
||||
density = 1 - (link_text_len / text_len if text_len > 0 else 0)
|
||||
score += self.metric_weights['link_density'] * density
|
||||
total_weight += self.metric_weights['link_density']
|
||||
|
||||
if self.metric_config['tag_weight']:
|
||||
tag_score = self.tag_weights.get(metrics['tag_name'], 0.5)
|
||||
score += self.metric_weights['tag_weight'] * tag_score
|
||||
total_weight += self.metric_weights['tag_weight']
|
||||
|
||||
if self.metric_config['class_id_weight']:
|
||||
class_score = self._compute_class_id_weight(metrics['node'])
|
||||
score += self.metric_weights['class_id_weight'] * max(0, class_score)
|
||||
total_weight += self.metric_weights['class_id_weight']
|
||||
|
||||
if self.metric_config['text_length']:
|
||||
score += self.metric_weights['text_length'] * math.log(text_len + 1)
|
||||
total_weight += self.metric_weights['text_length']
|
||||
|
||||
return score / total_weight if total_weight > 0 else 0
|
||||
|
||||
def _compute_class_id_weight(self, node):
|
||||
"""Computes the class ID weight"""
|
||||
class_id_score = 0
|
||||
if 'class' in node.attrs:
|
||||
classes = ' '.join(node['class'])
|
||||
if self.negative_patterns.match(classes):
|
||||
class_id_score -= 0.5
|
||||
if 'id' in node.attrs:
|
||||
element_id = node['id']
|
||||
if self.negative_patterns.match(element_id):
|
||||
class_id_score -= 0.5
|
||||
return class_id_score
|
||||
816
crawl4ai/content_scraping_strategy.py
Normal file
@@ -0,0 +1,816 @@
|
||||
import re # Point 1: Pre-Compile Regular Expressions
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import asyncio, requests, re, os
|
||||
from .config import *
|
||||
from bs4 import element, NavigableString, Comment
|
||||
from bs4 import PageElement, Tag
|
||||
from urllib.parse import urljoin
|
||||
from requests.exceptions import InvalidSchema
|
||||
# from .content_cleaning_strategy import ContentCleaningStrategy
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .models import MarkdownGenerationResult
|
||||
from .utils import (
|
||||
extract_metadata,
|
||||
normalize_url,
|
||||
is_external_url,
|
||||
get_base_domain,
|
||||
)
|
||||
|
||||
|
||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||
OG_REGEX = re.compile(r'^og:')
|
||||
TWITTER_REGEX = re.compile(r'^twitter:')
|
||||
DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
# match = re.match(r"(\d+)(\D*)", dimension)
|
||||
match = DIMENSION_REGEX.match(dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
class ContentScrapingStrategy(ABC):
|
||||
@abstractmethod
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
pass
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
|
||||
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
||||
"""Helper method to safely use logger."""
|
||||
if self.logger:
|
||||
log_method = getattr(self.logger, level)
|
||||
log_method(message=message, tag=tag, **kwargs)
|
||||
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entry point for content scraping.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
"""
|
||||
return self._scrap(url, html, is_async=False, **kwargs)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entry point for asynchronous content scraping.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
|
||||
def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate markdown content from cleaned HTML.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): The cleaned HTML content.
|
||||
html (str): The original HTML content.
|
||||
url (str): The URL of the page.
|
||||
success (bool): Whether the content was successfully cleaned.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the generated markdown content.
|
||||
"""
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
|
||||
|
||||
if markdown_generator:
|
||||
try:
|
||||
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
|
||||
markdown_generator.content_filter = BM25ContentFilter(
|
||||
user_query=kwargs.get('fit_markdown_user_query', None),
|
||||
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||
)
|
||||
|
||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url=url,
|
||||
html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
|
||||
return {
|
||||
'markdown': markdown_result.raw_markdown,
|
||||
'fit_markdown': markdown_result.fit_markdown,
|
||||
'fit_html': markdown_result.fit_html,
|
||||
'markdown_v2': markdown_result
|
||||
}
|
||||
except Exception as e:
|
||||
self._log('error',
|
||||
message="Error using new markdown generation strategy: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
markdown_generator = None
|
||||
return {
|
||||
'markdown': f"Error using new markdown generation strategy: {str(e)}",
|
||||
'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'markdown_v2': None
|
||||
}
|
||||
|
||||
# Legacy method
|
||||
"""
|
||||
# h = CustomHTML2Text()
|
||||
# h.update_params(**kwargs.get('html2text', {}))
|
||||
# markdown = h.handle(cleaned_html)
|
||||
# markdown = markdown.replace(' ```', '```')
|
||||
|
||||
# fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
# fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||
|
||||
# if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
|
||||
# content_filter = kwargs.get('content_filter', None)
|
||||
# if not content_filter:
|
||||
# content_filter = BM25ContentFilter(
|
||||
# user_query=kwargs.get('fit_markdown_user_query', None),
|
||||
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||
# )
|
||||
# fit_html = content_filter.filter_content(html)
|
||||
# fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
|
||||
# fit_markdown = h.handle(fit_html)
|
||||
|
||||
# markdown_v2 = MarkdownGenerationResult(
|
||||
# raw_markdown=markdown,
|
||||
# markdown_with_citations=markdown,
|
||||
# references_markdown=markdown,
|
||||
# fit_markdown=fit_markdown
|
||||
# )
|
||||
|
||||
# return {
|
||||
# 'markdown': markdown,
|
||||
# 'fit_markdown': fit_markdown,
|
||||
# 'fit_html': fit_html,
|
||||
# 'markdown_v2' : markdown_v2
|
||||
# }
|
||||
"""
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
|
||||
return self.flatten_nested_elements(node.contents[0])
|
||||
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||
return text_content
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
if keep_data_attributes:
|
||||
if not attr.startswith('data-'):
|
||||
attrs_to_remove.append(attr)
|
||||
else:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
for attr in attrs_to_remove:
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
if ' ' in u else None}
|
||||
for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
|
||||
# Constants for checks
|
||||
classes_to_check = frozenset(['button', 'icon', 'logo'])
|
||||
tags_to_check = frozenset(['button', 'input'])
|
||||
image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif'])
|
||||
|
||||
# Pre-fetch commonly used attributes
|
||||
style = img.get('style', '')
|
||||
alt = img.get('alt', '')
|
||||
src = img.get('src', '')
|
||||
data_src = img.get('data-src', '')
|
||||
srcset = img.get('srcset', '')
|
||||
data_srcset = img.get('data-srcset', '')
|
||||
width = img.get('width')
|
||||
height = img.get('height')
|
||||
parent = img.parent
|
||||
parent_classes = parent.get('class', [])
|
||||
|
||||
# Quick validation checks
|
||||
if ('display:none' in style or
|
||||
parent.name in tags_to_check or
|
||||
any(c in cls for c in parent_classes for cls in classes_to_check) or
|
||||
any(c in src for c in classes_to_check) or
|
||||
any(c in alt for c in classes_to_check)):
|
||||
return None
|
||||
|
||||
# Quick score calculation
|
||||
score = 0
|
||||
if width and width.isdigit():
|
||||
width_val = int(width)
|
||||
score += 1 if width_val > 150 else 0
|
||||
if height and height.isdigit():
|
||||
height_val = int(height)
|
||||
score += 1 if height_val > 150 else 0
|
||||
if alt:
|
||||
score += 1
|
||||
score += index/total_images < 0.5
|
||||
|
||||
# image_format = ''
|
||||
# if "data:image/" in src:
|
||||
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||
# else:
|
||||
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||
|
||||
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||
# score += 1
|
||||
|
||||
|
||||
# Check for image format in all possible sources
|
||||
def has_image_format(url):
|
||||
return any(fmt in url.lower() for fmt in image_formats)
|
||||
|
||||
# Score for having proper image sources
|
||||
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
||||
score += 1
|
||||
if srcset or data_srcset:
|
||||
score += 1
|
||||
if img.find_parent('picture'):
|
||||
score += 1
|
||||
|
||||
# Detect format from any available source
|
||||
detected_format = None
|
||||
for url in [src, data_src, srcset, data_srcset]:
|
||||
if url:
|
||||
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
||||
if format_matches:
|
||||
detected_format = format_matches[0]
|
||||
break
|
||||
|
||||
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
|
||||
# Use set for deduplication
|
||||
unique_urls = set()
|
||||
image_variants = []
|
||||
|
||||
# Generate a unique group ID for this set of variants
|
||||
group_id = index
|
||||
|
||||
# Base image info template
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
base_info = {
|
||||
'alt': alt,
|
||||
'desc': self.find_closest_parent_with_useful_text(img, **kwargs),
|
||||
'score': score,
|
||||
'type': 'image',
|
||||
'group_id': group_id, # Group ID for this set of variants
|
||||
'format': detected_format,
|
||||
}
|
||||
|
||||
# Inline function for adding variants
|
||||
def add_variant(src, width=None):
|
||||
if src and not src.startswith('data:') and src not in unique_urls:
|
||||
unique_urls.add(src)
|
||||
image_variants.append({**base_info, 'src': src, 'width': width})
|
||||
|
||||
# Process all sources
|
||||
add_variant(src)
|
||||
add_variant(data_src)
|
||||
|
||||
# Handle srcset and data-srcset in one pass
|
||||
for attr in ('srcset', 'data-srcset'):
|
||||
if value := img.get(attr):
|
||||
for source in parse_srcset(value):
|
||||
add_variant(source['url'], source['width'])
|
||||
|
||||
# Quick picture element check
|
||||
if picture := img.find_parent('picture'):
|
||||
for source in picture.find_all('source'):
|
||||
if srcset := source.get('srcset'):
|
||||
for src in parse_srcset(srcset):
|
||||
add_variant(src['url'], src['width'])
|
||||
|
||||
# Framework-specific attributes in one pass
|
||||
for attr, value in img.attrs.items():
|
||||
if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
|
||||
add_variant(value)
|
||||
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
How it works:
|
||||
1. Check if the element is an image, video, or audio.
|
||||
2. Extract the element's attributes and content.
|
||||
3. Process the element based on its type.
|
||||
4. Return the processed element information.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed element information.
|
||||
"""
|
||||
media = {'images': [], 'videos': [], 'audios': []}
|
||||
internal_links_dict = {}
|
||||
external_links_dict = {}
|
||||
self._process_element(
|
||||
url,
|
||||
element,
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
**kwargs
|
||||
)
|
||||
return {
|
||||
'media': media,
|
||||
'internal_links_dict': internal_links_dict,
|
||||
'external_links_dict': external_links_dict
|
||||
}
|
||||
|
||||
def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
element.extract()
|
||||
return False
|
||||
|
||||
# if element.name == 'img':
|
||||
# process_image(element, url, 0, 1)
|
||||
# return True
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
|
||||
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
|
||||
exclude_domains = kwargs.get('exclude_domains', [])
|
||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
try:
|
||||
if element.name == 'a' and element.get('href'):
|
||||
href = element.get('href', '').strip()
|
||||
if not href: # Skip empty hrefs
|
||||
return False
|
||||
|
||||
url_base = url.split('/')[2]
|
||||
|
||||
# Normalize the URL
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
except ValueError as e:
|
||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
||||
return False
|
||||
|
||||
link_data = {
|
||||
'href': normalized_href,
|
||||
'text': element.get_text().strip(),
|
||||
'title': element.get('title', '').strip(),
|
||||
'base_domain': base_domain
|
||||
}
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
|
||||
keep_element = True
|
||||
|
||||
# Handle external link exclusions
|
||||
if is_external:
|
||||
link_base_domain = get_base_domain(normalized_href)
|
||||
link_data['base_domain'] = link_base_domain
|
||||
if kwargs.get('exclude_external_links', False):
|
||||
element.decompose()
|
||||
return False
|
||||
# elif kwargs.get('exclude_social_media_links', False):
|
||||
# if link_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
elif exclude_domains:
|
||||
if link_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
if is_external:
|
||||
if normalized_href not in external_links_dict:
|
||||
external_links_dict[normalized_href] = link_data
|
||||
else:
|
||||
if normalized_href not in internal_links_dict:
|
||||
internal_links_dict[normalized_href] = link_data
|
||||
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error processing links: {str(e)}")
|
||||
|
||||
try:
|
||||
if element.name == 'img':
|
||||
potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original']
|
||||
src = element.get('src', '')
|
||||
while not src and potential_sources:
|
||||
src = element.get(potential_sources.pop(0), '')
|
||||
if not src:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# If it is srcset pick up the first image
|
||||
if 'srcset' in element.attrs:
|
||||
src = element.attrs['srcset'].split(',')[0].split(' ')[0]
|
||||
|
||||
# If image src is internal, then skip
|
||||
if not is_external_url(src, base_domain):
|
||||
return True
|
||||
|
||||
image_src_base_domain = get_base_domain(src)
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get('exclude_external_images', False):
|
||||
element.decompose()
|
||||
return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if url_base not in src_url_base:
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if any(domain in src for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# Handle exclude domains
|
||||
if exclude_domains:
|
||||
if image_src_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
return True # Always keep image elements
|
||||
except Exception as e:
|
||||
raise "Error processing images"
|
||||
|
||||
|
||||
# Check if flag to remove all forms is set
|
||||
if kwargs.get('remove_forms', False) and element.name == 'form':
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
if element.name in ['video', 'audio']:
|
||||
media[f"{element.name}s"].append({
|
||||
'src': element.get('src'),
|
||||
'alt': element.get('alt'),
|
||||
'type': element.name,
|
||||
'description': self.find_closest_parent_with_useful_text(element, **kwargs)
|
||||
})
|
||||
source_tags = element.find_all('source')
|
||||
for source_tag in source_tags:
|
||||
media[f"{element.name}s"].append({
|
||||
'src': source_tag.get('src'),
|
||||
'alt': element.get('alt'),
|
||||
'type': element.name,
|
||||
'description': self.find_closest_parent_with_useful_text(element, **kwargs)
|
||||
})
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
if kwargs.get('only_text', False):
|
||||
element.replace_with(element.get_text())
|
||||
|
||||
try:
|
||||
self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
||||
except Exception as e:
|
||||
# print('Error removing unwanted attributes:', str(e))
|
||||
self._log('error',
|
||||
message="Error removing unwanted attributes: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
if isinstance(child, NavigableString) and not isinstance(child, Comment):
|
||||
if len(child.strip()) > 0:
|
||||
keep_element = True
|
||||
else:
|
||||
if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs):
|
||||
keep_element = True
|
||||
|
||||
|
||||
# Check word count
|
||||
word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD)
|
||||
if not keep_element:
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
keep_element = word_count >= word_count_threshold
|
||||
|
||||
if not keep_element:
|
||||
element.decompose()
|
||||
|
||||
return keep_element
|
||||
except Exception as e:
|
||||
# print('Error processing element:', str(e))
|
||||
self._log('error',
|
||||
message="Error processing element: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return False
|
||||
|
||||
def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
parser_type = kwargs.get('parser', 'lxml')
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
self._log('error',
|
||||
message="Error extracting metadata: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
meta = {}
|
||||
|
||||
# Handle tag-based removal first - faster than CSS selection
|
||||
excluded_tags = set(kwargs.get('excluded_tags', []) or [])
|
||||
if excluded_tags:
|
||||
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
||||
element.extract()
|
||||
|
||||
# Handle CSS selector-based removal
|
||||
excluded_selector = kwargs.get('excluded_selector', '')
|
||||
if excluded_selector:
|
||||
is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector
|
||||
if is_single_selector:
|
||||
while element := body.select_one(excluded_selector):
|
||||
element.extract()
|
||||
else:
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
if css_selector:
|
||||
selected_elements = body.select(css_selector)
|
||||
if not selected_elements:
|
||||
return {
|
||||
'markdown': '',
|
||||
'cleaned_html': '',
|
||||
'success': True,
|
||||
'media': {'images': [], 'videos': [], 'audios': []},
|
||||
'links': {'internal': [], 'external': []},
|
||||
'metadata': {},
|
||||
'message': f"No elements found for CSS selector: {css_selector}"
|
||||
}
|
||||
# raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
||||
body = soup.new_tag('div')
|
||||
for el in selected_elements:
|
||||
body.append(el)
|
||||
|
||||
kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS)
|
||||
kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', []))
|
||||
if kwargs.get('exclude_social_media_links', False):
|
||||
kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains'])
|
||||
|
||||
result_obj = self.process_element(
|
||||
url,
|
||||
body,
|
||||
word_count_threshold = word_count_threshold,
|
||||
base_domain=base_domain,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
links = {'internal': [], 'external': []}
|
||||
media = result_obj['media']
|
||||
internal_links_dict = result_obj['internal_links_dict']
|
||||
external_links_dict = result_obj['external_links_dict']
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
links['internal'] = list(internal_links_dict.values())
|
||||
links['external'] = list(external_links_dict.values())
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all('img')
|
||||
|
||||
media['images'] = [
|
||||
img for result in (self.process_image(img, url, i, len(imgs))
|
||||
for i, img in enumerate(imgs))
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
body = self.flatten_nested_elements(body)
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
for img in imgs:
|
||||
src = img.get('src', '')
|
||||
if base64_pattern.match(src):
|
||||
# Replace base64 data with empty string
|
||||
img['src'] = base64_pattern.sub('', src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str_body = body.encode_contents().decode('utf-8')
|
||||
except Exception as e:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
body = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Create a new div with a special ID
|
||||
error_div = body.new_tag('div', id='crawl4ai_error_message')
|
||||
error_div.string = '''
|
||||
Crawl4AI Error: This page is not fully supported.
|
||||
|
||||
Possible reasons:
|
||||
1. The page may have restrictions that prevent crawling.
|
||||
2. The page might not be fully loaded.
|
||||
|
||||
Suggestions:
|
||||
- Try calling the crawl function with these parameters:
|
||||
magic=True,
|
||||
- Set headless=False to visualize what's happening on the page.
|
||||
|
||||
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||
'''
|
||||
|
||||
# Append the error div to the body
|
||||
body.body.append(error_div)
|
||||
str_body = body.encode_contents().decode('utf-8')
|
||||
|
||||
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
|
||||
self._log('error',
|
||||
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
||||
tag="SCRAPE"
|
||||
)
|
||||
|
||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||
|
||||
# markdown_content = self._generate_markdown_content(
|
||||
# cleaned_html=cleaned_html,
|
||||
# html=html,
|
||||
# url=url,
|
||||
# success=success,
|
||||
# **kwargs
|
||||
# )
|
||||
|
||||
return {
|
||||
# **markdown_content,
|
||||
'cleaned_html': cleaned_html,
|
||||
'success': success,
|
||||
'media': media,
|
||||
'links': links,
|
||||
'metadata': meta
|
||||
}
|
||||
@@ -5,17 +5,58 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException
|
||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||
# from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
# from webdriver_manager.chrome import ChromeDriverManager
|
||||
# from urllib3.exceptions import MaxRetryError
|
||||
|
||||
from typing import List
|
||||
from .config import *
|
||||
import logging, time
|
||||
import base64
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
from typing import List, Callable
|
||||
import requests
|
||||
import os
|
||||
from pathlib import Path
|
||||
from .utils import *
|
||||
|
||||
logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
logger_driver = logging.getLogger('selenium.webdriver.common.service')
|
||||
logger_driver.setLevel(logging.WARNING)
|
||||
|
||||
urllib3_logger = logging.getLogger('urllib3.connectionpool')
|
||||
urllib3_logger.setLevel(logging.WARNING)
|
||||
|
||||
# Disable http.client logging
|
||||
http_client_logger = logging.getLogger('http.client')
|
||||
http_client_logger.setLevel(logging.WARNING)
|
||||
|
||||
# Disable driver_finder and service logging
|
||||
driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
|
||||
driver_finder_logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
|
||||
|
||||
class CrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
def crawl(self, url: str, **kwargs) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def take_screenshot(self, save_path: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_user_agent(self, user_agent: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
pass
|
||||
|
||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html = False):
|
||||
@@ -33,60 +74,287 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
|
||||
response = response.json()
|
||||
html = response["results"][0]["html"]
|
||||
return html
|
||||
return sanitize_input_encode(html)
|
||||
|
||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
def __init__(self, use_cached_html=False, js_code=None):
|
||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||
super().__init__()
|
||||
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||
self.options = Options()
|
||||
self.options.headless = True
|
||||
if kwargs.get("proxy"):
|
||||
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
|
||||
if kwargs.get("user_agent"):
|
||||
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
|
||||
else:
|
||||
user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
self.options.add_argument(f"--user-agent={user_agent}")
|
||||
self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
|
||||
self.options.headless = kwargs.get("headless", True)
|
||||
if self.options.headless:
|
||||
self.options.add_argument("--headless")
|
||||
|
||||
self.options.add_argument("--disable-gpu")
|
||||
self.options.add_argument("--window-size=1920,1080")
|
||||
self.options.add_argument("--no-sandbox")
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
|
||||
# self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-gpu")
|
||||
self.options.add_argument("--disable-extensions")
|
||||
self.options.add_argument("--headless")
|
||||
# self.options.add_argument("--disable-extensions")
|
||||
# self.options.add_argument("--disable-infobars")
|
||||
# self.options.add_argument("--disable-logging")
|
||||
# self.options.add_argument("--disable-popup-blocking")
|
||||
# self.options.add_argument("--disable-translate")
|
||||
# self.options.add_argument("--disable-default-apps")
|
||||
# self.options.add_argument("--disable-background-networking")
|
||||
# self.options.add_argument("--disable-sync")
|
||||
# self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
|
||||
# self.options.add_argument("--disable-browser-side-navigation")
|
||||
# self.options.add_argument("--dns-prefetch-disable")
|
||||
# self.options.add_argument("--disable-web-security")
|
||||
self.options.add_argument("--log-level=3")
|
||||
self.use_cached_html = use_cached_html
|
||||
self.use_cached_html = use_cached_html
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
# Hooks
|
||||
self.hooks = {
|
||||
'on_driver_created': None,
|
||||
'on_user_agent_updated': None,
|
||||
'before_get_url': None,
|
||||
'after_get_url': None,
|
||||
'before_return_html': None
|
||||
}
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
import chromedriver_autoinstaller
|
||||
self.service = Service(chromedriver_autoinstaller.install())
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
# import chromedriver_autoinstaller
|
||||
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
||||
# chromedriver_path = chromedriver_autoinstaller.install()
|
||||
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
||||
# self.service = Service(chromedriver_autoinstaller.install())
|
||||
|
||||
|
||||
# chromedriver_path = ChromeDriverManager().install()
|
||||
# self.service = Service(chromedriver_path)
|
||||
# self.service.log_path = "NUL"
|
||||
# self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
# Use selenium-manager (built into Selenium 4.10.0+)
|
||||
self.service = Service()
|
||||
self.driver = webdriver.Chrome(options=self.options)
|
||||
|
||||
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||
|
||||
if kwargs.get("cookies"):
|
||||
for cookie in kwargs.get("cookies"):
|
||||
self.driver.add_cookie(cookie)
|
||||
|
||||
|
||||
|
||||
def crawl(self, url: str) -> str:
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
if hook_type in self.hooks:
|
||||
self.hooks[hook_type] = hook
|
||||
else:
|
||||
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||
|
||||
def execute_hook(self, hook_type: str, *args):
|
||||
hook = self.hooks.get(hook_type)
|
||||
if hook:
|
||||
result = hook(*args)
|
||||
if result is not None:
|
||||
if isinstance(result, webdriver.Chrome):
|
||||
return result
|
||||
else:
|
||||
raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
|
||||
# If the hook returns None or there is no hook, return self.driver
|
||||
return self.driver
|
||||
|
||||
def update_user_agent(self, user_agent: str):
|
||||
self.options.add_argument(f"user-agent={user_agent}")
|
||||
self.driver.quit()
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
self.driver = self.execute_hook('on_user_agent_updated', self.driver)
|
||||
|
||||
def set_custom_headers(self, headers: dict):
|
||||
# Enable Network domain for sending headers
|
||||
self.driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Set extra HTTP headers
|
||||
self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
|
||||
|
||||
def _ensure_page_load(self, max_checks=6, check_interval=0.01):
|
||||
initial_length = len(self.driver.page_source)
|
||||
|
||||
for ix in range(max_checks):
|
||||
# print(f"Checking page load: {ix}")
|
||||
time.sleep(check_interval)
|
||||
current_length = len(self.driver.page_source)
|
||||
|
||||
if current_length != initial_length:
|
||||
break
|
||||
|
||||
return self.driver.page_source
|
||||
|
||||
def crawl(self, url: str, **kwargs) -> str:
|
||||
# Create md5 hash of the URL
|
||||
import hashlib
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
||||
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
||||
if os.path.exists(cache_file_path):
|
||||
with open(cache_file_path, "r") as f:
|
||||
return f.read()
|
||||
return sanitize_input_encode(f.read())
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
self.driver = self.execute_hook('before_get_url', self.driver)
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
|
||||
self.driver.get(url) #<html><head></head><body></body></html>
|
||||
|
||||
WebDriverWait(self.driver, 20).until(
|
||||
lambda d: d.execute_script('return document.readyState') == 'complete'
|
||||
)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
|
||||
self.driver = self.execute_hook('after_get_url', self.driver)
|
||||
html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
|
||||
can_not_be_done_headless = False # Look at my creativity for naming variables
|
||||
|
||||
# TODO: Very ugly approach, but promise to change it!
|
||||
if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
|
||||
print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
|
||||
can_not_be_done_headless = True
|
||||
options = Options()
|
||||
options.headless = False
|
||||
# set window size very small
|
||||
options.add_argument("--window-size=5,5")
|
||||
driver = webdriver.Chrome(service=self.service, options=options)
|
||||
driver.get(url)
|
||||
self.driver = self.execute_hook('after_get_url', driver)
|
||||
html = sanitize_input_encode(driver.page_source)
|
||||
driver.quit()
|
||||
|
||||
# Execute JS code if provided
|
||||
if self.js_code:
|
||||
self.js_code = kwargs.get("js_code", self.js_code)
|
||||
if self.js_code and type(self.js_code) == str:
|
||||
self.driver.execute_script(self.js_code)
|
||||
# Optionally, wait for some condition after executing the JS code
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
elif self.js_code and type(self.js_code) == list:
|
||||
for js in self.js_code:
|
||||
self.driver.execute_script(js)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
html = self.driver.page_source
|
||||
# Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
|
||||
wait_for = kwargs.get('wait_for', False)
|
||||
if wait_for:
|
||||
if callable(wait_for):
|
||||
print("[LOG] 🔄 Waiting for condition...")
|
||||
WebDriverWait(self.driver, 20).until(wait_for)
|
||||
else:
|
||||
print("[LOG] 🔄 Waiting for condition...")
|
||||
WebDriverWait(self.driver, 20).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
|
||||
)
|
||||
|
||||
if not can_not_be_done_headless:
|
||||
html = sanitize_input_encode(self.driver.page_source)
|
||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
||||
with open(cache_file_path, "w") as f:
|
||||
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
||||
except InvalidArgumentException as e:
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||
except WebDriverException as e:
|
||||
# If e does nlt have msg attribute create it and set it to str(e)
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = sanitize_input_encode(str(e))
|
||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||
|
||||
def take_screenshot(self) -> str:
|
||||
try:
|
||||
# Get the dimensions of the page
|
||||
total_width = self.driver.execute_script("return document.body.scrollWidth")
|
||||
total_height = self.driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
# Set the window size to the dimensions of the page
|
||||
self.driver.set_window_size(total_width, total_height)
|
||||
|
||||
# Take screenshot
|
||||
screenshot = self.driver.get_screenshot_as_png()
|
||||
|
||||
# Open the screenshot with PIL
|
||||
image = Image.open(BytesIO(screenshot))
|
||||
|
||||
# Convert image to RGB mode (this will handle both RGB and RGBA images)
|
||||
rgb_image = image.convert('RGB')
|
||||
|
||||
# Convert to JPEG and compress
|
||||
buffered = BytesIO()
|
||||
rgb_image.save(buffered, format="JPEG", quality=85)
|
||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||
|
||||
return img_base64
|
||||
except Exception as e:
|
||||
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||
print(error_message)
|
||||
|
||||
# Generate an image with black background
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Load a font
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", 40)
|
||||
except IOError:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
# Define text color and wrap the text
|
||||
text_color = (255, 255, 255)
|
||||
max_width = 780
|
||||
wrapped_text = wrap_text(draw, error_message, font, max_width)
|
||||
|
||||
# Calculate text position
|
||||
text_position = (10, 10)
|
||||
|
||||
# Draw the text on the image
|
||||
draw.text(text_position, wrapped_text, fill=text_color, font=font)
|
||||
|
||||
# Convert to base64
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
return img_base64
|
||||
|
||||
def quit(self):
|
||||
self.driver.quit()
|
||||
self.driver.quit()
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
|
||||
def init_db():
|
||||
global DB_PATH
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
@@ -19,22 +18,37 @@ def init_db():
|
||||
cleaned_html TEXT,
|
||||
markdown TEXT,
|
||||
extracted_content TEXT,
|
||||
success BOOLEAN
|
||||
success BOOLEAN,
|
||||
media TEXT DEFAULT "{}",
|
||||
links TEXT DEFAULT "{}",
|
||||
metadata TEXT DEFAULT "{}",
|
||||
screenshot TEXT DEFAULT ""
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def check_db_path():
|
||||
if not DB_PATH:
|
||||
raise ValueError("Database path is not set or is empty.")
|
||||
|
||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
||||
def alter_db_add_screenshot(new_column: str = "media"):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
|
||||
cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error altering database to add screenshot column: {e}")
|
||||
|
||||
def check_db_path():
|
||||
if not DB_PATH:
|
||||
raise ValueError("Database path is not set or is empty.")
|
||||
|
||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result
|
||||
@@ -42,21 +56,25 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
||||
print(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
|
||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
extracted_content = excluded.extracted_content,
|
||||
success = excluded.success
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success))
|
||||
success = excluded.success,
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
@@ -95,4 +113,23 @@ def flush_db():
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error flushing database: {e}")
|
||||
print(f"Error flushing database: {e}")
|
||||
|
||||
def update_existing_records(new_column: str = "media", default_value: str = "{}"):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error updating existing records: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Delete the existing database file
|
||||
if os.path.exists(DB_PATH):
|
||||
os.remove(DB_PATH)
|
||||
init_db()
|
||||
# alter_db_add_screenshot("COL_NAME")
|
||||
|
||||
|
||||
67
crawl4ai/docs_manager.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import requests
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.llmtxt import AsyncLLMTextManager
|
||||
|
||||
class DocsManager:
|
||||
def __init__(self, logger=None):
|
||||
self.docs_dir = Path.home() / ".crawl4ai" / "docs"
|
||||
self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
|
||||
self.docs_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.logger = logger or AsyncLogger(verbose=True)
|
||||
self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)
|
||||
|
||||
async def ensure_docs_exist(self):
|
||||
"""Fetch docs if not present"""
|
||||
if not any(self.docs_dir.iterdir()):
|
||||
await self.fetch_docs()
|
||||
|
||||
async def fetch_docs(self) -> bool:
|
||||
"""Copy from local docs or download from GitHub"""
|
||||
try:
|
||||
# Try local first
|
||||
if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))):
|
||||
# Empty the local docs directory
|
||||
for file_path in self.docs_dir.glob("*.md"):
|
||||
file_path.unlink()
|
||||
# for file_path in self.docs_dir.glob("*.tokens"):
|
||||
# file_path.unlink()
|
||||
for file_path in self.local_docs.glob("*.md"):
|
||||
shutil.copy2(file_path, self.docs_dir / file_path.name)
|
||||
# for file_path in self.local_docs.glob("*.tokens"):
|
||||
# shutil.copy2(file_path, self.docs_dir / file_path.name)
|
||||
return True
|
||||
|
||||
# Fallback to GitHub
|
||||
response = requests.get(
|
||||
"https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
|
||||
headers={'Accept': 'application/vnd.github.v3+json'}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
for item in response.json():
|
||||
if item['type'] == 'file' and item['name'].endswith('.md'):
|
||||
content = requests.get(item['download_url']).text
|
||||
with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to fetch docs: {str(e)}")
|
||||
raise
|
||||
|
||||
def list(self) -> list[str]:
|
||||
"""List available topics"""
|
||||
names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
|
||||
# Remove [0-9]+_ prefix
|
||||
names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
|
||||
# Exclude those end with .xs.md and .q.md
|
||||
names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")]
|
||||
return names
|
||||
|
||||
def generate(self, sections, mode="extended"):
|
||||
return self.llm_text.generate(sections, mode)
|
||||
|
||||
def search(self, query: str, top_k: int = 5):
|
||||
return self.llm_text.search(query, top_k)
|
||||
1440
crawl4ai/extraction_strategy.bak.py
Normal file
1141
crawl4ai/html2text/__init__.py
Normal file
3
crawl4ai/html2text/__main__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .cli import main
|
||||
|
||||
main()
|
||||
2
crawl4ai/html2text/_typing.py
Normal file
@@ -0,0 +1,2 @@
|
||||
class OutCallback:
|
||||
def __call__(self, s: str) -> None: ...
|
||||
330
crawl4ai/html2text/cli.py
Normal file
@@ -0,0 +1,330 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from . import HTML2Text, __version__, config
|
||||
|
||||
|
||||
def main() -> None:
|
||||
baseurl = ""
|
||||
|
||||
class bcolors:
|
||||
HEADER = "\033[95m"
|
||||
OKBLUE = "\033[94m"
|
||||
OKGREEN = "\033[92m"
|
||||
WARNING = "\033[93m"
|
||||
FAIL = "\033[91m"
|
||||
ENDC = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
UNDERLINE = "\033[4m"
|
||||
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument(
|
||||
"--default-image-alt",
|
||||
dest="default_image_alt",
|
||||
default=config.DEFAULT_IMAGE_ALT,
|
||||
help="The default alt string for images with missing ones",
|
||||
)
|
||||
p.add_argument(
|
||||
"--pad-tables",
|
||||
dest="pad_tables",
|
||||
action="store_true",
|
||||
default=config.PAD_TABLES,
|
||||
help="pad the cells to equal column width in tables",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-wrap-links",
|
||||
dest="wrap_links",
|
||||
action="store_false",
|
||||
default=config.WRAP_LINKS,
|
||||
help="don't wrap links during conversion",
|
||||
)
|
||||
p.add_argument(
|
||||
"--wrap-list-items",
|
||||
dest="wrap_list_items",
|
||||
action="store_true",
|
||||
default=config.WRAP_LIST_ITEMS,
|
||||
help="wrap list items during conversion",
|
||||
)
|
||||
p.add_argument(
|
||||
"--wrap-tables",
|
||||
dest="wrap_tables",
|
||||
action="store_true",
|
||||
default=config.WRAP_TABLES,
|
||||
help="wrap tables",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-emphasis",
|
||||
dest="ignore_emphasis",
|
||||
action="store_true",
|
||||
default=config.IGNORE_EMPHASIS,
|
||||
help="don't include any formatting for emphasis",
|
||||
)
|
||||
p.add_argument(
|
||||
"--reference-links",
|
||||
dest="inline_links",
|
||||
action="store_false",
|
||||
default=config.INLINE_LINKS,
|
||||
help="use reference style links instead of inline links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-links",
|
||||
dest="ignore_links",
|
||||
action="store_true",
|
||||
default=config.IGNORE_ANCHORS,
|
||||
help="don't include any formatting for links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-mailto-links",
|
||||
action="store_true",
|
||||
dest="ignore_mailto_links",
|
||||
default=config.IGNORE_MAILTO_LINKS,
|
||||
help="don't include mailto: links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--protect-links",
|
||||
dest="protect_links",
|
||||
action="store_true",
|
||||
default=config.PROTECT_LINKS,
|
||||
help="protect links from line breaks surrounding them with angle brackets",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-images",
|
||||
dest="ignore_images",
|
||||
action="store_true",
|
||||
default=config.IGNORE_IMAGES,
|
||||
help="don't include any formatting for images",
|
||||
)
|
||||
p.add_argument(
|
||||
"--images-as-html",
|
||||
dest="images_as_html",
|
||||
action="store_true",
|
||||
default=config.IMAGES_AS_HTML,
|
||||
help=(
|
||||
"Always write image tags as raw html; preserves `height`, `width` and "
|
||||
"`alt` if possible."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--images-to-alt",
|
||||
dest="images_to_alt",
|
||||
action="store_true",
|
||||
default=config.IMAGES_TO_ALT,
|
||||
help="Discard image data, only keep alt text",
|
||||
)
|
||||
p.add_argument(
|
||||
"--images-with-size",
|
||||
dest="images_with_size",
|
||||
action="store_true",
|
||||
default=config.IMAGES_WITH_SIZE,
|
||||
help=(
|
||||
"Write image tags with height and width attrs as raw html to retain "
|
||||
"dimensions"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"-g",
|
||||
"--google-doc",
|
||||
action="store_true",
|
||||
dest="google_doc",
|
||||
default=False,
|
||||
help="convert an html-exported Google Document",
|
||||
)
|
||||
p.add_argument(
|
||||
"-d",
|
||||
"--dash-unordered-list",
|
||||
action="store_true",
|
||||
dest="ul_style_dash",
|
||||
default=False,
|
||||
help="use a dash rather than a star for unordered list items",
|
||||
)
|
||||
p.add_argument(
|
||||
"-e",
|
||||
"--asterisk-emphasis",
|
||||
action="store_true",
|
||||
dest="em_style_asterisk",
|
||||
default=False,
|
||||
help="use an asterisk rather than an underscore for emphasized text",
|
||||
)
|
||||
p.add_argument(
|
||||
"-b",
|
||||
"--body-width",
|
||||
dest="body_width",
|
||||
type=int,
|
||||
default=config.BODY_WIDTH,
|
||||
help="number of characters per output line, 0 for no wrap",
|
||||
)
|
||||
p.add_argument(
|
||||
"-i",
|
||||
"--google-list-indent",
|
||||
dest="list_indent",
|
||||
type=int,
|
||||
default=config.GOOGLE_LIST_INDENT,
|
||||
help="number of pixels Google indents nested lists",
|
||||
)
|
||||
p.add_argument(
|
||||
"-s",
|
||||
"--hide-strikethrough",
|
||||
action="store_true",
|
||||
dest="hide_strikethrough",
|
||||
default=False,
|
||||
help="hide strike-through text. only relevant when -g is " "specified as well",
|
||||
)
|
||||
p.add_argument(
|
||||
"--escape-all",
|
||||
action="store_true",
|
||||
dest="escape_snob",
|
||||
default=False,
|
||||
help=(
|
||||
"Escape all special characters. Output is less readable, but avoids "
|
||||
"corner case formatting issues."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--bypass-tables",
|
||||
action="store_true",
|
||||
dest="bypass_tables",
|
||||
default=config.BYPASS_TABLES,
|
||||
help="Format tables in HTML rather than Markdown syntax.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-tables",
|
||||
action="store_true",
|
||||
dest="ignore_tables",
|
||||
default=config.IGNORE_TABLES,
|
||||
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--single-line-break",
|
||||
action="store_true",
|
||||
dest="single_line_break",
|
||||
default=config.SINGLE_LINE_BREAK,
|
||||
help=(
|
||||
"Use a single line break after a block element rather than two line "
|
||||
"breaks. NOTE: Requires --body-width=0"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--unicode-snob",
|
||||
action="store_true",
|
||||
dest="unicode_snob",
|
||||
default=config.UNICODE_SNOB,
|
||||
help="Use unicode throughout document",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-automatic-links",
|
||||
action="store_false",
|
||||
dest="use_automatic_links",
|
||||
default=config.USE_AUTOMATIC_LINKS,
|
||||
help="Do not use automatic links wherever applicable",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-skip-internal-links",
|
||||
action="store_false",
|
||||
dest="skip_internal_links",
|
||||
default=config.SKIP_INTERNAL_LINKS,
|
||||
help="Do not skip internal links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--links-after-para",
|
||||
action="store_true",
|
||||
dest="links_each_paragraph",
|
||||
default=config.LINKS_EACH_PARAGRAPH,
|
||||
help="Put links after each paragraph instead of document",
|
||||
)
|
||||
p.add_argument(
|
||||
"--mark-code",
|
||||
action="store_true",
|
||||
dest="mark_code",
|
||||
default=config.MARK_CODE,
|
||||
help="Mark program code blocks with [code]...[/code]",
|
||||
)
|
||||
p.add_argument(
|
||||
"--decode-errors",
|
||||
dest="decode_errors",
|
||||
default=config.DECODE_ERRORS,
|
||||
help=(
|
||||
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
|
||||
"acceptable values"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--open-quote",
|
||||
dest="open_quote",
|
||||
default=config.OPEN_QUOTE,
|
||||
help="The character used to open quotes",
|
||||
)
|
||||
p.add_argument(
|
||||
"--close-quote",
|
||||
dest="close_quote",
|
||||
default=config.CLOSE_QUOTE,
|
||||
help="The character used to close quotes",
|
||||
)
|
||||
p.add_argument(
|
||||
"--version", action="version", version=".".join(map(str, __version__))
|
||||
)
|
||||
p.add_argument("filename", nargs="?")
|
||||
p.add_argument("encoding", nargs="?", default="utf-8")
|
||||
p.add_argument(
|
||||
"--include-sup-sub",
|
||||
dest="include_sup_sub",
|
||||
action="store_true",
|
||||
default=config.INCLUDE_SUP_SUB,
|
||||
help="Include the sup and sub tags",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
if args.filename and args.filename != "-":
|
||||
with open(args.filename, "rb") as fp:
|
||||
data = fp.read()
|
||||
else:
|
||||
data = sys.stdin.buffer.read()
|
||||
|
||||
try:
|
||||
html = data.decode(args.encoding, args.decode_errors)
|
||||
except UnicodeDecodeError as err:
|
||||
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
||||
warning += " Use the " + bcolors.OKGREEN
|
||||
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
|
||||
print(warning)
|
||||
raise err
|
||||
|
||||
h = HTML2Text(baseurl=baseurl)
|
||||
# handle options
|
||||
if args.ul_style_dash:
|
||||
h.ul_item_mark = "-"
|
||||
if args.em_style_asterisk:
|
||||
h.emphasis_mark = "*"
|
||||
h.strong_mark = "__"
|
||||
|
||||
h.body_width = args.body_width
|
||||
h.google_list_indent = args.list_indent
|
||||
h.ignore_emphasis = args.ignore_emphasis
|
||||
h.ignore_links = args.ignore_links
|
||||
h.ignore_mailto_links = args.ignore_mailto_links
|
||||
h.protect_links = args.protect_links
|
||||
h.ignore_images = args.ignore_images
|
||||
h.images_as_html = args.images_as_html
|
||||
h.images_to_alt = args.images_to_alt
|
||||
h.images_with_size = args.images_with_size
|
||||
h.google_doc = args.google_doc
|
||||
h.hide_strikethrough = args.hide_strikethrough
|
||||
h.escape_snob = args.escape_snob
|
||||
h.bypass_tables = args.bypass_tables
|
||||
h.ignore_tables = args.ignore_tables
|
||||
h.single_line_break = args.single_line_break
|
||||
h.inline_links = args.inline_links
|
||||
h.unicode_snob = args.unicode_snob
|
||||
h.use_automatic_links = args.use_automatic_links
|
||||
h.skip_internal_links = args.skip_internal_links
|
||||
h.links_each_paragraph = args.links_each_paragraph
|
||||
h.mark_code = args.mark_code
|
||||
h.wrap_links = args.wrap_links
|
||||
h.wrap_list_items = args.wrap_list_items
|
||||
h.wrap_tables = args.wrap_tables
|
||||
h.pad_tables = args.pad_tables
|
||||
h.default_image_alt = args.default_image_alt
|
||||
h.open_quote = args.open_quote
|
||||
h.close_quote = args.close_quote
|
||||
h.include_sup_sub = args.include_sup_sub
|
||||
|
||||
sys.stdout.write(h.handle(html))
|
||||
172
crawl4ai/html2text/config.py
Normal file
@@ -0,0 +1,172 @@
|
||||
import re
|
||||
|
||||
# Use Unicode characters instead of their ascii pseudo-replacements
|
||||
UNICODE_SNOB = False
|
||||
|
||||
# Marker to use for marking tables for padding post processing
|
||||
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
||||
# Escape all special characters. Output is less readable, but avoids
|
||||
# corner case formatting issues.
|
||||
ESCAPE_SNOB = False
|
||||
ESCAPE_BACKSLASH = False
|
||||
ESCAPE_DOT = False
|
||||
ESCAPE_PLUS = False
|
||||
ESCAPE_DASH = False
|
||||
|
||||
# Put the links after each paragraph instead of at the end.
|
||||
LINKS_EACH_PARAGRAPH = False
|
||||
|
||||
# Wrap long lines at position. 0 for no wrapping.
|
||||
BODY_WIDTH = 78
|
||||
|
||||
# Don't show internal links (href="#local-anchor") -- corresponding link
|
||||
# targets won't be visible in the plain text file anyway.
|
||||
SKIP_INTERNAL_LINKS = True
|
||||
|
||||
# Use inline, rather than reference, formatting for images and links
|
||||
INLINE_LINKS = True
|
||||
|
||||
# Protect links from line breaks surrounding them with angle brackets (in
|
||||
# addition to their square brackets)
|
||||
PROTECT_LINKS = False
|
||||
# WRAP_LINKS = True
|
||||
WRAP_LINKS = True
|
||||
|
||||
# Wrap list items.
|
||||
WRAP_LIST_ITEMS = False
|
||||
|
||||
# Wrap tables
|
||||
WRAP_TABLES = False
|
||||
|
||||
# Number of pixels Google indents nested lists
|
||||
GOOGLE_LIST_INDENT = 36
|
||||
|
||||
# Values Google and others may use to indicate bold text
|
||||
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
|
||||
|
||||
IGNORE_ANCHORS = False
|
||||
IGNORE_MAILTO_LINKS = False
|
||||
IGNORE_IMAGES = False
|
||||
IMAGES_AS_HTML = False
|
||||
IMAGES_TO_ALT = False
|
||||
IMAGES_WITH_SIZE = False
|
||||
IGNORE_EMPHASIS = False
|
||||
MARK_CODE = False
|
||||
DECODE_ERRORS = "strict"
|
||||
DEFAULT_IMAGE_ALT = ""
|
||||
PAD_TABLES = False
|
||||
|
||||
# Convert links with same href and text to <href> format
|
||||
# if they are absolute links
|
||||
USE_AUTOMATIC_LINKS = True
|
||||
|
||||
# For checking space-only lines on line 771
|
||||
RE_SPACE = re.compile(r"\s\+")
|
||||
|
||||
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
|
||||
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
|
||||
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
|
||||
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
||||
|
||||
# to find links in the text
|
||||
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
|
||||
|
||||
# to find table separators
|
||||
RE_TABLE = re.compile(r" \| ")
|
||||
|
||||
RE_MD_DOT_MATCHER = re.compile(
|
||||
r"""
|
||||
^ # start of line
|
||||
(\s*\d+) # optional whitespace and a number
|
||||
(\.) # dot
|
||||
(?=\s) # lookahead assert whitespace
|
||||
""",
|
||||
re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
RE_MD_PLUS_MATCHER = re.compile(
|
||||
r"""
|
||||
^
|
||||
(\s*)
|
||||
(\+)
|
||||
(?=\s)
|
||||
""",
|
||||
flags=re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
RE_MD_DASH_MATCHER = re.compile(
|
||||
r"""
|
||||
^
|
||||
(\s*)
|
||||
(-)
|
||||
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
||||
# or another dash (header or hr)
|
||||
""",
|
||||
flags=re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
|
||||
RE_MD_BACKSLASH_MATCHER = re.compile(
|
||||
r"""
|
||||
(\\) # match one slash
|
||||
(?=[%s]) # followed by a char that requires escaping
|
||||
"""
|
||||
% re.escape(RE_SLASH_CHARS),
|
||||
flags=re.VERBOSE,
|
||||
)
|
||||
|
||||
UNIFIABLE = {
|
||||
"rsquo": "'",
|
||||
"lsquo": "'",
|
||||
"rdquo": '"',
|
||||
"ldquo": '"',
|
||||
"copy": "(C)",
|
||||
"mdash": "--",
|
||||
"nbsp": " ",
|
||||
"rarr": "->",
|
||||
"larr": "<-",
|
||||
"middot": "*",
|
||||
"ndash": "-",
|
||||
"oelig": "oe",
|
||||
"aelig": "ae",
|
||||
"agrave": "a",
|
||||
"aacute": "a",
|
||||
"acirc": "a",
|
||||
"atilde": "a",
|
||||
"auml": "a",
|
||||
"aring": "a",
|
||||
"egrave": "e",
|
||||
"eacute": "e",
|
||||
"ecirc": "e",
|
||||
"euml": "e",
|
||||
"igrave": "i",
|
||||
"iacute": "i",
|
||||
"icirc": "i",
|
||||
"iuml": "i",
|
||||
"ograve": "o",
|
||||
"oacute": "o",
|
||||
"ocirc": "o",
|
||||
"otilde": "o",
|
||||
"ouml": "o",
|
||||
"ugrave": "u",
|
||||
"uacute": "u",
|
||||
"ucirc": "u",
|
||||
"uuml": "u",
|
||||
"lrm": "",
|
||||
"rlm": "",
|
||||
}
|
||||
|
||||
# Format tables in HTML rather than Markdown syntax
|
||||
BYPASS_TABLES = False
|
||||
# Ignore table-related tags (table, th, td, tr) while keeping rows
|
||||
IGNORE_TABLES = False
|
||||
|
||||
|
||||
# Use a single line break after a block element rather than two line breaks.
|
||||
# NOTE: Requires body width setting to be 0.
|
||||
SINGLE_LINE_BREAK = False
|
||||
|
||||
|
||||
# Use double quotation marks when converting the <q> tag.
|
||||
OPEN_QUOTE = '"'
|
||||
CLOSE_QUOTE = '"'
|
||||
|
||||
# Include the <sup> and <sub> tags
|
||||
INCLUDE_SUP_SUB = False
|
||||
18
crawl4ai/html2text/elements.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
class AnchorElement:
|
||||
__slots__ = ["attrs", "count", "outcount"]
|
||||
|
||||
def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
|
||||
self.attrs = attrs
|
||||
self.count = count
|
||||
self.outcount = outcount
|
||||
|
||||
|
||||
class ListElement:
|
||||
__slots__ = ["name", "num"]
|
||||
|
||||
def __init__(self, name: str, num: int):
|
||||
self.name = name
|
||||
self.num = num
|
||||
303
crawl4ai/html2text/utils.py
Normal file
@@ -0,0 +1,303 @@
|
||||
import html.entities
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from . import config
|
||||
|
||||
unifiable_n = {
|
||||
html.entities.name2codepoint[k]: v
|
||||
for k, v in config.UNIFIABLE.items()
|
||||
if k != "nbsp"
|
||||
}
|
||||
|
||||
|
||||
def hn(tag: str) -> int:
|
||||
if tag[0] == "h" and len(tag) == 2:
|
||||
n = tag[1]
|
||||
if "0" < n <= "9":
|
||||
return int(n)
|
||||
return 0
|
||||
|
||||
|
||||
def dumb_property_dict(style: str) -> Dict[str, str]:
|
||||
"""
|
||||
:returns: A hash of css attributes
|
||||
"""
|
||||
return {
|
||||
x.strip().lower(): y.strip().lower()
|
||||
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
|
||||
}
|
||||
|
||||
|
||||
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
:type data: str
|
||||
|
||||
:returns: A hash of css selectors, each of which contains a hash of
|
||||
css attributes.
|
||||
:rtype: dict
|
||||
"""
|
||||
# remove @import sentences
|
||||
data += ";"
|
||||
importIndex = data.find("@import")
|
||||
while importIndex != -1:
|
||||
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
|
||||
importIndex = data.find("@import")
|
||||
|
||||
# parse the css. reverted from dictionary comprehension in order to
|
||||
# support older pythons
|
||||
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
|
||||
try:
|
||||
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
|
||||
except ValueError:
|
||||
elements = {} # not that important
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def element_style(
|
||||
attrs: Dict[str, Optional[str]],
|
||||
style_def: Dict[str, Dict[str, str]],
|
||||
parent_style: Dict[str, str],
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
:type attrs: dict
|
||||
:type style_def: dict
|
||||
:type style_def: dict
|
||||
|
||||
:returns: A hash of the 'final' style attributes of the element
|
||||
:rtype: dict
|
||||
"""
|
||||
style = parent_style.copy()
|
||||
if "class" in attrs:
|
||||
assert attrs["class"] is not None
|
||||
for css_class in attrs["class"].split():
|
||||
css_style = style_def.get("." + css_class, {})
|
||||
style.update(css_style)
|
||||
if "style" in attrs:
|
||||
assert attrs["style"] is not None
|
||||
immediate_style = dumb_property_dict(attrs["style"])
|
||||
style.update(immediate_style)
|
||||
|
||||
return style
|
||||
|
||||
|
||||
def google_list_style(style: Dict[str, str]) -> str:
|
||||
"""
|
||||
Finds out whether this is an ordered or unordered list
|
||||
|
||||
:type style: dict
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
if "list-style-type" in style:
|
||||
list_style = style["list-style-type"]
|
||||
if list_style in ["disc", "circle", "square", "none"]:
|
||||
return "ul"
|
||||
|
||||
return "ol"
|
||||
|
||||
|
||||
def google_has_height(style: Dict[str, str]) -> bool:
|
||||
"""
|
||||
Check if the style of the element has the 'height' attribute
|
||||
explicitly defined
|
||||
|
||||
:type style: dict
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
return "height" in style
|
||||
|
||||
|
||||
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
|
||||
"""
|
||||
:type style: dict
|
||||
|
||||
:returns: A list of all emphasis modifiers of the element
|
||||
:rtype: list
|
||||
"""
|
||||
emphasis = []
|
||||
if "text-decoration" in style:
|
||||
emphasis.append(style["text-decoration"])
|
||||
if "font-style" in style:
|
||||
emphasis.append(style["font-style"])
|
||||
if "font-weight" in style:
|
||||
emphasis.append(style["font-weight"])
|
||||
|
||||
return emphasis
|
||||
|
||||
|
||||
def google_fixed_width_font(style: Dict[str, str]) -> bool:
|
||||
"""
|
||||
Check if the css of the current element defines a fixed width font
|
||||
|
||||
:type style: dict
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
font_family = ""
|
||||
if "font-family" in style:
|
||||
font_family = style["font-family"]
|
||||
return "courier new" == font_family or "consolas" == font_family
|
||||
|
||||
|
||||
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
|
||||
"""
|
||||
Extract numbering from list element attributes
|
||||
|
||||
:type attrs: dict
|
||||
|
||||
:rtype: int or None
|
||||
"""
|
||||
if "start" in attrs:
|
||||
assert attrs["start"] is not None
|
||||
try:
|
||||
return int(attrs["start"]) - 1
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def skipwrap(
|
||||
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
|
||||
) -> bool:
|
||||
# If it appears to contain a link
|
||||
# don't wrap
|
||||
if not wrap_links and config.RE_LINK.search(para):
|
||||
return True
|
||||
# If the text begins with four spaces or one tab, it's a code block;
|
||||
# don't wrap
|
||||
if para[0:4] == " " or para[0] == "\t":
|
||||
return True
|
||||
|
||||
# If the text begins with only two "--", possibly preceded by
|
||||
# whitespace, that's an emdash; so wrap.
|
||||
stripped = para.lstrip()
|
||||
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
||||
return False
|
||||
|
||||
# I'm not sure what this is for; I thought it was to detect lists,
|
||||
# but there's a <br>-inside-<span> case in one of the tests that
|
||||
# also depends upon it.
|
||||
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
|
||||
return not wrap_list_items
|
||||
|
||||
# If text contains a pipe character it is likely a table
|
||||
if not wrap_tables and config.RE_TABLE.search(para):
|
||||
return True
|
||||
|
||||
# If the text begins with a single -, *, or +, followed by a space,
|
||||
# or an integer, followed by a ., followed by a space (in either
|
||||
# case optionally proceeded by whitespace), it's a list; don't wrap.
|
||||
return bool(
|
||||
config.RE_ORDERED_LIST_MATCHER.match(stripped)
|
||||
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
|
||||
)
|
||||
|
||||
|
||||
def escape_md(text: str) -> str:
|
||||
"""
|
||||
Escapes markdown-sensitive characters within other markdown
|
||||
constructs.
|
||||
"""
|
||||
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
|
||||
|
||||
|
||||
def escape_md_section(
|
||||
text: str,
|
||||
escape_backslash: bool = True,
|
||||
snob: bool = False,
|
||||
escape_dot: bool = True,
|
||||
escape_plus: bool = True,
|
||||
escape_dash: bool = True
|
||||
) -> str:
|
||||
"""
|
||||
Escapes markdown-sensitive characters across whole document sections.
|
||||
Each escaping operation can be controlled individually.
|
||||
"""
|
||||
if escape_backslash:
|
||||
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
|
||||
|
||||
if snob:
|
||||
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
|
||||
|
||||
if escape_dot:
|
||||
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
|
||||
|
||||
if escape_plus:
|
||||
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
|
||||
|
||||
if escape_dash:
|
||||
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
||||
|
||||
return text
|
||||
|
||||
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
||||
"""
|
||||
Given the lines of a table
|
||||
padds the cells and returns the new lines
|
||||
"""
|
||||
# find the maximum width of the columns
|
||||
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
|
||||
max_cols = len(max_width)
|
||||
for line in lines:
|
||||
cols = [x.rstrip() for x in line.split("|")]
|
||||
num_cols = len(cols)
|
||||
|
||||
# don't drop any data if colspan attributes result in unequal lengths
|
||||
if num_cols < max_cols:
|
||||
cols += [""] * (max_cols - num_cols)
|
||||
elif max_cols < num_cols:
|
||||
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
|
||||
max_cols = num_cols
|
||||
|
||||
max_width = [
|
||||
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
|
||||
]
|
||||
|
||||
# reformat
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
cols = [x.rstrip() for x in line.split("|")]
|
||||
if set(line.strip()) == set("-|"):
|
||||
filler = "-"
|
||||
new_cols = [
|
||||
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||
for x, M in zip(cols, max_width)
|
||||
]
|
||||
new_lines.append("|-" + "|".join(new_cols) + "|")
|
||||
else:
|
||||
filler = " "
|
||||
new_cols = [
|
||||
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||
for x, M in zip(cols, max_width)
|
||||
]
|
||||
new_lines.append("| " + "|".join(new_cols) + "|")
|
||||
return new_lines
|
||||
|
||||
|
||||
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
|
||||
"""
|
||||
Provide padding for tables in the text
|
||||
"""
|
||||
lines = text.split("\n")
|
||||
table_buffer = [] # type: List[str]
|
||||
table_started = False
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
# Toggle table started
|
||||
if config.TABLE_MARKER_FOR_PAD in line:
|
||||
table_started = not table_started
|
||||
if not table_started:
|
||||
table = reformat_table(table_buffer, right_margin)
|
||||
new_lines.extend(table)
|
||||
table_buffer = []
|
||||
new_lines.append("")
|
||||
continue
|
||||
# Process lines
|
||||
if table_started:
|
||||
table_buffer.append(line)
|
||||
else:
|
||||
new_lines.append(line)
|
||||
return "\n".join(new_lines)
|
||||
83
crawl4ai/install.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
from .async_logger import AsyncLogger, LogLevel
|
||||
|
||||
# Initialize logger
|
||||
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||
|
||||
def post_install():
|
||||
"""Run all post-installation tasks"""
|
||||
logger.info("Running post-installation setup...", tag="INIT")
|
||||
install_playwright()
|
||||
run_migration()
|
||||
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
||||
|
||||
def install_playwright():
|
||||
logger.info("Installing Playwright browsers...", tag="INIT")
|
||||
try:
|
||||
# subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
|
||||
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
|
||||
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
|
||||
except subprocess.CalledProcessError as e:
|
||||
# logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
||||
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
||||
except Exception as e:
|
||||
# logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
||||
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
||||
|
||||
def run_migration():
|
||||
"""Initialize database during installation"""
|
||||
try:
|
||||
logger.info("Starting database initialization...", tag="INIT")
|
||||
from crawl4ai.async_database import async_db_manager
|
||||
|
||||
asyncio.run(async_db_manager.initialize())
|
||||
logger.success("Database initialization completed successfully.", tag="COMPLETE")
|
||||
except ImportError:
|
||||
logger.warning("Database module not found. Will initialize on first use.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Database initialization failed: {e}")
|
||||
logger.warning("Database will be initialized on first use")
|
||||
|
||||
async def run_doctor():
|
||||
"""Test if Crawl4AI is working properly"""
|
||||
logger.info("Running Crawl4AI health check...", tag="INIT")
|
||||
try:
|
||||
from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_type="chromium",
|
||||
ignore_https_errors=True,
|
||||
light_mode=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=720
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
logger.info("Testing crawling capabilities...", tag="TEST")
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com",
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result and result.markdown:
|
||||
logger.success("✅ Crawling test passed!", tag="COMPLETE")
|
||||
return True
|
||||
else:
|
||||
raise Exception("Failed to get content")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Test failed: {e}", tag="ERROR")
|
||||
return False
|
||||
|
||||
def doctor():
|
||||
"""Entry point for the doctor command"""
|
||||
import asyncio
|
||||
return asyncio.run(run_doctor())
|
||||
15
crawl4ai/js_snippet/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os, sys
|
||||
|
||||
# Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
|
||||
def load_js_script(script_name):
|
||||
# Get the path of the current script
|
||||
current_script_path = os.path.dirname(os.path.realpath(__file__))
|
||||
# Get the path of the script to load
|
||||
script_path = os.path.join(current_script_path, script_name + '.js')
|
||||
# Check if the script exists
|
||||
if not os.path.exists(script_path):
|
||||
raise ValueError(f"Script {script_name} not found in the folder {current_script_path}")
|
||||
# Load the content of the script
|
||||
with open(script_path, 'r') as f:
|
||||
script_content = f.read()
|
||||
return script_content
|
||||
25
crawl4ai/js_snippet/navigator_overrider.js
Normal file
@@ -0,0 +1,25 @@
|
||||
// Pass the Permissions Test.
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) =>
|
||||
parameters.name === "notifications"
|
||||
? Promise.resolve({ state: Notification.permission })
|
||||
: originalQuery(parameters);
|
||||
Object.defineProperty(navigator, "webdriver", {
|
||||
get: () => undefined,
|
||||
});
|
||||
window.navigator.chrome = {
|
||||
runtime: {},
|
||||
// Add other properties if necessary
|
||||
};
|
||||
Object.defineProperty(navigator, "plugins", {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
Object.defineProperty(navigator, "languages", {
|
||||
get: () => ["en-US", "en"],
|
||||
});
|
||||
Object.defineProperty(document, "hidden", {
|
||||
get: () => false,
|
||||
});
|
||||
Object.defineProperty(document, "visibilityState", {
|
||||
get: () => "visible",
|
||||
});
|
||||
119
crawl4ai/js_snippet/remove_overlay_elements.js
Normal file
@@ -0,0 +1,119 @@
|
||||
async () => {
|
||||
// Function to check if element is visible
|
||||
const isVisible = (elem) => {
|
||||
const style = window.getComputedStyle(elem);
|
||||
return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0";
|
||||
};
|
||||
|
||||
// Common selectors for popups and overlays
|
||||
const commonSelectors = [
|
||||
// Close buttons first
|
||||
'button[class*="close" i]',
|
||||
'button[class*="dismiss" i]',
|
||||
'button[aria-label*="close" i]',
|
||||
'button[title*="close" i]',
|
||||
'a[class*="close" i]',
|
||||
'span[class*="close" i]',
|
||||
|
||||
// Cookie notices
|
||||
'[class*="cookie-banner" i]',
|
||||
'[id*="cookie-banner" i]',
|
||||
'[class*="cookie-consent" i]',
|
||||
'[id*="cookie-consent" i]',
|
||||
|
||||
// Newsletter/subscription dialogs
|
||||
'[class*="newsletter" i]',
|
||||
'[class*="subscribe" i]',
|
||||
|
||||
// Generic popups/modals
|
||||
'[class*="popup" i]',
|
||||
'[class*="modal" i]',
|
||||
'[class*="overlay" i]',
|
||||
'[class*="dialog" i]',
|
||||
'[role="dialog"]',
|
||||
'[role="alertdialog"]',
|
||||
];
|
||||
|
||||
// Try to click close buttons first
|
||||
for (const selector of commonSelectors.slice(0, 6)) {
|
||||
const closeButtons = document.querySelectorAll(selector);
|
||||
for (const button of closeButtons) {
|
||||
if (isVisible(button)) {
|
||||
try {
|
||||
button.click();
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
} catch (e) {
|
||||
console.log("Error clicking button:", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove remaining overlay elements
|
||||
const removeOverlays = () => {
|
||||
// Find elements with high z-index
|
||||
const allElements = document.querySelectorAll("*");
|
||||
for (const elem of allElements) {
|
||||
const style = window.getComputedStyle(elem);
|
||||
const zIndex = parseInt(style.zIndex);
|
||||
const position = style.position;
|
||||
|
||||
if (
|
||||
isVisible(elem) &&
|
||||
(zIndex > 999 || position === "fixed" || position === "absolute") &&
|
||||
(elem.offsetWidth > window.innerWidth * 0.5 ||
|
||||
elem.offsetHeight > window.innerHeight * 0.5 ||
|
||||
style.backgroundColor.includes("rgba") ||
|
||||
parseFloat(style.opacity) < 1)
|
||||
) {
|
||||
elem.remove();
|
||||
}
|
||||
}
|
||||
|
||||
// Remove elements matching common selectors
|
||||
for (const selector of commonSelectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
elements.forEach((elem) => {
|
||||
if (isVisible(elem)) {
|
||||
elem.remove();
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Remove overlay elements
|
||||
removeOverlays();
|
||||
|
||||
// Remove any fixed/sticky position elements at the top/bottom
|
||||
const removeFixedElements = () => {
|
||||
const elements = document.querySelectorAll("*");
|
||||
elements.forEach((elem) => {
|
||||
const style = window.getComputedStyle(elem);
|
||||
if ((style.position === "fixed" || style.position === "sticky") && isVisible(elem)) {
|
||||
elem.remove();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
removeFixedElements();
|
||||
|
||||
// Remove empty block elements as: div, p, span, etc.
|
||||
const removeEmptyBlockElements = () => {
|
||||
const blockElements = document.querySelectorAll(
|
||||
"div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6"
|
||||
);
|
||||
blockElements.forEach((elem) => {
|
||||
if (elem.innerText.trim() === "") {
|
||||
elem.remove();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
// Remove margin-right and padding-right from body (often added by modal scripts)
|
||||
document.body.style.marginRight = "0px";
|
||||
document.body.style.paddingRight = "0px";
|
||||
document.body.style.overflow = "auto";
|
||||
|
||||
// Wait a bit for any animations to complete
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
};
|
||||
54
crawl4ai/js_snippet/update_image_dimensions.js
Normal file
@@ -0,0 +1,54 @@
|
||||
() => {
|
||||
return new Promise((resolve) => {
|
||||
const filterImage = (img) => {
|
||||
// Filter out images that are too small
|
||||
if (img.width < 100 && img.height < 100) return false;
|
||||
|
||||
// Filter out images that are not visible
|
||||
const rect = img.getBoundingClientRect();
|
||||
if (rect.width === 0 || rect.height === 0) return false;
|
||||
|
||||
// Filter out images with certain class names (e.g., icons, thumbnails)
|
||||
if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
|
||||
|
||||
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
||||
if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
|
||||
let imagesLeft = images.length;
|
||||
|
||||
if (imagesLeft === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
|
||||
const checkImage = (img) => {
|
||||
if (img.complete && img.naturalWidth !== 0) {
|
||||
img.setAttribute("width", img.naturalWidth);
|
||||
img.setAttribute("height", img.naturalHeight);
|
||||
imagesLeft--;
|
||||
if (imagesLeft === 0) resolve();
|
||||
}
|
||||
};
|
||||
|
||||
images.forEach((img) => {
|
||||
checkImage(img);
|
||||
if (!img.complete) {
|
||||
img.onload = () => {
|
||||
checkImage(img);
|
||||
};
|
||||
img.onerror = () => {
|
||||
imagesLeft--;
|
||||
if (imagesLeft === 0) resolve();
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
// Fallback timeout of 5 seconds
|
||||
// setTimeout(() => resolve(), 5000);
|
||||
resolve();
|
||||
});
|
||||
};
|
||||
498
crawl4ai/llmtxt.py
Normal file
@@ -0,0 +1,498 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Dict, List, Tuple, Optional, Any
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
import psutil
|
||||
import numpy as np
|
||||
from rank_bm25 import BM25Okapi
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from litellm import completion, batch_completion
|
||||
from .async_logger import AsyncLogger
|
||||
import litellm
|
||||
import pickle
|
||||
import hashlib # <--- ADDED for file-hash
|
||||
from fnmatch import fnmatch
|
||||
import glob
|
||||
|
||||
litellm.set_verbose = False
|
||||
|
||||
def _compute_file_hash(file_path: Path) -> str:
|
||||
"""Compute MD5 hash for the file's entire content."""
|
||||
hash_md5 = hashlib.md5()
|
||||
with file_path.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
class AsyncLLMTextManager:
|
||||
def __init__(
|
||||
self,
|
||||
docs_dir: Path,
|
||||
logger: Optional[AsyncLogger] = None,
|
||||
max_concurrent_calls: int = 5,
|
||||
batch_size: int = 3
|
||||
) -> None:
|
||||
self.docs_dir = docs_dir
|
||||
self.logger = logger
|
||||
self.max_concurrent_calls = max_concurrent_calls
|
||||
self.batch_size = batch_size
|
||||
self.bm25_index = None
|
||||
self.document_map: Dict[str, Any] = {}
|
||||
self.tokenized_facts: List[str] = []
|
||||
self.bm25_index_file = self.docs_dir / "bm25_index.pkl"
|
||||
|
||||
async def _process_document_batch(self, doc_batch: List[Path]) -> None:
|
||||
"""Process a batch of documents in parallel"""
|
||||
contents = []
|
||||
for file_path in doc_batch:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
contents.append(f.read())
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error reading {file_path}: {str(e)}")
|
||||
contents.append("") # Add empty content to maintain batch alignment
|
||||
|
||||
prompt = """Given a documentation file, generate a list of atomic facts where each fact:
|
||||
1. Represents a single piece of knowledge
|
||||
2. Contains variations in terminology for the same concept
|
||||
3. References relevant code patterns if they exist
|
||||
4. Is written in a way that would match natural language queries
|
||||
|
||||
Each fact should follow this format:
|
||||
<main_concept>: <fact_statement> | <related_terms> | <code_reference>
|
||||
|
||||
Example Facts:
|
||||
browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
|
||||
redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
|
||||
pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]
|
||||
|
||||
Wrap your response in <index>...</index> tags.
|
||||
"""
|
||||
|
||||
# Prepare messages for batch processing
|
||||
messages_list = [
|
||||
[
|
||||
{"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"}
|
||||
]
|
||||
for content in contents if content
|
||||
]
|
||||
|
||||
try:
|
||||
responses = batch_completion(
|
||||
model="anthropic/claude-3-5-sonnet-latest",
|
||||
messages=messages_list,
|
||||
logger_fn=None
|
||||
)
|
||||
|
||||
# Process responses and save index files
|
||||
for response, file_path in zip(responses, doc_batch):
|
||||
try:
|
||||
index_content_match = re.search(
|
||||
r'<index>(.*?)</index>',
|
||||
response.choices[0].message.content,
|
||||
re.DOTALL
|
||||
)
|
||||
if not index_content_match:
|
||||
self.logger.warning(f"No <index>...</index> content found for {file_path}")
|
||||
continue
|
||||
|
||||
index_content = re.sub(
|
||||
r"\n\s*\n", "\n", index_content_match.group(1)
|
||||
).strip()
|
||||
if index_content:
|
||||
index_file = file_path.with_suffix('.q.md')
|
||||
with open(index_file, 'w', encoding='utf-8') as f:
|
||||
f.write(index_content)
|
||||
self.logger.info(f"Created index file: {index_file}")
|
||||
else:
|
||||
self.logger.warning(f"No index content found in response for {file_path}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing response for {file_path}: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in batch completion: {str(e)}")
|
||||
|
||||
def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]:
|
||||
if "|" not in line:
|
||||
return False, "Missing separator '|'"
|
||||
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
if len(parts) != 3:
|
||||
return False, f"Expected 3 parts, got {len(parts)}"
|
||||
|
||||
concept_part = parts[0]
|
||||
if ":" not in concept_part:
|
||||
return False, "Missing ':' in concept definition"
|
||||
|
||||
return True, None
|
||||
|
||||
def _load_or_create_token_cache(self, fact_file: Path) -> Dict:
|
||||
"""
|
||||
Load token cache from .q.tokens if present and matching file hash.
|
||||
Otherwise return a new structure with updated file-hash.
|
||||
"""
|
||||
cache_file = fact_file.with_suffix(".q.tokens")
|
||||
current_hash = _compute_file_hash(fact_file)
|
||||
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, "r") as f:
|
||||
cache = json.load(f)
|
||||
# If the hash matches, return it directly
|
||||
if cache.get("content_hash") == current_hash:
|
||||
return cache
|
||||
# Otherwise, we signal that it's changed
|
||||
self.logger.info(f"Hash changed for {fact_file}, reindex needed.")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}")
|
||||
|
||||
# Return a fresh cache
|
||||
return {"facts": {}, "content_hash": current_hash}
|
||||
|
||||
def _save_token_cache(self, fact_file: Path, cache: Dict) -> None:
|
||||
cache_file = fact_file.with_suffix(".q.tokens")
|
||||
# Always ensure we're saving the correct file-hash
|
||||
cache["content_hash"] = _compute_file_hash(fact_file)
|
||||
with open(cache_file, "w") as f:
|
||||
json.dump(cache, f)
|
||||
|
||||
def preprocess_text(self, text: str) -> List[str]:
|
||||
parts = [x.strip() for x in text.split("|")] if "|" in text else [text]
|
||||
# Remove : after the first word of parts[0]
|
||||
parts[0] = re.sub(r"^(.*?):", r"\1", parts[0])
|
||||
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
stop_words = set(stopwords.words("english")) - {
|
||||
"how", "what", "when", "where", "why", "which",
|
||||
}
|
||||
|
||||
tokens = []
|
||||
for part in parts:
|
||||
if "(" in part and ")" in part:
|
||||
code_tokens = re.findall(
|
||||
r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part
|
||||
)
|
||||
tokens.extend(code_tokens)
|
||||
|
||||
words = word_tokenize(part.lower())
|
||||
tokens.extend(
|
||||
[
|
||||
lemmatizer.lemmatize(token)
|
||||
for token in words
|
||||
if token not in stop_words
|
||||
]
|
||||
)
|
||||
|
||||
return tokens
|
||||
|
||||
def maybe_load_bm25_index(self, clear_cache=False) -> bool:
|
||||
"""
|
||||
Load existing BM25 index from disk, if present and clear_cache=False.
|
||||
"""
|
||||
if not clear_cache and os.path.exists(self.bm25_index_file):
|
||||
self.logger.info("Loading existing BM25 index from disk.")
|
||||
with open(self.bm25_index_file, "rb") as f:
|
||||
data = pickle.load(f)
|
||||
self.tokenized_facts = data["tokenized_facts"]
|
||||
self.bm25_index = data["bm25_index"]
|
||||
return True
|
||||
return False
|
||||
|
||||
def build_search_index(self, clear_cache=False) -> None:
|
||||
"""
|
||||
Checks for new or modified .q.md files by comparing file-hash.
|
||||
If none need reindexing and clear_cache is False, loads existing index if available.
|
||||
Otherwise, reindexes only changed/new files and merges or creates a new index.
|
||||
"""
|
||||
# If clear_cache is True, we skip partial logic: rebuild everything from scratch
|
||||
if clear_cache:
|
||||
self.logger.info("Clearing cache and rebuilding full search index.")
|
||||
if self.bm25_index_file.exists():
|
||||
self.bm25_index_file.unlink()
|
||||
|
||||
process = psutil.Process()
|
||||
self.logger.info("Checking which .q.md files need (re)indexing...")
|
||||
|
||||
# Gather all .q.md files
|
||||
q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")]
|
||||
|
||||
# We'll store known (unchanged) facts in these lists
|
||||
existing_facts: List[str] = []
|
||||
existing_tokens: List[List[str]] = []
|
||||
|
||||
# Keep track of invalid lines for logging
|
||||
invalid_lines = []
|
||||
needSet = [] # files that must be (re)indexed
|
||||
|
||||
for qf in q_files:
|
||||
token_cache_file = qf.with_suffix(".q.tokens")
|
||||
|
||||
# If no .q.tokens or clear_cache is True → definitely reindex
|
||||
if clear_cache or not token_cache_file.exists():
|
||||
needSet.append(qf)
|
||||
continue
|
||||
|
||||
# Otherwise, load the existing cache and compare hash
|
||||
cache = self._load_or_create_token_cache(qf)
|
||||
# If the .q.tokens was out of date (i.e. changed hash), we reindex
|
||||
if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf):
|
||||
needSet.append(qf)
|
||||
else:
|
||||
# File is unchanged → retrieve cached token data
|
||||
for line, cache_data in cache["facts"].items():
|
||||
existing_facts.append(line)
|
||||
existing_tokens.append(cache_data["tokens"])
|
||||
self.document_map[line] = qf # track the doc for that fact
|
||||
|
||||
if not needSet and not clear_cache:
|
||||
# If no file needs reindexing, try loading existing index
|
||||
if self.maybe_load_bm25_index(clear_cache=False):
|
||||
self.logger.info("No new/changed .q.md files found. Using existing BM25 index.")
|
||||
return
|
||||
else:
|
||||
# If there's no existing index, we must build a fresh index from the old caches
|
||||
self.logger.info("No existing BM25 index found. Building from cached facts.")
|
||||
if existing_facts:
|
||||
self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.")
|
||||
self.bm25_index = BM25Okapi(existing_tokens)
|
||||
self.tokenized_facts = existing_facts
|
||||
with open(self.bm25_index_file, "wb") as f:
|
||||
pickle.dump({
|
||||
"bm25_index": self.bm25_index,
|
||||
"tokenized_facts": self.tokenized_facts
|
||||
}, f)
|
||||
else:
|
||||
self.logger.warning("No facts found at all. Index remains empty.")
|
||||
return
|
||||
|
||||
# ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md'
|
||||
# If we reach here, we have new or changed .q.md files
|
||||
# We'll parse them, reindex them, and then combine with existing_facts
|
||||
# -----------------------------------------------------
|
||||
|
||||
self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...")
|
||||
|
||||
# 1) Parse the new or changed .q.md files
|
||||
new_facts = []
|
||||
new_tokens = []
|
||||
with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar:
|
||||
for file in needSet:
|
||||
# We'll build up a fresh cache
|
||||
fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)}
|
||||
try:
|
||||
with open(file, "r", encoding="utf-8") as f_obj:
|
||||
content = f_obj.read().strip()
|
||||
lines = [l.strip() for l in content.split("\n") if l.strip()]
|
||||
|
||||
for line in lines:
|
||||
is_valid, error = self._validate_fact_line(line)
|
||||
if not is_valid:
|
||||
invalid_lines.append((file, line, error))
|
||||
continue
|
||||
|
||||
tokens = self.preprocess_text(line)
|
||||
fresh_cache["facts"][line] = {
|
||||
"tokens": tokens,
|
||||
"added": time.time(),
|
||||
}
|
||||
new_facts.append(line)
|
||||
new_tokens.append(tokens)
|
||||
self.document_map[line] = file
|
||||
|
||||
# Save the new .q.tokens with updated hash
|
||||
self._save_token_cache(file, fresh_cache)
|
||||
|
||||
mem_usage = process.memory_info().rss / 1024 / 1024
|
||||
self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing {file}: {str(e)}")
|
||||
|
||||
file_pbar.update(1)
|
||||
|
||||
if invalid_lines:
|
||||
self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:")
|
||||
for file, line, error in invalid_lines:
|
||||
self.logger.warning(f"{file}: {error} in line: {line[:50]}...")
|
||||
|
||||
# 2) Merge newly tokenized facts with the existing ones
|
||||
all_facts = existing_facts + new_facts
|
||||
all_tokens = existing_tokens + new_tokens
|
||||
|
||||
# 3) Build BM25 index from combined facts
|
||||
self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).")
|
||||
self.bm25_index = BM25Okapi(all_tokens)
|
||||
self.tokenized_facts = all_facts
|
||||
|
||||
# 4) Save the updated BM25 index to disk
|
||||
with open(self.bm25_index_file, "wb") as f:
|
||||
pickle.dump({
|
||||
"bm25_index": self.bm25_index,
|
||||
"tokenized_facts": self.tokenized_facts
|
||||
}, f)
|
||||
|
||||
final_mem = process.memory_info().rss / 1024 / 1024
|
||||
self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
|
||||
|
||||
async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None:
|
||||
"""
|
||||
Generate index files for all documents in parallel batches
|
||||
|
||||
Args:
|
||||
force_generate_facts (bool): If True, regenerate indexes even if they exist
|
||||
clear_bm25_cache (bool): If True, clear existing BM25 index cache
|
||||
"""
|
||||
self.logger.info("Starting index generation for documentation files.")
|
||||
|
||||
md_files = [
|
||||
self.docs_dir / f for f in os.listdir(self.docs_dir)
|
||||
if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md'])
|
||||
]
|
||||
|
||||
# Filter out files that already have .q files unless force=True
|
||||
if not force_generate_facts:
|
||||
md_files = [
|
||||
f for f in md_files
|
||||
if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists()
|
||||
]
|
||||
|
||||
if not md_files:
|
||||
self.logger.info("All index files exist. Use force=True to regenerate.")
|
||||
else:
|
||||
# Process documents in batches
|
||||
for i in range(0, len(md_files), self.batch_size):
|
||||
batch = md_files[i:i + self.batch_size]
|
||||
self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}")
|
||||
await self._process_document_batch(batch)
|
||||
|
||||
self.logger.info("Index generation complete, building/updating search index.")
|
||||
self.build_search_index(clear_cache=clear_bm25_cache)
|
||||
|
||||
def generate(self, sections: List[str], mode: str = "extended") -> str:
|
||||
# Get all markdown files
|
||||
all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \
|
||||
glob.glob(str(self.docs_dir / "[0-9]*.xs.md"))
|
||||
|
||||
# Extract base names without extensions
|
||||
base_docs = {Path(f).name.split('.')[0] for f in all_files
|
||||
if not Path(f).name.endswith('.q.md')}
|
||||
|
||||
# Filter by sections if provided
|
||||
if sections:
|
||||
base_docs = {doc for doc in base_docs
|
||||
if any(section.lower() in doc.lower() for section in sections)}
|
||||
|
||||
# Get file paths based on mode
|
||||
files = []
|
||||
for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999):
|
||||
if mode == "condensed":
|
||||
xs_file = self.docs_dir / f"{doc}.xs.md"
|
||||
regular_file = self.docs_dir / f"{doc}.md"
|
||||
files.append(str(xs_file if xs_file.exists() else regular_file))
|
||||
else:
|
||||
files.append(str(self.docs_dir / f"{doc}.md"))
|
||||
|
||||
# Read and format content
|
||||
content = []
|
||||
for file in files:
|
||||
try:
|
||||
with open(file, 'r', encoding='utf-8') as f:
|
||||
fname = Path(file).name
|
||||
content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error reading {file}: {str(e)}")
|
||||
|
||||
return "\n\n---\n\n".join(content) if content else ""
|
||||
|
||||
def search(self, query: str, top_k: int = 5) -> str:
|
||||
if not self.bm25_index:
|
||||
return "No search index available. Call build_search_index() first."
|
||||
|
||||
query_tokens = self.preprocess_text(query)
|
||||
doc_scores = self.bm25_index.get_scores(query_tokens)
|
||||
|
||||
mean_score = np.mean(doc_scores)
|
||||
std_score = np.std(doc_scores)
|
||||
score_threshold = mean_score + (0.25 * std_score)
|
||||
|
||||
file_data = self._aggregate_search_scores(
|
||||
doc_scores=doc_scores,
|
||||
score_threshold=score_threshold,
|
||||
query_tokens=query_tokens,
|
||||
)
|
||||
|
||||
ranked_files = sorted(
|
||||
file_data.items(),
|
||||
key=lambda x: (
|
||||
x[1]["code_match_score"] * 2.0
|
||||
+ x[1]["match_count"] * 1.5
|
||||
+ x[1]["total_score"]
|
||||
),
|
||||
reverse=True,
|
||||
)[:top_k]
|
||||
|
||||
results = []
|
||||
for file, _ in ranked_files:
|
||||
main_doc = str(file).replace(".q.md", ".md")
|
||||
if os.path.exists(self.docs_dir / main_doc):
|
||||
with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f:
|
||||
only_file_name = main_doc.split("/")[-1]
|
||||
content = [
|
||||
"#" * 20,
|
||||
f"# {only_file_name}",
|
||||
"#" * 20,
|
||||
"",
|
||||
f.read()
|
||||
]
|
||||
results.append("\n".join(content))
|
||||
|
||||
return "\n\n---\n\n".join(results)
|
||||
|
||||
def _aggregate_search_scores(
|
||||
self, doc_scores: List[float], score_threshold: float, query_tokens: List[str]
|
||||
) -> Dict:
|
||||
file_data = {}
|
||||
|
||||
for idx, score in enumerate(doc_scores):
|
||||
if score <= score_threshold:
|
||||
continue
|
||||
|
||||
fact = self.tokenized_facts[idx]
|
||||
file_path = self.document_map[fact]
|
||||
|
||||
if file_path not in file_data:
|
||||
file_data[file_path] = {
|
||||
"total_score": 0,
|
||||
"match_count": 0,
|
||||
"code_match_score": 0,
|
||||
"matched_facts": [],
|
||||
}
|
||||
|
||||
components = fact.split("|") if "|" in fact else [fact]
|
||||
|
||||
code_match_score = 0
|
||||
if len(components) == 3:
|
||||
code_ref = components[2].strip()
|
||||
code_tokens = self.preprocess_text(code_ref)
|
||||
code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens)
|
||||
|
||||
file_data[file_path]["total_score"] += score
|
||||
file_data[file_path]["match_count"] += 1
|
||||
file_data[file_path]["code_match_score"] = max(
|
||||
file_data[file_path]["code_match_score"], code_match_score
|
||||
)
|
||||
file_data[file_path]["matched_facts"].append(fact)
|
||||
|
||||
return file_data
|
||||
|
||||
def refresh_index(self) -> None:
|
||||
"""Convenience method for a full rebuild."""
|
||||
self.build_search_index(clear_cache=True)
|
||||
225
crawl4ai/markdown_generation_strategy.py
Normal file
@@ -0,0 +1,225 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from .models import MarkdownGenerationResult
|
||||
from .html2text import CustomHTML2Text
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
# Pre-compile the regex pattern
|
||||
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
||||
|
||||
def fast_urljoin(base: str, url: str) -> str:
|
||||
"""Fast URL joining for common cases."""
|
||||
if url.startswith(('http://', 'https://', 'mailto:', '//')):
|
||||
return url
|
||||
if url.startswith('/'):
|
||||
# Handle absolute paths
|
||||
if base.endswith('/'):
|
||||
return base[:-1] + url
|
||||
return base + url
|
||||
return urljoin(base, url)
|
||||
|
||||
class MarkdownGenerationStrategy(ABC):
|
||||
"""Abstract base class for markdown generation strategies."""
|
||||
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
||||
self.content_filter = content_filter
|
||||
self.options = options or {}
|
||||
|
||||
@abstractmethod
|
||||
def generate_markdown(self,
|
||||
cleaned_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs) -> MarkdownGenerationResult:
|
||||
"""Generate markdown from cleaned HTML."""
|
||||
pass
|
||||
|
||||
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
"""
|
||||
Default implementation of markdown generation strategy.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
"""
|
||||
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(content_filter, options)
|
||||
|
||||
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
||||
"""
|
||||
Convert links in markdown to citations.
|
||||
|
||||
How it works:
|
||||
1. Find all links in the markdown.
|
||||
2. Convert links to citations.
|
||||
3. Return converted markdown and references markdown.
|
||||
|
||||
Note:
|
||||
This function uses a regex pattern to find links in markdown.
|
||||
|
||||
Args:
|
||||
markdown (str): Markdown text.
|
||||
base_url (str): Base URL for URL joins.
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: Converted markdown and references markdown.
|
||||
"""
|
||||
link_map = {}
|
||||
url_cache = {} # Cache for URL joins
|
||||
parts = []
|
||||
last_end = 0
|
||||
counter = 1
|
||||
|
||||
for match in LINK_PATTERN.finditer(markdown):
|
||||
parts.append(markdown[last_end:match.start()])
|
||||
text, url, title = match.groups()
|
||||
|
||||
# Use cached URL if available, otherwise compute and cache
|
||||
if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
|
||||
if url not in url_cache:
|
||||
url_cache[url] = fast_urljoin(base_url, url)
|
||||
url = url_cache[url]
|
||||
|
||||
if url not in link_map:
|
||||
desc = []
|
||||
if title: desc.append(title)
|
||||
if text and text != title: desc.append(text)
|
||||
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
|
||||
counter += 1
|
||||
|
||||
num = link_map[url][0]
|
||||
parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
|
||||
last_end = match.end()
|
||||
|
||||
parts.append(markdown[last_end:])
|
||||
converted_text = ''.join(parts)
|
||||
|
||||
# Pre-build reference strings
|
||||
references = ["\n\n## References\n\n"]
|
||||
references.extend(
|
||||
f"⟨{num}⟩ {url}{desc}\n"
|
||||
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
|
||||
)
|
||||
|
||||
return converted_text, ''.join(references)
|
||||
|
||||
def generate_markdown(self,
|
||||
cleaned_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs) -> MarkdownGenerationResult:
|
||||
"""
|
||||
Generate markdown with citations from cleaned HTML.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): Cleaned HTML content.
|
||||
base_url (str): Base URL for URL joins.
|
||||
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
citations (bool): Whether to generate citations.
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
"""
|
||||
try:
|
||||
# Initialize HTML2Text with default options for better conversion
|
||||
h = CustomHTML2Text(baseurl=base_url)
|
||||
default_options = {
|
||||
'body_width': 0, # Disable text wrapping
|
||||
'ignore_emphasis': False,
|
||||
'ignore_links': False,
|
||||
'ignore_images': False,
|
||||
'protect_links': True,
|
||||
'single_line_break': True,
|
||||
'mark_code': True,
|
||||
'escape_snob': False
|
||||
}
|
||||
|
||||
# Update with custom options if provided
|
||||
if html2text_options:
|
||||
default_options.update(html2text_options)
|
||||
elif options:
|
||||
default_options.update(options)
|
||||
elif self.options:
|
||||
default_options.update(self.options)
|
||||
|
||||
h.update_params(**default_options)
|
||||
|
||||
# Ensure we have valid input
|
||||
if not cleaned_html:
|
||||
cleaned_html = ""
|
||||
elif not isinstance(cleaned_html, str):
|
||||
cleaned_html = str(cleaned_html)
|
||||
|
||||
# Generate raw markdown
|
||||
try:
|
||||
raw_markdown = h.handle(cleaned_html)
|
||||
except Exception as e:
|
||||
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
|
||||
|
||||
raw_markdown = raw_markdown.replace(' ```', '```')
|
||||
|
||||
# Convert links to citations
|
||||
markdown_with_citations: str = raw_markdown
|
||||
references_markdown: str = ""
|
||||
if citations:
|
||||
try:
|
||||
markdown_with_citations, references_markdown = self.convert_links_to_citations(
|
||||
raw_markdown, base_url
|
||||
)
|
||||
except Exception as e:
|
||||
markdown_with_citations = raw_markdown
|
||||
references_markdown = f"Error generating citations: {str(e)}"
|
||||
|
||||
# Generate fit markdown if content filter is provided
|
||||
fit_markdown: Optional[str] = ""
|
||||
filtered_html: Optional[str] = ""
|
||||
if content_filter or self.content_filter:
|
||||
try:
|
||||
content_filter = content_filter or self.content_filter
|
||||
filtered_html = content_filter.filter_content(cleaned_html)
|
||||
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
|
||||
fit_markdown = h.handle(filtered_html)
|
||||
except Exception as e:
|
||||
fit_markdown = f"Error generating fit markdown: {str(e)}"
|
||||
filtered_html = ""
|
||||
|
||||
return MarkdownGenerationResult(
|
||||
raw_markdown=raw_markdown or "",
|
||||
markdown_with_citations=markdown_with_citations or "",
|
||||
references_markdown=references_markdown or "",
|
||||
fit_markdown=fit_markdown or "",
|
||||
fit_html=filtered_html or "",
|
||||
)
|
||||
except Exception as e:
|
||||
# If anything fails, return empty strings with error message
|
||||
error_msg = f"Error in markdown generation: {str(e)}"
|
||||
return MarkdownGenerationResult(
|
||||
raw_markdown=error_msg,
|
||||
markdown_with_citations=error_msg,
|
||||
references_markdown="",
|
||||
fit_markdown="",
|
||||
fit_html="",
|
||||
)
|
||||
168
crawl4ai/migrations.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import aiosqlite
|
||||
from typing import Optional
|
||||
import xxhash
|
||||
import aiofiles
|
||||
import shutil
|
||||
import time
|
||||
from datetime import datetime
|
||||
from .async_logger import AsyncLogger, LogLevel
|
||||
|
||||
# Initialize logger
|
||||
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
# logger = logging.getLogger(__name__)
|
||||
|
||||
class DatabaseMigration:
|
||||
def __init__(self, db_path: str):
|
||||
self.db_path = db_path
|
||||
self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
|
||||
|
||||
def _ensure_content_dirs(self, base_path: str) -> dict:
|
||||
dirs = {
|
||||
'html': 'html_content',
|
||||
'cleaned': 'cleaned_html',
|
||||
'markdown': 'markdown_content',
|
||||
'extracted': 'extracted_content',
|
||||
'screenshots': 'screenshots'
|
||||
}
|
||||
content_paths = {}
|
||||
for key, dirname in dirs.items():
|
||||
path = os.path.join(base_path, dirname)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
content_paths[key] = path
|
||||
return content_paths
|
||||
|
||||
def _generate_content_hash(self, content: str) -> str:
|
||||
x = xxhash.xxh64()
|
||||
x.update(content.encode())
|
||||
content_hash = x.hexdigest()
|
||||
return content_hash
|
||||
# return hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
async def _store_content(self, content: str, content_type: str) -> str:
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
content_hash = self._generate_content_hash(content)
|
||||
file_path = os.path.join(self.content_paths[content_type], content_hash)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
||||
await f.write(content)
|
||||
|
||||
return content_hash
|
||||
|
||||
async def migrate_database(self):
|
||||
"""Migrate existing database to file-based storage"""
|
||||
# logger.info("Starting database migration...")
|
||||
logger.info("Starting database migration...", tag="INIT")
|
||||
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
# Get all rows
|
||||
async with db.execute(
|
||||
'''SELECT url, html, cleaned_html, markdown,
|
||||
extracted_content, screenshot FROM crawled_data'''
|
||||
) as cursor:
|
||||
rows = await cursor.fetchall()
|
||||
|
||||
migrated_count = 0
|
||||
for row in rows:
|
||||
url, html, cleaned_html, markdown, extracted_content, screenshot = row
|
||||
|
||||
# Store content in files and get hashes
|
||||
html_hash = await self._store_content(html, 'html')
|
||||
cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
|
||||
markdown_hash = await self._store_content(markdown, 'markdown')
|
||||
extracted_hash = await self._store_content(extracted_content, 'extracted')
|
||||
screenshot_hash = await self._store_content(screenshot, 'screenshots')
|
||||
|
||||
# Update database with hashes
|
||||
await db.execute('''
|
||||
UPDATE crawled_data
|
||||
SET html = ?,
|
||||
cleaned_html = ?,
|
||||
markdown = ?,
|
||||
extracted_content = ?,
|
||||
screenshot = ?
|
||||
WHERE url = ?
|
||||
''', (html_hash, cleaned_hash, markdown_hash,
|
||||
extracted_hash, screenshot_hash, url))
|
||||
|
||||
migrated_count += 1
|
||||
if migrated_count % 100 == 0:
|
||||
logger.info(f"Migrated {migrated_count} records...", tag="INIT")
|
||||
|
||||
|
||||
await db.commit()
|
||||
logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
|
||||
|
||||
except Exception as e:
|
||||
# logger.error(f"Migration failed: {e}")
|
||||
logger.error(
|
||||
message="Migration failed: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
raise e
|
||||
|
||||
async def backup_database(db_path: str) -> str:
|
||||
"""Create backup of existing database"""
|
||||
if not os.path.exists(db_path):
|
||||
logger.info("No existing database found. Skipping backup.", tag="INIT")
|
||||
return None
|
||||
|
||||
# Create backup with timestamp
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
backup_path = f"{db_path}.backup_{timestamp}"
|
||||
|
||||
try:
|
||||
# Wait for any potential write operations to finish
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Create backup
|
||||
shutil.copy2(db_path, backup_path)
|
||||
logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
|
||||
return backup_path
|
||||
except Exception as e:
|
||||
# logger.error(f"Backup failed: {e}")
|
||||
logger.error(
|
||||
message="Migration failed: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
raise e
|
||||
|
||||
async def run_migration(db_path: Optional[str] = None):
|
||||
"""Run database migration"""
|
||||
if db_path is None:
|
||||
db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
|
||||
|
||||
if not os.path.exists(db_path):
|
||||
logger.info("No existing database found. Skipping migration.", tag="INIT")
|
||||
return
|
||||
|
||||
# Create backup first
|
||||
backup_path = await backup_database(db_path)
|
||||
if not backup_path:
|
||||
return
|
||||
|
||||
migration = DatabaseMigration(db_path)
|
||||
await migration.migrate_database()
|
||||
|
||||
def main():
|
||||
"""CLI entry point for migration"""
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
|
||||
parser.add_argument('--db-path', help='Custom database path')
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(run_migration(args.db_path))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -2,11 +2,61 @@ from functools import lru_cache
|
||||
from pathlib import Path
|
||||
import subprocess, os
|
||||
import shutil
|
||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||
import tarfile
|
||||
from .model_loader import *
|
||||
import argparse
|
||||
import urllib.request
|
||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
@lru_cache()
|
||||
def get_available_memory(device):
|
||||
import torch
|
||||
if device.type == 'cuda':
|
||||
return torch.cuda.get_device_properties(device).total_memory
|
||||
elif device.type == 'mps':
|
||||
return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate
|
||||
else:
|
||||
return 0
|
||||
|
||||
@lru_cache()
|
||||
def calculate_batch_size(device):
|
||||
available_memory = get_available_memory(device)
|
||||
|
||||
if device.type == 'cpu':
|
||||
return 16
|
||||
elif device.type in ['cuda', 'mps']:
|
||||
# Adjust these thresholds based on your model size and available memory
|
||||
if available_memory >= 31 * 1024 ** 3: # > 32GB
|
||||
return 256
|
||||
elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
|
||||
return 128
|
||||
elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
|
||||
return 64
|
||||
else:
|
||||
return 32
|
||||
else:
|
||||
return 16 # Default batch size
|
||||
|
||||
@lru_cache()
|
||||
def get_device():
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device('cuda')
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device('mps')
|
||||
else:
|
||||
device = torch.device('cpu')
|
||||
return device
|
||||
|
||||
def set_model_device(model):
|
||||
device = get_device()
|
||||
model.to(device)
|
||||
return model, device
|
||||
|
||||
@lru_cache()
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
@@ -17,25 +67,38 @@ def load_bert_base_uncased():
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_bge_small_en_v1_5():
|
||||
def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
|
||||
"""Load the Hugging Face model for embedding.
|
||||
|
||||
Args:
|
||||
model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
|
||||
|
||||
Returns:
|
||||
tuple: The tokenizer and model.
|
||||
"""
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
|
||||
model = AutoModel.from_pretrained(model_name, resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_text_classifier():
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
from transformers import pipeline
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
|
||||
return pipe
|
||||
|
||||
@lru_cache()
|
||||
@@ -45,21 +108,23 @@ def load_text_multilabel_classifier():
|
||||
from scipy.special import expit
|
||||
import torch
|
||||
|
||||
# # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||
# if torch.cuda.is_available():
|
||||
# device = torch.device("cuda")
|
||||
# elif torch.backends.mps.is_available():
|
||||
# device = torch.device("mps")
|
||||
# else:
|
||||
# device = torch.device("cpu")
|
||||
# # return load_spacy_model(), torch.device("cpu")
|
||||
|
||||
|
||||
MODEL = "cardiffnlp/tweet-topic-21-multi"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
|
||||
model.eval()
|
||||
model, device = set_model_device(model)
|
||||
class_mapping = model.config.id2label
|
||||
|
||||
# Check for available device: CUDA, MPS (for Apple Silicon), or CPU
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda")
|
||||
elif torch.backends.mps.is_available():
|
||||
device = torch.device("mps")
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
|
||||
model.to(device)
|
||||
|
||||
def _classifier(texts, threshold=0.5, max_length=64):
|
||||
tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
|
||||
tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
|
||||
@@ -78,7 +143,7 @@ def load_text_multilabel_classifier():
|
||||
|
||||
return batch_labels
|
||||
|
||||
return _classifier
|
||||
return _classifier, device
|
||||
|
||||
@lru_cache()
|
||||
def load_nltk_punkt():
|
||||
@@ -89,6 +154,67 @@ def load_nltk_punkt():
|
||||
nltk.download('punkt')
|
||||
return nltk.data.find('tokenizers/punkt')
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
import spacy
|
||||
name = "models/reuters"
|
||||
home_folder = get_home_folder()
|
||||
model_folder = Path(home_folder) / name
|
||||
|
||||
# Check if the model directory already exists
|
||||
if not (model_folder.exists() and any(model_folder.iterdir())):
|
||||
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||
branch = MODEL_REPO_BRANCH
|
||||
repo_folder = Path(home_folder) / "crawl4ai"
|
||||
|
||||
print("[LOG] ⏬ Downloading Spacy model for the first time...")
|
||||
|
||||
# Remove existing repo folder if it exists
|
||||
if repo_folder.exists():
|
||||
try:
|
||||
shutil.rmtree(repo_folder)
|
||||
if model_folder.exists():
|
||||
shutil.rmtree(model_folder)
|
||||
except PermissionError:
|
||||
print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
|
||||
print(f"- {repo_folder}")
|
||||
print(f"- {model_folder}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Clone the repository
|
||||
subprocess.run(
|
||||
["git", "clone", "-b", branch, repo_url, str(repo_folder)],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True
|
||||
)
|
||||
|
||||
# Create the models directory if it doesn't exist
|
||||
models_folder = Path(home_folder) / "models"
|
||||
models_folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy the reuters model folder to the models directory
|
||||
source_folder = repo_folder / "models" / "reuters"
|
||||
shutil.copytree(source_folder, model_folder)
|
||||
|
||||
# Remove the cloned repository
|
||||
shutil.rmtree(repo_folder)
|
||||
|
||||
print("[LOG] ✅ Spacy Model downloaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while cloning the repository: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
try:
|
||||
return spacy.load(str(model_folder))
|
||||
except Exception as e:
|
||||
print(f"Error loading spacy model: {e}")
|
||||
return None
|
||||
|
||||
def download_all_models(remove_existing=False):
|
||||
"""Download all models required for Crawl4AI."""
|
||||
if remove_existing:
|
||||
@@ -104,12 +230,15 @@ def download_all_models(remove_existing=False):
|
||||
print("[LOG] Existing models removed.")
|
||||
|
||||
# Load each model to trigger download
|
||||
print("[LOG] Downloading BERT Base Uncased...")
|
||||
load_bert_base_uncased()
|
||||
print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading BERT Base Uncased...")
|
||||
# load_bert_base_uncased()
|
||||
# print("[LOG] Downloading BGE Small EN v1.5...")
|
||||
# load_bge_small_en_v1_5()
|
||||
# print("[LOG] Downloading ONNX model...")
|
||||
# load_onnx_all_MiniLM_l6_v2()
|
||||
print("[LOG] Downloading text classifier...")
|
||||
load_text_multilabel_classifier
|
||||
_, device = load_text_multilabel_classifier()
|
||||
print(f"[LOG] Text classifier loaded on {device}")
|
||||
print("[LOG] Downloading custom NLTK Punkt model...")
|
||||
load_nltk_punkt()
|
||||
print("[LOG] ✅ All models downloaded successfully.")
|
||||
@@ -124,4 +253,4 @@ def main():
|
||||
download_all_models(remove_existing=args.remove_existing)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -1,16 +1,61 @@
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from dataclasses import dataclass
|
||||
from .ssl_certificate import SSLCertificate
|
||||
|
||||
@dataclass
|
||||
class TokenUsage:
|
||||
completion_tokens: int = 0
|
||||
prompt_tokens: int = 0
|
||||
total_tokens: int = 0
|
||||
completion_tokens_details: Optional[dict] = None
|
||||
prompt_tokens_details: Optional[dict] = None
|
||||
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: HttpUrl
|
||||
forced: bool = False
|
||||
|
||||
class MarkdownGenerationResult(BaseModel):
|
||||
raw_markdown: str
|
||||
markdown_with_citations: str
|
||||
references_markdown: str
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
success: bool
|
||||
cleaned_html: str = None
|
||||
markdown: str = None
|
||||
extracted_content: str = None
|
||||
metadata: dict = None
|
||||
error_message: str = None
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
session_id: Optional[str] = None
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
status_code: int
|
||||
screenshot: Optional[str] = None
|
||||
pdf_data: Optional[bytes] = None
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
|
||||
PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
|
||||
<url>{URL}</url>
|
||||
|
||||
And here is the cleaned HTML content of that webpage:
|
||||
@@ -29,7 +29,7 @@ To generate the JSON objects:
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
Please provide your output within <blocks> tags, like this:
|
||||
|
||||
@@ -79,7 +79,7 @@ To generate the JSON objects:
|
||||
2. For each block:
|
||||
a. Assign it an index based on its order in the content.
|
||||
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
|
||||
c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
||||
c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
|
||||
|
||||
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
|
||||
|
||||
@@ -87,7 +87,7 @@ To generate the JSON objects:
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
7. Never alter the extracted content, just copy and paste it as it is.
|
||||
|
||||
@@ -142,7 +142,7 @@ To generate the JSON objects:
|
||||
|
||||
5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
|
||||
|
||||
6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
|
||||
|
||||
7. Never alter the extracted content, just copy and paste it as it is.
|
||||
|
||||
@@ -164,4 +164,41 @@ Please provide your output within <blocks> tags, like this:
|
||||
|
||||
**Make sure to follow the user instruction to extract blocks aligin with the instruction.**
|
||||
|
||||
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||
Remember, the output should be a complete, parsable JSON wrapped in <blocks> tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
|
||||
|
||||
PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL:
|
||||
<url>{URL}</url>
|
||||
|
||||
<url_content>
|
||||
{HTML}
|
||||
</url_content>
|
||||
|
||||
The user has made the following request for what information to extract from the above content:
|
||||
|
||||
<user_request>
|
||||
{REQUEST}
|
||||
</user_request>
|
||||
|
||||
<schema_block>
|
||||
{SCHEMA}
|
||||
</schema_block>
|
||||
|
||||
Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the <schema_block> above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for.
|
||||
|
||||
Extraction instructions:
|
||||
Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in <blocks>...</blocks> XML tags.
|
||||
|
||||
Quality Reflection:
|
||||
Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred.
|
||||
|
||||
Quality Score:
|
||||
After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside <score> tags.
|
||||
|
||||
Avoid Common Mistakes:
|
||||
- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
|
||||
- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
|
||||
- Do not miss closing </blocks> tag at the end of the JSON output.
|
||||
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
||||
|
||||
Result
|
||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||
|
||||
181
crawl4ai/ssl_certificate.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""SSL Certificate class for handling certificate operations."""
|
||||
|
||||
import ssl
|
||||
import socket
|
||||
import base64
|
||||
import json
|
||||
from typing import Dict, Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
import OpenSSL.crypto
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class SSLCertificate:
|
||||
"""
|
||||
A class representing an SSL certificate with methods to export in various formats.
|
||||
|
||||
Attributes:
|
||||
cert_info (Dict[str, Any]): The certificate information.
|
||||
|
||||
Methods:
|
||||
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
|
||||
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
|
||||
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
|
||||
export_as_pem() -> str: Export the certificate as PEM format.
|
||||
export_as_der() -> bytes: Export the certificate as DER format.
|
||||
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
|
||||
export_as_text() -> str: Export the certificate as text format.
|
||||
"""
|
||||
def __init__(self, cert_info: Dict[str, Any]):
|
||||
self._cert_info = self._decode_cert_data(cert_info)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
|
||||
"""
|
||||
Create SSLCertificate instance from a URL.
|
||||
|
||||
Args:
|
||||
url (str): URL of the website.
|
||||
timeout (int): Timeout for the connection (default: 10).
|
||||
|
||||
Returns:
|
||||
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ':' in hostname:
|
||||
hostname = hostname.split(':')[0]
|
||||
|
||||
context = ssl.create_default_context()
|
||||
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
||||
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
||||
cert_binary = ssock.getpeercert(binary_form=True)
|
||||
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
|
||||
|
||||
cert_info = {
|
||||
"subject": dict(x509.get_subject().get_components()),
|
||||
"issuer": dict(x509.get_issuer().get_components()),
|
||||
"version": x509.get_version(),
|
||||
"serial_number": hex(x509.get_serial_number()),
|
||||
"not_before": x509.get_notBefore(),
|
||||
"not_after": x509.get_notAfter(),
|
||||
"fingerprint": x509.digest("sha256").hex(),
|
||||
"signature_algorithm": x509.get_signature_algorithm(),
|
||||
"raw_cert": base64.b64encode(cert_binary)
|
||||
}
|
||||
|
||||
# Add extensions
|
||||
extensions = []
|
||||
for i in range(x509.get_extension_count()):
|
||||
ext = x509.get_extension(i)
|
||||
extensions.append({
|
||||
"name": ext.get_short_name(),
|
||||
"value": str(ext)
|
||||
})
|
||||
cert_info["extensions"] = extensions
|
||||
|
||||
return SSLCertificate(cert_info)
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _decode_cert_data(data: Any) -> Any:
|
||||
"""Helper method to decode bytes in certificate data."""
|
||||
if isinstance(data, bytes):
|
||||
return data.decode('utf-8')
|
||||
elif isinstance(data, dict):
|
||||
return {
|
||||
(k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
|
||||
for k, v in data.items()
|
||||
}
|
||||
elif isinstance(data, list):
|
||||
return [SSLCertificate._decode_cert_data(item) for item in data]
|
||||
return data
|
||||
|
||||
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Export certificate as JSON.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the JSON file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: JSON string if successful, None otherwise.
|
||||
"""
|
||||
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
||||
if filepath:
|
||||
Path(filepath).write_text(json_str, encoding='utf-8')
|
||||
return None
|
||||
return json_str
|
||||
|
||||
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Export certificate as PEM.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the PEM file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: PEM string if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1,
|
||||
base64.b64decode(self._cert_info['raw_cert'])
|
||||
)
|
||||
pem_data = OpenSSL.crypto.dump_certificate(
|
||||
OpenSSL.crypto.FILETYPE_PEM,
|
||||
x509
|
||||
).decode('utf-8')
|
||||
|
||||
if filepath:
|
||||
Path(filepath).write_text(pem_data, encoding='utf-8')
|
||||
return None
|
||||
return pem_data
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
||||
"""
|
||||
Export certificate as DER.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the DER file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[bytes]: DER bytes if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
der_data = base64.b64decode(self._cert_info['raw_cert'])
|
||||
if filepath:
|
||||
Path(filepath).write_bytes(der_data)
|
||||
return None
|
||||
return der_data
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@property
|
||||
def issuer(self) -> Dict[str, str]:
|
||||
"""Get certificate issuer information."""
|
||||
return self._cert_info.get('issuer', {})
|
||||
|
||||
@property
|
||||
def subject(self) -> Dict[str, str]:
|
||||
"""Get certificate subject information."""
|
||||
return self._cert_info.get('subject', {})
|
||||
|
||||
@property
|
||||
def valid_from(self) -> str:
|
||||
"""Get certificate validity start date."""
|
||||
return self._cert_info.get('not_before', '')
|
||||
|
||||
@property
|
||||
def valid_until(self) -> str:
|
||||
"""Get certificate validity end date."""
|
||||
return self._cert_info.get('not_after', '')
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""Get certificate fingerprint."""
|
||||
return self._cert_info.get('fingerprint', '')
|
||||
@@ -1,146 +0,0 @@
|
||||
import spacy
|
||||
from spacy.training import Example
|
||||
import random
|
||||
import nltk
|
||||
from nltk.corpus import reuters
|
||||
import torch
|
||||
|
||||
def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
|
||||
# Extract the TextCategorizer component
|
||||
textcat = nlp.get_pipe("textcat_multilabel")
|
||||
|
||||
# Convert the weights to a PyTorch state dictionary
|
||||
state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
|
||||
|
||||
# Save the state dictionary
|
||||
torch.save(state_dict, f"{model_dir}/model_weights.pth")
|
||||
|
||||
# Extract and save the vocabulary
|
||||
vocab = extract_vocab(nlp)
|
||||
with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
|
||||
for word, idx in vocab.items():
|
||||
vocab_file.write(f"{word}\t{idx}\n")
|
||||
|
||||
print(f"Model weights and vocabulary saved to: {model_dir}")
|
||||
|
||||
def extract_vocab(nlp):
|
||||
# Extract vocabulary from the SpaCy model
|
||||
vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
|
||||
return vocab
|
||||
|
||||
nlp = spacy.load("models/reuters")
|
||||
save_spacy_model_as_torch(nlp, model_dir="models")
|
||||
|
||||
def train_and_save_reuters_model(model_dir="models/reuters"):
|
||||
# Ensure the Reuters corpus is downloaded
|
||||
nltk.download('reuters')
|
||||
nltk.download('punkt')
|
||||
if not reuters.fileids():
|
||||
print("Reuters corpus not found.")
|
||||
return
|
||||
|
||||
# Load a blank English spaCy model
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# Create a TextCategorizer with the ensemble model for multi-label classification
|
||||
textcat = nlp.add_pipe("textcat_multilabel")
|
||||
|
||||
# Add labels to text classifier
|
||||
for label in reuters.categories():
|
||||
textcat.add_label(label)
|
||||
|
||||
# Prepare training data
|
||||
train_examples = []
|
||||
for fileid in reuters.fileids():
|
||||
categories = reuters.categories(fileid)
|
||||
text = reuters.raw(fileid)
|
||||
cats = {label: label in categories for label in reuters.categories()}
|
||||
# Prepare spacy Example objects
|
||||
doc = nlp.make_doc(text)
|
||||
example = Example.from_dict(doc, {'cats': cats})
|
||||
train_examples.append(example)
|
||||
|
||||
# Initialize the text categorizer with the example objects
|
||||
nlp.initialize(lambda: train_examples)
|
||||
|
||||
# Train the model
|
||||
random.seed(1)
|
||||
spacy.util.fix_random_seed(1)
|
||||
for i in range(5): # Adjust iterations for better accuracy
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# Create batches of data
|
||||
batches = spacy.util.minibatch(train_examples, size=8)
|
||||
for batch in batches:
|
||||
nlp.update(batch, drop=0.2, losses=losses)
|
||||
print(f"Losses at iteration {i}: {losses}")
|
||||
|
||||
# Save the trained model
|
||||
nlp.to_disk(model_dir)
|
||||
print(f"Model saved to: {model_dir}")
|
||||
|
||||
def train_model(model_dir, additional_epochs=0):
|
||||
# Load the model if it exists, otherwise start with a blank model
|
||||
try:
|
||||
nlp = spacy.load(model_dir)
|
||||
print("Model loaded from disk.")
|
||||
except IOError:
|
||||
print("No existing model found. Starting with a new model.")
|
||||
nlp = spacy.blank("en")
|
||||
textcat = nlp.add_pipe("textcat_multilabel")
|
||||
for label in reuters.categories():
|
||||
textcat.add_label(label)
|
||||
|
||||
# Prepare training data
|
||||
train_examples = []
|
||||
for fileid in reuters.fileids():
|
||||
categories = reuters.categories(fileid)
|
||||
text = reuters.raw(fileid)
|
||||
cats = {label: label in categories for label in reuters.categories()}
|
||||
doc = nlp.make_doc(text)
|
||||
example = Example.from_dict(doc, {'cats': cats})
|
||||
train_examples.append(example)
|
||||
|
||||
# Initialize the model if it was newly created
|
||||
if 'textcat_multilabel' not in nlp.pipe_names:
|
||||
nlp.initialize(lambda: train_examples)
|
||||
else:
|
||||
print("Continuing training with existing model.")
|
||||
|
||||
# Train the model
|
||||
random.seed(1)
|
||||
spacy.util.fix_random_seed(1)
|
||||
num_epochs = 5 + additional_epochs
|
||||
for i in range(num_epochs):
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
batches = spacy.util.minibatch(train_examples, size=8)
|
||||
for batch in batches:
|
||||
nlp.update(batch, drop=0.2, losses=losses)
|
||||
print(f"Losses at iteration {i}: {losses}")
|
||||
|
||||
# Save the trained model
|
||||
nlp.to_disk(model_dir)
|
||||
print(f"Model saved to: {model_dir}")
|
||||
|
||||
def load_model_and_predict(model_dir, text, tok_k = 3):
|
||||
# Load the trained model from the specified directory
|
||||
nlp = spacy.load(model_dir)
|
||||
|
||||
# Process the text with the loaded model
|
||||
doc = nlp(text)
|
||||
|
||||
# gee top 3 categories
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
print(f"Top {tok_k} categories:")
|
||||
|
||||
return top_categories
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_and_save_reuters_model()
|
||||
train_model("models/reuters", additional_epochs=5)
|
||||
model_directory = "reuters_model_10"
|
||||
print(reuters.categories())
|
||||
example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
|
||||
r =load_model_and_predict(model_directory, example_text)
|
||||
print(r)
|
||||
305
crawl4ai/user_agent_generator.py
Normal file
@@ -0,0 +1,305 @@
|
||||
import random
|
||||
from typing import Optional, Literal, List, Dict, Tuple
|
||||
import re
|
||||
|
||||
|
||||
class UserAgentGenerator:
|
||||
"""
|
||||
Generate random user agents with specified constraints.
|
||||
|
||||
Attributes:
|
||||
desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
|
||||
mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
|
||||
browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
|
||||
rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
|
||||
chrome_versions (list): A list of possible Chrome browser versions.
|
||||
firefox_versions (list): A list of possible Firefox browser versions.
|
||||
edge_versions (list): A list of possible Edge browser versions.
|
||||
safari_versions (list): A list of possible Safari browser versions.
|
||||
ios_versions (list): A list of possible iOS browser versions.
|
||||
android_versions (list): A list of possible Android browser versions.
|
||||
|
||||
Methods:
|
||||
generate_user_agent(
|
||||
platform: Literal["desktop", "mobile"] = "desktop",
|
||||
browser: str = "chrome",
|
||||
rendering_engine: str = "chrome_webkit",
|
||||
chrome_version: Optional[str] = None,
|
||||
firefox_version: Optional[str] = None,
|
||||
edge_version: Optional[str] = None,
|
||||
safari_version: Optional[str] = None,
|
||||
ios_version: Optional[str] = None,
|
||||
android_version: Optional[str] = None
|
||||
): Generates a random user agent string based on the specified parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
# Previous platform definitions remain the same...
|
||||
self.desktop_platforms = {
|
||||
"windows": {
|
||||
"10_64": "(Windows NT 10.0; Win64; x64)",
|
||||
"10_32": "(Windows NT 10.0; WOW64)",
|
||||
},
|
||||
"macos": {
|
||||
"intel": "(Macintosh; Intel Mac OS X 10_15_7)",
|
||||
"newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)",
|
||||
},
|
||||
"linux": {
|
||||
"generic": "(X11; Linux x86_64)",
|
||||
"ubuntu": "(X11; Ubuntu; Linux x86_64)",
|
||||
"chrome_os": "(X11; CrOS x86_64 14541.0.0)",
|
||||
}
|
||||
}
|
||||
|
||||
self.mobile_platforms = {
|
||||
"android": {
|
||||
"samsung": "(Linux; Android 13; SM-S901B)",
|
||||
"pixel": "(Linux; Android 12; Pixel 6)",
|
||||
"oneplus": "(Linux; Android 13; OnePlus 9 Pro)",
|
||||
"xiaomi": "(Linux; Android 12; M2102J20SG)",
|
||||
},
|
||||
"ios": {
|
||||
"iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)",
|
||||
"ipad": "(iPad; CPU OS 16_5 like Mac OS X)",
|
||||
}
|
||||
}
|
||||
|
||||
# Browser Combinations
|
||||
self.browser_combinations = {
|
||||
1: [
|
||||
["chrome"],
|
||||
["firefox"],
|
||||
["safari"],
|
||||
["edge"]
|
||||
],
|
||||
2: [
|
||||
["gecko", "firefox"],
|
||||
["chrome", "safari"],
|
||||
["webkit", "safari"]
|
||||
],
|
||||
3: [
|
||||
["chrome", "safari", "edge"],
|
||||
["webkit", "chrome", "safari"]
|
||||
]
|
||||
}
|
||||
|
||||
# Rendering Engines with versions
|
||||
self.rendering_engines = {
|
||||
"chrome_webkit": "AppleWebKit/537.36",
|
||||
"safari_webkit": "AppleWebKit/605.1.15",
|
||||
"gecko": [ # Added Gecko versions
|
||||
"Gecko/20100101",
|
||||
"Gecko/20100101", # Firefox usually uses this constant version
|
||||
"Gecko/2010010",
|
||||
]
|
||||
}
|
||||
|
||||
# Browser Versions
|
||||
self.chrome_versions = [
|
||||
"Chrome/119.0.6045.199",
|
||||
"Chrome/118.0.5993.117",
|
||||
"Chrome/117.0.5938.149",
|
||||
"Chrome/116.0.5845.187",
|
||||
"Chrome/115.0.5790.171",
|
||||
]
|
||||
|
||||
self.edge_versions = [
|
||||
"Edg/119.0.2151.97",
|
||||
"Edg/118.0.2088.76",
|
||||
"Edg/117.0.2045.47",
|
||||
"Edg/116.0.1938.81",
|
||||
"Edg/115.0.1901.203",
|
||||
]
|
||||
|
||||
self.safari_versions = [
|
||||
"Safari/537.36", # For Chrome-based
|
||||
"Safari/605.1.15",
|
||||
"Safari/604.1",
|
||||
"Safari/602.1",
|
||||
"Safari/601.5.17",
|
||||
]
|
||||
|
||||
# Added Firefox versions
|
||||
self.firefox_versions = [
|
||||
"Firefox/119.0",
|
||||
"Firefox/118.0.2",
|
||||
"Firefox/117.0.1",
|
||||
"Firefox/116.0",
|
||||
"Firefox/115.0.3",
|
||||
"Firefox/114.0.2",
|
||||
"Firefox/113.0.1",
|
||||
"Firefox/112.0",
|
||||
"Firefox/111.0.1",
|
||||
"Firefox/110.0",
|
||||
]
|
||||
|
||||
def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
|
||||
"""
|
||||
Get a valid combination of browser versions.
|
||||
|
||||
How it works:
|
||||
1. Check if the number of browsers is supported.
|
||||
2. Randomly choose a combination of browsers.
|
||||
3. Iterate through the combination and add browser versions.
|
||||
4. Return the browser stack.
|
||||
|
||||
Args:
|
||||
num_browsers: Number of browser specifications (1-3)
|
||||
|
||||
Returns:
|
||||
List[str]: A list of browser versions.
|
||||
"""
|
||||
if num_browsers not in self.browser_combinations:
|
||||
raise ValueError(f"Unsupported number of browsers: {num_browsers}")
|
||||
|
||||
combination = random.choice(self.browser_combinations[num_browsers])
|
||||
browser_stack = []
|
||||
|
||||
for browser in combination:
|
||||
if browser == "chrome":
|
||||
browser_stack.append(random.choice(self.chrome_versions))
|
||||
elif browser == "firefox":
|
||||
browser_stack.append(random.choice(self.firefox_versions))
|
||||
elif browser == "safari":
|
||||
browser_stack.append(random.choice(self.safari_versions))
|
||||
elif browser == "edge":
|
||||
browser_stack.append(random.choice(self.edge_versions))
|
||||
elif browser == "gecko":
|
||||
browser_stack.append(random.choice(self.rendering_engines["gecko"]))
|
||||
elif browser == "webkit":
|
||||
browser_stack.append(self.rendering_engines["chrome_webkit"])
|
||||
|
||||
return browser_stack
|
||||
|
||||
def generate(self,
|
||||
device_type: Optional[Literal['desktop', 'mobile']] = None,
|
||||
os_type: Optional[str] = None,
|
||||
device_brand: Optional[str] = None,
|
||||
browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None,
|
||||
num_browsers: int = 3) -> str:
|
||||
"""
|
||||
Generate a random user agent with specified constraints.
|
||||
|
||||
Args:
|
||||
device_type: 'desktop' or 'mobile'
|
||||
os_type: 'windows', 'macos', 'linux', 'android', 'ios'
|
||||
device_brand: Specific device brand
|
||||
browser_type: 'chrome', 'edge', 'safari', or 'firefox'
|
||||
num_browsers: Number of browser specifications (1-3)
|
||||
"""
|
||||
# Get platform string
|
||||
platform = self.get_random_platform(device_type, os_type, device_brand)
|
||||
|
||||
# Start with Mozilla
|
||||
components = ["Mozilla/5.0", platform]
|
||||
|
||||
# Add browser stack
|
||||
browser_stack = self.get_browser_stack(num_browsers)
|
||||
|
||||
# Add appropriate legacy token based on browser stack
|
||||
if "Firefox" in str(browser_stack):
|
||||
components.append(random.choice(self.rendering_engines["gecko"]))
|
||||
elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack):
|
||||
components.append(self.rendering_engines["chrome_webkit"])
|
||||
components.append("(KHTML, like Gecko)")
|
||||
|
||||
# Add browser versions
|
||||
components.extend(browser_stack)
|
||||
|
||||
return " ".join(components)
|
||||
|
||||
def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]:
|
||||
"""Generate both user agent and matching client hints"""
|
||||
user_agent = self.generate(**kwargs)
|
||||
client_hints = self.generate_client_hints(user_agent)
|
||||
return user_agent, client_hints
|
||||
|
||||
def get_random_platform(self, device_type, os_type, device_brand):
|
||||
"""Helper method to get random platform based on constraints"""
|
||||
platforms = self.desktop_platforms if device_type == 'desktop' else \
|
||||
self.mobile_platforms if device_type == 'mobile' else \
|
||||
{**self.desktop_platforms, **self.mobile_platforms}
|
||||
|
||||
if os_type:
|
||||
for platform_group in [self.desktop_platforms, self.mobile_platforms]:
|
||||
if os_type in platform_group:
|
||||
platforms = {os_type: platform_group[os_type]}
|
||||
break
|
||||
|
||||
os_key = random.choice(list(platforms.keys()))
|
||||
if device_brand and device_brand in platforms[os_key]:
|
||||
return platforms[os_key][device_brand]
|
||||
return random.choice(list(platforms[os_key].values()))
|
||||
|
||||
def parse_user_agent(self, user_agent: str) -> Dict[str, str]:
|
||||
"""Parse a user agent string to extract browser and version information"""
|
||||
browsers = {
|
||||
'chrome': r'Chrome/(\d+)',
|
||||
'edge': r'Edg/(\d+)',
|
||||
'safari': r'Version/(\d+)',
|
||||
'firefox': r'Firefox/(\d+)'
|
||||
}
|
||||
|
||||
result = {}
|
||||
for browser, pattern in browsers.items():
|
||||
match = re.search(pattern, user_agent)
|
||||
if match:
|
||||
result[browser] = match.group(1)
|
||||
|
||||
return result
|
||||
|
||||
def generate_client_hints(self, user_agent: str) -> str:
|
||||
"""Generate Sec-CH-UA header value based on user agent string"""
|
||||
browsers = self.parse_user_agent(user_agent)
|
||||
|
||||
# Client hints components
|
||||
hints = []
|
||||
|
||||
# Handle different browser combinations
|
||||
if 'chrome' in browsers:
|
||||
hints.append(f'"Chromium";v="{browsers["chrome"]}"')
|
||||
hints.append('"Not_A Brand";v="8"')
|
||||
|
||||
if 'edge' in browsers:
|
||||
hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
|
||||
else:
|
||||
hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
|
||||
|
||||
elif 'firefox' in browsers:
|
||||
# Firefox doesn't typically send Sec-CH-UA
|
||||
return '""'
|
||||
|
||||
elif 'safari' in browsers:
|
||||
# Safari's format for client hints
|
||||
hints.append(f'"Safari";v="{browsers["safari"]}"')
|
||||
hints.append('"Not_A Brand";v="8"')
|
||||
|
||||
return ', '.join(hints)
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
generator = UserAgentGenerator()
|
||||
print(generator.generate())
|
||||
|
||||
print("\nSingle browser (Chrome):")
|
||||
print(generator.generate(num_browsers=1, browser_type='chrome'))
|
||||
|
||||
print("\nTwo browsers (Gecko/Firefox):")
|
||||
print(generator.generate(num_browsers=2))
|
||||
|
||||
print("\nThree browsers (Chrome/Safari/Edge):")
|
||||
print(generator.generate(num_browsers=3))
|
||||
|
||||
print("\nFirefox on Linux:")
|
||||
print(generator.generate(
|
||||
device_type='desktop',
|
||||
os_type='linux',
|
||||
browser_type='firefox',
|
||||
num_browsers=2
|
||||
))
|
||||
|
||||
print("\nChrome/Safari/Edge on Windows:")
|
||||
print(generator.generate(
|
||||
device_type='desktop',
|
||||
os_type='windows',
|
||||
num_browsers=3
|
||||
))
|
||||
1241
crawl4ai/utils.py
0
crawl4ai/utils.scraping.py
Normal file
30
crawl4ai/version_manager.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# version_manager.py
|
||||
import os
|
||||
from pathlib import Path
|
||||
from packaging import version
|
||||
from . import __version__
|
||||
|
||||
class VersionManager:
|
||||
def __init__(self):
|
||||
self.home_dir = Path.home() / ".crawl4ai"
|
||||
self.version_file = self.home_dir / "version.txt"
|
||||
|
||||
def get_installed_version(self):
|
||||
"""Get the version recorded in home directory"""
|
||||
if not self.version_file.exists():
|
||||
return None
|
||||
try:
|
||||
return version.parse(self.version_file.read_text().strip())
|
||||
except:
|
||||
return None
|
||||
|
||||
def update_version(self):
|
||||
"""Update the version file to current library version"""
|
||||
self.version_file.write_text(__version__.__version__)
|
||||
|
||||
def needs_update(self):
|
||||
"""Check if database needs update based on version"""
|
||||
installed = self.get_installed_version()
|
||||
current = version.parse(__version__.__version__)
|
||||
return installed is None or installed < current
|
||||
|
||||
@@ -10,47 +10,35 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".')
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
# db_path: str = None,
|
||||
crawler_strategy: CrawlerStrategy = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
):
|
||||
# self.db_path = db_path
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy()
|
||||
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
|
||||
# If db_path is not provided, use the default path
|
||||
# if not db_path:
|
||||
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
||||
|
||||
# flush_db()
|
||||
init_db()
|
||||
|
||||
self.ready = False
|
||||
|
||||
def warmup(self):
|
||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||
result = self.run(
|
||||
url='https://crawl4ai.uccode.io/',
|
||||
self.run(
|
||||
url='https://google.com/',
|
||||
word_count_threshold=5,
|
||||
extraction_strategy= NoExtractionStrategy(),
|
||||
extraction_strategy=NoExtractionStrategy(),
|
||||
bypass_cache=False,
|
||||
verbose = False
|
||||
verbose=False
|
||||
)
|
||||
self.ready = True
|
||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||
|
||||
|
||||
def fetch_page(
|
||||
self,
|
||||
url_model: UrlModel,
|
||||
@@ -58,6 +46,8 @@ class WebCrawler:
|
||||
api_token: str = None,
|
||||
extract_blocks_flag: bool = True,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
use_cached_html: bool = False,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
@@ -69,111 +59,12 @@ class WebCrawler:
|
||||
extraction_strategy or NoExtractionStrategy(),
|
||||
chunking_strategy,
|
||||
bypass_cache=url_model.forced,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
**kwargs,
|
||||
)
|
||||
pass
|
||||
|
||||
|
||||
def run(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
# Check cache first
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
if cached:
|
||||
return CrawlResult(
|
||||
**{
|
||||
"url": cached[0],
|
||||
"html": cached[1],
|
||||
"cleaned_html": cached[2],
|
||||
"markdown": cached[3],
|
||||
"extracted_content": cached[4],
|
||||
"success": cached[5],
|
||||
"error_message": "",
|
||||
}
|
||||
)
|
||||
|
||||
# Initialize WebDriver for crawling
|
||||
t = time.time()
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
success = True
|
||||
error_message = ""
|
||||
# Extract content from HTML
|
||||
try:
|
||||
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = result.get("cleaned_html", html)
|
||||
markdown = result.get("markdown", "")
|
||||
|
||||
# Print a profession LOG style message, show time taken and say crawling is done
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
|
||||
)
|
||||
|
||||
extracted_content = []
|
||||
if verbose:
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||
t = time.time()
|
||||
# Split markdown into sections
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
||||
|
||||
extracted_content = extraction_strategy.run(
|
||||
url, sections,
|
||||
)
|
||||
extracted_content = json.dumps(extracted_content)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
cleaned_html = beautify_html(cleaned_html)
|
||||
cache_url(
|
||||
url,
|
||||
html,
|
||||
cleaned_html,
|
||||
markdown,
|
||||
extracted_content,
|
||||
success,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown,
|
||||
extracted_content=extracted_content,
|
||||
success=success,
|
||||
error_message=error_message,
|
||||
)
|
||||
|
||||
def fetch_pages(
|
||||
self,
|
||||
url_models: List[UrlModel],
|
||||
@@ -182,6 +73,8 @@ class WebCrawler:
|
||||
extract_blocks_flag: bool = True,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
use_cached_html: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
**kwargs,
|
||||
@@ -199,6 +92,8 @@ class WebCrawler:
|
||||
[api_token] * len(url_models),
|
||||
[extract_blocks_flag] * len(url_models),
|
||||
[word_count_threshold] * len(url_models),
|
||||
[css_selector] * len(url_models),
|
||||
[screenshot] * len(url_models),
|
||||
[use_cached_html] * len(url_models),
|
||||
[extraction_strategy] * len(url_models),
|
||||
[chunking_strategy] * len(url_models),
|
||||
@@ -207,3 +102,152 @@ class WebCrawler:
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def run(
|
||||
self,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
try:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
||||
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = sanitize_input_encode(cached[1])
|
||||
extracted_content = sanitize_input_encode(cached[4])
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
if not screenshot_data:
|
||||
cached = None
|
||||
|
||||
if not cached or not html:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
t1 = time.time()
|
||||
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
|
||||
if screenshot:
|
||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||
|
||||
|
||||
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||
crawl_result.success = bool(html)
|
||||
return crawl_result
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
||||
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
||||
|
||||
def process_html(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
word_count_threshold: int,
|
||||
extraction_strategy: ExtractionStrategy,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
css_selector: str,
|
||||
screenshot: bool,
|
||||
verbose: bool,
|
||||
is_cached: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.time()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
t1 = time.time()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
|
||||
result = scrapping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
word_count_threshold=word_count_threshold,
|
||||
css_selector=css_selector,
|
||||
only_text=kwargs.get("only_text", False),
|
||||
image_description_min_word_threshold=kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
),
|
||||
**extra_params,
|
||||
)
|
||||
|
||||
# result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
|
||||
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
if extracted_content is None:
|
||||
if verbose:
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
if not is_cached:
|
||||
cache_url(
|
||||
url,
|
||||
html,
|
||||
cleaned_html,
|
||||
markdown,
|
||||
extracted_content,
|
||||
True,
|
||||
json.dumps(media),
|
||||
json.dumps(links),
|
||||
json.dumps(metadata),
|
||||
screenshot=screenshot,
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=format_html(cleaned_html),
|
||||
markdown=markdown,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
)
|
||||
@@ -1,10 +1,67 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
web:
|
||||
build: .
|
||||
command: uvicorn main:app --host 0.0.0.0 --port 80 --workers $(nproc)
|
||||
# Local build services for different platforms
|
||||
crawl4ai-amd64:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
PYTHON_VERSION: "3.10"
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
||||
ENABLE_GPU: false
|
||||
platforms:
|
||||
- linux/amd64
|
||||
profiles: ["local-amd64"]
|
||||
extends: &base-config
|
||||
file: docker-compose.yml
|
||||
service: base-config
|
||||
|
||||
crawl4ai-arm64:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
PYTHON_VERSION: "3.10"
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
||||
ENABLE_GPU: false
|
||||
platforms:
|
||||
- linux/arm64
|
||||
profiles: ["local-arm64"]
|
||||
extends: *base-config
|
||||
|
||||
# Hub services for different platforms and versions
|
||||
crawl4ai-hub-amd64:
|
||||
image: unclecode/crawl4ai:${VERSION:-basic}-amd64
|
||||
profiles: ["hub-amd64"]
|
||||
extends: *base-config
|
||||
|
||||
crawl4ai-hub-arm64:
|
||||
image: unclecode/crawl4ai:${VERSION:-basic}-arm64
|
||||
profiles: ["hub-arm64"]
|
||||
extends: *base-config
|
||||
|
||||
# Base configuration to be extended
|
||||
base-config:
|
||||
ports:
|
||||
- "80:80"
|
||||
- "11235:11235"
|
||||
- "8000:8000"
|
||||
- "9222:9222"
|
||||
- "8080:8080"
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
reservations:
|
||||
memory: 1G
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
BIN
docs/assets/pitch-dark.png
Normal file
|
After Width: | Height: | Size: 33 KiB |
64
docs/assets/pitch-dark.svg
Normal file
@@ -0,0 +1,64 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 500">
|
||||
<!-- Background -->
|
||||
<rect width="800" height="500" fill="#1a1a1a"/>
|
||||
|
||||
<!-- Opportunities Section -->
|
||||
<g transform="translate(50,50)">
|
||||
<!-- Opportunity 1 Box -->
|
||||
<rect x="0" y="0" width="300" height="150" rx="10" fill="#1a2d3d" stroke="#64b5f6" stroke-width="2"/>
|
||||
<text x="150" y="30" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#64b5f6">Data Capitalization Opportunity</text>
|
||||
<text x="150" y="60" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">
|
||||
<tspan x="150" dy="0">Transform digital footprints into assets</tspan>
|
||||
<tspan x="150" dy="20">Personal data as capital</tspan>
|
||||
<tspan x="150" dy="20">Enterprise knowledge valuation</tspan>
|
||||
<tspan x="150" dy="20">New form of wealth creation</tspan>
|
||||
</text>
|
||||
|
||||
<!-- Opportunity 2 Box -->
|
||||
<rect x="0" y="200" width="300" height="150" rx="10" fill="#1a2d1a" stroke="#81c784" stroke-width="2"/>
|
||||
<text x="150" y="230" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#81c784">Authentic Data Potential</text>
|
||||
<text x="150" y="260" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">
|
||||
<tspan x="150" dy="0">Vast reservoir of real insights</tspan>
|
||||
<tspan x="150" dy="20">Enhanced AI development</tspan>
|
||||
<tspan x="150" dy="20">Diverse human knowledge</tspan>
|
||||
<tspan x="150" dy="20">Willing participation model</tspan>
|
||||
</text>
|
||||
</g>
|
||||
|
||||
<!-- Development Pathway -->
|
||||
<g transform="translate(450,50)">
|
||||
<!-- Step 1 Box -->
|
||||
<rect x="0" y="0" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
|
||||
<text x="150" y="35" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">1. Open-Source Foundation</text>
|
||||
<text x="150" y="65" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Data extraction engine & community development</text>
|
||||
|
||||
<!-- Step 2 Box -->
|
||||
<rect x="0" y="125" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
|
||||
<text x="150" y="160" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">2. Data Capitalization Platform</text>
|
||||
<text x="150" y="190" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Tools to structure & value digital assets</text>
|
||||
|
||||
<!-- Step 3 Box -->
|
||||
<rect x="0" y="250" width="300" height="100" rx="10" fill="#2d1a2d" stroke="#ce93d8" stroke-width="2"/>
|
||||
<text x="150" y="285" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ce93d8">3. Shared Data Marketplace</text>
|
||||
<text x="150" y="315" text-anchor="middle" font-family="Arial" font-size="12" fill="#e0e0e0">Economic platform for data exchange</text>
|
||||
</g>
|
||||
|
||||
<!-- Connecting Arrows -->
|
||||
<g transform="translate(400,125)">
|
||||
<path d="M-20,0 L40,0" stroke="#666" stroke-width="2" marker-end="url(#arrowhead)"/>
|
||||
<path d="M-20,200 L40,200" stroke="#666" stroke-width="2" marker-end="url(#arrowhead)"/>
|
||||
</g>
|
||||
|
||||
<!-- Arrow Marker -->
|
||||
<defs>
|
||||
<marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
|
||||
<polygon points="0 0, 10 3.5, 0 7" fill="#666"/>
|
||||
</marker>
|
||||
</defs>
|
||||
|
||||
<!-- Vision Box at Bottom -->
|
||||
<g transform="translate(200,420)">
|
||||
<rect x="0" y="0" width="400" height="60" rx="10" fill="#2d2613" stroke="#ffd54f" stroke-width="2"/>
|
||||
<text x="200" y="35" text-anchor="middle" font-family="Arial" font-weight="bold" font-size="16" fill="#ffd54f">Economic Vision: Shared Data Economy</text>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 3.8 KiB |
@@ -1,12 +0,0 @@
|
||||
{
|
||||
"RegexChunking": "### RegexChunking\n\n`RegexChunking` is a text chunking strategy that splits a given text into smaller parts using regular expressions.\nThis is useful for preparing large texts for processing by language models, ensuring they are divided into manageable segments.\n\n#### Constructor Parameters:\n- `patterns` (list, optional): A list of regular expression patterns used to split the text. Default is to split by double newlines (`['\\n\\n']`).\n\n#### Example usage:\n```python\nchunker = RegexChunking(patterns=[r'\\n\\n', r'\\. '])\nchunks = chunker.chunk(\"This is a sample text. It will be split into chunks.\")\n```",
|
||||
|
||||
"NlpSentenceChunking": "### NlpSentenceChunking\n\n`NlpSentenceChunking` uses a natural language processing model to chunk a given text into sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.\n\n#### Constructor Parameters:\n- None.\n\n#### Example usage:\n```python\nchunker = NlpSentenceChunking()\nchunks = chunker.chunk(\"This is a sample text. It will be split into sentences.\")\n```",
|
||||
|
||||
"TopicSegmentationChunking": "### TopicSegmentationChunking\n\n`TopicSegmentationChunking` uses the TextTiling algorithm to segment a given text into topic-based chunks. This method identifies thematic boundaries in the text.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): The number of keywords to extract for each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nchunker = TopicSegmentationChunking(num_keywords=3)\nchunks = chunker.chunk(\"This is a sample text. It will be split into topic-based segments.\")\n```",
|
||||
|
||||
"FixedLengthWordChunking": "### FixedLengthWordChunking\n\n`FixedLengthWordChunking` splits a given text into chunks of fixed length, based on the number of words.\n\n#### Constructor Parameters:\n- `chunk_size` (int, optional): The number of words in each chunk. Default is `100`.\n\n#### Example usage:\n```python\nchunker = FixedLengthWordChunking(chunk_size=100)\nchunks = chunker.chunk(\"This is a sample text. It will be split into fixed-length word chunks.\")\n```",
|
||||
|
||||
"SlidingWindowChunking": "### SlidingWindowChunking\n\n`SlidingWindowChunking` uses a sliding window approach to chunk a given text. Each chunk has a fixed length, and the window slides by a specified step size.\n\n#### Constructor Parameters:\n- `window_size` (int, optional): The number of words in each chunk. Default is `100`.\n- `step` (int, optional): The number of words to slide the window. Default is `50`.\n\n#### Example usage:\n```python\nchunker = SlidingWindowChunking(window_size=100, step=50)\nchunks = chunker.chunk(\"This is a sample text. It will be split using a sliding window approach.\")\n```"
|
||||
}
|
||||
|
||||
189
docs/deprecated/docker-deployment.md
Normal file
@@ -0,0 +1,189 @@
|
||||
# 🐳 Using Docker (Legacy)
|
||||
|
||||
Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository.
|
||||
|
||||
---
|
||||
|
||||
<details>
|
||||
<summary>🐳 <strong>Option 1: Docker Hub (Recommended)</strong></summary>
|
||||
|
||||
Choose the appropriate image based on your platform and needs:
|
||||
|
||||
### For AMD64 (Regular Linux/Windows):
|
||||
```bash
|
||||
# Basic version (recommended)
|
||||
docker pull unclecode/crawl4ai:basic-amd64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64
|
||||
|
||||
# Full ML/LLM support
|
||||
docker pull unclecode/crawl4ai:all-amd64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:all-amd64
|
||||
|
||||
# With GPU support
|
||||
docker pull unclecode/crawl4ai:gpu-amd64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64
|
||||
```
|
||||
|
||||
### For ARM64 (M1/M2 Macs, ARM servers):
|
||||
```bash
|
||||
# Basic version (recommended)
|
||||
docker pull unclecode/crawl4ai:basic-arm64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64
|
||||
|
||||
# Full ML/LLM support
|
||||
docker pull unclecode/crawl4ai:all-arm64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:all-arm64
|
||||
|
||||
# With GPU support
|
||||
docker pull unclecode/crawl4ai:gpu-arm64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64
|
||||
```
|
||||
|
||||
Need more memory? Add `--shm-size`:
|
||||
```bash
|
||||
docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64
|
||||
```
|
||||
|
||||
Test the installation:
|
||||
```bash
|
||||
curl http://localhost:11235/health
|
||||
```
|
||||
|
||||
### For Raspberry Pi (32-bit) (coming soon):
|
||||
```bash
|
||||
# Pull and run basic version (recommended for Raspberry Pi)
|
||||
docker pull unclecode/crawl4ai:basic-armv7
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7
|
||||
|
||||
# With increased shared memory if needed
|
||||
docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7
|
||||
```
|
||||
|
||||
Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🐳 <strong>Option 2: Build from Repository</strong></summary>
|
||||
|
||||
Build the image locally based on your platform:
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
|
||||
# For AMD64 (Regular Linux/Windows)
|
||||
docker build --platform linux/amd64 \
|
||||
--tag crawl4ai:local \
|
||||
--build-arg INSTALL_TYPE=basic \
|
||||
.
|
||||
|
||||
# For ARM64 (M1/M2 Macs, ARM servers)
|
||||
docker build --platform linux/arm64 \
|
||||
--tag crawl4ai:local \
|
||||
--build-arg INSTALL_TYPE=basic \
|
||||
.
|
||||
```
|
||||
|
||||
Build options:
|
||||
- INSTALL_TYPE=basic (default): Basic crawling features
|
||||
- INSTALL_TYPE=all: Full ML/LLM support
|
||||
- ENABLE_GPU=true: Add GPU support
|
||||
|
||||
Example with all options:
|
||||
```bash
|
||||
docker build --platform linux/amd64 \
|
||||
--tag crawl4ai:local \
|
||||
--build-arg INSTALL_TYPE=all \
|
||||
--build-arg ENABLE_GPU=true \
|
||||
.
|
||||
```
|
||||
|
||||
Run your local build:
|
||||
```bash
|
||||
# Regular run
|
||||
docker run -p 11235:11235 crawl4ai:local
|
||||
|
||||
# With increased shared memory
|
||||
docker run --shm-size=2gb -p 11235:11235 crawl4ai:local
|
||||
```
|
||||
|
||||
Test the installation:
|
||||
```bash
|
||||
curl http://localhost:11235/health
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🐳 <strong>Option 3: Using Docker Compose</strong></summary>
|
||||
|
||||
Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations.
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
```
|
||||
|
||||
### For AMD64 (Regular Linux/Windows):
|
||||
```bash
|
||||
# Build and run locally
|
||||
docker-compose --profile local-amd64 up
|
||||
|
||||
# Run from Docker Hub
|
||||
VERSION=basic docker-compose --profile hub-amd64 up # Basic version
|
||||
VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support
|
||||
VERSION=gpu docker-compose --profile hub-amd64 up # GPU support
|
||||
```
|
||||
|
||||
### For ARM64 (M1/M2 Macs, ARM servers):
|
||||
```bash
|
||||
# Build and run locally
|
||||
docker-compose --profile local-arm64 up
|
||||
|
||||
# Run from Docker Hub
|
||||
VERSION=basic docker-compose --profile hub-arm64 up # Basic version
|
||||
VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support
|
||||
VERSION=gpu docker-compose --profile hub-arm64 up # GPU support
|
||||
```
|
||||
|
||||
Environment variables (optional):
|
||||
```bash
|
||||
# Create a .env file
|
||||
CRAWL4AI_API_TOKEN=your_token
|
||||
OPENAI_API_KEY=your_openai_key
|
||||
CLAUDE_API_KEY=your_claude_key
|
||||
```
|
||||
|
||||
The compose file includes:
|
||||
- Memory management (4GB limit, 1GB reserved)
|
||||
- Shared memory volume for browser support
|
||||
- Health checks
|
||||
- Auto-restart policy
|
||||
- All necessary port mappings
|
||||
|
||||
Test the installation:
|
||||
```bash
|
||||
curl http://localhost:11235/health
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🚀 <strong>One-Click Deployment</strong></summary>
|
||||
|
||||
Deploy your own instance of Crawl4AI with one click:
|
||||
|
||||
[](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge)
|
||||
|
||||
> 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation.
|
||||
|
||||
The deploy will:
|
||||
- Set up a Docker container with Crawl4AI
|
||||
- Configure Playwright and all dependencies
|
||||
- Start the FastAPI server on port `11235`
|
||||
- Set up health checks and auto-deployment
|
||||
|
||||
</details>
|
||||
114
docs/examples/amazon_product_extraction_direct_url.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
This example demonstrates how to use JSON CSS extraction to scrape product information
|
||||
from Amazon search results. It shows how to extract structured data like product titles,
|
||||
prices, ratings, and other details using CSS selectors.
|
||||
"""
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Initialize crawler config with JSON CSS extraction strategy
|
||||
crawler_config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
"baseSelector": "[data-component-type='s-search-result']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# Example search URL (you should replace with your actual Amazon URL)
|
||||
url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
print(f"ASIN: {product.get('asin')}")
|
||||
print(f"Title: {product.get('title')}")
|
||||
print(f"Price: {product.get('price')}")
|
||||
print(f"Original Price: {product.get('original_price')}")
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(extract_amazon_products())
|
||||
145
docs/examples/amazon_product_extraction_using_hooks.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
This example demonstrates how to use JSON CSS extraction to scrape product information
|
||||
from Amazon search results. It shows how to extract structured data like product titles,
|
||||
prices, ratings, and other details using CSS selectors.
|
||||
"""
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
# browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
"baseSelector": "[data-component-type='s-search-result']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
url = "https://www.amazon.com/"
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
|
||||
"""Hook called after navigating to each URL"""
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
|
||||
try:
|
||||
# Wait for search box to be available
|
||||
search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
|
||||
|
||||
# Type the search query
|
||||
await search_box.fill('Samsung Galaxy Tab')
|
||||
|
||||
# Get the search button and prepare for navigation
|
||||
search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
|
||||
|
||||
# Click with navigation waiting
|
||||
await search_button.click()
|
||||
|
||||
# Wait for search results to load
|
||||
await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
|
||||
print("[HOOK] Search completed and results loaded!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[HOOK] Error during search operation: {str(e)}")
|
||||
|
||||
return page
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
||||
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
print(f"ASIN: {product.get('asin')}")
|
||||
print(f"Title: {product.get('title')}")
|
||||
print(f"Price: {product.get('price')}")
|
||||
print(f"Original Price: {product.get('original_price')}")
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(extract_amazon_products())
|
||||
129
docs/examples/amazon_product_extraction_using_use_javascript.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
This example demonstrates how to use JSON CSS extraction to scrape product information
|
||||
from Amazon search results. It shows how to extract structured data like product titles,
|
||||
prices, ratings, and other details using CSS selectors.
|
||||
"""
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
# browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
js_code_to_search = """
|
||||
const task = async () => {
|
||||
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
|
||||
document.querySelector('#nav-search-submit-button').click();
|
||||
}
|
||||
await task();
|
||||
"""
|
||||
js_code_to_search_sync = """
|
||||
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
|
||||
document.querySelector('#nav-search-submit-button').click();
|
||||
"""
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code = js_code_to_search,
|
||||
wait_for='css:[data-component-type="s-search-result"]',
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
"baseSelector": "[data-component-type='s-search-result']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# Example search URL (you should replace with your actual Amazon URL)
|
||||
url = "https://www.amazon.com/"
|
||||
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
print(f"ASIN: {product.get('asin')}")
|
||||
print(f"Title: {product.get('title')}")
|
||||
print(f"Price: {product.get('price')}")
|
||||
print(f"Original Price: {product.get('original_price')}")
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(extract_amazon_products())
|
||||
BIN
docs/examples/assets/audio.mp3
Normal file
BIN
docs/examples/assets/basic.png
Normal file
|
After Width: | Height: | Size: 372 KiB |
BIN
docs/examples/assets/cosine_extraction.png
Normal file
|
After Width: | Height: | Size: 403 KiB |
BIN
docs/examples/assets/css_js.png
Normal file
|
After Width: | Height: | Size: 537 KiB |
BIN
docs/examples/assets/css_selector.png
Normal file
|
After Width: | Height: | Size: 375 KiB |
BIN
docs/examples/assets/exec_script.png
Normal file
|
After Width: | Height: | Size: 469 KiB |
BIN
docs/examples/assets/llm_extraction.png
Normal file
|
After Width: | Height: | Size: 477 KiB |
BIN
docs/examples/assets/semantic_extraction_cosine.png
Normal file
|
After Width: | Height: | Size: 419 KiB |
BIN
docs/examples/assets/semantic_extraction_llm.png
Normal file
|
After Width: | Height: | Size: 485 KiB |
48
docs/examples/async_webcrawler_multiple_urls_example.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# File: async_webcrawler_multiple_urls_example.py
|
||||
import os, sys
|
||||
# append 2 parent directories to sys.path to import crawl4ai
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
# Initialize the AsyncWebCrawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# List of URLs to crawl
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://python.org",
|
||||
"https://github.com",
|
||||
"https://stackoverflow.com",
|
||||
"https://news.ycombinator.com"
|
||||
]
|
||||
|
||||
# Set up crawling parameters
|
||||
word_count_threshold = 100
|
||||
|
||||
# Run the crawling process for multiple URLs
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
word_count_threshold=word_count_threshold,
|
||||
bypass_cache=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Process the results
|
||||
for result in results:
|
||||
if result.success:
|
||||
print(f"Successfully crawled: {result.url}")
|
||||
print(f"Title: {result.metadata.get('title', 'N/A')}")
|
||||
print(f"Word count: {len(result.markdown.split())}")
|
||||
print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
|
||||
print(f"Number of images: {len(result.media.get('images', []))}")
|
||||
print("---")
|
||||
else:
|
||||
print(f"Failed to crawl: {result.url}")
|
||||
print(f"Error: {result.error_message}")
|
||||
print("---")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
128
docs/examples/browser_optimization_example.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
This example demonstrates optimal browser usage patterns in Crawl4AI:
|
||||
1. Sequential crawling with session reuse
|
||||
2. Parallel crawling with browser instance reuse
|
||||
3. Performance optimization settings
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
async def crawl_sequential(urls: List[str]):
|
||||
"""
|
||||
Sequential crawling using session reuse - most efficient for moderate workloads
|
||||
"""
|
||||
print("\n=== Sequential Crawling with Session Reuse ===")
|
||||
|
||||
# Configure browser with optimized settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_args=[
|
||||
"--disable-gpu", # Disable GPU acceleration
|
||||
"--disable-dev-shm-usage", # Disable /dev/shm usage
|
||||
"--no-sandbox", # Required for Docker
|
||||
],
|
||||
viewport={
|
||||
"width": 800,
|
||||
"height": 600,
|
||||
}, # Smaller viewport for better performance
|
||||
)
|
||||
|
||||
# Configure crawl settings
|
||||
crawl_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(), In case you need fit_markdown
|
||||
),
|
||||
)
|
||||
|
||||
# Create single crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
session_id = "session1" # Use same session for all URLs
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=crawl_config,
|
||||
session_id=session_id, # Reuse same browser tab
|
||||
)
|
||||
if result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
|
||||
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
||||
"""
|
||||
Parallel crawling while reusing browser instance - best for large workloads
|
||||
"""
|
||||
print("\n=== Parallel Crawling with Browser Reuse ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
||||
viewport={"width": 800, "height": 600},
|
||||
)
|
||||
|
||||
crawl_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(), In case you need fit_markdown
|
||||
),
|
||||
)
|
||||
|
||||
# Create single crawler instance for all parallel tasks
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
# Create tasks in batches to control concurrency
|
||||
for i in range(0, len(urls), max_concurrent):
|
||||
batch = urls[i : i + max_concurrent]
|
||||
tasks = []
|
||||
|
||||
for j, url in enumerate(batch):
|
||||
session_id = (
|
||||
f"parallel_session_{j}" # Different session per concurrent task
|
||||
)
|
||||
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for batch to complete
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
for url, result in zip(batch, results):
|
||||
if isinstance(result, Exception):
|
||||
print(f"Error crawling {url}: {str(result)}")
|
||||
elif result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
|
||||
async def main():
|
||||
# Example URLs
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3",
|
||||
"https://example.com/page4",
|
||||
]
|
||||
|
||||
# Demo sequential crawling
|
||||
await crawl_sequential(urls)
|
||||
|
||||
# Demo parallel crawling
|
||||
await crawl_parallel(urls, max_concurrent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
3
docs/examples/chainlit.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Welcome to Crawl4AI! 🚀🤖
|
||||
|
||||
Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
|
||||
67
docs/examples/crawlai_vs_firecrawl.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import os, time
|
||||
# append the path to the root of the project
|
||||
import sys
|
||||
import asyncio
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
from firecrawl import FirecrawlApp
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
|
||||
|
||||
async def compare():
|
||||
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
|
||||
|
||||
# Tet Firecrawl with a simple crawl
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
'https://www.nbcnews.com/business',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(scrape_status['markdown']))
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/firecrawl_simple.md", "w") as f:
|
||||
f.write(scrape_status['markdown'])
|
||||
# Count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
|
||||
|
||||
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
# js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(result.markdown))
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
|
||||
f.write(result.markdown)
|
||||
# count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(result.markdown.count("cldnry.s-nbcnews.com"))
|
||||
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
word_count_threshold=0,
|
||||
bypass_cache=True,
|
||||
verbose=False
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(len(result.markdown))
|
||||
# save the markdown content with provider name
|
||||
with open(f"{__data__}/crawl4ai_js.md", "w") as f:
|
||||
f.write(result.markdown)
|
||||
# count how many "cldnry.s-nbcnews.com" are in the markdown
|
||||
print(result.markdown.count("cldnry.s-nbcnews.com"))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(compare())
|
||||
|
||||
357
docs/examples/docker_example.py
Normal file
@@ -0,0 +1,357 @@
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import base64
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||
self.base_url = base_url
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
|
||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||
|
||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
|
||||
if response.status_code == 403:
|
||||
raise Exception("API token is invalid or missing")
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
# Poll for result
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
|
||||
status = result.json()
|
||||
|
||||
if status["status"] == "failed":
|
||||
print("Task failed:", status.get("error"))
|
||||
raise Exception(f"Task failed: {status.get('error')}")
|
||||
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
|
||||
if response.status_code == 408:
|
||||
raise TimeoutError("Task did not complete within server timeout")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Directly crawl without using task queue"""
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl_direct",
|
||||
json=request_data,
|
||||
headers=self.headers
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester(
|
||||
base_url="http://localhost:11235" ,
|
||||
# base_url="https://api.crawl4ai.com" # just for example
|
||||
# api_token="test" # just for example
|
||||
)
|
||||
print(f"Testing Crawl4AI Docker {version} version")
|
||||
|
||||
# Health check with timeout and retry
|
||||
max_retries = 5
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
health = requests.get(f"{tester.base_url}/health", timeout=10)
|
||||
print("Health check:", health.json())
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
if i == max_retries - 1:
|
||||
print(f"Failed to connect after {max_retries} attempts")
|
||||
sys.exit(1)
|
||||
print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
|
||||
time.sleep(5)
|
||||
|
||||
# Test cases based on version
|
||||
test_basic_crawl_direct(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl_sync(tester)
|
||||
|
||||
if version in ["full", "transformer"]:
|
||||
test_cosine_extraction(tester)
|
||||
|
||||
test_js_execution(tester)
|
||||
test_css_selector(tester)
|
||||
test_structured_extraction(tester)
|
||||
test_llm_extraction(tester)
|
||||
test_llm_with_ollama(tester)
|
||||
test_screenshot(tester)
|
||||
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
assert len(result["result"]["markdown"]) > 0
|
||||
|
||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
"session_id": "test"
|
||||
}
|
||||
|
||||
result = tester.submit_sync(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['status'] == 'completed'
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
def test_basic_crawl_direct(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Direct) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
# "session_id": "test"
|
||||
"cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
|
||||
}
|
||||
|
||||
result = tester.crawl_direct(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
"wait_for": "article.tease-card:nth-child(10)",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"JS execution result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
},
|
||||
"extra": {"word_count_threshold": 10}
|
||||
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"CSS selector result length: {len(result['result']['markdown'])}")
|
||||
assert result["result"]["success"]
|
||||
|
||||
def test_structured_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Structured Extraction ===")
|
||||
schema = {
|
||||
"name": "Coinbase Crypto Prices",
|
||||
"baseSelector": ".cds-tableRow-t45thuk",
|
||||
"fields": [
|
||||
{
|
||||
"name": "crypto",
|
||||
"selector": "td:nth-child(1) h2",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "symbol",
|
||||
"selector": "td:nth-child(1) p",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": "td:nth-child(2)",
|
||||
"type": "text",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"priority": 9,
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"schema": schema
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} items")
|
||||
print("Sample item:", json.dumps(extracted[0], indent=2))
|
||||
assert result["result"]["success"]
|
||||
assert len(extracted) > 0
|
||||
|
||||
def test_llm_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM Extraction ===")
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the OpenAI model."
|
||||
},
|
||||
"input_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for input token for the OpenAI model."
|
||||
},
|
||||
"output_fee": {
|
||||
"type": "string",
|
||||
"description": "Fee for output token for the OpenAI model."
|
||||
}
|
||||
},
|
||||
"required": ["model_name", "input_fee", "output_fee"]
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"api_token": os.getenv("OPENAI_API_KEY"),
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
|
||||
}
|
||||
},
|
||||
"crawler_params": {"word_count_threshold": 1}
|
||||
}
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} model pricing entries")
|
||||
print("Sample entry:", json.dumps(extracted[0], indent=2))
|
||||
assert result["result"]["success"]
|
||||
except Exception as e:
|
||||
print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
|
||||
|
||||
def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
print("\n=== Testing LLM with Ollama ===")
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"article_title": {
|
||||
"type": "string",
|
||||
"description": "The main title of the news article"
|
||||
},
|
||||
"summary": {
|
||||
"type": "string",
|
||||
"description": "A brief summary of the article content"
|
||||
},
|
||||
"main_topics": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Main topics or themes discussed in the article"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
"params": {
|
||||
"provider": "ollama/llama2",
|
||||
"schema": schema,
|
||||
"extraction_type": "schema",
|
||||
"instruction": "Extract the main article information including title, summary, and main topics."
|
||||
}
|
||||
},
|
||||
"extra": {"word_count_threshold": 1},
|
||||
"crawler_params": {"verbose": True}
|
||||
}
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print("Extracted content:", json.dumps(extracted, indent=2))
|
||||
assert result["result"]["success"]
|
||||
except Exception as e:
|
||||
print(f"Ollama extraction test failed: {str(e)}")
|
||||
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
"params": {
|
||||
"semantic_filter": "business finance economy",
|
||||
"word_count_threshold": 10,
|
||||
"max_dist": 0.2,
|
||||
"top_k": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
result = tester.submit_and_wait(request)
|
||||
extracted = json.loads(result["result"]["extracted_content"])
|
||||
print(f"Extracted {len(extracted)} text clusters")
|
||||
print("First cluster tags:", extracted[0]["tags"])
|
||||
assert result["result"]["success"]
|
||||
except Exception as e:
|
||||
print(f"Cosine extraction test failed: {str(e)}")
|
||||
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {
|
||||
"headless": True
|
||||
}
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print("Screenshot captured:", bool(result["result"]["screenshot"]))
|
||||
|
||||
if result["result"]["screenshot"]:
|
||||
# Save screenshot
|
||||
screenshot_data = base64.b64decode(result["result"]["screenshot"])
|
||||
with open("test_screenshot.jpg", "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print("Screenshot saved as test_screenshot.jpg")
|
||||
|
||||
assert result["result"]["success"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
|
||||
# version = "full"
|
||||
test_docker_deployment(version)
|
||||
115
docs/examples/extraction_strategies_example.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Example demonstrating different extraction strategies with various input formats.
|
||||
This example shows how to:
|
||||
1. Use different input formats (markdown, HTML, fit_markdown)
|
||||
2. Work with JSON-based extractors (CSS and XPath)
|
||||
3. Use LLM-based extraction with different input formats
|
||||
4. Configure browser and crawler settings properly
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy
|
||||
)
|
||||
from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
|
||||
"""Helper function to run extraction with proper configuration"""
|
||||
try:
|
||||
# Configure the crawler run settings
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=strategy,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter() # For fit_markdown support
|
||||
)
|
||||
)
|
||||
|
||||
# Run the crawler
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"\n=== {name} Results ===")
|
||||
print(f"Extracted Content: {result.extracted_content}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||
else:
|
||||
print(f"Error in {name}: Crawl failed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in {name}: {str(e)}")
|
||||
|
||||
async def main():
|
||||
# Example URL (replace with actual URL)
|
||||
url = "https://example.com/product-page"
|
||||
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Initialize extraction strategies
|
||||
|
||||
# 1. LLM Extraction with different input formats
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information including name, price, and description"
|
||||
)
|
||||
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
input_format="html",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information from HTML including structured data"
|
||||
)
|
||||
|
||||
fit_markdown_strategy = LLMExtractionStrategy(
|
||||
input_format="fit_markdown",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information from cleaned markdown"
|
||||
)
|
||||
|
||||
# 2. JSON CSS Extraction (automatically uses HTML input)
|
||||
css_schema = {
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1.product-title", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "description", "selector": ".description", "type": "text"}
|
||||
]
|
||||
}
|
||||
css_strategy = JsonCssExtractionStrategy(schema=css_schema)
|
||||
|
||||
# 3. JSON XPath Extraction (automatically uses HTML input)
|
||||
xpath_schema = {
|
||||
"baseSelector": "//div[@class='product']",
|
||||
"fields": [
|
||||
{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
|
||||
{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
|
||||
{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
|
||||
]
|
||||
}
|
||||
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Run all strategies
|
||||
await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
|
||||
await run_extraction(crawler, url, html_strategy, "HTML LLM")
|
||||
await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
|
||||
await run_extraction(crawler, url, css_strategy, "CSS Extraction")
|
||||
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
58
docs/examples/full_page_screenshot_and_pdf_export.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI
|
||||
|
||||
When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences.
|
||||
|
||||
**The New Approach:**
|
||||
We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic.
|
||||
|
||||
**Key Benefits:**
|
||||
- **Reliability:** The PDF export never times out and works regardless of page length.
|
||||
- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing.
|
||||
- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime.
|
||||
|
||||
**Simple Example:**
|
||||
```python
|
||||
import os, sys
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
|
||||
# Adjust paths as needed
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Request both PDF and screenshot
|
||||
result = await crawler.arun(
|
||||
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
if result.screenshot:
|
||||
from base64 import b64decode
|
||||
with open(os.path.join(__location__, "screenshot.png"), "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
if result.pdf:
|
||||
pdf_bytes = b64decode(result.pdf)
|
||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**What Happens Under the Hood:**
|
||||
- Crawl4AI navigates to the target page.
|
||||
- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length.
|
||||
- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling.
|
||||
- Finally, you get your PDF and/or screenshot ready to use.
|
||||
|
||||
**Conclusion:**
|
||||
With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages.
|
||||
25
docs/examples/hello_world.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import os, sys
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
)
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import *
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
107
docs/examples/hooks_example.py
Normal file
@@ -0,0 +1,107 @@
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
async def main():
|
||||
print("🔗 Hooks Example: Demonstrating different hook use cases")
|
||||
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Configure crawler settings
|
||||
crawler_run_config = CrawlerRunConfig(
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="body",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# Create crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
# Define and set hook functions
|
||||
async def on_browser_created(browser, context: BrowserContext, **kwargs):
|
||||
"""Hook called after the browser is created"""
|
||||
print("[HOOK] on_browser_created - Browser is ready!")
|
||||
# Example: Set a cookie that will be used for all requests
|
||||
return browser
|
||||
|
||||
async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
|
||||
"""Hook called after a new page and context are created"""
|
||||
print("[HOOK] on_page_context_created - New page created!")
|
||||
# Example: Set default viewport size
|
||||
await context.add_cookies([{
|
||||
'name': 'session_id',
|
||||
'value': 'example_session',
|
||||
'domain': '.example.com',
|
||||
'path': '/'
|
||||
}])
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
|
||||
"""Hook called when the user agent is updated"""
|
||||
print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
|
||||
return page
|
||||
|
||||
async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
|
||||
"""Hook called after custom JavaScript execution"""
|
||||
print("[HOOK] on_execution_started - Custom JS executed!")
|
||||
return page
|
||||
|
||||
async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
|
||||
"""Hook called before navigating to each URL"""
|
||||
print(f"[HOOK] before_goto - About to visit: {url}")
|
||||
# Example: Add custom headers for the request
|
||||
await page.set_extra_http_headers({
|
||||
"Custom-Header": "my-value"
|
||||
})
|
||||
return page
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
|
||||
"""Hook called after navigating to each URL"""
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
# Example: Wait for a specific element to be loaded
|
||||
try:
|
||||
await page.wait_for_selector('.content', timeout=1000)
|
||||
print("Content element found!")
|
||||
except:
|
||||
print("Content element not found, continuing anyway")
|
||||
return page
|
||||
|
||||
async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
|
||||
"""Hook called before retrieving the HTML content"""
|
||||
print("[HOOK] before_retrieve_html - About to get HTML content")
|
||||
# Example: Scroll to bottom to trigger lazy loading
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
return page
|
||||
|
||||
async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
|
||||
"""Hook called before returning the HTML content"""
|
||||
print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
|
||||
# Example: You could modify the HTML content here if needed
|
||||
return page
|
||||
|
||||
# Set all the hooks
|
||||
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
|
||||
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||||
crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
crawler.crawler_strategy.set_hook("before_goto", before_goto)
|
||||
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
||||
crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
|
||||
crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
|
||||
|
||||
await crawler.start()
|
||||
|
||||
# Example usage: crawl a simple website
|
||||
url = 'https://example.com'
|
||||
result = await crawler.arun(url, config=crawler_run_config)
|
||||
print(f"\nCrawled URL: {result.url}")
|
||||
print(f"HTML length: {len(result.html)}")
|
||||
|
||||
await crawler.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
45
docs/examples/language_support_example.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def main():
|
||||
# Example 1: Setting language when creating the crawler
|
||||
crawler1 = AsyncWebCrawler(
|
||||
crawler_strategy=AsyncPlaywrightCrawlerStrategy(
|
||||
headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
|
||||
)
|
||||
)
|
||||
result1 = await crawler1.arun("https://www.example.com")
|
||||
print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters
|
||||
|
||||
# Example 2: Setting language before crawling
|
||||
crawler2 = AsyncWebCrawler()
|
||||
crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
|
||||
result2 = await crawler2.arun("https://www.example.com")
|
||||
print("Example 2 result:", result2.extracted_content[:100])
|
||||
|
||||
# Example 3: Setting language when calling arun method
|
||||
crawler3 = AsyncWebCrawler()
|
||||
result3 = await crawler3.arun(
|
||||
"https://www.example.com",
|
||||
headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
|
||||
)
|
||||
print("Example 3 result:", result3.extracted_content[:100])
|
||||
|
||||
# Example 4: Crawling multiple pages with different languages
|
||||
urls = [
|
||||
("https://www.example.com", "fr-FR,fr;q=0.9"),
|
||||
("https://www.example.org", "es-ES,es;q=0.9"),
|
||||
("https://www.example.net", "de-DE,de;q=0.9"),
|
||||
]
|
||||
|
||||
crawler4 = AsyncWebCrawler()
|
||||
results = await asyncio.gather(*[
|
||||
crawler4.arun(url, headers={"Accept-Language": lang})
|
||||
for url, lang in urls
|
||||
])
|
||||
|
||||
for url, result in zip([u for u, _ in urls], results):
|
||||
print(f"Result for {url}:", result.extracted_content[:100])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
40
docs/examples/llm_extraction_openai_pricing.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
import asyncio
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
url = r'https://openai.com/api/pricing/'
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
# Use AsyncWebCrawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their " \
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. " \
|
||||
'One extracted model JSON format should look like this: ' \
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
|
||||
),
|
||||
|
||||
)
|
||||
print("Success:", result.success)
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
print(len(model_fees))
|
||||
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
asyncio.run(main())
|
||||
664
docs/examples/quickstart.ipynb
Normal file
610
docs/examples/quickstart_async.config.py
Normal file
@@ -0,0 +1,610 @@
|
||||
import os, sys
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
)
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
)
|
||||
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
|
||||
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
|
||||
print("Twitter: @unclecode")
|
||||
print("Website: https://crawl4ai.com")
|
||||
|
||||
|
||||
# Basic Example - Simple Crawl
|
||||
async def simple_crawl():
|
||||
print("\n--- Basic Usage ---")
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
async def clean_content():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
excluded_tags=["nav", "footer", "aside"],
|
||||
remove_overlay_elements=True,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
),
|
||||
options={"ignore_links": True},
|
||||
),
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
config=crawler_config,
|
||||
)
|
||||
full_markdown_length = len(result.markdown_v2.raw_markdown)
|
||||
fit_markdown_length = len(result.markdown_v2.fit_markdown)
|
||||
print(f"Full Markdown Length: {full_markdown_length}")
|
||||
print(f"Fit Markdown Length: {fit_markdown_length}")
|
||||
|
||||
async def link_analysis():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
exclude_external_links=True,
|
||||
exclude_social_media_links=True,
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config,
|
||||
)
|
||||
print(f"Found {len(result.links['internal'])} internal links")
|
||||
print(f"Found {len(result.links['external'])} external links")
|
||||
|
||||
for link in result.links['internal'][:5]:
|
||||
print(f"Href: {link['href']}\nText: {link['text']}\n")
|
||||
|
||||
# JavaScript Execution Example
|
||||
async def simple_example_with_running_js_code():
|
||||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||||
|
||||
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
|
||||
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
# CSS Selector Example
|
||||
async def simple_example_with_css_selector():
|
||||
print("\n--- Using CSS Selectors ---")
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
async def media_handling():
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
)
|
||||
for img in result.media['images'][:5]:
|
||||
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
|
||||
|
||||
async def custom_hook_workflow(verbose=True):
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Set a 'before_goto' hook to run custom code just before navigation
|
||||
crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
|
||||
|
||||
# Perform the crawl operation
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com"
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
|
||||
|
||||
|
||||
# Proxy Example
|
||||
async def use_proxy():
|
||||
print("\n--- Using a Proxy ---")
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
proxy_config={
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "username",
|
||||
"password": "password",
|
||||
},
|
||||
)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
if result.success:
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
# Screenshot Example
|
||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
|
||||
screenshot_data = base64.b64decode(result.screenshot)
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print(f"Screenshot saved successfully to {output_path}")
|
||||
else:
|
||||
print("Failed to capture screenshot")
|
||||
|
||||
|
||||
# LLM Extraction Example
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
)
|
||||
|
||||
|
||||
async def extract_structured_data_using_llm(
|
||||
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
|
||||
):
|
||||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||
|
||||
if api_token is None and provider != "ollama":
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
|
||||
if extra_headers:
|
||||
extra_args["extra_headers"] = extra_headers
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=provider,
|
||||
api_token=api_token,
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content.""",
|
||||
extra_args=extra_args,
|
||||
),
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://openai.com/api/pricing/", config=crawler_config
|
||||
)
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
# CSS Extraction Example
|
||||
async def extract_structured_data_using_css_extractor():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
schema = {
|
||||
"name": "KidoCode Courses",
|
||||
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
||||
"fields": [
|
||||
{
|
||||
"name": "section_title",
|
||||
"selector": "h3.heading-50",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "section_description",
|
||||
"selector": ".charge-content",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_name",
|
||||
"selector": ".text-block-93",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_description",
|
||||
"selector": ".course-content-text",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_icon",
|
||||
"selector": ".image-92",
|
||||
"type": "attribute",
|
||||
"attribute": "src",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
||||
|
||||
js_click_tabs = """
|
||||
(async () => {
|
||||
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
|
||||
for(let tab of tabs) {
|
||||
tab.scrollIntoView();
|
||||
tab.click();
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
})();
|
||||
"""
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||
js_code=[js_click_tabs],
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.kidocode.com/degrees/technology", config=crawler_config
|
||||
)
|
||||
|
||||
companies = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(companies)} companies")
|
||||
print(json.dumps(companies[0], indent=2))
|
||||
|
||||
|
||||
# Dynamic Content Examples - Method 1
|
||||
async def crawl_dynamic_content_pages_method_1():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
first_commit = ""
|
||||
|
||||
async def on_execution_started(page, **kwargs):
|
||||
nonlocal first_commit
|
||||
try:
|
||||
while True:
|
||||
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
|
||||
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
|
||||
commit = await commit.evaluate("(element) => element.textContent")
|
||||
commit = re.sub(r"\s+", "", commit)
|
||||
if commit and commit != first_commit:
|
||||
first_commit = commit
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
||||
|
||||
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "typescript_commits_session"
|
||||
all_commits = []
|
||||
|
||||
js_next_page = """
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) button.click();
|
||||
"""
|
||||
|
||||
for page in range(3):
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
session_id=session_id,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
assert result.success, f"Failed to crawl page {page + 1}"
|
||||
|
||||
soup = BeautifulSoup(result.cleaned_html, "html.parser")
|
||||
commits = soup.select("li")
|
||||
all_commits.extend(commits)
|
||||
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
# Dynamic Content Examples - Method 2
|
||||
async def crawl_dynamic_content_pages_method_2():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
|
||||
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
||||
|
||||
js_next_page_and_wait = """
|
||||
(async () => {
|
||||
const getCurrentCommit = () => {
|
||||
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
|
||||
return commits.length > 0 ? commits[0].textContent.trim() : null;
|
||||
};
|
||||
|
||||
const initialCommit = getCurrentCommit();
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) button.click();
|
||||
|
||||
while (true) {
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
const newCommit = getCurrentCommit();
|
||||
if (newCommit && newCommit !== initialCommit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
})();
|
||||
"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4.markdown-title",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "typescript_commits_session"
|
||||
all_commits = []
|
||||
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
|
||||
for page in range(3):
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page_and_wait if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
session_id=session_id,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
assert result.success, f"Failed to crawl page {page + 1}"
|
||||
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
async def cosine_similarity_extraction():
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=CosineStrategy(
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2, # Maximum distance between two words
|
||||
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
|
||||
top_k=3, # Number of top keywords to extract
|
||||
sim_threshold=0.3, # Similarity threshold for clustering
|
||||
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
|
||||
verbose=True
|
||||
),
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
|
||||
config=crawl_config
|
||||
)
|
||||
print(json.loads(result.extracted_content)[:5])
|
||||
|
||||
# Browser Comparison
|
||||
async def crawl_custom_browser_type():
|
||||
print("\n--- Browser Comparison ---")
|
||||
|
||||
# Firefox
|
||||
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
)
|
||||
print("Firefox:", time.time() - start)
|
||||
print(result.markdown[:500])
|
||||
|
||||
# WebKit
|
||||
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
)
|
||||
print("WebKit:", time.time() - start)
|
||||
print(result.markdown[:500])
|
||||
|
||||
# Chromium (default)
|
||||
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
)
|
||||
print("Chromium:", time.time() - start)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
# Anti-Bot and User Simulation
|
||||
async def crawl_with_user_simulation():
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
|
||||
print(result.markdown)
|
||||
|
||||
async def ssl_certification():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
cert = result.ssl_certificate
|
||||
|
||||
# 1. Access certificate properties directly
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||
print(f"Valid until: {cert.valid_until}")
|
||||
print(f"Fingerprint: {cert.fingerprint}")
|
||||
|
||||
# 2. Export certificate in different formats
|
||||
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
||||
print("\nCertificate exported to:")
|
||||
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
||||
|
||||
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
|
||||
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
||||
|
||||
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
|
||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||
|
||||
# Speed Comparison
|
||||
async def speed_comparison():
|
||||
print("\n--- Speed Comparison ---")
|
||||
|
||||
# Firecrawl comparison
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
||||
)
|
||||
end = time.time()
|
||||
print("Firecrawl:")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(scrape_status['markdown'])} characters")
|
||||
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI comparisons
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Simple crawl
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, word_count_threshold=0
|
||||
),
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (simple crawl):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Advanced filtering
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=0,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
),
|
||||
),
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
# Basic examples
|
||||
# await simple_crawl()
|
||||
# await simple_example_with_running_js_code()
|
||||
# await simple_example_with_css_selector()
|
||||
|
||||
# Advanced examples
|
||||
# await extract_structured_data_using_css_extractor()
|
||||
await extract_structured_data_using_llm(
|
||||
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
# await crawl_dynamic_content_pages_method_1()
|
||||
# await crawl_dynamic_content_pages_method_2()
|
||||
|
||||
# Browser comparisons
|
||||
# await crawl_custom_browser_type()
|
||||
|
||||
# Performance testing
|
||||
# await speed_comparison()
|
||||
|
||||
# Screenshot example
|
||||
# await capture_and_save_screenshot(
|
||||
# "https://www.example.com",
|
||||
# os.path.join(__location__, "tmp/example_screenshot.jpg")
|
||||
# )
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
640
docs/examples/quickstart_async.py
Normal file
@@ -0,0 +1,640 @@
|
||||
import os, sys
|
||||
# append parent directory to system path
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692";
|
||||
|
||||
import asyncio
|
||||
# import nest_asyncio
|
||||
# nest_asyncio.apply()
|
||||
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
)
|
||||
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
|
||||
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
|
||||
print("Twitter: @unclecode")
|
||||
print("Website: https://crawl4ai.com")
|
||||
|
||||
|
||||
async def simple_crawl():
|
||||
print("\n--- Basic Usage ---")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
async def simple_example_with_running_js_code():
|
||||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||||
# New code to handle the wait_for parameter
|
||||
wait_for = """() => {
|
||||
return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
|
||||
}"""
|
||||
|
||||
# wait_for can be also just a css selector
|
||||
# wait_for = "article.tease-card:nth-child(10)"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
js_code = [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
]
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=js_code,
|
||||
# wait_for=wait_for,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
async def simple_example_with_css_selector():
|
||||
print("\n--- Using CSS Selectors ---")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
css_selector=".wide-tease-item__description",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
async def use_proxy():
|
||||
print("\n--- Using a Proxy ---")
|
||||
print(
|
||||
"Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
|
||||
)
|
||||
# Uncomment and modify the following lines to use a proxy
|
||||
async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
cache_mode= CacheMode.BYPASS
|
||||
)
|
||||
if result.success:
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
screenshot=True,
|
||||
cache_mode= CacheMode.BYPASS
|
||||
)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
|
||||
# Decode the base64 screenshot data
|
||||
screenshot_data = base64.b64decode(result.screenshot)
|
||||
|
||||
# Save the screenshot as a JPEG file
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(screenshot_data)
|
||||
|
||||
print(f"Screenshot saved successfully to {output_path}")
|
||||
else:
|
||||
print("Failed to capture screenshot")
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
)
|
||||
|
||||
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
|
||||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||
|
||||
if api_token is None and provider != "ollama":
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
# extra_args = {}
|
||||
extra_args={
|
||||
"temperature": 0,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 2000,
|
||||
# any other supported parameters for litellm
|
||||
}
|
||||
if extra_headers:
|
||||
extra_args["extra_headers"] = extra_headers
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://openai.com/api/pricing/",
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=provider,
|
||||
api_token=api_token,
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
||||
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
|
||||
extra_args=extra_args
|
||||
),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
print(result.extracted_content)
|
||||
|
||||
async def extract_structured_data_using_css_extractor():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
schema = {
|
||||
"name": "KidoCode Courses",
|
||||
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
||||
"fields": [
|
||||
{
|
||||
"name": "section_title",
|
||||
"selector": "h3.heading-50",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "section_description",
|
||||
"selector": ".charge-content",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_name",
|
||||
"selector": ".text-block-93",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_description",
|
||||
"selector": ".course-content-text",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "course_icon",
|
||||
"selector": ".image-92",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
headless=True,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
|
||||
# Create the JavaScript that handles clicking multiple times
|
||||
js_click_tabs = """
|
||||
(async () => {
|
||||
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
|
||||
|
||||
for(let tab of tabs) {
|
||||
// scroll to the tab
|
||||
tab.scrollIntoView();
|
||||
tab.click();
|
||||
// Wait for content to load and animations to complete
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
})();
|
||||
"""
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://www.kidocode.com/degrees/technology",
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
|
||||
js_code=[js_click_tabs],
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
companies = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(companies)} companies")
|
||||
print(json.dumps(companies[0], indent=2))
|
||||
|
||||
# Advanced Session-Based Crawling with Dynamic Content 🔄
|
||||
async def crawl_dynamic_content_pages_method_1():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
first_commit = ""
|
||||
|
||||
async def on_execution_started(page):
|
||||
nonlocal first_commit
|
||||
try:
|
||||
while True:
|
||||
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
|
||||
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
|
||||
commit = await commit.evaluate("(element) => element.textContent")
|
||||
commit = re.sub(r"\s+", "", commit)
|
||||
if commit and commit != first_commit:
|
||||
first_commit = commit
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "typescript_commits_session"
|
||||
all_commits = []
|
||||
|
||||
js_next_page = """
|
||||
(() => {
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) button.click();
|
||||
})();
|
||||
"""
|
||||
|
||||
for page in range(3): # Crawl 3 pages
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
session_id=session_id,
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
js=js_next_page if page > 0 else None,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_only=page > 0,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
assert result.success, f"Failed to crawl page {page + 1}"
|
||||
|
||||
soup = BeautifulSoup(result.cleaned_html, "html.parser")
|
||||
commits = soup.select("li")
|
||||
all_commits.extend(commits)
|
||||
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
async def crawl_dynamic_content_pages_method_2():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "typescript_commits_session"
|
||||
all_commits = []
|
||||
last_commit = ""
|
||||
|
||||
js_next_page_and_wait = """
|
||||
(async () => {
|
||||
const getCurrentCommit = () => {
|
||||
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
|
||||
return commits.length > 0 ? commits[0].textContent.trim() : null;
|
||||
};
|
||||
|
||||
const initialCommit = getCurrentCommit();
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) button.click();
|
||||
|
||||
// Poll for changes
|
||||
while (true) {
|
||||
await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
|
||||
const newCommit = getCurrentCommit();
|
||||
if (newCommit && newCommit !== initialCommit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
})();
|
||||
"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4.markdown-title",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
for page in range(3): # Crawl 3 pages
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
session_id=session_id,
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page_and_wait if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
assert result.success, f"Failed to crawl page {page + 1}"
|
||||
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
async def crawl_dynamic_content_pages_method_3():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "typescript_commits_session"
|
||||
all_commits = []
|
||||
|
||||
js_next_page = """
|
||||
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
|
||||
if (commits.length > 0) {
|
||||
window.firstCommit = commits[0].textContent.trim();
|
||||
}
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) button.click();
|
||||
"""
|
||||
|
||||
wait_for = """() => {
|
||||
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
|
||||
if (commits.length === 0) return false;
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.firstCommit;
|
||||
}"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4.markdown-title",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
for page in range(3): # Crawl 3 pages
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
session_id=session_id,
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
wait_for=wait_for if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
assert result.success, f"Failed to crawl page {page + 1}"
|
||||
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
async def crawl_custom_browser_type():
|
||||
# Use Firefox
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||
print(result.markdown[:500])
|
||||
print("Time taken: ", time.time() - start)
|
||||
|
||||
# Use WebKit
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||
print(result.markdown[:500])
|
||||
print("Time taken: ", time.time() - start)
|
||||
|
||||
# Use Chromium (default)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||
print(result.markdown[:500])
|
||||
print("Time taken: ", time.time() - start)
|
||||
|
||||
async def crawl_with_user_simultion():
|
||||
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||
url = "YOUR-URL-HERE"
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
|
||||
# simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
|
||||
# override_navigator = True # Overrides the navigator object to make it look like a real user
|
||||
)
|
||||
|
||||
print(result.markdown)
|
||||
|
||||
async def speed_comparison():
|
||||
# print("\n--- Speed Comparison ---")
|
||||
# print("Firecrawl (simulated):")
|
||||
# print("Time taken: 7.02 seconds")
|
||||
# print("Content length: 42074 characters")
|
||||
# print("Images found: 49")
|
||||
# print()
|
||||
# Simulated Firecrawl performance
|
||||
from firecrawl import FirecrawlApp
|
||||
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
'https://www.nbcnews.com/business',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
)
|
||||
end = time.time()
|
||||
print("Firecrawl:")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(scrape_status['markdown'])} characters")
|
||||
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl4AI simple crawl
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
word_count_threshold=0,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (simple crawl):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI with advanced content filtering
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
word_count_threshold=0,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
|
||||
),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI with JavaScript execution
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js_code=[
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
word_count_threshold=0,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
|
||||
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
|
||||
),
|
||||
verbose=False,
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (with JavaScript execution):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
|
||||
print("\nNote on Speed Comparison:")
|
||||
print("The speed test conducted here may not reflect optimal conditions.")
|
||||
print("When we call Firecrawl's API, we're seeing its best performance,")
|
||||
print("while Crawl4AI's performance is limited by the local network speed.")
|
||||
print("For a more accurate comparison, it's recommended to run these tests")
|
||||
print("on servers with a stable and fast internet connection.")
|
||||
print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
|
||||
print("If you run these tests in an environment with better network conditions,")
|
||||
print("you may observe an even more significant speed advantage for Crawl4AI.")
|
||||
|
||||
async def generate_knowledge_graph():
|
||||
class Entity(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
|
||||
class Relationship(BaseModel):
|
||||
entity1: Entity
|
||||
entity2: Entity
|
||||
description: str
|
||||
relation_type: str
|
||||
|
||||
class KnowledgeGraph(BaseModel):
|
||||
entities: List[Entity]
|
||||
relationships: List[Relationship]
|
||||
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models
|
||||
api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token"
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""Extract entities and relationships from the given text."""
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
url = "https://paulgraham.com/love.html"
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=extraction_strategy,
|
||||
# magic=True
|
||||
)
|
||||
# print(result.extracted_content)
|
||||
with open(os.path.join(__location__, "kb.json"), "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
async def fit_markdown_remove_overlay():
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
headless=True, # Set to False to see what is happening
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={
|
||||
"device_type": "mobile",
|
||||
"os_type": "android"
|
||||
},
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://www.kidocode.com/degrees/technology',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
),
|
||||
options={
|
||||
"ignore_links": True
|
||||
}
|
||||
),
|
||||
# markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
|
||||
# options={
|
||||
# "ignore_links": True
|
||||
# }
|
||||
# ),
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(len(result.markdown_v2.raw_markdown))
|
||||
print(len(result.markdown_v2.markdown_with_citations))
|
||||
print(len(result.markdown_v2.fit_markdown))
|
||||
|
||||
# Save clean html
|
||||
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
|
||||
f.write(result.cleaned_html)
|
||||
|
||||
with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
|
||||
with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
|
||||
print("Done")
|
||||
|
||||
|
||||
async def main():
|
||||
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# await simple_crawl()
|
||||
# await simple_example_with_running_js_code()
|
||||
# await simple_example_with_css_selector()
|
||||
# # await use_proxy()
|
||||
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
||||
# await extract_structured_data_using_css_extractor()
|
||||
|
||||
# LLM extraction examples
|
||||
# await extract_structured_data_using_llm()
|
||||
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||
# await extract_structured_data_using_llm("ollama/llama3.2")
|
||||
|
||||
# You always can pass custom headers to the extraction strategy
|
||||
# custom_headers = {
|
||||
# "Authorization": "Bearer your-custom-token",
|
||||
# "X-Custom-Header": "Some-Value"
|
||||
# }
|
||||
# await extract_structured_data_using_llm(extra_headers=custom_headers)
|
||||
|
||||
# await crawl_dynamic_content_pages_method_1()
|
||||
# await crawl_dynamic_content_pages_method_2()
|
||||
await crawl_dynamic_content_pages_method_3()
|
||||
|
||||
# await crawl_custom_browser_type()
|
||||
|
||||
# await speed_comparison()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -12,7 +12,7 @@ console = Console()
|
||||
|
||||
@lru_cache()
|
||||
def create_crawler():
|
||||
crawler = WebCrawler()
|
||||
crawler = WebCrawler(verbose=True)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
@@ -35,10 +35,26 @@ def cprint(message, press_any_key=False):
|
||||
|
||||
def basic_usage(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def basic_usage_some_params(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def screenshot_usage(crawler):
|
||||
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
|
||||
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
|
||||
# Save the screenshot to a file
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
cprint("Screenshot saved to 'screenshot.png'!")
|
||||
print_result(result)
|
||||
|
||||
def understanding_parameters(crawler):
|
||||
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
||||
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
||||
@@ -86,7 +102,7 @@ def add_extraction_strategy(crawler):
|
||||
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
|
||||
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
||||
print_result(result)
|
||||
@@ -156,14 +172,118 @@ def interactive_extraction(crawler):
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def multiple_scrip(crawler):
|
||||
# Passing JavaScript code to interact with the page
|
||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||
js_code = ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""] * 2
|
||||
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
|
||||
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
js = js_code
|
||||
)
|
||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def using_crawler_hooks(crawler):
|
||||
# Example usage of the hooks for authentication and setting a cookie
|
||||
def on_driver_created(driver):
|
||||
print("[HOOK] on_driver_created")
|
||||
# Example customization: maximize the window
|
||||
driver.maximize_window()
|
||||
|
||||
# Example customization: logging in to a hypothetical website
|
||||
driver.get('https://example.com/login')
|
||||
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.NAME, 'username'))
|
||||
)
|
||||
driver.find_element(By.NAME, 'username').send_keys('testuser')
|
||||
driver.find_element(By.NAME, 'password').send_keys('password123')
|
||||
driver.find_element(By.NAME, 'login').click()
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, 'welcome'))
|
||||
)
|
||||
# Add a custom cookie
|
||||
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
|
||||
return driver
|
||||
|
||||
|
||||
def before_get_url(driver):
|
||||
print("[HOOK] before_get_url")
|
||||
# Example customization: add a custom header
|
||||
# Enable Network domain for sending headers
|
||||
driver.execute_cdp_cmd('Network.enable', {})
|
||||
# Add a custom header
|
||||
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
|
||||
return driver
|
||||
|
||||
def after_get_url(driver):
|
||||
print("[HOOK] after_get_url")
|
||||
# Example customization: log the URL
|
||||
print(driver.current_url)
|
||||
return driver
|
||||
|
||||
def before_return_html(driver, html):
|
||||
print("[HOOK] before_return_html")
|
||||
# Example customization: log the HTML
|
||||
print(len(html))
|
||||
return driver
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
|
||||
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('on_driver_created', on_driver_created)
|
||||
crawler_strategy.set_hook('before_get_url', before_get_url)
|
||||
crawler_strategy.set_hook('after_get_url', after_get_url)
|
||||
crawler_strategy.set_hook('before_return_html', before_return_html)
|
||||
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
result = crawler.run(url="https://example.com")
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result= result)
|
||||
|
||||
def using_crawler_hooks_dleay_example(crawler):
|
||||
def delay(driver):
|
||||
print("Delaying for 5 seconds...")
|
||||
time.sleep(5)
|
||||
print("Resuming...")
|
||||
|
||||
def create_crawler():
|
||||
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
|
||||
crawler_strategy.set_hook('after_get_url', delay)
|
||||
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
|
||||
crawler.warmup()
|
||||
return crawler
|
||||
|
||||
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
|
||||
crawler = create_crawler()
|
||||
result = crawler.run(url="https://google.com", bypass_cache=True)
|
||||
|
||||
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||
@@ -171,15 +291,19 @@ def main():
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
basic_usage(crawler)
|
||||
# basic_usage_some_params(crawler)
|
||||
understanding_parameters(crawler)
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
screenshot_usage(crawler)
|
||||
add_chunking_strategy(crawler)
|
||||
add_extraction_strategy(crawler)
|
||||
add_llm_extraction_strategy(crawler)
|
||||
targeted_extraction(crawler)
|
||||
interactive_extraction(crawler)
|
||||
multiple_scrip(crawler)
|
||||
|
||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||
|
||||
735
docs/examples/quickstart_v0.ipynb
Normal file
@@ -0,0 +1,735 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "6yLvrXn7yZQI"
|
||||
},
|
||||
"source": [
|
||||
"# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
|
||||
"\n",
|
||||
"Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
|
||||
"\n",
|
||||
"- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
|
||||
"- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
|
||||
"- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
|
||||
"\n",
|
||||
"Let's explore the powerful features of Crawl4AI!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "KIn_9nxFyZQK"
|
||||
},
|
||||
"source": [
|
||||
"## Installation\n",
|
||||
"\n",
|
||||
"First, let's install Crawl4AI from GitHub:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "mSnaxLf3zMog"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "xlXqaRtayZQK"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install crawl4ai\n",
|
||||
"!pip install nest-asyncio\n",
|
||||
"!playwright install"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "qKCE7TI7yZQL"
|
||||
},
|
||||
"source": [
|
||||
"Now, let's import the necessary libraries:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"id": "I67tr7aAyZQL"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"from crawl4ai import AsyncWebCrawler\n",
|
||||
"from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
|
||||
"import json\n",
|
||||
"import time\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "h7yR_Rt_yZQM"
|
||||
},
|
||||
"source": [
|
||||
"## Basic Usage\n",
|
||||
"\n",
|
||||
"Let's start with a simple crawl example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "yBh6hf4WyZQM",
|
||||
"outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
|
||||
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
|
||||
"[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
|
||||
"[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
|
||||
"18102\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"async def simple_crawl():\n",
|
||||
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
|
||||
" result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
|
||||
" print(len(result.markdown))\n",
|
||||
"await simple_crawl()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "9rtkgHI28uI4"
|
||||
},
|
||||
"source": [
|
||||
"💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "MzZ0zlJ9yZQM"
|
||||
},
|
||||
"source": [
|
||||
"## Advanced Features\n",
|
||||
"\n",
|
||||
"### Executing JavaScript and Using CSS Selectors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "gHStF86xyZQM",
|
||||
"outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
|
||||
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
|
||||
"[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
|
||||
"[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
|
||||
"[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
|
||||
"[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
|
||||
"[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
|
||||
"[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
|
||||
"41135\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"async def js_and_css():\n",
|
||||
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
|
||||
" js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" js_code=js_code,\n",
|
||||
" # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" print(len(result.markdown))\n",
|
||||
"\n",
|
||||
"await js_and_css()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "cqE_W4coyZQM"
|
||||
},
|
||||
"source": [
|
||||
"### Using a Proxy\n",
|
||||
"\n",
|
||||
"Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "QjAyiAGqyZQM"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"async def use_proxy():\n",
|
||||
" async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Print first 500 characters\n",
|
||||
"\n",
|
||||
"# Uncomment the following line to run the proxy example\n",
|
||||
"# await use_proxy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "XTZ88lbayZQN"
|
||||
},
|
||||
"source": [
|
||||
"### Extracting Structured Data with OpenAI\n",
|
||||
"\n",
|
||||
"Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "fIOlDayYyZQN",
|
||||
"outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
|
||||
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
|
||||
"[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
|
||||
"[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
|
||||
"[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
|
||||
"[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
|
||||
"[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
|
||||
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
|
||||
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
|
||||
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
|
||||
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
|
||||
"[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
|
||||
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
|
||||
"[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
|
||||
"[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
|
||||
"[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
|
||||
"[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
|
||||
"[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
|
||||
"5029\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"class OpenAIModelFee(BaseModel):\n",
|
||||
" model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
|
||||
" input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
|
||||
" output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
|
||||
"\n",
|
||||
"async def extract_openai_fees():\n",
|
||||
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url='https://openai.com/api/pricing/',\n",
|
||||
" word_count_threshold=1,\n",
|
||||
" extraction_strategy=LLMExtractionStrategy(\n",
|
||||
" provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
|
||||
" schema=OpenAIModelFee.schema(),\n",
|
||||
" extraction_type=\"schema\",\n",
|
||||
" instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
|
||||
" Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
|
||||
" {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
|
||||
" ),\n",
|
||||
" bypass_cache=True,\n",
|
||||
" )\n",
|
||||
" print(len(result.extracted_content))\n",
|
||||
"\n",
|
||||
"# Uncomment the following line to run the OpenAI extraction example\n",
|
||||
"await extract_openai_fees()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "BypA5YxEyZQN"
|
||||
},
|
||||
"source": [
|
||||
"### Advanced Multi-Page Crawling with JavaScript Execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "tfkcVQ0b7mw-"
|
||||
},
|
||||
"source": [
|
||||
"## Advanced Multi-Page Crawling with JavaScript Execution\n",
|
||||
"\n",
|
||||
"This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
|
||||
"\n",
|
||||
"To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "qUBKGpn3yZQN",
|
||||
"outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
|
||||
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
|
||||
"[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
|
||||
"[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
|
||||
"[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
|
||||
"[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
|
||||
"[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
|
||||
"[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
|
||||
"Page 1: Found 35 commits\n",
|
||||
"[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
|
||||
"[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
|
||||
"[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
|
||||
"[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
|
||||
"[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
|
||||
"[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
|
||||
"Page 2: Found 35 commits\n",
|
||||
"[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
|
||||
"[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
|
||||
"[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
|
||||
"[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
|
||||
"[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
|
||||
"[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
|
||||
"Page 3: Found 35 commits\n",
|
||||
"Successfully crawled 105 commits across 3 pages\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"\n",
|
||||
"async def crawl_typescript_commits():\n",
|
||||
" first_commit = \"\"\n",
|
||||
" async def on_execution_started(page):\n",
|
||||
" nonlocal first_commit\n",
|
||||
" try:\n",
|
||||
" while True:\n",
|
||||
" await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
|
||||
" commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
|
||||
" commit = await commit.evaluate('(element) => element.textContent')\n",
|
||||
" commit = re.sub(r'\\s+', '', commit)\n",
|
||||
" if commit and commit != first_commit:\n",
|
||||
" first_commit = commit\n",
|
||||
" break\n",
|
||||
" await asyncio.sleep(0.5)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
|
||||
"\n",
|
||||
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
|
||||
" crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
|
||||
"\n",
|
||||
" url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
|
||||
" session_id = \"typescript_commits_session\"\n",
|
||||
" all_commits = []\n",
|
||||
"\n",
|
||||
" js_next_page = \"\"\"\n",
|
||||
" const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
|
||||
" if (button) button.click();\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" for page in range(3): # Crawl 3 pages\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url=url,\n",
|
||||
" session_id=session_id,\n",
|
||||
" css_selector=\"li.Box-sc-g0xbh4-0\",\n",
|
||||
" js=js_next_page if page > 0 else None,\n",
|
||||
" bypass_cache=True,\n",
|
||||
" js_only=page > 0\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert result.success, f\"Failed to crawl page {page + 1}\"\n",
|
||||
"\n",
|
||||
" soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
|
||||
" commits = soup.select(\"li\")\n",
|
||||
" all_commits.extend(commits)\n",
|
||||
"\n",
|
||||
" print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
|
||||
"\n",
|
||||
" await crawler.crawler_strategy.kill_session(session_id)\n",
|
||||
" print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
|
||||
"\n",
|
||||
"await crawl_typescript_commits()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "EJRnYsp6yZQN"
|
||||
},
|
||||
"source": [
|
||||
"### Using JsonCssExtractionStrategy for Fast Structured Output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "1ZMqIzB_8SYp"
|
||||
},
|
||||
"source": [
|
||||
"The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
|
||||
"\n",
|
||||
"1. You define a schema that describes the pattern of data you're interested in extracting.\n",
|
||||
"2. The schema includes a base selector that identifies repeating elements on the page.\n",
|
||||
"3. Within the schema, you define fields, each with its own selector and type.\n",
|
||||
"4. These field selectors are applied within the context of each base selector element.\n",
|
||||
"5. The strategy supports nested structures, lists within lists, and various data types.\n",
|
||||
"6. You can even include computed fields for more complex data manipulation.\n",
|
||||
"\n",
|
||||
"This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
|
||||
"\n",
|
||||
"For more details and advanced usage, check out the full documentation on the Crawl4AI website."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "trCMR2T9yZQN",
|
||||
"outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
|
||||
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
|
||||
"[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
|
||||
"[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
|
||||
"[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
|
||||
"[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
|
||||
"[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
|
||||
"[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
|
||||
"Successfully extracted 11 news teasers\n",
|
||||
"{\n",
|
||||
" \"category\": \"Business News\",\n",
|
||||
" \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
|
||||
" \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
|
||||
" \"time\": \"13h ago\",\n",
|
||||
" \"image\": {\n",
|
||||
" \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
|
||||
" \"alt\": \"Mike Tirico.\"\n",
|
||||
" },\n",
|
||||
" \"link\": \"https://www.nbcnews.com/business\"\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"async def extract_news_teasers():\n",
|
||||
" schema = {\n",
|
||||
" \"name\": \"News Teaser Extractor\",\n",
|
||||
" \"baseSelector\": \".wide-tease-item__wrapper\",\n",
|
||||
" \"fields\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"category\",\n",
|
||||
" \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"headline\",\n",
|
||||
" \"selector\": \".wide-tease-item__headline\",\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"summary\",\n",
|
||||
" \"selector\": \".wide-tease-item__description\",\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"time\",\n",
|
||||
" \"selector\": \"[data-testid='wide-tease-date']\",\n",
|
||||
" \"type\": \"text\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"image\",\n",
|
||||
" \"type\": \"nested\",\n",
|
||||
" \"selector\": \"picture.teasePicture img\",\n",
|
||||
" \"fields\": [\n",
|
||||
" {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
|
||||
" {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
|
||||
" ],\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"link\",\n",
|
||||
" \"selector\": \"a[href]\",\n",
|
||||
" \"type\": \"attribute\",\n",
|
||||
" \"attribute\": \"href\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
|
||||
"\n",
|
||||
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" extraction_strategy=extraction_strategy,\n",
|
||||
" bypass_cache=True,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" assert result.success, \"Failed to crawl the page\"\n",
|
||||
"\n",
|
||||
" news_teasers = json.loads(result.extracted_content)\n",
|
||||
" print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
|
||||
" print(json.dumps(news_teasers[0], indent=2))\n",
|
||||
"\n",
|
||||
"await extract_news_teasers()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "FnyVhJaByZQN"
|
||||
},
|
||||
"source": [
|
||||
"## Speed Comparison\n",
|
||||
"\n",
|
||||
"Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "agDD186f3wig"
|
||||
},
|
||||
"source": [
|
||||
"💡 **Note on Speed Comparison:**\n",
|
||||
"\n",
|
||||
"The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
|
||||
"\n",
|
||||
"For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
|
||||
"\n",
|
||||
"If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "F7KwHv8G1LbY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install firecrawl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "91813zILyZQN",
|
||||
"outputId": "663223db-ab89-4976-b233-05ceca62b19b"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Firecrawl (simulated):\n",
|
||||
"Time taken: 4.38 seconds\n",
|
||||
"Content length: 41967 characters\n",
|
||||
"Images found: 49\n",
|
||||
"\n",
|
||||
"Crawl4AI (simple crawl):\n",
|
||||
"Time taken: 4.22 seconds\n",
|
||||
"Content length: 18221 characters\n",
|
||||
"Images found: 49\n",
|
||||
"\n",
|
||||
"Crawl4AI (with JavaScript execution):\n",
|
||||
"Time taken: 9.13 seconds\n",
|
||||
"Content length: 34243 characters\n",
|
||||
"Images found: 89\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
|
||||
"import time\n",
|
||||
"from firecrawl import FirecrawlApp\n",
|
||||
"\n",
|
||||
"async def speed_comparison():\n",
|
||||
" # Simulated Firecrawl performance\n",
|
||||
" app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
|
||||
" start = time.time()\n",
|
||||
" scrape_status = app.scrape_url(\n",
|
||||
" 'https://www.nbcnews.com/business',\n",
|
||||
" params={'formats': ['markdown', 'html']}\n",
|
||||
" )\n",
|
||||
" end = time.time()\n",
|
||||
" print(\"Firecrawl (simulated):\")\n",
|
||||
" print(f\"Time taken: {end - start:.2f} seconds\")\n",
|
||||
" print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
|
||||
" print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
" async with AsyncWebCrawler() as crawler:\n",
|
||||
" # Crawl4AI simple crawl\n",
|
||||
" start = time.time()\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" word_count_threshold=0,\n",
|
||||
" bypass_cache=True,\n",
|
||||
" verbose=False\n",
|
||||
" )\n",
|
||||
" end = time.time()\n",
|
||||
" print(\"Crawl4AI (simple crawl):\")\n",
|
||||
" print(f\"Time taken: {end - start:.2f} seconds\")\n",
|
||||
" print(f\"Content length: {len(result.markdown)} characters\")\n",
|
||||
" print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
" # Crawl4AI with JavaScript execution\n",
|
||||
" start = time.time()\n",
|
||||
" result = await crawler.arun(\n",
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
|
||||
" word_count_threshold=0,\n",
|
||||
" bypass_cache=True,\n",
|
||||
" verbose=False\n",
|
||||
" )\n",
|
||||
" end = time.time()\n",
|
||||
" print(\"Crawl4AI (with JavaScript execution):\")\n",
|
||||
" print(f\"Time taken: {end - start:.2f} seconds\")\n",
|
||||
" print(f\"Content length: {len(result.markdown)} characters\")\n",
|
||||
" print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
|
||||
"\n",
|
||||
"await speed_comparison()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "OBFFYVJIyZQN"
|
||||
},
|
||||
"source": [
|
||||
"If you run on a local machine with a proper internet speed:\n",
|
||||
"- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
|
||||
"- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
|
||||
"\n",
|
||||
"Please note that actual performance may vary depending on network conditions and the specific content being crawled."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "A6_1RK1_yZQO"
|
||||
},
|
||||
"source": [
|
||||
"## Conclusion\n",
|
||||
"\n",
|
||||
"In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
|
||||
"\n",
|
||||
"1. Basic crawling\n",
|
||||
"2. JavaScript execution and CSS selector usage\n",
|
||||
"3. Proxy support\n",
|
||||
"4. Structured data extraction with OpenAI\n",
|
||||
"5. Advanced multi-page crawling with JavaScript execution\n",
|
||||
"6. Fast structured output using JsonCssExtractionStrategy\n",
|
||||
"7. Speed comparison with other services\n",
|
||||
"\n",
|
||||
"Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
|
||||
"\n",
|
||||
"For more information and advanced usage, please visit the [Crawl4AI documentation](https://crawl4ai.com/mkdocs/).\n",
|
||||
"\n",
|
||||
"Happy crawling!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
195
docs/examples/research_assistant.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# Make sure to install the required packageschainlit and groq
|
||||
import os, time
|
||||
from openai import AsyncOpenAI
|
||||
import chainlit as cl
|
||||
import re
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from chainlit.element import ElementBased
|
||||
from groq import Groq
|
||||
|
||||
# Import threadpools to run the crawl_url function in a separate thread
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
|
||||
|
||||
# Instrument the OpenAI client
|
||||
cl.instrument_openai()
|
||||
|
||||
settings = {
|
||||
"model": "llama3-8b-8192",
|
||||
"temperature": 0.5,
|
||||
"max_tokens": 500,
|
||||
"top_p": 1,
|
||||
"frequency_penalty": 0,
|
||||
"presence_penalty": 0,
|
||||
}
|
||||
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'(https?://\S+)')
|
||||
return url_pattern.findall(text)
|
||||
|
||||
def crawl_url(url):
|
||||
data = {
|
||||
"urls": [url],
|
||||
"include_raw_html": True,
|
||||
"word_count_threshold": 10,
|
||||
"extraction_strategy": "NoExtractionStrategy",
|
||||
"chunking_strategy": "RegexChunking"
|
||||
}
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
response_data = response.json()
|
||||
response_data = response_data['results'][0]
|
||||
return response_data['markdown']
|
||||
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
cl.user_session.set("session", {
|
||||
"history": [],
|
||||
"context": {}
|
||||
})
|
||||
await cl.Message(
|
||||
content="Welcome to the chat! How can I assist you today?"
|
||||
).send()
|
||||
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
user_session = cl.user_session.get("session")
|
||||
|
||||
# Extract URLs from the user's message
|
||||
urls = extract_urls(message.content)
|
||||
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for url in urls:
|
||||
futures.append(executor.submit(crawl_url, url))
|
||||
|
||||
results = [future.result() for future in futures]
|
||||
|
||||
for url, result in zip(urls, results):
|
||||
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
user_session["context"][ref_number] = {
|
||||
"url": url,
|
||||
"content": result
|
||||
}
|
||||
|
||||
|
||||
user_session["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
|
||||
# Create a system message that includes the context
|
||||
context_messages = [
|
||||
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
|
||||
for ref, data in user_session["context"].items()
|
||||
]
|
||||
if context_messages:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful bot. Use the following context for answering questions. "
|
||||
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
|
||||
"If the question requires any information from the provided appendices or context, refer to the sources. "
|
||||
"If not, there is no need to add a references section. "
|
||||
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||
"\n\n".join(context_messages)
|
||||
)
|
||||
}
|
||||
else:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}
|
||||
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
# Get response from the LLM
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
system_message,
|
||||
*user_session["history"]
|
||||
],
|
||||
stream=True,
|
||||
**settings
|
||||
)
|
||||
|
||||
assistant_response = ""
|
||||
async for part in stream:
|
||||
if token := part.choices[0].delta.content:
|
||||
assistant_response += token
|
||||
await msg.stream_token(token)
|
||||
|
||||
# Add assistant message to the history
|
||||
user_session["history"].append({
|
||||
"role": "assistant",
|
||||
"content": assistant_response
|
||||
})
|
||||
await msg.update()
|
||||
|
||||
# Append the reference section to the assistant's response
|
||||
reference_section = "\n\nReferences:\n"
|
||||
for ref, data in user_session["context"].items():
|
||||
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
|
||||
|
||||
msg.content += reference_section
|
||||
await msg.update()
|
||||
|
||||
|
||||
@cl.on_audio_chunk
|
||||
async def on_audio_chunk(chunk: cl.AudioChunk):
|
||||
if chunk.isStart:
|
||||
buffer = BytesIO()
|
||||
# This is required for whisper to recognize the file type
|
||||
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
|
||||
# Initialize the session for a new audio stream
|
||||
cl.user_session.set("audio_buffer", buffer)
|
||||
cl.user_session.set("audio_mime_type", chunk.mimeType)
|
||||
|
||||
# Write the chunks to a buffer and transcribe the whole audio at the end
|
||||
cl.user_session.get("audio_buffer").write(chunk.data)
|
||||
|
||||
pass
|
||||
|
||||
@cl.step(type="tool")
|
||||
async def speech_to_text(audio_file):
|
||||
cli = Groq()
|
||||
|
||||
response = await client.audio.transcriptions.create(
|
||||
model="whisper-large-v3", file=audio_file
|
||||
)
|
||||
|
||||
return response.text
|
||||
|
||||
|
||||
@cl.on_audio_end
|
||||
async def on_audio_end(elements: list[ElementBased]):
|
||||
# Get the audio buffer from the session
|
||||
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
|
||||
audio_buffer.seek(0) # Move the file pointer to the beginning
|
||||
audio_file = audio_buffer.read()
|
||||
audio_mime_type: str = cl.user_session.get("audio_mime_type")
|
||||
|
||||
start_time = time.time()
|
||||
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
|
||||
transcription = await speech_to_text(whisper_input)
|
||||
end_time = time.time()
|
||||
print(f"Transcription took {end_time - start_time} seconds")
|
||||
|
||||
user_msg = cl.Message(
|
||||
author="You",
|
||||
type="user_message",
|
||||
content=transcription
|
||||
)
|
||||
await user_msg.send()
|
||||
await on_message(user_msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
run_chainlit(__file__)
|
||||
|
||||
|
||||
64
docs/examples/rest_call.py
Normal file
@@ -0,0 +1,64 @@
|
||||
|
||||
import requests, base64, os
|
||||
|
||||
data = {
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
result = response.json()['results'][0]
|
||||
print(result.keys())
|
||||
# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
|
||||
# 'links', 'screenshot', 'markdown', 'extracted_content',
|
||||
# 'metadata', 'error_message'])
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result['screenshot']))
|
||||
|
||||
# Example of filtering the content using CSS selectors
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"css_selector": "article",
|
||||
"screenshot": True,
|
||||
}
|
||||
|
||||
# Example of executing a JS script on the page before extracting the content
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"screenshot": True,
|
||||
'js' : ["""
|
||||
const loadMoreButton = Array.from(document.querySelectorAll('button')).
|
||||
find(button => button.textContent.includes('Load More'));
|
||||
loadMoreButton && loadMoreButton.click();
|
||||
"""]
|
||||
}
|
||||
|
||||
# Example of using a custom extraction strategy
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"extraction_strategy": "CosineStrategy",
|
||||
"extraction_strategy_args": {
|
||||
"semantic_filter": "inflation rent prices"
|
||||
},
|
||||
}
|
||||
|
||||
# Example of using LLM to extract content
|
||||
data = {
|
||||
"urls": [
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"extraction_strategy": "LLMExtractionStrategy",
|
||||
"extraction_strategy_args": {
|
||||
"provider": "groq/llama3-8b-8192",
|
||||
"api_token": os.environ.get("GROQ_API_KEY"),
|
||||
"instruction": """I am interested in only financial news,
|
||||
and translate them in French."""
|
||||
},
|
||||
}
|
||||
|
||||
106
docs/examples/sample_ecommerce.html
Normal file
@@ -0,0 +1,106 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Sample E-commerce Page for JsonCssExtractionStrategy Testing</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }
|
||||
.category { border: 1px solid #ddd; margin-bottom: 20px; padding: 10px; }
|
||||
.product { border: 1px solid #eee; margin: 10px 0; padding: 10px; }
|
||||
.product-details, .product-reviews, .related-products { margin-top: 10px; }
|
||||
.review { background-color: #f9f9f9; margin: 5px 0; padding: 5px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Sample E-commerce Product Catalog</h1>
|
||||
<div id="catalog"></div>
|
||||
|
||||
<script>
|
||||
const categories = ['Electronics', 'Home & Kitchen', 'Books'];
|
||||
const products = [
|
||||
{
|
||||
name: 'Smartphone X',
|
||||
price: '$999',
|
||||
brand: 'TechCorp',
|
||||
model: 'X-2000',
|
||||
features: ['5G capable', '6.5" OLED screen', '128GB storage'],
|
||||
reviews: [
|
||||
{ reviewer: 'John D.', rating: '4.5', text: 'Great phone, love the camera!' },
|
||||
{ reviewer: 'Jane S.', rating: '5', text: 'Best smartphone I\'ve ever owned.' }
|
||||
],
|
||||
related: [
|
||||
{ name: 'Phone Case', price: '$29.99' },
|
||||
{ name: 'Screen Protector', price: '$9.99' }
|
||||
]
|
||||
},
|
||||
{
|
||||
name: 'Laptop Pro',
|
||||
price: '$1499',
|
||||
brand: 'TechMaster',
|
||||
model: 'LT-3000',
|
||||
features: ['Intel i7 processor', '16GB RAM', '512GB SSD'],
|
||||
reviews: [
|
||||
{ reviewer: 'Alice W.', rating: '4', text: 'Powerful machine, but a bit heavy.' },
|
||||
{ reviewer: 'Bob M.', rating: '5', text: 'Perfect for my development work!' }
|
||||
],
|
||||
related: [
|
||||
{ name: 'Laptop Bag', price: '$49.99' },
|
||||
{ name: 'Wireless Mouse', price: '$24.99' }
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
function createProductHTML(product) {
|
||||
return `
|
||||
<div class="product">
|
||||
<h3 class="product-name">${product.name}</h3>
|
||||
<p class="product-price">${product.price}</p>
|
||||
<div class="product-details">
|
||||
<span class="brand">${product.brand}</span>
|
||||
<span class="model">${product.model}</span>
|
||||
</div>
|
||||
<ul class="product-features">
|
||||
${product.features.map(feature => `<li>${feature}</li>`).join('')}
|
||||
</ul>
|
||||
<div class="product-reviews">
|
||||
${product.reviews.map(review => `
|
||||
<div class="review">
|
||||
<span class="reviewer">${review.reviewer}</span>
|
||||
<span class="rating">${review.rating}</span>
|
||||
<p class="review-text">${review.text}</p>
|
||||
</div>
|
||||
`).join('')}
|
||||
</div>
|
||||
<ul class="related-products">
|
||||
${product.related.map(item => `
|
||||
<li>
|
||||
<span class="related-name">${item.name}</span>
|
||||
<span class="related-price">${item.price}</span>
|
||||
</li>
|
||||
`).join('')}
|
||||
</ul>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function createCategoryHTML(category, products) {
|
||||
return `
|
||||
<div class="category">
|
||||
<h2 class="category-name">${category}</h2>
|
||||
${products.map(createProductHTML).join('')}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function populateCatalog() {
|
||||
const catalog = document.getElementById('catalog');
|
||||
categories.forEach(category => {
|
||||
catalog.innerHTML += createCategoryHTML(category, products);
|
||||
});
|
||||
}
|
||||
|
||||
populateCatalog();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
46
docs/examples/ssl_example.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Example showing how to work with SSL certificates in Crawl4AI."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
# Create tmp directory if it doesn't exist
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
tmp_dir = os.path.join(parent_dir, "tmp")
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
|
||||
async def main():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
cert = result.ssl_certificate
|
||||
|
||||
# 1. Access certificate properties directly
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||
print(f"Valid until: {cert.valid_until}")
|
||||
print(f"Fingerprint: {cert.fingerprint}")
|
||||
|
||||
# 2. Export certificate in different formats
|
||||
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
||||
print("\nCertificate exported to:")
|
||||
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
||||
|
||||
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
|
||||
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
||||
|
||||
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
|
||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
225
docs/examples/storage_state_tutorial.md
Normal file
@@ -0,0 +1,225 @@
|
||||
### Using `storage_state` to Pre-Load Cookies and LocalStorage
|
||||
|
||||
Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time.
|
||||
|
||||
#### What is `storage_state`?
|
||||
|
||||
`storage_state` can be:
|
||||
|
||||
- A dictionary containing cookies and localStorage data.
|
||||
- A path to a JSON file that holds this information.
|
||||
|
||||
When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state.
|
||||
|
||||
#### Example Structure
|
||||
|
||||
Here’s an example storage state:
|
||||
|
||||
```json
|
||||
{
|
||||
"cookies": [
|
||||
{
|
||||
"name": "session",
|
||||
"value": "abcd1234",
|
||||
"domain": "example.com",
|
||||
"path": "/",
|
||||
"expires": 1675363572.037711,
|
||||
"httpOnly": false,
|
||||
"secure": false,
|
||||
"sameSite": "None"
|
||||
}
|
||||
],
|
||||
"origins": [
|
||||
{
|
||||
"origin": "https://example.com",
|
||||
"localStorage": [
|
||||
{ "name": "token", "value": "my_auth_token" },
|
||||
{ "name": "refreshToken", "value": "my_refresh_token" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`.
|
||||
|
||||
---
|
||||
|
||||
### Passing `storage_state` as a Dictionary
|
||||
|
||||
You can directly provide the data as a dictionary:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
storage_dict = {
|
||||
"cookies": [
|
||||
{
|
||||
"name": "session",
|
||||
"value": "abcd1234",
|
||||
"domain": "example.com",
|
||||
"path": "/",
|
||||
"expires": 1675363572.037711,
|
||||
"httpOnly": False,
|
||||
"secure": False,
|
||||
"sameSite": "None"
|
||||
}
|
||||
],
|
||||
"origins": [
|
||||
{
|
||||
"origin": "https://example.com",
|
||||
"localStorage": [
|
||||
{"name": "token", "value": "my_auth_token"},
|
||||
{"name": "refreshToken", "value": "my_refresh_token"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
headless=True,
|
||||
storage_state=storage_dict
|
||||
) as crawler:
|
||||
result = await crawler.arun(url='https://example.com/protected')
|
||||
if result.success:
|
||||
print("Crawl succeeded with pre-loaded session data!")
|
||||
print("Page HTML length:", len(result.html))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Passing `storage_state` as a File
|
||||
|
||||
If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(
|
||||
headless=True,
|
||||
storage_state="mystate.json" # Uses a JSON file instead of a dictionary
|
||||
) as crawler:
|
||||
result = await crawler.arun(url='https://example.com/protected')
|
||||
if result.success:
|
||||
print("Crawl succeeded with pre-loaded session data!")
|
||||
print("Page HTML length:", len(result.html))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later)
|
||||
|
||||
A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can:
|
||||
|
||||
1. Perform the login once in a hook.
|
||||
2. After login completes, export the resulting `storage_state` to a file.
|
||||
3. On subsequent runs, provide that `storage_state` to skip the login step.
|
||||
|
||||
**Step-by-Step Example:**
|
||||
|
||||
**First Run (Perform Login and Save State):**
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def on_browser_created_hook(browser):
|
||||
# Access the default context and create a page
|
||||
context = browser.contexts[0]
|
||||
page = await context.new_page()
|
||||
|
||||
# Navigate to the login page
|
||||
await page.goto("https://example.com/login", wait_until="domcontentloaded")
|
||||
|
||||
# Fill in credentials and submit
|
||||
await page.fill("input[name='username']", "myuser")
|
||||
await page.fill("input[name='password']", "mypassword")
|
||||
await page.click("button[type='submit']")
|
||||
await page.wait_for_load_state("networkidle")
|
||||
|
||||
# Now the site sets tokens in localStorage and cookies
|
||||
# Export this state to a file so we can reuse it
|
||||
await context.storage_state(path="my_storage_state.json")
|
||||
await page.close()
|
||||
|
||||
async def main():
|
||||
# First run: perform login and export the storage_state
|
||||
async with AsyncWebCrawler(
|
||||
headless=True,
|
||||
verbose=True,
|
||||
hooks={"on_browser_created": on_browser_created_hook},
|
||||
use_persistent_context=True,
|
||||
user_data_dir="./my_user_data"
|
||||
) as crawler:
|
||||
|
||||
# After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json
|
||||
result = await crawler.arun(
|
||||
url='https://example.com/protected-page',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
|
||||
)
|
||||
print("First run result success:", result.success)
|
||||
if result.success:
|
||||
print("Protected page HTML length:", len(result.html))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Second Run (Reuse Saved State, No Login Needed):**
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def main():
|
||||
# Second run: no need to hook on_browser_created this time.
|
||||
# Just provide the previously saved storage state.
|
||||
async with AsyncWebCrawler(
|
||||
headless=True,
|
||||
verbose=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir="./my_user_data",
|
||||
storage_state="my_storage_state.json" # Reuse previously exported state
|
||||
) as crawler:
|
||||
|
||||
# Now the crawler starts already logged in
|
||||
result = await crawler.arun(
|
||||
url='https://example.com/protected-page',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
|
||||
)
|
||||
print("Second run result success:", result.success)
|
||||
if result.success:
|
||||
print("Protected page HTML length:", len(result.html))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**What’s Happening Here?**
|
||||
|
||||
- During the first run, the `on_browser_created_hook` logs into the site.
|
||||
- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`.
|
||||
- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps.
|
||||
|
||||
**Sign Out Scenario:**
|
||||
If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time.
|
||||
|
||||
---
|
||||
|
||||
### Conclusion
|
||||
|
||||
By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines.
|
||||
46
docs/examples/summarize_page.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
from crawl4ai.crawler_strategy import *
|
||||
|
||||
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
||||
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class PageSummary(BaseModel):
|
||||
title: str = Field(..., description="Title of the page.")
|
||||
summary: str = Field(..., description="Summary of the page.")
|
||||
brief_summary: str = Field(..., description="Brief summary of the page.")
|
||||
keywords: list = Field(..., description="Keywords assigned to the page.")
|
||||
|
||||
result = crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy= LLMExtractionStrategy(
|
||||
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
schema=PageSummary.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
apply_chunking =False,
|
||||
instruction="From the crawled content, extract the following details: "\
|
||||
"1. Title of the page "\
|
||||
"2. Summary of the page, which is a detailed summary "\
|
||||
"3. Brief summary of the page, which is a paragraph text "\
|
||||
"4. Keywords assigned to the page, which is a list of keywords. "\
|
||||
'The extracted JSON format should look like this: '\
|
||||
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
page_summary = json.loads(result.extracted_content)
|
||||
|
||||
print(page_summary)
|
||||
|
||||
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
117
docs/examples/tutorial_dynamic_clicks.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
|
||||
|
||||
## Introduction
|
||||
|
||||
When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
|
||||
|
||||
1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
|
||||
2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A working installation of Crawl4AI
|
||||
- Basic familiarity with Python’s `async`/`await` syntax
|
||||
|
||||
## Step-by-Step Approach
|
||||
|
||||
Use a session ID to maintain state across multiple `arun()` calls:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
|
||||
js_code = [
|
||||
# This JS finds the “Next” button and clicks it
|
||||
"const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
|
||||
]
|
||||
|
||||
wait_for_condition = "css:.new-content-class"
|
||||
|
||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||
# 1. Load the initial page
|
||||
result_initial = await crawler.arun(
|
||||
url="https://example.com",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
session_id="my_session"
|
||||
)
|
||||
|
||||
# 2. Click the 'Next' button and wait for new content
|
||||
result_next = await crawler.arun(
|
||||
url="https://example.com",
|
||||
session_id="my_session",
|
||||
js_code=js_code,
|
||||
wait_for=wait_for_condition,
|
||||
js_only=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# `result_next` now contains the updated HTML after clicking 'Next'
|
||||
```
|
||||
|
||||
**Key Points:**
|
||||
- **`session_id`**: Keeps the same browser context open.
|
||||
- **`js_code`**: Executes JavaScript in the context of the already loaded page.
|
||||
- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
|
||||
- **`js_only=True`**: Runs the JS in the current session without reloading the page.
|
||||
|
||||
By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
|
||||
|
||||
## Single-call Approach
|
||||
|
||||
If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
|
||||
- Iterates over all the modules or "Next" buttons
|
||||
- Clicks them one by one
|
||||
- Waits for content updates between each click
|
||||
- Once done, returns control to Crawl4AI for extraction.
|
||||
|
||||
Example snippet:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
|
||||
js_code = [
|
||||
# Example JS that clicks multiple modules:
|
||||
"""
|
||||
(async () => {
|
||||
const modules = document.querySelectorAll('.module-item');
|
||||
for (let i = 0; i < modules.length; i++) {
|
||||
modules[i].scrollIntoView();
|
||||
modules[i].click();
|
||||
// Wait for each module’s content to load, adjust 100ms as needed
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
})();
|
||||
"""
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code=js_code,
|
||||
wait_for="css:.final-loaded-content-class",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# `result` now contains all content after all modules have been clicked in one go.
|
||||
```
|
||||
|
||||
**Key Points:**
|
||||
- All interactions (clicks and waits) happen before the extraction.
|
||||
- Ideal for pages where all steps can be done in a single pass.
|
||||
|
||||
## Choosing the Right Approach
|
||||
|
||||
- **Step-by-Step (Session-based)**:
|
||||
- Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
|
||||
- Useful if the page requires multiple conditions checked at runtime.
|
||||
|
||||
- **Single-call**:
|
||||
- Perfect if the sequence of interactions is known in advance.
|
||||
- Cleaner code if the page’s structure is consistent and predictable.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Crawl4AI makes it easy to handle dynamic content:
|
||||
- Use session IDs and multiple `arun()` calls for stepwise crawling.
|
||||
- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
|
||||
|
||||
This flexibility ensures you can handle a wide range of dynamic web pages efficiently.
|
||||
277
docs/examples/v0.3.74.overview.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import os, sys
|
||||
# append the parent directory to the sys.path
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
parent_parent_dir = os.path.dirname(parent_dir)
|
||||
sys.path.append(parent_parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
__data__ = os.path.join(__location__, "__data")
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import aiohttp
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
# 1. File Download Processing Example
|
||||
async def download_example():
|
||||
"""Example of downloading files from Python.org"""
|
||||
# downloads_path = os.path.join(os.getcwd(), "downloads")
|
||||
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||
os.makedirs(downloads_path, exist_ok=True)
|
||||
|
||||
print(f"Downloads will be saved to: {downloads_path}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
accept_downloads=True,
|
||||
downloads_path=downloads_path,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.python.org/downloads/",
|
||||
js_code="""
|
||||
// Find and click the first Windows installer link
|
||||
const downloadLink = document.querySelector('a[href$=".exe"]');
|
||||
if (downloadLink) {
|
||||
console.log('Found download link:', downloadLink.href);
|
||||
downloadLink.click();
|
||||
} else {
|
||||
console.log('No .exe download link found');
|
||||
}
|
||||
""",
|
||||
delay_before_return_html=1, # Wait 5 seconds to ensure download starts
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
if result.downloaded_files:
|
||||
print("\nDownload successful!")
|
||||
print("Downloaded files:")
|
||||
for file_path in result.downloaded_files:
|
||||
print(f"- {file_path}")
|
||||
print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
|
||||
else:
|
||||
print("\nNo files were downloaded")
|
||||
|
||||
# 2. Local File and Raw HTML Processing Example
|
||||
async def local_and_raw_html_example():
|
||||
"""Example of processing local files and raw HTML"""
|
||||
# Create a sample HTML file
|
||||
sample_file = os.path.join(__data__, "sample.html")
|
||||
with open(sample_file, "w") as f:
|
||||
f.write("""
|
||||
<html><body>
|
||||
<h1>Test Content</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
</body></html>
|
||||
""")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Process local file
|
||||
local_result = await crawler.arun(
|
||||
url=f"file://{os.path.abspath(sample_file)}"
|
||||
)
|
||||
|
||||
# Process raw HTML
|
||||
raw_html = """
|
||||
<html><body>
|
||||
<h1>Raw HTML Test</h1>
|
||||
<p>This is a test of raw HTML processing.</p>
|
||||
</body></html>
|
||||
"""
|
||||
raw_result = await crawler.arun(
|
||||
url=f"raw:{raw_html}"
|
||||
)
|
||||
|
||||
# Clean up
|
||||
os.remove(sample_file)
|
||||
|
||||
print("Local file content:", local_result.markdown)
|
||||
print("\nRaw HTML content:", raw_result.markdown)
|
||||
|
||||
# 3. Enhanced Markdown Generation Example
|
||||
async def markdown_generation_example():
|
||||
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create a content filter (optional)
|
||||
content_filter = BM25ContentFilter(
|
||||
# user_query="History and cultivation",
|
||||
bm25_threshold=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=content_filter,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=BM25ContentFilter()
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown)
|
||||
|
||||
print("\nMarkdown Generation Results:")
|
||||
print(f"1. Original markdown length: {len(result.markdown)}")
|
||||
print(f"2. New markdown versions (markdown_v2):")
|
||||
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||
print(f" - References section length: {len(result.markdown_v2.references_markdown)}")
|
||||
if result.markdown_v2.fit_markdown:
|
||||
print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
|
||||
|
||||
# Save examples to files
|
||||
output_dir = os.path.join(__data__, "markdown_examples")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Save different versions
|
||||
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
|
||||
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
|
||||
f.write(result.markdown_v2.references_markdown)
|
||||
|
||||
if result.markdown_v2.fit_markdown:
|
||||
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
|
||||
print(f"\nMarkdown examples saved to: {output_dir}")
|
||||
|
||||
# Show a sample of citations and references
|
||||
print("\nSample of markdown with citations:")
|
||||
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
|
||||
print("Sample of references:")
|
||||
print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
|
||||
|
||||
# 4. Browser Management Example
|
||||
async def browser_management_example():
|
||||
"""Example of using enhanced browser management features"""
|
||||
# Use the specified user directory path
|
||||
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
|
||||
print(f"Browser profile will be saved to: {user_data_dir}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
use_managed_browser=True,
|
||||
user_data_dir=user_data_dir,
|
||||
headless=False,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com",
|
||||
# session_id="persistent_session_1",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
# Use GitHub as an example - it's a good test for browser management
|
||||
# because it requires proper browser handling
|
||||
result = await crawler.arun(
|
||||
url="https://github.com/trending",
|
||||
# session_id="persistent_session_1",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
print("\nBrowser session result:", result.success)
|
||||
if result.success:
|
||||
print("Page title:", result.metadata.get('title', 'No title found'))
|
||||
|
||||
# 5. API Usage Example
|
||||
async def api_example():
|
||||
"""Example of using the new API endpoints"""
|
||||
api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
|
||||
headers = {'Authorization': f'Bearer {api_token}'}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Submit crawl job
|
||||
crawl_request = {
|
||||
"urls": ["https://news.ycombinator.com"], # Hacker News as an example
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"schema": {
|
||||
"name": "Hacker News Articles",
|
||||
"baseSelector": ".athing",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": ".title a",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "score",
|
||||
"selector": ".score",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": ".title a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
# "use_managed_browser": True
|
||||
},
|
||||
"cache_mode": "bypass",
|
||||
# "screenshot": True,
|
||||
# "magic": True
|
||||
}
|
||||
|
||||
async with session.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json=crawl_request,
|
||||
headers=headers
|
||||
) as response:
|
||||
task_data = await response.json()
|
||||
task_id = task_data["task_id"]
|
||||
|
||||
# Check task status
|
||||
while True:
|
||||
async with session.get(
|
||||
f"http://localhost:11235/task/{task_id}",
|
||||
headers=headers
|
||||
) as status_response:
|
||||
result = await status_response.json()
|
||||
print(f"Task status: {result['status']}")
|
||||
|
||||
if result["status"] == "completed":
|
||||
print("Task completed!")
|
||||
print("Results:")
|
||||
news = json.loads(result["results"][0]['extracted_content'])
|
||||
print(json.dumps(news[:4], indent=2))
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
# print("Running Crawl4AI feature examples...")
|
||||
|
||||
# print("\n1. Running Download Example:")
|
||||
# await download_example()
|
||||
|
||||
# print("\n2. Running Markdown Generation Example:")
|
||||
# await markdown_generation_example()
|
||||
|
||||
# # print("\n3. Running Local and Raw HTML Example:")
|
||||
# await local_and_raw_html_example()
|
||||
|
||||
# # print("\n4. Running Browser Management Example:")
|
||||
await browser_management_example()
|
||||
|
||||
# print("\n5. Running API Example:")
|
||||
await api_example()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
443
docs/examples/v0_4_24_walkthrough.py
Normal file
@@ -0,0 +1,443 @@
|
||||
"""
|
||||
Crawl4AI v0.4.24 Feature Walkthrough
|
||||
===================================
|
||||
|
||||
This script demonstrates the new features introduced in Crawl4AI v0.4.24.
|
||||
Each section includes detailed examples and explanations of the new capabilities.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from typing import List, Optional, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy
|
||||
)
|
||||
from crawl4ai.content_filter_strategy import RelevantContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Sample HTML for demonstrations
|
||||
SAMPLE_HTML = """
|
||||
<div class="article-list">
|
||||
<article class="post" data-category="tech" data-author="john">
|
||||
<h2 class="title"><a href="/post-1">First Post</a></h2>
|
||||
<div class="meta">
|
||||
<a href="/author/john" class="author">John Doe</a>
|
||||
<span class="date">2023-12-31</span>
|
||||
</div>
|
||||
<div class="content">
|
||||
<p>First post content...</p>
|
||||
<a href="/read-more-1" class="read-more">Read More</a>
|
||||
</div>
|
||||
</article>
|
||||
<article class="post" data-category="science" data-author="jane">
|
||||
<h2 class="title"><a href="/post-2">Second Post</a></h2>
|
||||
<div class="meta">
|
||||
<a href="/author/jane" class="author">Jane Smith</a>
|
||||
<span class="date">2023-12-30</span>
|
||||
</div>
|
||||
<div class="content">
|
||||
<p>Second post content...</p>
|
||||
<a href="/read-more-2" class="read-more">Read More</a>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
"""
|
||||
|
||||
async def demo_ssl_features():
|
||||
"""
|
||||
Enhanced SSL & Security Features Demo
|
||||
-----------------------------------
|
||||
|
||||
This example demonstrates the new SSL certificate handling and security features:
|
||||
1. Custom certificate paths
|
||||
2. SSL verification options
|
||||
3. HTTPS error handling
|
||||
4. Certificate validation configurations
|
||||
|
||||
These features are particularly useful when:
|
||||
- Working with self-signed certificates
|
||||
- Dealing with corporate proxies
|
||||
- Handling mixed content websites
|
||||
- Managing different SSL security levels
|
||||
"""
|
||||
print("\n1. Enhanced SSL & Security Demo")
|
||||
print("--------------------------------")
|
||||
|
||||
browser_config = BrowserConfig()
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
fetch_ssl_certificate=True # Enable SSL certificate fetching
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
print(f"SSL Crawl Success: {result.success}")
|
||||
result.ssl_certificate.to_json(
|
||||
os.path.join(os.getcwd(), "ssl_certificate.json")
|
||||
)
|
||||
if not result.success:
|
||||
print(f"SSL Error: {result.error_message}")
|
||||
|
||||
async def demo_content_filtering():
|
||||
"""
|
||||
Smart Content Filtering Demo
|
||||
----------------------
|
||||
|
||||
Demonstrates advanced content filtering capabilities:
|
||||
1. Custom filter to identify and extract specific content
|
||||
2. Integration with markdown generation
|
||||
3. Flexible pruning rules
|
||||
"""
|
||||
print("\n2. Smart Content Filtering Demo")
|
||||
print("--------------------------------")
|
||||
|
||||
# Create a custom content filter
|
||||
class CustomNewsFilter(RelevantContentFilter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Add news-specific patterns
|
||||
self.negative_patterns = re.compile(
|
||||
r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
|
||||
re.I
|
||||
)
|
||||
self.min_word_count = 30 # Higher threshold for news content
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""
|
||||
Implements news-specific content filtering logic.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered
|
||||
min_word_threshold (int, optional): Minimum word count threshold
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered HTML content blocks
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
if not soup.body:
|
||||
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
|
||||
|
||||
body = soup.find('body')
|
||||
|
||||
# Extract chunks with metadata
|
||||
chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
|
||||
|
||||
# Filter chunks based on news-specific criteria
|
||||
filtered_chunks = []
|
||||
for _, text, tag_type, element in chunks:
|
||||
# Skip if element has negative class/id
|
||||
if self.is_excluded(element):
|
||||
continue
|
||||
|
||||
# Headers are important in news articles
|
||||
if tag_type == 'header':
|
||||
filtered_chunks.append(self.clean_element(element))
|
||||
continue
|
||||
|
||||
# For content, check word count and link density
|
||||
text = element.get_text(strip=True)
|
||||
if len(text.split()) >= (min_word_threshold or self.min_word_count):
|
||||
# Calculate link density
|
||||
links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
|
||||
link_density = len(links_text) / len(text) if text else 1
|
||||
|
||||
# Accept if link density is reasonable
|
||||
if link_density < 0.5:
|
||||
filtered_chunks.append(self.clean_element(element))
|
||||
|
||||
return filtered_chunks
|
||||
|
||||
# Create markdown generator with custom filter
|
||||
markdown_gen = DefaultMarkdownGenerator(
|
||||
content_filter=CustomNewsFilter()
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=markdown_gen,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=run_config
|
||||
)
|
||||
print("Filtered Content Sample:")
|
||||
print(result.markdown[:500]) # Show first 500 chars
|
||||
|
||||
async def demo_json_extraction():
|
||||
"""
|
||||
Improved JSON Extraction Demo
|
||||
---------------------------
|
||||
|
||||
Demonstrates the enhanced JSON extraction capabilities:
|
||||
1. Base element attributes extraction
|
||||
2. Complex nested structures
|
||||
3. Multiple extraction patterns
|
||||
|
||||
Key features shown:
|
||||
- Extracting attributes from base elements (href, data-* attributes)
|
||||
- Processing repeated patterns
|
||||
- Handling optional fields
|
||||
"""
|
||||
print("\n3. Improved JSON Extraction Demo")
|
||||
print("--------------------------------")
|
||||
|
||||
# Define the extraction schema with base element attributes
|
||||
json_strategy = JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Blog Posts",
|
||||
"baseSelector": "div.article-list",
|
||||
"baseFields": [
|
||||
{"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
|
||||
{"name": "category", "type": "attribute", "attribute": "data-category"}
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
"name": "posts",
|
||||
"selector": "article.post",
|
||||
"type": "nested_list",
|
||||
"baseFields": [
|
||||
{"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
|
||||
{"name": "author_id", "type": "attribute", "attribute": "data-author"}
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2.title a",
|
||||
"type": "text",
|
||||
"baseFields": [
|
||||
{"name": "url", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "author",
|
||||
"selector": "div.meta a.author",
|
||||
"type": "text",
|
||||
"baseFields": [
|
||||
{"name": "profile_url", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "date",
|
||||
"selector": "span.date",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "read_more",
|
||||
"selector": "a.read-more",
|
||||
"type": "nested",
|
||||
"fields": [
|
||||
{"name": "text", "type": "text"},
|
||||
{"name": "url", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# Demonstrate extraction from raw HTML
|
||||
run_config = CrawlerRunConfig(
|
||||
extraction_strategy=json_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML
|
||||
config=run_config
|
||||
)
|
||||
print("Extracted Content:")
|
||||
print(result.extracted_content)
|
||||
|
||||
async def demo_input_formats():
|
||||
"""
|
||||
Input Format Handling Demo
|
||||
----------------------
|
||||
|
||||
Demonstrates how LLM extraction can work with different input formats:
|
||||
1. Markdown (default) - Good for simple text extraction
|
||||
2. HTML - Better when you need structure and attributes
|
||||
|
||||
This example shows how HTML input can be beneficial when:
|
||||
- You need to understand the DOM structure
|
||||
- You want to extract both visible text and HTML attributes
|
||||
- The content has complex layouts like tables or forms
|
||||
"""
|
||||
print("\n4. Input Format Handling Demo")
|
||||
print("---------------------------")
|
||||
|
||||
# Create a dummy HTML with rich structure
|
||||
dummy_html = """
|
||||
<div class="job-posting" data-post-id="12345">
|
||||
<header class="job-header">
|
||||
<h1 class="job-title">Senior AI/ML Engineer</h1>
|
||||
<div class="job-meta">
|
||||
<span class="department">AI Research Division</span>
|
||||
<span class="location" data-remote="hybrid">San Francisco (Hybrid)</span>
|
||||
</div>
|
||||
<div class="salary-info" data-currency="USD">
|
||||
<span class="range">$150,000 - $220,000</span>
|
||||
<span class="period">per year</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<section class="requirements">
|
||||
<div class="technical-skills">
|
||||
<h3>Technical Requirements</h3>
|
||||
<ul class="required-skills">
|
||||
<li class="skill required" data-priority="must-have">
|
||||
5+ years experience in Machine Learning
|
||||
</li>
|
||||
<li class="skill required" data-priority="must-have">
|
||||
Proficiency in Python and PyTorch/TensorFlow
|
||||
</li>
|
||||
<li class="skill preferred" data-priority="nice-to-have">
|
||||
Experience with distributed training systems
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="soft-skills">
|
||||
<h3>Professional Skills</h3>
|
||||
<ul class="required-skills">
|
||||
<li class="skill required" data-priority="must-have">
|
||||
Strong problem-solving abilities
|
||||
</li>
|
||||
<li class="skill preferred" data-priority="nice-to-have">
|
||||
Experience leading technical teams
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="timeline">
|
||||
<time class="deadline" datetime="2024-02-28">
|
||||
Application Deadline: February 28, 2024
|
||||
</time>
|
||||
</section>
|
||||
|
||||
<footer class="contact-section">
|
||||
<div class="hiring-manager">
|
||||
<h4>Hiring Manager</h4>
|
||||
<div class="contact-info">
|
||||
<span class="name">Dr. Sarah Chen</span>
|
||||
<span class="title">Director of AI Research</span>
|
||||
<span class="email">ai.hiring@example.com</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="team-info">
|
||||
<p>Join our team of 50+ researchers working on cutting-edge AI applications</p>
|
||||
</div>
|
||||
</footer>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# Use raw:// prefix to pass HTML content directly
|
||||
url = f"raw://{dummy_html}"
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
|
||||
# Define our schema using Pydantic
|
||||
class JobRequirement(BaseModel):
|
||||
category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
|
||||
items: List[str] = Field(description="List of specific requirements in this category")
|
||||
priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
|
||||
|
||||
class JobPosting(BaseModel):
|
||||
title: str = Field(description="Job title")
|
||||
department: str = Field(description="Department or team")
|
||||
location: str = Field(description="Job location, including remote options")
|
||||
salary_range: Optional[str] = Field(description="Salary range if specified")
|
||||
requirements: List[JobRequirement] = Field(description="Categorized job requirements")
|
||||
application_deadline: Optional[str] = Field(description="Application deadline if specified")
|
||||
contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
|
||||
|
||||
# First try with markdown (default)
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
schema=JobPosting.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""
|
||||
Extract job posting details into structured data. Focus on the visible text content
|
||||
and organize requirements into categories.
|
||||
""",
|
||||
input_format="markdown" # default
|
||||
)
|
||||
|
||||
# Then with HTML for better structure understanding
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
schema=JobPosting.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""
|
||||
Extract job posting details, using HTML structure to:
|
||||
1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred')
|
||||
2. Extract contact info from the page footer or dedicated contact section
|
||||
3. Parse salary information from specially formatted elements
|
||||
4. Determine application deadline from timestamp or date elements
|
||||
|
||||
Use HTML attributes and classes to enhance extraction accuracy.
|
||||
""",
|
||||
input_format="html" # explicitly use HTML
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Try with markdown first
|
||||
markdown_config = CrawlerRunConfig(
|
||||
extraction_strategy=markdown_strategy
|
||||
)
|
||||
markdown_result = await crawler.arun(
|
||||
url=url,
|
||||
config=markdown_config
|
||||
)
|
||||
print("\nMarkdown-based Extraction Result:")
|
||||
items = json.loads(markdown_result.extracted_content)
|
||||
print(json.dumps(items, indent=2))
|
||||
|
||||
# Then with HTML for better structure understanding
|
||||
html_config = CrawlerRunConfig(
|
||||
extraction_strategy=html_strategy
|
||||
)
|
||||
html_result = await crawler.arun(
|
||||
url=url,
|
||||
config=html_config
|
||||
)
|
||||
print("\nHTML-based Extraction Result:")
|
||||
items = json.loads(html_result.extracted_content)
|
||||
print(json.dumps(items, indent=2))
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
print("Crawl4AI v0.4.24 Feature Walkthrough")
|
||||
print("====================================")
|
||||
|
||||
# Run all demos
|
||||
await demo_ssl_features()
|
||||
await demo_content_filtering()
|
||||
await demo_json_extraction()
|
||||
# await demo_input_formats()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,10 +0,0 @@
|
||||
{
|
||||
"NoExtractionStrategy": "### NoExtractionStrategy\n\n`NoExtractionStrategy` is a basic extraction strategy that returns the entire HTML content without any modification. It is useful for cases where no specific extraction is required. Only clean html, and amrkdown.\n\n#### Constructor Parameters:\nNone.\n\n#### Example usage:\n```python\nextractor = NoExtractionStrategy()\nextracted_content = extractor.extract(url, html)\n```",
|
||||
|
||||
"LLMExtractionStrategy": "### LLMExtractionStrategy\n\n`LLMExtractionStrategy` uses a Language Model (LLM) to extract meaningful blocks or chunks from the given HTML content. This strategy leverages an external provider for language model completions.\n\n#### Constructor Parameters:\n- `provider` (str, optional): The provider to use for the language model completions. Default is `DEFAULT_PROVIDER` (e.g., openai/gpt-4).\n- `api_token` (str, optional): The API token for the provider. If not provided, it will try to load from the environment variable `OPENAI_API_KEY`.\n- `instruction` (str, optional): An instruction to guide the LLM on how to perform the extraction. This allows users to specify the type of data they are interested in or set the tone of the response. Default is `None`.\n\n#### Example usage:\n```python\nextractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')\nextracted_content = extractor.extract(url, html)\n```\n\nBy providing clear instructions, users can tailor the extraction process to their specific needs, enhancing the relevance and utility of the extracted content.",
|
||||
|
||||
"CosineStrategy": "### CosineStrategy\n\n`CosineStrategy` uses hierarchical clustering based on cosine similarity to extract clusters of text from the given HTML content. This strategy is suitable for identifying related content sections.\n\n#### Constructor Parameters:\n- `semantic_filter` (str, optional): A string containing keywords for filtering relevant documents before clustering. If provided, documents are filtered based on their cosine similarity to the keyword filter embedding. Default is `None`.\n- `word_count_threshold` (int, optional): Minimum number of words per cluster. Default is `20`.\n- `max_dist` (float, optional): The maximum cophenetic distance on the dendrogram to form clusters. Default is `0.2`.\n- `linkage_method` (str, optional): The linkage method for hierarchical clustering. Default is `'ward'`.\n- `top_k` (int, optional): Number of top categories to extract. Default is `3`.\n- `model_name` (str, optional): The model name for embedding generation. Default is `'BAAI/bge-small-en-v1.5'`.\n\n#### Example usage:\n```python\nextractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')\nextracted_content = extractor.extract(url, html)\n```\n\n#### Cosine Similarity Filtering\n\nWhen a `semantic_filter` is provided, the `CosineStrategy` applies an embedding-based filtering process to select relevant documents before performing hierarchical clustering.",
|
||||
|
||||
"TopicExtractionStrategy": "### TopicExtractionStrategy\n\n`TopicExtractionStrategy` uses the TextTiling algorithm to segment the HTML content into topics and extracts keywords for each segment. This strategy is useful for identifying and summarizing thematic content.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): Number of keywords to represent each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nextractor = TopicExtractionStrategy(num_keywords=3)\nextracted_content = extractor.extract(url, html)\n```"
|
||||
}
|
||||
|
||||
136
docs/md_v2/advanced/content-processing.md
Normal file
@@ -0,0 +1,136 @@
|
||||
# Content Processing
|
||||
|
||||
Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction.
|
||||
|
||||
## Media Processing
|
||||
|
||||
Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance.
|
||||
|
||||
### Image Processing
|
||||
|
||||
The library handles various image scenarios, including:
|
||||
- Regular images
|
||||
- Lazy-loaded images
|
||||
- Background images
|
||||
- Responsive images
|
||||
- Image metadata and context
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
for image in result.media["images"]:
|
||||
# Each image includes rich metadata
|
||||
print(f"Source: {image['src']}")
|
||||
print(f"Alt text: {image['alt']}")
|
||||
print(f"Description: {image['desc']}")
|
||||
print(f"Context: {image['context']}") # Surrounding text
|
||||
print(f"Relevance score: {image['score']}") # 0-10 score
|
||||
```
|
||||
|
||||
### Handling Lazy-Loaded Content
|
||||
|
||||
Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
wait_for="css:img[data-src]", # Wait for lazy images
|
||||
delay_before_return_html=2.0 # Additional wait time
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
### Video and Audio Content
|
||||
|
||||
The library extracts video and audio elements with their metadata:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
# Process videos
|
||||
for video in result.media["videos"]:
|
||||
print(f"Video source: {video['src']}")
|
||||
print(f"Type: {video['type']}")
|
||||
print(f"Duration: {video.get('duration')}")
|
||||
print(f"Thumbnail: {video.get('poster')}")
|
||||
|
||||
# Process audio
|
||||
for audio in result.media["audios"]:
|
||||
print(f"Audio source: {audio['src']}")
|
||||
print(f"Type: {audio['type']}")
|
||||
print(f"Duration: {audio.get('duration')}")
|
||||
```
|
||||
|
||||
## Link Analysis
|
||||
|
||||
Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns.
|
||||
|
||||
### Link Classification
|
||||
|
||||
The library automatically categorizes links into:
|
||||
- Internal links (same domain)
|
||||
- External links (different domains)
|
||||
- Social media links
|
||||
- Navigation links
|
||||
- Content links
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
# Analyze internal links
|
||||
for link in result.links["internal"]:
|
||||
print(f"Internal: {link['href']}")
|
||||
print(f"Link text: {link['text']}")
|
||||
print(f"Context: {link['context']}") # Surrounding text
|
||||
print(f"Type: {link['type']}") # nav, content, etc.
|
||||
|
||||
# Analyze external links
|
||||
for link in result.links["external"]:
|
||||
print(f"External: {link['href']}")
|
||||
print(f"Domain: {link['domain']}")
|
||||
print(f"Type: {link['type']}")
|
||||
```
|
||||
|
||||
### Smart Link Filtering
|
||||
|
||||
Control which links are included in the results with `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
exclude_external_links=True, # Remove external links
|
||||
exclude_social_media_links=True, # Remove social media links
|
||||
exclude_social_media_domains=[ # Custom social media domains
|
||||
"facebook.com", "twitter.com", "instagram.com"
|
||||
],
|
||||
exclude_domains=["ads.example.com"] # Exclude specific domains
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
## Metadata Extraction
|
||||
|
||||
Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
metadata = result.metadata
|
||||
print(f"Title: {metadata['title']}")
|
||||
print(f"Description: {metadata['description']}")
|
||||
print(f"Keywords: {metadata['keywords']}")
|
||||
print(f"Author: {metadata['author']}")
|
||||
print(f"Published Date: {metadata['published_date']}")
|
||||
print(f"Modified Date: {metadata['modified_date']}")
|
||||
print(f"Language: {metadata['language']}")
|
||||
```
|
||||
121
docs/md_v2/advanced/hooks-auth.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# Hooks & Auth for AsyncWebCrawler
|
||||
|
||||
Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`.
|
||||
|
||||
## Example: Using Crawler Hooks with AsyncWebCrawler
|
||||
|
||||
In this example, we'll:
|
||||
|
||||
1. Configure the browser and set up authentication when it's created.
|
||||
2. Apply custom routing and initial actions when the page context is created.
|
||||
3. Add custom headers before navigating to the URL.
|
||||
4. Log the current URL after navigation.
|
||||
5. Perform actions after JavaScript execution.
|
||||
6. Log the length of the HTML before returning it.
|
||||
|
||||
### Hook Definitions
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from playwright.async_api import Page, Browser, BrowserContext
|
||||
|
||||
def log_routing(route):
|
||||
# Example: block loading images
|
||||
if route.request.resource_type == "image":
|
||||
print(f"[HOOK] Blocking image request: {route.request.url}")
|
||||
asyncio.create_task(route.abort())
|
||||
else:
|
||||
asyncio.create_task(route.continue_())
|
||||
|
||||
async def on_browser_created(browser: Browser, **kwargs):
|
||||
print("[HOOK] on_browser_created")
|
||||
# Example: Set browser viewport size and log in
|
||||
context = await browser.new_context(viewport={"width": 1920, "height": 1080})
|
||||
page = await context.new_page()
|
||||
await page.goto("https://example.com/login")
|
||||
await page.fill("input[name='username']", "testuser")
|
||||
await page.fill("input[name='password']", "password123")
|
||||
await page.click("button[type='submit']")
|
||||
await page.wait_for_selector("#welcome")
|
||||
await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}])
|
||||
await page.close()
|
||||
await context.close()
|
||||
|
||||
async def on_page_context_created(context: BrowserContext, page: Page, **kwargs):
|
||||
print("[HOOK] on_page_context_created")
|
||||
await context.route("**", log_routing)
|
||||
|
||||
async def before_goto(page: Page, context: BrowserContext, **kwargs):
|
||||
print("[HOOK] before_goto")
|
||||
await page.set_extra_http_headers({"X-Test-Header": "test"})
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, **kwargs):
|
||||
print("[HOOK] after_goto")
|
||||
print(f"Current URL: {page.url}")
|
||||
|
||||
async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
|
||||
print("[HOOK] on_execution_started")
|
||||
await page.evaluate("console.log('Custom JS executed')")
|
||||
|
||||
async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs):
|
||||
print("[HOOK] before_return_html")
|
||||
print(f"HTML length: {len(html)}")
|
||||
return page
|
||||
```
|
||||
|
||||
### Using the Hooks with AsyncWebCrawler
|
||||
|
||||
```python
|
||||
async def main():
|
||||
print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!")
|
||||
|
||||
# Configure browser and crawler settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
crawler_run_config = CrawlerRunConfig(
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="footer"
|
||||
)
|
||||
|
||||
# Initialize crawler
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
|
||||
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||||
crawler.crawler_strategy.set_hook("before_goto", before_goto)
|
||||
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
|
||||
|
||||
# Run the crawler
|
||||
result = await crawler.arun(url="https://example.com", config=crawler_run_config)
|
||||
|
||||
print("\n📦 Crawler Hooks Result:")
|
||||
print(result)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Explanation of Hooks
|
||||
|
||||
- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies).
|
||||
- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL.
|
||||
- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions.
|
||||
- **`after_goto`**: Called after navigation. Use this to verify content or log the URL.
|
||||
- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions.
|
||||
- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content.
|
||||
|
||||
### Additional Customizations
|
||||
|
||||
- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts).
|
||||
- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL.
|
||||
- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens.
|
||||
- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content.
|
||||
|
||||
These hooks provide powerful customization options for tailoring the crawling process to your needs.
|
||||
|
||||
156
docs/md_v2/advanced/identity_based_crawling.md
Normal file
@@ -0,0 +1,156 @@
|
||||
### Preserve Your Identity with Crawl4AI
|
||||
|
||||
Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios.
|
||||
|
||||
---
|
||||
|
||||
### Managed Browsers: Your Digital Identity Solution
|
||||
|
||||
**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web.
|
||||
|
||||
#### Why Use Managed Browsers?
|
||||
1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior.
|
||||
2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access.
|
||||
3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions.
|
||||
|
||||
#### Steps to Use Managed Browsers
|
||||
|
||||
1. **Setup the Browser Configuration**:
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Set to False for initial setup to view browser actions
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
use_managed_browser=True, # Enables persistent browser sessions
|
||||
browser_type="chromium",
|
||||
user_data_dir="/path/to/user_profile_data" # Path to save session data
|
||||
)
|
||||
```
|
||||
|
||||
2. **Perform an Initial Run**:
|
||||
- Run the crawler with `headless=False`.
|
||||
- Manually interact with the site (e.g., solve CAPTCHA or log in).
|
||||
- The browser session saves cookies, local storage, and other required data.
|
||||
|
||||
3. **Subsequent Runs**:
|
||||
- Switch to `headless=True` for automation.
|
||||
- The session data is reused, allowing seamless crawling.
|
||||
|
||||
#### Example: Extracting Data Using Managed Browsers
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
# Define schema for structured data extraction
|
||||
schema = {
|
||||
"name": "Example Data",
|
||||
"baseSelector": "div.example",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
}
|
||||
|
||||
# Configure crawler
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Automate subsequent runs
|
||||
verbose=True,
|
||||
use_managed_browser=True,
|
||||
user_data_dir="/path/to/user_profile_data"
|
||||
)
|
||||
|
||||
crawl_config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||
wait_for="css:div.example" # Wait for the targeted element to load
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=crawl_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print("Extracted Data:", result.extracted_content)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Benefits of Managed Browsers Over Other Methods
|
||||
Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely.
|
||||
|
||||
---
|
||||
|
||||
### Magic Mode: Simplified Automation
|
||||
|
||||
While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration.
|
||||
|
||||
#### What Magic Mode Does:
|
||||
- Simulates human browsing by randomizing interaction patterns and timing.
|
||||
- Masks browser automation signals.
|
||||
- Handles cookie popups and modals.
|
||||
- Modifies navigator properties for enhanced compatibility.
|
||||
|
||||
#### Using Magic Mode
|
||||
|
||||
```python
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
magic=True # Enables all automation features
|
||||
)
|
||||
```
|
||||
|
||||
Magic Mode is particularly useful for:
|
||||
- Quick prototyping when a Managed Browser setup is not available.
|
||||
- Basic sites requiring minimal interaction or configuration.
|
||||
|
||||
#### Example: Combining Magic Mode with Additional Options
|
||||
|
||||
```python
|
||||
async def crawl_with_magic_mode(url: str):
|
||||
async with AsyncWebCrawler(headless=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
magic=True,
|
||||
remove_overlay_elements=True, # Remove popups/modals
|
||||
page_timeout=60000 # Increased timeout for complex pages
|
||||
)
|
||||
|
||||
return result.markdown if result.success else None
|
||||
```
|
||||
|
||||
### Magic Mode vs. Managed Browsers
|
||||
While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor.
|
||||
|
||||
---
|
||||
|
||||
### Key Comparison: Managed Browsers vs. Magic Mode
|
||||
|
||||
| Feature | **Managed Browsers** | **Magic Mode** |
|
||||
|-------------------------|------------------------------------------|-------------------------------------|
|
||||
| **Session Persistence** | Retains cookies and local storage. | No session retention. |
|
||||
| **Human Interaction** | Uses real user profiles and data. | Simulates human-like patterns. |
|
||||
| **Complex Sites** | Best suited for heavily configured sites.| Works well with simpler challenges.|
|
||||
| **Setup Complexity** | Requires initial manual interaction. | Fully automated, one-line setup. |
|
||||
|
||||
#### Recommendation:
|
||||
- Use **Managed Browsers** for reliable, session-based crawling and data extraction.
|
||||
- Use **Magic Mode** for quick prototyping or when persistent profiles are not required.
|
||||
|
||||
---
|
||||
|
||||
### Conclusion
|
||||
|
||||
- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites.
|
||||
- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed.
|
||||
|
||||
By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs.
|
||||
|
||||
52
docs/md_v2/advanced/magic-mode.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Magic Mode & Anti-Bot Protection
|
||||
|
||||
Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution.
|
||||
|
||||
## Magic Mode
|
||||
|
||||
The easiest way to bypass anti-bot protections:
|
||||
|
||||
```python
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
magic=True # Enables all anti-detection features
|
||||
)
|
||||
```
|
||||
|
||||
Magic Mode automatically:
|
||||
- Masks browser automation signals
|
||||
- Simulates human-like behavior
|
||||
- Overrides navigator properties
|
||||
- Handles cookie consent popups
|
||||
- Manages browser fingerprinting
|
||||
- Randomizes timing patterns
|
||||
|
||||
## Manual Anti-Bot Options
|
||||
|
||||
While Magic Mode is recommended, you can also configure individual anti-detection features:
|
||||
|
||||
```python
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
simulate_user=True, # Simulate human behavior
|
||||
override_navigator=True # Mask automation signals
|
||||
)
|
||||
```
|
||||
|
||||
Note: When `magic=True` is used, you don't need to set these individual options.
|
||||
|
||||
## Example: Handling Protected Sites
|
||||
|
||||
```python
|
||||
async def crawl_protected_site(url: str):
|
||||
async with AsyncWebCrawler(headless=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
magic=True,
|
||||
remove_overlay_elements=True, # Remove popups/modals
|
||||
page_timeout=60000 # Increased timeout for protection checks
|
||||
)
|
||||
|
||||
return result.markdown if result.success else None
|
||||
```
|
||||
188
docs/md_v2/advanced/managed_browser.md
Normal file
@@ -0,0 +1,188 @@
|
||||
# Creating Browser Instances, Contexts, and Pages
|
||||
|
||||
## 1 Introduction
|
||||
|
||||
### Overview of Browser Management in Crawl4AI
|
||||
Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling.
|
||||
|
||||
### Key Objectives
|
||||
- **Anti-Bot Handling**:
|
||||
- Implements stealth techniques to evade detection mechanisms used by modern websites.
|
||||
- Simulates human-like behavior, such as mouse movements, scrolling, and key presses.
|
||||
- Supports integration with third-party services to bypass CAPTCHA challenges.
|
||||
- **Persistent Sessions**:
|
||||
- Retains session data (cookies, local storage) for workflows requiring user authentication.
|
||||
- Allows seamless continuation of tasks across multiple runs without re-authentication.
|
||||
- **Scalable Crawling**:
|
||||
- Optimized resource utilization for handling thousands of URLs concurrently.
|
||||
- Flexible configuration options to tailor crawling behavior to specific requirements.
|
||||
|
||||
---
|
||||
|
||||
## 2 Browser Creation Methods
|
||||
|
||||
### Standard Browser Creation
|
||||
Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization.
|
||||
|
||||
#### Features and Limitations
|
||||
- **Features**:
|
||||
- Quick and straightforward setup for small-scale tasks.
|
||||
- Supports headless and headful modes.
|
||||
- **Limitations**:
|
||||
- Lacks advanced customization options like session reuse.
|
||||
- May struggle with sites employing strict anti-bot measures.
|
||||
|
||||
#### Example Usage
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com")
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
### Persistent Contexts
|
||||
Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information.
|
||||
|
||||
#### Benefits of Using `user_data_dir`
|
||||
- **Session Persistence**:
|
||||
- Stores cookies, local storage, and cache between crawling sessions.
|
||||
- Reduces overhead for repetitive logins or multi-step workflows.
|
||||
- **Enhanced Performance**:
|
||||
- Leverages pre-loaded resources for faster page loading.
|
||||
- **Flexibility**:
|
||||
- Adapts to complex workflows requiring user-specific configurations.
|
||||
|
||||
#### Example: Setting Up Persistent Contexts
|
||||
```python
|
||||
config = BrowserConfig(user_data_dir="/path/to/user/data")
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com")
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
### Managed Browser
|
||||
The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures.
|
||||
|
||||
#### How It Works
|
||||
- **Browser Process Management**:
|
||||
- Automates initialization and cleanup of browser processes.
|
||||
- Optimizes resource usage by pooling and reusing browser instances.
|
||||
- **Debugging Support**:
|
||||
- Integrates with debugging tools like Chrome Developer Tools for real-time inspection.
|
||||
- **Anti-Bot Measures**:
|
||||
- Implements stealth plugins to mimic real user behavior and bypass bot detection.
|
||||
|
||||
#### Features
|
||||
- **Customizable Configurations**:
|
||||
- Supports advanced options such as viewport resizing, proxy settings, and header manipulation.
|
||||
- **Debugging and Logging**:
|
||||
- Logs detailed browser interactions for debugging and performance analysis.
|
||||
- **Scalability**:
|
||||
- Handles multiple browser instances concurrently, scaling dynamically based on workload.
|
||||
|
||||
#### Example: Using `ManagedBrowser`
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
config = BrowserConfig(headless=False, debug_port=9222)
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com")
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3 Context and Page Management
|
||||
|
||||
### Creating and Configuring Browser Contexts
|
||||
Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage.
|
||||
|
||||
#### Customizations
|
||||
- **Headers and Cookies**:
|
||||
- Define custom headers to mimic specific devices or browsers.
|
||||
- Set cookies for authenticated sessions.
|
||||
- **Session Reuse**:
|
||||
- Retain and reuse session data across multiple requests.
|
||||
- Example: Preserve login states for authenticated crawls.
|
||||
|
||||
#### Example: Context Initialization
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"})
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com", config=config)
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
### Creating Pages
|
||||
Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions.
|
||||
|
||||
#### Key Features
|
||||
- **IFrame Handling**:
|
||||
- Extract content from embedded iframes.
|
||||
- Navigate and interact with nested content.
|
||||
- **Viewport Customization**:
|
||||
- Adjust viewport size to match target device dimensions.
|
||||
- **Lazy Loading**:
|
||||
- Ensure dynamic elements are fully loaded before extraction.
|
||||
|
||||
#### Example: Page Initialization
|
||||
```python
|
||||
config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com", config=config)
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4 Advanced Features and Best Practices
|
||||
|
||||
### Debugging and Logging
|
||||
Remote debugging provides a powerful way to troubleshoot complex crawling workflows.
|
||||
|
||||
#### Example: Enabling Remote Debugging
|
||||
```python
|
||||
config = BrowserConfig(debug_port=9222)
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com")
|
||||
```
|
||||
|
||||
### Anti-Bot Techniques
|
||||
- **Human Behavior Simulation**:
|
||||
- Mimic real user actions, such as scrolling, clicking, and typing.
|
||||
- Example: Use JavaScript to simulate interactions.
|
||||
- **Captcha Handling**:
|
||||
- Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving.
|
||||
|
||||
#### Example: Simulating User Actions
|
||||
```python
|
||||
js_code = """
|
||||
(async () => {
|
||||
document.querySelector('input[name="search"]').value = 'test';
|
||||
document.querySelector('button[type="submit"]').click();
|
||||
})();
|
||||
"""
|
||||
config = CrawlerRunConfig(js_code=[js_code])
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://crawl4ai.com", config=config)
|
||||
```
|
||||
|
||||
### Optimizations for Performance and Scalability
|
||||
- **Persistent Contexts**:
|
||||
- Reuse browser contexts to minimize resource consumption.
|
||||
- **Concurrent Crawls**:
|
||||
- Use `arun_many` with a controlled semaphore count for efficient batch processing.
|
||||
|
||||
#### Example: Scaling Crawls
|
||||
```python
|
||||
urls = ["https://example1.com", "https://example2.com"]
|
||||
config = CrawlerRunConfig(semaphore_count=10)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
for result in results:
|
||||
print(result.url, result.markdown)
|
||||
```
|
||||