Compare commits
88 Commits
vr0.4.3b1
...
feature/sc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f7ce2d42c9 | ||
|
|
f6edb8342e | ||
|
|
ca3f0126d3 | ||
|
|
858c18df39 | ||
|
|
2c8f2ec5a6 | ||
|
|
9ef43bc5f0 | ||
|
|
84ffdaab9a | ||
|
|
78223bc847 | ||
|
|
60ce8bbf55 | ||
|
|
85847ff13f | ||
|
|
f34b4878cf | ||
|
|
d9324e3454 | ||
|
|
0ff95c83bc | ||
|
|
bb6450f458 | ||
|
|
513d008de5 | ||
|
|
dde14eba7d | ||
|
|
d0586f09a9 | ||
|
|
09ac7ed008 | ||
|
|
97796f39d2 | ||
|
|
4d7f91b378 | ||
|
|
69a77222ef | ||
|
|
0afc3e9e5e | ||
|
|
65d33bcc0f | ||
|
|
6a01008a2b | ||
|
|
cf3e1e748d | ||
|
|
6dc01eae3a | ||
|
|
7b7fe84e0d | ||
|
|
5c36f4308f | ||
|
|
45809d1c91 | ||
|
|
357414c345 | ||
|
|
260b9120c3 | ||
|
|
976ea52167 | ||
|
|
e6ef8d91ba | ||
|
|
2d69bf2366 | ||
|
|
dee5fe9851 | ||
|
|
6e78c56dda | ||
|
|
67fa06c09b | ||
|
|
26d78d8512 | ||
|
|
1079965453 | ||
|
|
a677c2b61d | ||
|
|
6dfa9cb703 | ||
|
|
8878b3d032 | ||
|
|
1ab9d115cf | ||
|
|
f9c601eb7e | ||
|
|
ad5e5d21ca | ||
|
|
26d821c0de | ||
|
|
010677cbee | ||
|
|
fe52311bf4 | ||
|
|
01b73950ee | ||
|
|
12880f1ffa | ||
|
|
53be88b677 | ||
|
|
3427ead8b8 | ||
|
|
32652189b0 | ||
|
|
7a5f83b76f | ||
|
|
7c0fa269a6 | ||
|
|
2f5e0598bb | ||
|
|
ff731e4ea1 | ||
|
|
9530ded83a | ||
|
|
155c756238 | ||
|
|
a888c91790 | ||
|
|
a98d51a62c | ||
|
|
ee3001b1f7 | ||
|
|
b13fd71040 | ||
|
|
2226ef53c8 | ||
|
|
3d52b551f2 | ||
|
|
f8e85b1499 | ||
|
|
c1797037c0 | ||
|
|
60670b2af6 | ||
|
|
0d357ab7d2 | ||
|
|
bae4665949 | ||
|
|
d11c004fbb | ||
|
|
3d1c9a8434 | ||
|
|
be472c624c | ||
|
|
06b21dcc50 | ||
|
|
0f0f60527d | ||
|
|
8105fd178e | ||
|
|
ce7fce4b16 | ||
|
|
de28b59aca | ||
|
|
04d8b47b92 | ||
|
|
2943feeecf | ||
|
|
8a7d29ce85 | ||
|
|
159bd875bd | ||
|
|
d743adac68 | ||
|
|
7fe220dbd5 | ||
|
|
65e013d9d1 | ||
|
|
7f3e2e47ed | ||
|
|
78f26ac263 | ||
|
|
44ce12c62c |
59
.github/DISCUSSION_TEMPLATE/feature-requests.yml
vendored
Normal file
59
.github/DISCUSSION_TEMPLATE/feature-requests.yml
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
title: "[Feature Request]: "
|
||||
labels: ["⚙️ New"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thank you for your interest in suggesting a new feature! Before you submit, please take a moment to check if already exists in
|
||||
this discussions category to avoid duplicates. 😊
|
||||
|
||||
- type: textarea
|
||||
id: needs_to_be_done
|
||||
attributes:
|
||||
label: What needs to be done?
|
||||
description: Please describe the feature or functionality you'd like to see.
|
||||
placeholder: "e.g., Return alt text along with images scraped from a webpages in Result"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: problem_to_solve
|
||||
attributes:
|
||||
label: What problem does this solve?
|
||||
description: Explain the pain point or issue this feature will help address.
|
||||
placeholder: "e.g., Bypass Captchas added by cloudflare"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: target_users
|
||||
attributes:
|
||||
label: Target users/beneficiaries
|
||||
description: Who would benefit from this feature? (e.g., specific teams, developers, users, etc.)
|
||||
placeholder: "e.g., Marketing teams, developers"
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
id: current_workarounds
|
||||
attributes:
|
||||
label: Current alternatives/workarounds
|
||||
description: Are there any existing solutions or workarounds? How does this feature improve upon them?
|
||||
placeholder: "e.g., Users manually select the css classes mapped to data fields to extract them"
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
### 💡 Implementation Ideas
|
||||
|
||||
- type: textarea
|
||||
id: proposed_approach
|
||||
attributes:
|
||||
label: Proposed approach
|
||||
description: Share any ideas you have for how this feature could be implemented. Point out any challenges your foresee
|
||||
and the success metrics for this feature
|
||||
placeholder: "e.g., Implement a breadth first traversal algorithm for scraper"
|
||||
validations:
|
||||
required: false
|
||||
127
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
127
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,127 @@
|
||||
name: Bug Report
|
||||
description: Report a bug with the Crawl4AI.
|
||||
title: "[Bug]: "
|
||||
labels: ["🐞 Bug","🩺 Needs Triage"]
|
||||
body:
|
||||
- type: input
|
||||
id: crawl4ai_version
|
||||
attributes:
|
||||
label: crawl4ai version
|
||||
description: Specify the version of crawl4ai you are using.
|
||||
placeholder: "e.g., 2.0.0"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: expected_behavior
|
||||
attributes:
|
||||
label: Expected Behavior
|
||||
description: Describe what you expected to happen.
|
||||
placeholder: "Provide a detailed explanation of the expected outcome."
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: current_behavior
|
||||
attributes:
|
||||
label: Current Behavior
|
||||
description: Describe what is happening instead of the expected behavior.
|
||||
placeholder: "Describe the actual result or issue you encountered."
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: dropdown
|
||||
id: reproducible
|
||||
attributes:
|
||||
label: Is this reproducible?
|
||||
description: Indicate whether this bug can be reproduced consistently.
|
||||
options:
|
||||
- "Yes"
|
||||
- "No"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: inputs
|
||||
attributes:
|
||||
label: Inputs Causing the Bug
|
||||
description: Provide details about the inputs causing the issue.
|
||||
placeholder: |
|
||||
- URL(s):
|
||||
- Settings used:
|
||||
- Input data (if applicable):
|
||||
render: bash
|
||||
|
||||
- type: textarea
|
||||
id: steps_to_reproduce
|
||||
attributes:
|
||||
label: Steps to Reproduce
|
||||
description: Provide step-by-step instructions to reproduce the issue.
|
||||
placeholder: |
|
||||
1. Go to...
|
||||
2. Click on...
|
||||
3. Observe the issue...
|
||||
render: bash
|
||||
|
||||
- type: textarea
|
||||
id: code_snippets
|
||||
attributes:
|
||||
label: Code snippets
|
||||
description: Provide code snippets(if any). Add comments as necessary
|
||||
placeholder: print("Hello world")
|
||||
render: python
|
||||
|
||||
# Header Section with Title
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
## Supporting Information
|
||||
Please provide the following details to help us understand and resolve your issue. This will assist us in reproducing and diagnosing the problem
|
||||
|
||||
- type: input
|
||||
id: os
|
||||
attributes:
|
||||
label: OS
|
||||
description: Please provide the operating system & distro where the issue occurs.
|
||||
placeholder: "e.g., Windows, macOS, Linux"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: python_version
|
||||
attributes:
|
||||
label: Python version
|
||||
description: Specify the Python version being used.
|
||||
placeholder: "e.g., 3.8.5"
|
||||
validations:
|
||||
required: true
|
||||
|
||||
# Browser Field
|
||||
- type: input
|
||||
id: browser
|
||||
attributes:
|
||||
label: Browser
|
||||
description: Provide the name of the browser you are using.
|
||||
placeholder: "e.g., Chrome, Firefox, Safari"
|
||||
validations:
|
||||
required: false
|
||||
|
||||
# Browser Version Field
|
||||
- type: input
|
||||
id: browser_version
|
||||
attributes:
|
||||
label: Browser version
|
||||
description: Provide the version of the browser you are using.
|
||||
placeholder: "e.g., 91.0.4472.124"
|
||||
validations:
|
||||
required: false
|
||||
|
||||
# Error Logs Field (Text Area)
|
||||
- type: textarea
|
||||
id: error_logs
|
||||
attributes:
|
||||
label: Error logs & Screenshots (if applicable)
|
||||
description: If you encountered any errors, please provide the error logs. Attach any relevant screenshots to help us understand the issue.
|
||||
placeholder: "Paste error logs here and attach your screenshots"
|
||||
validations:
|
||||
required: false
|
||||
8
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
8
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Feature Requests
|
||||
url: https://github.com/unclecode/crawl4ai/discussions/categories/feature-requests
|
||||
about: "Suggest new features or enhancements for Crawl4AI"
|
||||
- name: Forums - Q&A
|
||||
url: https://github.com/unclecode/crawl4ai/discussions/categories/forums-q-a
|
||||
about: "Ask questions or engage in general discussions about Crawl4AI"
|
||||
19
.github/pull_request_template.md
vendored
Normal file
19
.github/pull_request_template.md
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
## Summary
|
||||
Please include a summary of the change and/or which issues are fixed.
|
||||
|
||||
eg: `Fixes #123` (Tag GitHub issue numbers in this format, so it automatically links the issues with your PR)
|
||||
|
||||
## List of files changed and why
|
||||
eg: quickstart.py - To update the example as per new changes
|
||||
|
||||
## How Has This Been Tested?
|
||||
Please describe the tests that you ran to verify your changes.
|
||||
|
||||
## Checklist:
|
||||
|
||||
- [ ] My code follows the style guidelines of this project
|
||||
- [ ] I have performed a self-review of my own code
|
||||
- [ ] I have commented my code, particularly in hard-to-understand areas
|
||||
- [ ] I have made corresponding changes to the documentation
|
||||
- [ ] I have added/updated unit tests that prove my fix is effective or that my feature works
|
||||
- [ ] New and existing unit tests pass locally with my changes
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -226,8 +226,15 @@ tree.md
|
||||
.local
|
||||
.do
|
||||
/plans
|
||||
plans/
|
||||
|
||||
# Codeium
|
||||
.codeiumignore
|
||||
todo/
|
||||
|
||||
# windsurf rules
|
||||
.windsurfrules
|
||||
|
||||
|
||||
# windsurf rules
|
||||
.windsurfrules
|
||||
|
||||
13
CHANGELOG.md
13
CHANGELOG.md
@@ -5,9 +5,12 @@ All notable changes to Crawl4AI will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
---
|
||||
|
||||
### Changed
|
||||
Okay, here's a detailed changelog in Markdown format, generated from the provided git diff and commit history. I've focused on user-facing changes, fixes, and features, and grouped them as requested:
|
||||
|
||||
## Version 0.4.3 (2025-01-21)
|
||||
## Version 0.4.3b2 (2025-01-21)
|
||||
|
||||
This release introduces several powerful new features, including robots.txt compliance, dynamic proxy support, LLM-powered schema generation, and improved documentation.
|
||||
|
||||
@@ -29,7 +32,7 @@ This release introduces several powerful new features, including robots.txt comp
|
||||
|
||||
- **URL Redirection Tracking:**
|
||||
- Added URL redirection tracking to capture the final URL after any redirects.
|
||||
- The final URL is now available in the `final_url` field of the `AsyncCrawlResponse` object.
|
||||
- The final URL is now available in the `redirected_url` field of the `AsyncCrawlResponse` object.
|
||||
|
||||
- **Enhanced Streamlined Documentation:**
|
||||
- Refactored and improved the documentation structure for clarity and ease of use.
|
||||
@@ -135,9 +138,11 @@ This release introduces several powerful new features, including robots.txt comp
|
||||
- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py))
|
||||
- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
|
||||
|
||||
#### Other
|
||||
- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore))
|
||||
## [0.4.267] - 2025 - 01 - 06
|
||||
|
||||
### Added
|
||||
- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
|
||||
- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
|
||||
|
||||
## [0.4.24] - 2024-12-31
|
||||
|
||||
|
||||
131
CODE_OF_CONDUCT.md
Normal file
131
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# Crawl4AI Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, caste, color, religion, or sexual
|
||||
identity and orientation.
|
||||
|
||||
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||
diverse, inclusive, and healthy community.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment for our
|
||||
community include:
|
||||
|
||||
* Demonstrating empathy and kindness toward other people
|
||||
* Being respectful of differing opinions, viewpoints, and experiences
|
||||
* Giving and gracefully accepting constructive feedback
|
||||
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||
and learning from the experience
|
||||
* Focusing on what is best not just for us as individuals, but for the overall
|
||||
community
|
||||
|
||||
Examples of unacceptable behavior include:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or advances of
|
||||
any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or email address,
|
||||
without their explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Enforcement Responsibilities
|
||||
|
||||
Community leaders are responsible for clarifying and enforcing our standards of
|
||||
acceptable behavior and will take appropriate and fair corrective action in
|
||||
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||
or harmful.
|
||||
|
||||
Community leaders have the right and responsibility to remove, edit, or reject
|
||||
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||
decisions when appropriate.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies within all community spaces, and also applies when
|
||||
an individual is officially representing the community in public spaces.
|
||||
Examples of representing our community include using an official email address,
|
||||
posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the community leaders responsible for enforcement at
|
||||
unclecode@crawl4ai.com. All complaints will be reviewed and investigated promptly and fairly.
|
||||
|
||||
All community leaders are obligated to respect the privacy and security of the
|
||||
reporter of any incident.
|
||||
|
||||
## Enforcement Guidelines
|
||||
|
||||
Community leaders will follow these Community Impact Guidelines in determining
|
||||
the consequences for any action they deem in violation of this Code of Conduct:
|
||||
|
||||
### 1. Correction
|
||||
|
||||
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||
unprofessional or unwelcome in the community.
|
||||
|
||||
**Consequence**: A private, written warning from community leaders, providing
|
||||
clarity around the nature of the violation and an explanation of why the
|
||||
behavior was inappropriate. A public apology may be requested.
|
||||
|
||||
### 2. Warning
|
||||
|
||||
**Community Impact**: A violation through a single incident or series of
|
||||
actions.
|
||||
|
||||
**Consequence**: A warning with consequences for continued behavior. No
|
||||
interaction with the people involved, including unsolicited interaction with
|
||||
those enforcing the Code of Conduct, for a specified period of time. This
|
||||
includes avoiding interactions in community spaces as well as external channels
|
||||
like social media. Violating these terms may lead to a temporary or permanent
|
||||
ban.
|
||||
|
||||
### 3. Temporary Ban
|
||||
|
||||
**Community Impact**: A serious violation of community standards, including
|
||||
sustained inappropriate behavior.
|
||||
|
||||
**Consequence**: A temporary ban from any sort of interaction or public
|
||||
communication with the community for a specified period of time. No public or
|
||||
private interaction with the people involved, including unsolicited interaction
|
||||
with those enforcing the Code of Conduct, is allowed during this period.
|
||||
Violating these terms may lead to a permanent ban.
|
||||
|
||||
### 4. Permanent Ban
|
||||
|
||||
**Community Impact**: Demonstrating a pattern of violation of community
|
||||
standards, including sustained inappropriate behavior, harassment of an
|
||||
individual, or aggression toward or disparagement of classes of individuals.
|
||||
|
||||
**Consequence**: A permanent ban from any sort of public interaction within the
|
||||
community.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||
version 2.1, available at
|
||||
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
|
||||
|
||||
Community Impact Guidelines were inspired by
|
||||
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
|
||||
[https://www.contributor-covenant.org/translations][translations].
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
|
||||
[Mozilla CoC]: https://github.com/mozilla/diversity
|
||||
[FAQ]: https://www.contributor-covenant.org/faq
|
||||
[translations]: https://www.contributor-covenant.org/translations
|
||||
@@ -6,7 +6,7 @@ We would like to thank the following people for their contributions to Crawl4AI:
|
||||
|
||||
- [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer
|
||||
- [Nasrin](https://github.com/ntohidi) - Project Manager and Developer
|
||||
- [Aravind Karnam](https://github.com/aravindkarnam) - Developer
|
||||
- [Aravind Karnam](https://github.com/aravindkarnam) - Head of Community and Product
|
||||
|
||||
## Community Contributors
|
||||
|
||||
|
||||
22
README.md
22
README.md
@@ -15,14 +15,15 @@
|
||||
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
||||
[](https://github.com/psf/black)
|
||||
[](https://github.com/PyCQA/bandit)
|
||||
[](code_of_conduct.md)
|
||||
|
||||
</div>
|
||||
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
[✨ Check out latest update v0.4.3b1x](#-recent-updates)
|
||||
[✨ Check out latest update v0.4.3bx](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.4.3b1 is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog)
|
||||
🎉 **Version 0.4.3bx is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
@@ -31,7 +32,7 @@ My journey with computers started in childhood when my dad, a computer scientist
|
||||
|
||||
Fast forward to 2023, I was working on a tool for a project and needed a crawler to convert a webpage into markdown. While exploring solutions, I found one that claimed to be open-source but required creating an account and generating an API token. Worse, it turned out to be a SaaS model charging $16, and its quality didn’t meet my standards. Frustrated, I realized this was a deeper problem. That frustration turned into turbo anger mode, and I decided to build my own solution. In just a few days, I created Crawl4AI. To my surprise, it went viral, earning thousands of GitHub stars and resonating with a global community.
|
||||
|
||||
I made Crawl4AI open-source for two reasons. First, it’s my way of giving back to the open-source community that has supported me throughout my career. Second, I believe data should be accessible to everyone, not locked behind paywalls or monopolized by a few. Open access to data lays the foundation for the democratization of AI—a vision where individuals can train their own models and take ownership of their information. This library is the first step in a larger journey to create the best open-source data extraction and generation tool the world has ever seen, built collaboratively by a passionate community.
|
||||
I made Crawl4AI open-source for two reasons. First, it’s my way of giving back to the open-source community that has supported me throughout my career. Second, I believe data should be accessible to everyone, not locked behind paywalls or monopolized by a few. Open access to data lays the foundation for the democratization of AI, a vision where individuals can train their own models and take ownership of their information. This library is the first step in a larger journey to create the best open-source data extraction and generation tool the world has ever seen, built collaboratively by a passionate community.
|
||||
|
||||
Thank you to everyone who has supported this project, used it, and shared feedback. Your encouragement motivates me to dream even bigger. Join us, file issues, submit PRs, or spread the word. Together, we can build a tool that truly empowers people to access their own data and reshape the future of AI.
|
||||
</details>
|
||||
@@ -52,6 +53,9 @@ Thank you to everyone who has supported this project, used it, and shared feedba
|
||||
# Install the package
|
||||
pip install -U crawl4ai
|
||||
|
||||
# For pre release versions
|
||||
pip install crawl4ai --pre
|
||||
|
||||
# Run post-installation setup
|
||||
crawl4ai-setup
|
||||
|
||||
@@ -443,7 +447,7 @@ if __name__ == "__main__":
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>🤖 <strong>Using You own Browswer with Custome User Profile</strong></summary>
|
||||
<summary>🤖 <strong>Using You own Browser with Custom User Profile</strong></summary>
|
||||
|
||||
```python
|
||||
import os, sys
|
||||
@@ -491,15 +495,13 @@ async def test_news_crawl():
|
||||
- **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction.
|
||||
- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
|
||||
- **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
|
||||
- **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects.
|
||||
- **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence.
|
||||
- **➡️ URL Redirection Tracking**: The `redirected_url` field now captures the final destination after any redirects.
|
||||
- **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
|
||||
- **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
|
||||
- **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials.
|
||||
|
||||
Read the full details in our [0.4.248 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
Here's a clear markdown explanation for your users about version numbering:
|
||||
|
||||
Read the full details in our [0.4.3bx Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
## Version Numbering in Crawl4AI
|
||||
|
||||
@@ -570,7 +572,7 @@ To check our development plans and upcoming features, visit our [Roadmap](https:
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information.
|
||||
We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
|
||||
## 📄 License
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ from .extraction_strategy import (
|
||||
)
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter
|
||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
from .async_dispatcher import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
@@ -44,6 +44,7 @@ __all__ = [
|
||||
"ChunkingStrategy",
|
||||
"RegexChunking",
|
||||
"DefaultMarkdownGenerator",
|
||||
"RelevantContentFilter",
|
||||
"PruningContentFilter",
|
||||
"BM25ContentFilter",
|
||||
"LLMContentFilter",
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.4.3b1"
|
||||
__version__ = "0.4.3b3"
|
||||
|
||||
@@ -6,12 +6,16 @@ from .config import (
|
||||
IMAGE_SCORE_THRESHOLD,
|
||||
SOCIAL_MEDIA_DOMAINS,
|
||||
)
|
||||
from .user_agent_generator import UserAgentGenerator
|
||||
|
||||
from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .deep_crawl import DeepCrawlStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from typing import Optional, Union, List
|
||||
from .cache_context import CacheMode
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
@@ -29,6 +33,7 @@ class BrowserConfig:
|
||||
Default: True.
|
||||
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||||
advanced manipulation. Default: False.
|
||||
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
|
||||
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
|
||||
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
|
||||
Automatically sets use_managed_browser=True. Default: False.
|
||||
@@ -77,17 +82,18 @@ class BrowserConfig:
|
||||
browser_type: str = "chromium",
|
||||
headless: bool = True,
|
||||
use_managed_browser: bool = False,
|
||||
cdp_url: str = None,
|
||||
use_persistent_context: bool = False,
|
||||
user_data_dir: str = None,
|
||||
chrome_channel: str = "chromium",
|
||||
channel: str = "chromium",
|
||||
proxy: Optional[str] = None,
|
||||
proxy: str = None,
|
||||
proxy_config: dict = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: str = None,
|
||||
storage_state=None,
|
||||
storage_state : Union[str, dict, None]=None,
|
||||
ignore_https_errors: bool = True,
|
||||
java_script_enabled: bool = True,
|
||||
sleep_on_close: bool = False,
|
||||
@@ -95,19 +101,23 @@ class BrowserConfig:
|
||||
cookies: list = None,
|
||||
headers: dict = None,
|
||||
user_agent: str = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||||
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
|
||||
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
# "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
|
||||
),
|
||||
user_agent_mode: str = None,
|
||||
user_agent_generator_config: dict = None,
|
||||
user_agent_mode: str = "",
|
||||
user_agent_generator_config: dict = {},
|
||||
text_mode: bool = False,
|
||||
light_mode: bool = False,
|
||||
extra_args: list = None,
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost",
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
self.use_managed_browser = use_managed_browser
|
||||
self.cdp_url = cdp_url
|
||||
self.use_persistent_context = use_persistent_context
|
||||
self.user_data_dir = user_data_dir
|
||||
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
||||
@@ -136,17 +146,15 @@ class BrowserConfig:
|
||||
self.verbose = verbose
|
||||
self.debugging_port = debugging_port
|
||||
|
||||
user_agenr_generator = UserAgentGenerator()
|
||||
if self.user_agent_mode != "random" and self.user_agent_generator_config:
|
||||
self.user_agent = user_agenr_generator.generate(
|
||||
fa_user_agenr_generator = ValidUAGenerator()
|
||||
if self.user_agent_mode == "random":
|
||||
self.user_agent = fa_user_agenr_generator.generate(
|
||||
**(self.user_agent_generator_config or {})
|
||||
)
|
||||
elif self.user_agent_mode == "random":
|
||||
self.user_agent = user_agenr_generator.generate()
|
||||
else:
|
||||
pass
|
||||
|
||||
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
||||
|
||||
self.browser_hint = UAGen.generate_client_hints(self.user_agent)
|
||||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||||
|
||||
# If persistent context is requested, ensure managed browser is enabled
|
||||
@@ -159,6 +167,7 @@ class BrowserConfig:
|
||||
browser_type=kwargs.get("browser_type", "chromium"),
|
||||
headless=kwargs.get("headless", True),
|
||||
use_managed_browser=kwargs.get("use_managed_browser", False),
|
||||
cdp_url=kwargs.get("cdp_url"),
|
||||
use_persistent_context=kwargs.get("use_persistent_context", False),
|
||||
user_data_dir=kwargs.get("user_data_dir"),
|
||||
chrome_channel=kwargs.get("chrome_channel", "chromium"),
|
||||
@@ -191,6 +200,7 @@ class BrowserConfig:
|
||||
"browser_type": self.browser_type,
|
||||
"headless": self.headless,
|
||||
"use_managed_browser": self.use_managed_browser,
|
||||
"cdp_url": self.cdp_url,
|
||||
"use_persistent_context": self.use_persistent_context,
|
||||
"user_data_dir": self.user_data_dir,
|
||||
"chrome_channel": self.chrome_channel,
|
||||
@@ -373,6 +383,11 @@ class CrawlerRunConfig:
|
||||
stream (bool): If True, stream the page content as it is being loaded.
|
||||
url: str = None # This is not a compulsory parameter
|
||||
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
|
||||
user_agent (str): Custom User-Agent string to use. Default: None
|
||||
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
|
||||
user_agent as-is. Default: None.
|
||||
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -381,8 +396,9 @@ class CrawlerRunConfig:
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
deep_crawl_strategy: DeepCrawlStrategy = None,
|
||||
markdown_generator: MarkdownGenerationStrategy = None,
|
||||
content_filter=None,
|
||||
content_filter : RelevantContentFilter = None,
|
||||
only_text: bool = False,
|
||||
css_selector: str = None,
|
||||
excluded_tags: list = None,
|
||||
@@ -396,7 +412,7 @@ class CrawlerRunConfig:
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
# Caching Parameters
|
||||
cache_mode=None,
|
||||
cache_mode: CacheMode =None,
|
||||
session_id: str = None,
|
||||
bypass_cache: bool = False,
|
||||
disable_cache: bool = False,
|
||||
@@ -444,6 +460,9 @@ class CrawlerRunConfig:
|
||||
stream: bool = False,
|
||||
url: str = None,
|
||||
check_robots_txt: bool = False,
|
||||
user_agent: str = None,
|
||||
user_agent_mode: str = None,
|
||||
user_agent_generator_config: dict = {},
|
||||
):
|
||||
self.url = url
|
||||
|
||||
@@ -451,6 +470,7 @@ class CrawlerRunConfig:
|
||||
self.word_count_threshold = word_count_threshold
|
||||
self.extraction_strategy = extraction_strategy
|
||||
self.chunking_strategy = chunking_strategy
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
self.markdown_generator = markdown_generator
|
||||
self.content_filter = content_filter
|
||||
self.only_text = only_text
|
||||
@@ -526,6 +546,11 @@ class CrawlerRunConfig:
|
||||
# Robots.txt Handling Parameters
|
||||
self.check_robots_txt = check_robots_txt
|
||||
|
||||
# User Agent Parameters
|
||||
self.user_agent = user_agent
|
||||
self.user_agent_mode = user_agent_mode
|
||||
self.user_agent_generator_config = user_agent_generator_config
|
||||
|
||||
# Validate type of extraction strategy and chunking strategy if they are provided
|
||||
if self.extraction_strategy is not None and not isinstance(
|
||||
self.extraction_strategy, ExtractionStrategy
|
||||
@@ -533,6 +558,14 @@ class CrawlerRunConfig:
|
||||
raise ValueError(
|
||||
"extraction_strategy must be an instance of ExtractionStrategy"
|
||||
)
|
||||
|
||||
if self.deep_crawl_strategy is not None and not isinstance(
|
||||
self.deep_crawl_strategy, DeepCrawlStrategy
|
||||
):
|
||||
raise ValueError(
|
||||
"deep_crawl_strategy must be an instance of DeepCrawlStrategy"
|
||||
)
|
||||
|
||||
if self.chunking_strategy is not None and not isinstance(
|
||||
self.chunking_strategy, ChunkingStrategy
|
||||
):
|
||||
@@ -551,6 +584,7 @@ class CrawlerRunConfig:
|
||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||||
markdown_generator=kwargs.get("markdown_generator"),
|
||||
content_filter=kwargs.get("content_filter"),
|
||||
only_text=kwargs.get("only_text", False),
|
||||
@@ -623,6 +657,9 @@ class CrawlerRunConfig:
|
||||
stream=kwargs.get("stream", False),
|
||||
url=kwargs.get("url"),
|
||||
check_robots_txt=kwargs.get("check_robots_txt", False),
|
||||
user_agent=kwargs.get("user_agent"),
|
||||
user_agent_mode=kwargs.get("user_agent_mode"),
|
||||
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
|
||||
)
|
||||
|
||||
# Create a funciton returns dict of the object
|
||||
@@ -631,6 +668,7 @@ class CrawlerRunConfig:
|
||||
"word_count_threshold": self.word_count_threshold,
|
||||
"extraction_strategy": self.extraction_strategy,
|
||||
"chunking_strategy": self.chunking_strategy,
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"markdown_generator": self.markdown_generator,
|
||||
"content_filter": self.content_filter,
|
||||
"only_text": self.only_text,
|
||||
@@ -686,6 +724,9 @@ class CrawlerRunConfig:
|
||||
"stream": self.stream,
|
||||
"url": self.url,
|
||||
"check_robots_txt": self.check_robots_txt,
|
||||
"user_agent": self.user_agent,
|
||||
"user_agent_mode": self.user_agent_mode,
|
||||
"user_agent_generator_config": self.user_agent_generator_config,
|
||||
}
|
||||
|
||||
def clone(self, **kwargs):
|
||||
|
||||
@@ -23,6 +23,7 @@ from .async_logger import AsyncLogger
|
||||
from playwright_stealth import StealthConfig
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from .utils import get_home_folder, get_chromium_path
|
||||
from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
@@ -102,6 +103,7 @@ class ManagedBrowser:
|
||||
logger=None,
|
||||
host: str = "localhost",
|
||||
debugging_port: int = 9222,
|
||||
cdp_url: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the ManagedBrowser instance.
|
||||
@@ -116,6 +118,7 @@ class ManagedBrowser:
|
||||
logger (logging.Logger): Logger instance for logging messages. Default: None.
|
||||
host (str): Host for debugging the browser. Default: "localhost".
|
||||
debugging_port (int): Port for debugging the browser. Default: 9222.
|
||||
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
|
||||
"""
|
||||
self.browser_type = browser_type
|
||||
self.user_data_dir = user_data_dir
|
||||
@@ -126,12 +129,20 @@ class ManagedBrowser:
|
||||
self.host = host
|
||||
self.logger = logger
|
||||
self.shutting_down = False
|
||||
self.cdp_url = cdp_url
|
||||
|
||||
async def start(self) -> str:
|
||||
"""
|
||||
Starts the browser process and returns the CDP endpoint URL.
|
||||
If user_data_dir is not provided, creates a temporary directory.
|
||||
Starts the browser process or returns CDP endpoint URL.
|
||||
If cdp_url is provided, returns it directly.
|
||||
If user_data_dir is not provided for local browser, creates a temporary directory.
|
||||
|
||||
Returns:
|
||||
str: CDP endpoint URL
|
||||
"""
|
||||
# If CDP URL provided, just return it
|
||||
if self.cdp_url:
|
||||
return self.cdp_url
|
||||
|
||||
# Create temp dir if needed
|
||||
if not self.user_data_dir:
|
||||
@@ -554,7 +565,7 @@ class BrowserManager:
|
||||
Context: Browser context object with the specified configurations
|
||||
"""
|
||||
# Base settings
|
||||
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
||||
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
||||
viewport_settings = {
|
||||
"width": self.config.viewport_width,
|
||||
"height": self.config.viewport_height,
|
||||
@@ -633,9 +644,12 @@ class BrowserManager:
|
||||
if crawlerRunConfig.proxy_config:
|
||||
proxy_settings = {
|
||||
"server": crawlerRunConfig.proxy_config.get("server"),
|
||||
"username": crawlerRunConfig.proxy_config.get("username"),
|
||||
"password": crawlerRunConfig.proxy_config.get("password"),
|
||||
}
|
||||
if crawlerRunConfig.proxy_config.get("username"):
|
||||
proxy_settings.update({
|
||||
"username": crawlerRunConfig.proxy_config.get("username"),
|
||||
"password": crawlerRunConfig.proxy_config.get("password"),
|
||||
})
|
||||
context_settings["proxy"] = proxy_settings
|
||||
|
||||
if self.config.text_mode:
|
||||
@@ -1251,16 +1265,18 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
config.url = url
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
final_url = url
|
||||
redirected_url = url
|
||||
|
||||
# Reset downloaded files list for new crawl
|
||||
self._downloaded_files = []
|
||||
|
||||
# Handle user agent with magic mode
|
||||
user_agent = self.browser_config.user_agent
|
||||
if config.magic and self.browser_config.user_agent_mode != "random":
|
||||
self.browser_config.user_agent = UserAgentGenerator().generate(
|
||||
**(self.browser_config.user_agent_generator_config or {})
|
||||
user_agent_to_override = config.user_agent
|
||||
if user_agent_to_override:
|
||||
self.browser_config.user_agent = user_agent_to_override
|
||||
elif config.magic or config.user_agent_mode == "random":
|
||||
self.browser_config.user_agent = ValidUAGenerator().generate(
|
||||
**(config.user_agent_generator_config or {})
|
||||
)
|
||||
|
||||
# Get page for session
|
||||
@@ -1333,7 +1349,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
response = await page.goto(
|
||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||
)
|
||||
final_url = page.url
|
||||
redirected_url = page.url
|
||||
except Error as e:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
|
||||
@@ -1613,7 +1629,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
downloaded_files=(
|
||||
self._downloaded_files if self._downloaded_files else None
|
||||
),
|
||||
final_url=final_url,
|
||||
redirected_url=redirected_url,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -10,13 +10,19 @@ import asyncio
|
||||
|
||||
# from contextlib import nullcontext, asynccontextmanager
|
||||
from contextlib import asynccontextmanager
|
||||
from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult
|
||||
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
MarkdownGenerationResult,
|
||||
CrawlerTaskResult,
|
||||
DispatchResult,
|
||||
)
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import * # noqa: F403
|
||||
from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
|
||||
from .content_filter_strategy import * # noqa: F403
|
||||
from .content_filter_strategy import RelevantContentFilter
|
||||
from .extraction_strategy import * # noqa: F403
|
||||
from .extraction_strategy import * # noqa: F403
|
||||
from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
|
||||
from .async_crawler_strategy import (
|
||||
AsyncCrawlerStrategy,
|
||||
@@ -30,8 +36,9 @@ from .markdown_generation_strategy import (
|
||||
)
|
||||
from .async_logger import AsyncLogger
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
from .deep_crawl import DeepCrawlStrategy
|
||||
|
||||
from .config import MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
@@ -46,11 +53,18 @@ from .utils import (
|
||||
from typing import Union, AsyncGenerator, List, TypeVar
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
|
||||
from .__version__ import __version__ as crawl4ai_version
|
||||
|
||||
CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
|
||||
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
|
||||
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
DeepCrawlManyReturn = Union[
|
||||
List[List[CrawlResultT]],
|
||||
AsyncGenerator[CrawlResultT, None],
|
||||
]
|
||||
|
||||
|
||||
class AsyncWebCrawler:
|
||||
"""
|
||||
@@ -257,7 +271,7 @@ class AsyncWebCrawler:
|
||||
|
||||
@asynccontextmanager
|
||||
async def nullcontext(self):
|
||||
"""异步空上下文管理器"""
|
||||
"""Asynchronous null context manager"""
|
||||
yield
|
||||
|
||||
async def arun(
|
||||
@@ -282,7 +296,7 @@ class AsyncWebCrawler:
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
) -> Union[CrawlResult, DeepCrawlSingleReturn]:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
@@ -384,6 +398,23 @@ class AsyncWebCrawler:
|
||||
extracted_content = None
|
||||
start_time = time.perf_counter()
|
||||
|
||||
if crawler_config.deep_crawl_strategy:
|
||||
if crawler_config.stream:
|
||||
return crawler_config.deep_crawl_strategy.arun(
|
||||
start_url=url,
|
||||
crawler=self,
|
||||
crawler_run_config=crawler_config,
|
||||
)
|
||||
else:
|
||||
results = []
|
||||
async for result in crawler_config.deep_crawl_strategy.arun(
|
||||
start_url=url,
|
||||
crawler=self,
|
||||
crawler_run_config=crawler_config,
|
||||
):
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
# Try to get cached result if appropriate
|
||||
if cache_context.should_read():
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
@@ -420,14 +451,18 @@ class AsyncWebCrawler:
|
||||
|
||||
# Check robots.txt if enabled
|
||||
if config and config.check_robots_txt:
|
||||
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
|
||||
if not await self.robots_parser.can_fetch(
|
||||
url, self.browser_config.user_agent
|
||||
):
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
success=False,
|
||||
status_code=403,
|
||||
error_message="Access denied by robots.txt",
|
||||
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
||||
response_headers={
|
||||
"X-Robots-Status": "Blocked by robots.txt"
|
||||
},
|
||||
)
|
||||
|
||||
# Pass config to crawl method
|
||||
@@ -449,7 +484,7 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Process the HTML content
|
||||
crawl_result : CrawlResult = await self.aprocess_html(
|
||||
crawl_result: CrawlResult = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
@@ -462,7 +497,7 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.redirected_url = async_response.final_url or url
|
||||
crawl_result.redirected_url = async_response.redirected_url or url
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
crawl_result.ssl_certificate = (
|
||||
@@ -717,7 +752,7 @@ class AsyncWebCrawler:
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
@@ -731,8 +766,8 @@ class AsyncWebCrawler:
|
||||
pdf: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs
|
||||
) -> RunManyReturn:
|
||||
**kwargs,
|
||||
) -> Union[RunManyReturn, DeepCrawlManyReturn]:
|
||||
"""
|
||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||
|
||||
@@ -763,6 +798,22 @@ class AsyncWebCrawler:
|
||||
):
|
||||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||||
"""
|
||||
|
||||
async def merge_async_generators(generators):
|
||||
tasks = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
|
||||
while tasks:
|
||||
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
||||
|
||||
for task in done:
|
||||
gen = tasks.pop(task) # Get the generator associated with this task
|
||||
|
||||
try:
|
||||
result = task.result()
|
||||
yield result # Yield the result
|
||||
tasks[asyncio.create_task(gen.__anext__())] = gen # Fetch next item
|
||||
except StopAsyncIteration:
|
||||
pass # Generator is exhausted, don't add it back to the tasks
|
||||
|
||||
if config is None:
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=word_count_threshold,
|
||||
@@ -786,7 +837,9 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
transform_result = lambda task_result: (
|
||||
setattr(task_result.result, 'dispatch_result',
|
||||
setattr(
|
||||
task_result.result,
|
||||
"dispatch_result",
|
||||
DispatchResult(
|
||||
task_id=task_result.task_id,
|
||||
memory_usage=task_result.memory_usage,
|
||||
@@ -794,20 +847,46 @@ class AsyncWebCrawler:
|
||||
start_time=task_result.start_time,
|
||||
end_time=task_result.end_time,
|
||||
error_message=task_result.error_message,
|
||||
)
|
||||
) or task_result.result
|
||||
),
|
||||
)
|
||||
or task_result.result
|
||||
)
|
||||
|
||||
stream = config.stream
|
||||
|
||||
|
||||
if config.deep_crawl_strategy:
|
||||
if config.stream:
|
||||
generators = []
|
||||
for url in urls:
|
||||
generators.append(
|
||||
config.deep_crawl_strategy.arun(
|
||||
start_url=url, crawler=self, crawler_run_config=config
|
||||
)
|
||||
)
|
||||
return merge_async_generators(generators)
|
||||
else:
|
||||
results = []
|
||||
for url in urls:
|
||||
url_results = []
|
||||
async for result in config.deep_crawl_strategy.arun(
|
||||
start_url=url, crawler=self, crawler_run_config=config
|
||||
):
|
||||
url_results.append(result)
|
||||
results.append(url_results)
|
||||
return results
|
||||
|
||||
if stream:
|
||||
|
||||
async def result_transformer():
|
||||
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
|
||||
async for task_result in dispatcher.run_urls_stream(
|
||||
crawler=self, urls=urls, config=config
|
||||
):
|
||||
yield transform_result(task_result)
|
||||
|
||||
return result_transformer()
|
||||
else:
|
||||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||||
return [transform_result(res) for res in _results]
|
||||
return [transform_result(res) for res in _results]
|
||||
|
||||
async def aclear_cache(self):
|
||||
"""Clear the cache database."""
|
||||
|
||||
@@ -84,3 +84,4 @@ SHOW_DEPRECATION_WARNINGS = True
|
||||
SCREENSHOT_HEIGHT_TRESHOLD = 10000
|
||||
PAGE_TIMEOUT = 60000
|
||||
DOWNLOAD_PAGE_TIMEOUT = 60000
|
||||
DEEP_CRAWL_BATCH_SIZE = 5
|
||||
|
||||
29
crawl4ai/deep_crawl/__init__.py
Normal file
29
crawl4ai/deep_crawl/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from .bfs_deep_crawl_strategy import BFSDeepCrawlStrategy
|
||||
from .filters import (
|
||||
URLFilter,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
)
|
||||
from .scorers import (
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer,
|
||||
)
|
||||
from .deep_crawl_strategty import DeepCrawlStrategy
|
||||
|
||||
__all__ = [
|
||||
"BFSDeepCrawlStrategy",
|
||||
"FilterChain",
|
||||
"URLFilter",
|
||||
"URLPatternFilter",
|
||||
"ContentTypeFilter",
|
||||
"DomainFilter",
|
||||
"KeywordRelevanceScorer",
|
||||
"PathDepthScorer",
|
||||
"FreshnessScorer",
|
||||
"CompositeScorer",
|
||||
"DeepCrawlStrategy",
|
||||
]
|
||||
193
crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py
Normal file
193
crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py
Normal file
@@ -0,0 +1,193 @@
|
||||
from typing import AsyncGenerator, Optional, Dict, Set, List
|
||||
from datetime import datetime
|
||||
import asyncio
|
||||
import logging
|
||||
from urllib.parse import urlparse
|
||||
from ..models import CrawlResult, TraversalStats
|
||||
from .filters import FilterChain
|
||||
from .scorers import URLScorer
|
||||
from .deep_crawl_strategty import DeepCrawlStrategy
|
||||
from ..config import DEEP_CRAWL_BATCH_SIZE
|
||||
|
||||
|
||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
"""Best-First Search traversal strategy with filtering and scoring."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int,
|
||||
filter_chain: FilterChain,
|
||||
url_scorer: URLScorer,
|
||||
process_external_links: bool = False,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
):
|
||||
self.max_depth = max_depth
|
||||
self.filter_chain = filter_chain
|
||||
self.url_scorer = url_scorer
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
|
||||
# Crawl control
|
||||
self.stats = TraversalStats(start_time=datetime.now())
|
||||
self._cancel_event = asyncio.Event()
|
||||
self.process_external_links = process_external_links
|
||||
|
||||
async def can_process_url(self, url: str, depth: int) -> bool:
|
||||
"""Check if URL can be processed based on filters
|
||||
This is our gatekeeper method that determines if a URL should be processed. It:
|
||||
- Validates URL format using a robust built-in method
|
||||
- Applies custom filters from the filter chain
|
||||
- Updates statistics for blocked URLs
|
||||
- Returns False early if any check fails
|
||||
"""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
if not all([result.scheme, result.netloc]):
|
||||
raise ValueError("Invalid URL")
|
||||
if result.scheme not in ("http", "https"):
|
||||
raise ValueError("URL must be HTTP or HTTPS")
|
||||
if not result.netloc or "." not in result.netloc:
|
||||
raise ValueError("Invalid domain")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}")
|
||||
return False
|
||||
|
||||
# Apply the filter chain if it's not start page
|
||||
if depth != 0 and not self.filter_chain.apply(url):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def _process_links(
|
||||
self,
|
||||
result: CrawlResult,
|
||||
source_url: str,
|
||||
queue: asyncio.PriorityQueue,
|
||||
visited: Set[str],
|
||||
depths: Dict[str, int],
|
||||
) -> List[str]:
|
||||
"""Process extracted links from crawl result.
|
||||
This is our link processor that:
|
||||
Checks depth limits
|
||||
Handles both internal and external links
|
||||
Checks if URL is visited already
|
||||
Checks if URL can be processed - validates URL, applies Filters with can_process_url
|
||||
Scores URLs for priority
|
||||
Updates depth tracking dictionary
|
||||
Adds valid URLs to the queue
|
||||
Updates maximum depth statistics
|
||||
"""
|
||||
next_depth = depths[source_url] + 1
|
||||
# If depth limit reached, exit without processing links
|
||||
if next_depth > self.max_depth:
|
||||
return
|
||||
links_to_process = result.links["internal"]
|
||||
if self.process_external_links:
|
||||
links_to_process += result.links["external"]
|
||||
for link in links_to_process:
|
||||
url = link["href"]
|
||||
if url in visited:
|
||||
continue
|
||||
if not await self.can_process_url(url, next_depth):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
||||
await queue.put((score, next_depth, url, source_url))
|
||||
depths[url] = next_depth
|
||||
self.stats.total_depth_reached = max(
|
||||
self.stats.total_depth_reached, next_depth
|
||||
)
|
||||
|
||||
async def arun(
|
||||
self,
|
||||
start_url: str,
|
||||
crawler: "AsyncWebCrawler",
|
||||
crawler_run_config: Optional["CrawlerRunConfig"] = None,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
"""Implement BFS traversal strategy"""
|
||||
|
||||
# Initialize traversal state
|
||||
"""
|
||||
queue: A priority queue where items are tuples of (score, depth, url)
|
||||
Score: Determines traversal priority (lower = higher priority)
|
||||
Depth: Current distance from start_url
|
||||
URL: The actual URL to crawl
|
||||
visited: Keeps track of URLs we've already seen to avoid cycles
|
||||
depths: Maps URLs to their depths from the start URL
|
||||
active_crawls: Tracks currently running crawl tasks
|
||||
"""
|
||||
queue = asyncio.PriorityQueue()
|
||||
await queue.put((0, 0, start_url, None))
|
||||
visited: Set[str] = set()
|
||||
depths = {start_url: 0}
|
||||
active_crawls = {} # Track URLs currently being processed with depth and score
|
||||
active_crawls_lock = (
|
||||
asyncio.Lock()
|
||||
) # Create the lock within the same event loop
|
||||
try:
|
||||
while (
|
||||
not queue.empty() or active_crawls
|
||||
) and not self._cancel_event.is_set():
|
||||
"""
|
||||
This sets up our main control loop which:
|
||||
- Continues while there are URLs to process (not queue.empty())
|
||||
- Or while there are active crawls still running (arun_many)
|
||||
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
||||
"""
|
||||
# Collect batch of URLs into active_crawls to process
|
||||
async with active_crawls_lock:
|
||||
while (
|
||||
len(active_crawls) < DEEP_CRAWL_BATCH_SIZE and not queue.empty()
|
||||
):
|
||||
score, depth, url, parent_url = await queue.get()
|
||||
active_crawls[url] = {
|
||||
"depth": depth,
|
||||
"score": score,
|
||||
"parent_url": parent_url,
|
||||
}
|
||||
self.stats.current_depth = depth
|
||||
|
||||
if not active_crawls:
|
||||
# If no active crawls exist, wait a bit and continue
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
# Process batch
|
||||
try:
|
||||
# This is very important to ensure recursively you don't deep_crawl down the children.
|
||||
if crawler_run_config:
|
||||
crawler_run_config = crawler_run_config.clone(
|
||||
deep_crawl_strategy=None, stream=True
|
||||
)
|
||||
async for result in await crawler.arun_many(
|
||||
urls=list(active_crawls.keys()),
|
||||
config=crawler_run_config
|
||||
):
|
||||
async with active_crawls_lock:
|
||||
crawl_info = active_crawls.pop(result.url, None)
|
||||
|
||||
if crawl_info and result.success:
|
||||
await self._process_links(
|
||||
result, result.url, queue, visited, depths
|
||||
)
|
||||
result.depth = crawl_info["depth"]
|
||||
result.score = crawl_info["score"]
|
||||
result.parent_url = crawl_info["parent_url"]
|
||||
yield result
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"Failed to crawl {result.url}: {result.error_message}"
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Batch processing error: {e}")
|
||||
# Continue processing other batches
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in crawl process: {e}")
|
||||
raise
|
||||
|
||||
finally:
|
||||
self.stats.end_time = datetime.now()
|
||||
|
||||
async def shutdown(self):
|
||||
"""Clean up resources and stop crawling"""
|
||||
self._cancel_event.set()
|
||||
30
crawl4ai/deep_crawl/deep_crawl_strategty.py
Normal file
30
crawl4ai/deep_crawl/deep_crawl_strategty.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
from ..models import CrawlResult
|
||||
|
||||
|
||||
class DeepCrawlStrategy(ABC):
|
||||
@abstractmethod
|
||||
async def arun(
|
||||
self,
|
||||
url: str,
|
||||
crawler: "AsyncWebCrawler",
|
||||
crawler_run_config: Optional["CrawlerRunConfig"] = None,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
"""Traverse the given URL using the specified crawler.
|
||||
|
||||
Args:
|
||||
url (str): The starting URL for the traversal.
|
||||
crawler (AsyncWebCrawler): The crawler instance to use for traversal.
|
||||
crawler_run_config (CrawlerRunConfig, optional): The configuration for the crawler.
|
||||
|
||||
Returns:
|
||||
AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def shutdown(self):
|
||||
"""Clean up resources used by the strategy"""
|
||||
pass
|
||||
868
crawl4ai/deep_crawl/filters.py
Normal file
868
crawl4ai/deep_crawl/filters.py
Normal file
@@ -0,0 +1,868 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Pattern, Set, Union, FrozenSet
|
||||
import re, time
|
||||
from urllib.parse import urlparse
|
||||
from array import array
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
import fnmatch
|
||||
from dataclasses import dataclass
|
||||
from typing import ClassVar
|
||||
import weakref
|
||||
import mimetypes
|
||||
|
||||
|
||||
@dataclass
|
||||
class FilterStats:
|
||||
# PERF: Using dataclass creates overhead with __init__ and property access
|
||||
# PERF: Could use __slots__ to reduce memory footprint
|
||||
# PERF: Consider using array.array('I') for atomic increments
|
||||
total_urls: int = 0
|
||||
rejected_urls: int = 0
|
||||
passed_urls: int = 0
|
||||
|
||||
|
||||
class URLFilter(ABC):
|
||||
# PERF: Logger creation is expensive, consider lazy initialization
|
||||
# PERF: stats object creation adds overhead for each filter instance
|
||||
def __init__(self, name: str = None):
|
||||
self.name = name or self.__class__.__name__
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger(f"urlfilter.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, url: str) -> bool:
|
||||
pass
|
||||
|
||||
def _update_stats(self, passed: bool):
|
||||
# PERF: Already optimized but could use bitwise operations
|
||||
# PERF: Consider removing stats entirely in production/fast mode
|
||||
self.stats.total_urls += 1
|
||||
self.stats.passed_urls += passed
|
||||
self.stats.rejected_urls += not passed
|
||||
|
||||
|
||||
class FilterChain:
|
||||
# PERF: List traversal for each URL is expensive
|
||||
# PERF: Could use array.array instead of list for filters
|
||||
# PERF: Consider adding fast path for single filter case
|
||||
def __init__(self, filters: List[URLFilter] = None):
|
||||
self.filters = filters or []
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger("urlfilter.chain")
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
# PERF: Logging on every rejection is expensive
|
||||
# PERF: Could reorder filters by rejection rate
|
||||
# PERF: Consider batch processing mode
|
||||
self.stats.total_urls += 1
|
||||
|
||||
for filter_ in self.filters:
|
||||
if not filter_.apply(url):
|
||||
self.stats.rejected_urls += 1
|
||||
self.logger.debug(f"URL {url} rejected by {filter_.name}")
|
||||
return False
|
||||
|
||||
self.stats.passed_urls += 1
|
||||
return True
|
||||
|
||||
|
||||
class URLPatternFilter(URLFilter):
|
||||
# PERF: Converting glob to regex is expensive
|
||||
# PERF: Multiple regex compilation is slow
|
||||
# PERF: List of patterns causes multiple regex evaluations
|
||||
def __init__(
|
||||
self,
|
||||
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||
use_glob: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
self.use_glob = use_glob
|
||||
self._compiled_patterns = []
|
||||
|
||||
# PERF: This could be consolidated into a single regex with OR conditions
|
||||
# PERF: glob_to_regex creates complex patterns, could be simplified
|
||||
for pattern in self.patterns:
|
||||
if isinstance(pattern, str) and use_glob:
|
||||
self._compiled_patterns.append(self._glob_to_regex(pattern))
|
||||
else:
|
||||
self._compiled_patterns.append(
|
||||
re.compile(pattern) if isinstance(pattern, str) else pattern
|
||||
)
|
||||
|
||||
def _glob_to_regex(self, pattern: str) -> Pattern:
|
||||
# PERF: fnmatch.translate creates overly complex patterns
|
||||
# PERF: Could cache common translations
|
||||
return re.compile(fnmatch.translate(pattern))
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
# PERF: any() with generator is slower than direct loop with early return
|
||||
# PERF: searching entire string is slower than anchored match
|
||||
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
|
||||
self._update_stats(matches)
|
||||
return matches
|
||||
|
||||
|
||||
class ContentTypeFilter(URLFilter):
|
||||
# PERF: mimetypes guessing is extremely slow
|
||||
# PERF: URL parsing on every check is expensive
|
||||
# PERF: No caching of results for similar extensions
|
||||
def __init__(
|
||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True
|
||||
):
|
||||
super().__init__()
|
||||
self.allowed_types = (
|
||||
[allowed_types] if isinstance(allowed_types, str) else allowed_types
|
||||
)
|
||||
self.check_extension = check_extension
|
||||
self._normalize_types()
|
||||
|
||||
def _normalize_types(self):
|
||||
"""Normalize content type strings"""
|
||||
self.allowed_types = [t.lower() for t in self.allowed_types]
|
||||
|
||||
def _check_extension(self, url: str) -> bool:
|
||||
# PERF: urlparse is called on every check
|
||||
# PERF: multiple string splits are expensive
|
||||
# PERF: mimetypes.guess_type is very slow
|
||||
ext = (
|
||||
urlparse(url).path.split(".")[-1].lower()
|
||||
if "." in urlparse(url).path
|
||||
else ""
|
||||
)
|
||||
if not ext:
|
||||
return True
|
||||
|
||||
# PERF: guess_type is main bottleneck
|
||||
guessed_type = mimetypes.guess_type(url)[0]
|
||||
return any(
|
||||
allowed in (guessed_type or "").lower() for allowed in self.allowed_types
|
||||
)
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Check if URL's content type is allowed"""
|
||||
result = True
|
||||
if self.check_extension:
|
||||
result = self._check_extension(url)
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
|
||||
class DomainFilter(URLFilter):
|
||||
# PERF: Set lookups are fast but string normalizations on init are not
|
||||
# PERF: Creating two sets doubles memory usage
|
||||
def __init__(
|
||||
self,
|
||||
allowed_domains: Union[str, List[str]] = None,
|
||||
blocked_domains: Union[str, List[str]] = None,
|
||||
):
|
||||
super().__init__()
|
||||
# PERF: Normalizing domains on every init is wasteful
|
||||
# PERF: Could use frozenset for immutable lists
|
||||
self.allowed_domains = (
|
||||
set(self._normalize_domains(allowed_domains)) if allowed_domains else None
|
||||
)
|
||||
self.blocked_domains = (
|
||||
set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
|
||||
)
|
||||
|
||||
def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
|
||||
# PERF: strip() and lower() create new strings for each domain
|
||||
# PERF: List comprehension creates intermediate list
|
||||
if isinstance(domains, str):
|
||||
domains = [domains]
|
||||
return [d.lower().strip() for d in domains]
|
||||
|
||||
def _extract_domain(self, url: str) -> str:
|
||||
# PERF: urlparse is called for every URL check
|
||||
# PERF: lower() creates new string every time
|
||||
# PERF: Could cache recent results
|
||||
return urlparse(url).netloc.lower()
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
# PERF: Two separate set lookups in worst case
|
||||
# PERF: Domain extraction happens before knowing if we have any filters
|
||||
domain = self._extract_domain(url)
|
||||
|
||||
if domain in self.blocked_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
if self.allowed_domains is not None and domain not in self.allowed_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
|
||||
# Example usage:
|
||||
def create_common_filter_chain() -> FilterChain:
|
||||
"""Create a commonly used filter chain"""
|
||||
return FilterChain(
|
||||
[
|
||||
URLPatternFilter(
|
||||
[
|
||||
"*.html",
|
||||
"*.htm", # HTML files
|
||||
"*/article/*",
|
||||
"*/blog/*", # Common content paths
|
||||
]
|
||||
),
|
||||
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
||||
DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
####################################################################################
|
||||
# Uncledoe: Optimized Version
|
||||
####################################################################################
|
||||
|
||||
|
||||
# Use __slots__ and array for maximum memory/speed efficiency
|
||||
class FastFilterStats:
|
||||
__slots__ = ("_counters",)
|
||||
|
||||
def __init__(self):
|
||||
# Use array of unsigned ints for atomic operations
|
||||
self._counters = array("I", [0, 0, 0]) # total, passed, rejected
|
||||
|
||||
@property
|
||||
def total_urls(self):
|
||||
return self._counters[0]
|
||||
|
||||
@property
|
||||
def passed_urls(self):
|
||||
return self._counters[1]
|
||||
|
||||
@property
|
||||
def rejected_urls(self):
|
||||
return self._counters[2]
|
||||
|
||||
|
||||
class FastURLFilter(ABC):
|
||||
"""Optimized base filter class"""
|
||||
|
||||
__slots__ = ("name", "stats", "_logger_ref")
|
||||
|
||||
def __init__(self, name: str = None):
|
||||
self.name = name or self.__class__.__name__
|
||||
self.stats = FastFilterStats()
|
||||
# Lazy logger initialization using weakref
|
||||
self._logger_ref = None
|
||||
|
||||
@property
|
||||
def logger(self):
|
||||
if self._logger_ref is None or self._logger_ref() is None:
|
||||
logger = logging.getLogger(f"urlfilter.{self.name}")
|
||||
self._logger_ref = weakref.ref(logger)
|
||||
return self._logger_ref()
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, url: str) -> bool:
|
||||
pass
|
||||
|
||||
def _update_stats(self, passed: bool):
|
||||
# Use direct array index for speed
|
||||
self.stats._counters[0] += 1 # total
|
||||
self.stats._counters[1] += passed # passed
|
||||
self.stats._counters[2] += not passed # rejected
|
||||
|
||||
|
||||
class FastFilterChain:
|
||||
"""Optimized filter chain"""
|
||||
|
||||
__slots__ = ("filters", "stats", "_logger_ref")
|
||||
|
||||
def __init__(self, filters: List[FastURLFilter] = None):
|
||||
self.filters = tuple(filters or []) # Immutable tuple for speed
|
||||
self.stats = FastFilterStats()
|
||||
self._logger_ref = None
|
||||
|
||||
@property
|
||||
def logger(self):
|
||||
if self._logger_ref is None or self._logger_ref() is None:
|
||||
logger = logging.getLogger("urlfilter.chain")
|
||||
self._logger_ref = weakref.ref(logger)
|
||||
return self._logger_ref()
|
||||
|
||||
def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain":
|
||||
"""Add a filter to the chain"""
|
||||
self.filters.append(filter_)
|
||||
return self # Enable method chaining
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Optimized apply with minimal operations"""
|
||||
self.stats._counters[0] += 1 # total
|
||||
|
||||
# Direct tuple iteration is faster than list
|
||||
for f in self.filters:
|
||||
if not f.apply(url):
|
||||
self.stats._counters[2] += 1 # rejected
|
||||
return False
|
||||
|
||||
self.stats._counters[1] += 1 # passed
|
||||
return True
|
||||
|
||||
class FastURLPatternFilter(FastURLFilter):
|
||||
"""Pattern filter balancing speed and completeness"""
|
||||
__slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns')
|
||||
|
||||
PATTERN_TYPES = {
|
||||
'SUFFIX': 1, # *.html
|
||||
'PREFIX': 2, # /foo/*
|
||||
'DOMAIN': 3, # *.example.com
|
||||
'PATH': 4 , # Everything else
|
||||
'REGEX': 5
|
||||
}
|
||||
|
||||
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True):
|
||||
super().__init__()
|
||||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
|
||||
self._simple_suffixes = set()
|
||||
self._simple_prefixes = set()
|
||||
self._domain_patterns = []
|
||||
self._path_patterns = []
|
||||
|
||||
for pattern in patterns:
|
||||
pattern_type = self._categorize_pattern(pattern)
|
||||
self._add_pattern(pattern, pattern_type)
|
||||
|
||||
def _categorize_pattern(self, pattern: str) -> int:
|
||||
"""Categorize pattern for specialized handling"""
|
||||
if not isinstance(pattern, str):
|
||||
return self.PATTERN_TYPES['PATH']
|
||||
|
||||
# Check if it's a regex pattern
|
||||
if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern:
|
||||
return self.PATTERN_TYPES['REGEX']
|
||||
|
||||
if pattern.count('*') == 1:
|
||||
if pattern.startswith('*.'):
|
||||
return self.PATTERN_TYPES['SUFFIX']
|
||||
if pattern.endswith('/*'):
|
||||
return self.PATTERN_TYPES['PREFIX']
|
||||
|
||||
if '://' in pattern and pattern.startswith('*.'):
|
||||
return self.PATTERN_TYPES['DOMAIN']
|
||||
|
||||
return self.PATTERN_TYPES['PATH']
|
||||
|
||||
def _add_pattern(self, pattern: str, pattern_type: int):
|
||||
"""Add pattern to appropriate matcher"""
|
||||
if pattern_type == self.PATTERN_TYPES['REGEX']:
|
||||
# For regex patterns, compile directly without glob translation
|
||||
if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern):
|
||||
self._path_patterns.append(re.compile(pattern))
|
||||
return
|
||||
elif pattern_type == self.PATTERN_TYPES['SUFFIX']:
|
||||
self._simple_suffixes.add(pattern[2:])
|
||||
elif pattern_type == self.PATTERN_TYPES['PREFIX']:
|
||||
self._simple_prefixes.add(pattern[:-2])
|
||||
elif pattern_type == self.PATTERN_TYPES['DOMAIN']:
|
||||
self._domain_patterns.append(
|
||||
re.compile(pattern.replace('*.', r'[^/]+\.'))
|
||||
)
|
||||
else:
|
||||
if isinstance(pattern, str):
|
||||
# Handle complex glob patterns
|
||||
if '**' in pattern:
|
||||
pattern = pattern.replace('**', '.*')
|
||||
if '{' in pattern:
|
||||
# Convert {a,b} to (a|b)
|
||||
pattern = re.sub(r'\{([^}]+)\}',
|
||||
lambda m: f'({"|".join(m.group(1).split(","))})',
|
||||
pattern)
|
||||
pattern = fnmatch.translate(pattern)
|
||||
self._path_patterns.append(
|
||||
pattern if isinstance(pattern, Pattern) else re.compile(pattern)
|
||||
)
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Hierarchical pattern matching"""
|
||||
# Quick suffix check (*.html)
|
||||
if self._simple_suffixes:
|
||||
path = url.split('?')[0]
|
||||
if path.split('/')[-1].split('.')[-1] in self._simple_suffixes:
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Domain check
|
||||
if self._domain_patterns:
|
||||
for pattern in self._domain_patterns:
|
||||
if pattern.match(url):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Prefix check (/foo/*)
|
||||
if self._simple_prefixes:
|
||||
path = url.split('?')[0]
|
||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Complex patterns
|
||||
if self._path_patterns:
|
||||
if any(p.search(url) for p in self._path_patterns):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
|
||||
class FastContentTypeFilter(FastURLFilter):
|
||||
"""Optimized content type filter using fast lookups"""
|
||||
|
||||
__slots__ = ("allowed_types", "_ext_map", "_check_extension")
|
||||
|
||||
# Fast extension to mime type mapping
|
||||
_MIME_MAP = {
|
||||
# Text Formats
|
||||
"txt": "text/plain",
|
||||
"html": "text/html",
|
||||
"htm": "text/html",
|
||||
"xhtml": "application/xhtml+xml",
|
||||
"css": "text/css",
|
||||
"csv": "text/csv",
|
||||
"ics": "text/calendar",
|
||||
"js": "application/javascript",
|
||||
# Images
|
||||
"bmp": "image/bmp",
|
||||
"gif": "image/gif",
|
||||
"jpeg": "image/jpeg",
|
||||
"jpg": "image/jpeg",
|
||||
"png": "image/png",
|
||||
"svg": "image/svg+xml",
|
||||
"tiff": "image/tiff",
|
||||
"ico": "image/x-icon",
|
||||
"webp": "image/webp",
|
||||
# Audio
|
||||
"mp3": "audio/mpeg",
|
||||
"wav": "audio/wav",
|
||||
"ogg": "audio/ogg",
|
||||
"m4a": "audio/mp4",
|
||||
"aac": "audio/aac",
|
||||
# Video
|
||||
"mp4": "video/mp4",
|
||||
"mpeg": "video/mpeg",
|
||||
"webm": "video/webm",
|
||||
"avi": "video/x-msvideo",
|
||||
"mov": "video/quicktime",
|
||||
"flv": "video/x-flv",
|
||||
"wmv": "video/x-ms-wmv",
|
||||
"mkv": "video/x-matroska",
|
||||
# Applications
|
||||
"json": "application/json",
|
||||
"xml": "application/xml",
|
||||
"pdf": "application/pdf",
|
||||
"zip": "application/zip",
|
||||
"gz": "application/gzip",
|
||||
"tar": "application/x-tar",
|
||||
"rar": "application/vnd.rar",
|
||||
"7z": "application/x-7z-compressed",
|
||||
"exe": "application/vnd.microsoft.portable-executable",
|
||||
"msi": "application/x-msdownload",
|
||||
# Fonts
|
||||
"woff": "font/woff",
|
||||
"woff2": "font/woff2",
|
||||
"ttf": "font/ttf",
|
||||
"otf": "font/otf",
|
||||
# Microsoft Office
|
||||
"doc": "application/msword",
|
||||
"dot": "application/msword",
|
||||
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"xls": "application/vnd.ms-excel",
|
||||
"ppt": "application/vnd.ms-powerpoint",
|
||||
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
# OpenDocument Formats
|
||||
"odt": "application/vnd.oasis.opendocument.text",
|
||||
"ods": "application/vnd.oasis.opendocument.spreadsheet",
|
||||
"odp": "application/vnd.oasis.opendocument.presentation",
|
||||
# Archives
|
||||
"tar.gz": "application/gzip",
|
||||
"tgz": "application/gzip",
|
||||
"bz2": "application/x-bzip2",
|
||||
# Others
|
||||
"rtf": "application/rtf",
|
||||
"apk": "application/vnd.android.package-archive",
|
||||
"epub": "application/epub+zip",
|
||||
"jar": "application/java-archive",
|
||||
"swf": "application/x-shockwave-flash",
|
||||
"midi": "audio/midi",
|
||||
"mid": "audio/midi",
|
||||
"ps": "application/postscript",
|
||||
"ai": "application/postscript",
|
||||
"eps": "application/postscript",
|
||||
# Custom or less common
|
||||
"bin": "application/octet-stream",
|
||||
"dmg": "application/x-apple-diskimage",
|
||||
"iso": "application/x-iso9660-image",
|
||||
"deb": "application/x-debian-package",
|
||||
"rpm": "application/x-rpm",
|
||||
"sqlite": "application/vnd.sqlite3",
|
||||
# Placeholder
|
||||
"unknown": "application/octet-stream", # Fallback for unknown file types
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=1000)
|
||||
def _extract_extension(path: str) -> str:
|
||||
"""Fast extension extraction with caching"""
|
||||
if "." not in path:
|
||||
return ""
|
||||
return path.rpartition(".")[-1].lower()
|
||||
|
||||
def __init__(
|
||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True
|
||||
):
|
||||
super().__init__()
|
||||
# Normalize and store as frozenset for fast lookup
|
||||
self.allowed_types = frozenset(
|
||||
t.lower()
|
||||
for t in (
|
||||
allowed_types if isinstance(allowed_types, list) else [allowed_types]
|
||||
)
|
||||
)
|
||||
self._check_extension = check_extension
|
||||
|
||||
# Pre-compute extension map for allowed types
|
||||
self._ext_map = frozenset(
|
||||
ext
|
||||
for ext, mime in self._MIME_MAP.items()
|
||||
if any(allowed in mime for allowed in self.allowed_types)
|
||||
)
|
||||
|
||||
@lru_cache(maxsize=1000)
|
||||
def _check_url_cached(self, url: str) -> bool:
|
||||
"""Cached URL checking"""
|
||||
if not self._check_extension:
|
||||
return True
|
||||
|
||||
path = url.split("?")[0] # Fast path split
|
||||
ext = self._extract_extension(path)
|
||||
if not ext:
|
||||
return True
|
||||
|
||||
return ext in self._ext_map
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Fast extension check with caching"""
|
||||
result = self._check_url_cached(url)
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
|
||||
class FastDomainFilter(FastURLFilter):
|
||||
"""Optimized domain filter with fast lookups and caching"""
|
||||
|
||||
__slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
|
||||
|
||||
# Regex for fast domain extraction
|
||||
_DOMAIN_REGEX = re.compile(r"://([^/]+)")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
allowed_domains: Union[str, List[str]] = None,
|
||||
blocked_domains: Union[str, List[str]] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
# Convert inputs to frozensets for immutable, fast lookups
|
||||
self._allowed_domains = (
|
||||
frozenset(self._normalize_domains(allowed_domains))
|
||||
if allowed_domains
|
||||
else None
|
||||
)
|
||||
self._blocked_domains = (
|
||||
frozenset(self._normalize_domains(blocked_domains))
|
||||
if blocked_domains
|
||||
else frozenset()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
|
||||
"""Fast domain normalization"""
|
||||
if isinstance(domains, str):
|
||||
return {domains.lower()}
|
||||
return {d.lower() for d in domains}
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=10000)
|
||||
def _extract_domain(url: str) -> str:
|
||||
"""Ultra-fast domain extraction with regex and caching"""
|
||||
match = FastDomainFilter._DOMAIN_REGEX.search(url)
|
||||
return match.group(1).lower() if match else ""
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Optimized domain checking with early returns"""
|
||||
# Skip processing if no filters
|
||||
if not self._blocked_domains and self._allowed_domains is None:
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
domain = self._extract_domain(url)
|
||||
|
||||
# Early return for blocked domains
|
||||
if domain in self._blocked_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
# If no allowed domains specified, accept all non-blocked
|
||||
if self._allowed_domains is None:
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Final allowed domains check
|
||||
result = domain in self._allowed_domains
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
|
||||
def create_fast_filter_chain() -> FastFilterChain:
|
||||
"""Create an optimized filter chain with filters ordered by rejection rate"""
|
||||
return FastFilterChain(
|
||||
[
|
||||
# Domain filter first (fastest rejection)
|
||||
FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]),
|
||||
# Content filter second (medium speed)
|
||||
FastContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
||||
# Pattern filter last (most expensive)
|
||||
FastURLPatternFilter(
|
||||
[
|
||||
"*.html",
|
||||
"*.htm",
|
||||
"*/article/*",
|
||||
"*/blog/*",
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def run_performance_test():
|
||||
import time
|
||||
import random
|
||||
from itertools import cycle
|
||||
|
||||
# Generate test URLs
|
||||
base_urls = [
|
||||
"https://example.com/article/123",
|
||||
"https://blog.example.com/post/456",
|
||||
"https://ads.example.com/tracking",
|
||||
"https://example.com/about.html",
|
||||
"https://analytics.example.com/script.js",
|
||||
"https://example.com/products.php",
|
||||
"https://subdomain.example.com/blog/post-123",
|
||||
"https://example.com/path/file.pdf",
|
||||
]
|
||||
|
||||
# Create more varied test data
|
||||
test_urls = []
|
||||
for base in base_urls:
|
||||
# Add original
|
||||
test_urls.append(base)
|
||||
# Add variations
|
||||
parts = base.split("/")
|
||||
for i in range(10):
|
||||
parts[-1] = f"page_{i}.html"
|
||||
test_urls.append("/".join(parts))
|
||||
|
||||
# Multiply to get enough test data
|
||||
test_urls = test_urls * 10000 # Creates ~800k URLs
|
||||
|
||||
def benchmark(name: str, func, *args, warmup=True):
|
||||
if warmup:
|
||||
# Warmup run
|
||||
func(*args)
|
||||
|
||||
# Actual timing
|
||||
start = time.perf_counter_ns()
|
||||
result = func(*args)
|
||||
elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms
|
||||
print(
|
||||
f"{name:<30} {elapsed:>8.3f} ms ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)"
|
||||
)
|
||||
return result
|
||||
|
||||
print("\nBenchmarking original vs optimized implementations...")
|
||||
print("-" * 70)
|
||||
|
||||
# Original implementation
|
||||
pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
|
||||
content_filter = ContentTypeFilter(["text/html"])
|
||||
domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
|
||||
chain = FilterChain([pattern_filter, content_filter, domain_filter])
|
||||
|
||||
# Optimized implementation
|
||||
fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"])
|
||||
fast_content_filter = FastContentTypeFilter(["text/html"])
|
||||
fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"])
|
||||
fast_chain = FastFilterChain(
|
||||
[fast_domain_filter, fast_content_filter, fast_pattern_filter]
|
||||
)
|
||||
|
||||
# Test individual filters
|
||||
print("\nSingle filter performance (first 1000 URLs):")
|
||||
test_subset = test_urls[:1000]
|
||||
|
||||
print("\nPattern Filters:")
|
||||
benchmark(
|
||||
"Original Pattern Filter",
|
||||
lambda: [pattern_filter.apply(url) for url in test_subset],
|
||||
)
|
||||
benchmark(
|
||||
"Optimized Pattern Filter",
|
||||
lambda: [fast_pattern_filter.apply(url) for url in test_subset],
|
||||
)
|
||||
|
||||
print("\nContent Filters:")
|
||||
benchmark(
|
||||
"Original Content Filter",
|
||||
lambda: [content_filter.apply(url) for url in test_subset],
|
||||
)
|
||||
benchmark(
|
||||
"Optimized Content Filter",
|
||||
lambda: [fast_content_filter.apply(url) for url in test_subset],
|
||||
)
|
||||
|
||||
print("\nDomain Filters:")
|
||||
benchmark(
|
||||
"Original Domain Filter",
|
||||
lambda: [domain_filter.apply(url) for url in test_subset],
|
||||
)
|
||||
benchmark(
|
||||
"Optimized Domain Filter",
|
||||
lambda: [fast_domain_filter.apply(url) for url in test_subset],
|
||||
)
|
||||
|
||||
print("\nFull Chain Performance (all URLs):")
|
||||
# Test chain
|
||||
benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls])
|
||||
benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls])
|
||||
|
||||
# Memory usage
|
||||
import sys
|
||||
|
||||
print("\nMemory Usage per Filter:")
|
||||
print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes")
|
||||
print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes")
|
||||
print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes")
|
||||
print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes")
|
||||
print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes")
|
||||
print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes")
|
||||
|
||||
def test_pattern_filter():
|
||||
import time
|
||||
from itertools import chain
|
||||
|
||||
# Test cases as list of tuples instead of dict for multiple patterns
|
||||
test_cases = [
|
||||
# Simple suffix patterns (*.html)
|
||||
("*.html", {
|
||||
"https://example.com/page.html": True,
|
||||
"https://example.com/path/doc.html": True,
|
||||
"https://example.com/page.htm": False,
|
||||
"https://example.com/page.html?param=1": True,
|
||||
}),
|
||||
|
||||
# Path prefix patterns (/foo/*)
|
||||
("*/article/*", {
|
||||
"https://example.com/article/123": True,
|
||||
"https://example.com/blog/article/456": True,
|
||||
"https://example.com/articles/789": False,
|
||||
"https://example.com/article": False,
|
||||
}),
|
||||
|
||||
# Complex patterns
|
||||
("blog-*-[0-9]", {
|
||||
"https://example.com/blog-post-1": True,
|
||||
"https://example.com/blog-test-9": True,
|
||||
"https://example.com/blog-post": False,
|
||||
"https://example.com/blog-post-x": False,
|
||||
}),
|
||||
|
||||
# Multiple patterns case
|
||||
(["*.pdf", "*/download/*"], {
|
||||
"https://example.com/doc.pdf": True,
|
||||
"https://example.com/download/file.txt": True,
|
||||
"https://example.com/path/download/doc": True,
|
||||
"https://example.com/uploads/file.txt": False,
|
||||
}),
|
||||
|
||||
# Edge cases
|
||||
("*", {
|
||||
"https://example.com": True,
|
||||
"": True,
|
||||
"http://test.com/path": True,
|
||||
}),
|
||||
|
||||
# Complex regex
|
||||
(r"^https?://.*\.example\.com/\d+", {
|
||||
"https://sub.example.com/123": True,
|
||||
"http://test.example.com/456": True,
|
||||
"https://example.com/789": False,
|
||||
"https://sub.example.com/abc": False,
|
||||
})
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for patterns, test_urls in test_cases:
|
||||
filter_obj = FastURLPatternFilter(patterns)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
def run_speed_test():
|
||||
print("\nSpeed Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
# Create a large set of test URLs
|
||||
all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases))
|
||||
test_urls = all_urls * 10000 # 100K+ URLs
|
||||
|
||||
# Test both implementations
|
||||
original = URLPatternFilter(["*.html", "*/article/*", "blog-*"])
|
||||
optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"])
|
||||
|
||||
def benchmark(name, filter_obj):
|
||||
start = time.perf_counter()
|
||||
for url in test_urls:
|
||||
filter_obj.apply(url)
|
||||
elapsed = time.perf_counter() - start
|
||||
urls_per_sec = len(test_urls) / elapsed
|
||||
print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)")
|
||||
|
||||
benchmark("Original Filter:", original)
|
||||
benchmark("Optimized Filter:", optimized)
|
||||
|
||||
# Run tests
|
||||
print("Running Pattern Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n✨ All accuracy tests passed!")
|
||||
run_speed_test()
|
||||
else:
|
||||
print("\n❌ Some accuracy tests failed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_performance_test()
|
||||
# test_pattern_filter()
|
||||
1204
crawl4ai/deep_crawl/scorers.py
Normal file
1204
crawl4ai/deep_crawl/scorers.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,4 @@
|
||||
from __future__ import annotations
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from enum import Enum
|
||||
@@ -5,6 +6,7 @@ from dataclasses import dataclass
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from math import inf
|
||||
|
||||
|
||||
###############################
|
||||
@@ -95,6 +97,18 @@ class DispatchResult(BaseModel):
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class TraversalStats:
|
||||
"""Statistics for the traversal process"""
|
||||
|
||||
start_time: datetime
|
||||
urls_processed: int = 0
|
||||
urls_failed: int = 0
|
||||
urls_skipped: int = 0
|
||||
total_depth_reached: int = 0
|
||||
current_depth: int = 0
|
||||
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
@@ -118,11 +132,14 @@ class CrawlResult(BaseModel):
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
# Attributes for position
|
||||
depth: Optional[int] = None
|
||||
score: Optional[float] = -inf
|
||||
parent_url: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
@@ -132,7 +149,7 @@ class AsyncCrawlResponse(BaseModel):
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
final_url: Optional[str] = None
|
||||
redirected_url: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -161,12 +178,12 @@ class Link(BaseModel):
|
||||
|
||||
class Media(BaseModel):
|
||||
images: List[MediaItem] = []
|
||||
videos: List[
|
||||
MediaItem
|
||||
] = [] # Using MediaItem model for now, can be extended with Video model if needed
|
||||
audios: List[
|
||||
MediaItem
|
||||
] = [] # Using MediaItem model for now, can be extended with Audio model if needed
|
||||
videos: List[MediaItem] = (
|
||||
[]
|
||||
) # Using MediaItem model for now, can be extended with Video model if needed
|
||||
audios: List[MediaItem] = (
|
||||
[]
|
||||
) # Using MediaItem model for now, can be extended with Audio model if needed
|
||||
|
||||
|
||||
class Links(BaseModel):
|
||||
|
||||
@@ -2,8 +2,146 @@ import random
|
||||
from typing import Optional, Literal, List, Dict, Tuple
|
||||
import re
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import random
|
||||
from fake_useragent import UserAgent
|
||||
import requests
|
||||
from lxml import html
|
||||
import json
|
||||
from typing import Optional, List, Union, Dict
|
||||
|
||||
class UserAgentGenerator:
|
||||
class UAGen(ABC):
|
||||
@abstractmethod
|
||||
def generate(self,
|
||||
browsers: Optional[List[str]] = None,
|
||||
os: Optional[Union[str, List[str]]] = None,
|
||||
min_version: float = 0.0,
|
||||
platforms: Optional[Union[str, List[str]]] = None,
|
||||
pct_threshold: Optional[float] = None,
|
||||
fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Union[str, Dict]:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def generate_client_hints( user_agent: str) -> str:
|
||||
"""Generate Sec-CH-UA header value based on user agent string"""
|
||||
def _parse_user_agent(user_agent: str) -> Dict[str, str]:
|
||||
"""Parse a user agent string to extract browser and version information"""
|
||||
browsers = {
|
||||
"chrome": r"Chrome/(\d+)",
|
||||
"edge": r"Edg/(\d+)",
|
||||
"safari": r"Version/(\d+)",
|
||||
"firefox": r"Firefox/(\d+)",
|
||||
}
|
||||
|
||||
result = {}
|
||||
for browser, pattern in browsers.items():
|
||||
match = re.search(pattern, user_agent)
|
||||
if match:
|
||||
result[browser] = match.group(1)
|
||||
|
||||
return result
|
||||
browsers = _parse_user_agent(user_agent)
|
||||
|
||||
# Client hints components
|
||||
hints = []
|
||||
|
||||
# Handle different browser combinations
|
||||
if "chrome" in browsers:
|
||||
hints.append(f'"Chromium";v="{browsers["chrome"]}"')
|
||||
hints.append('"Not_A Brand";v="8"')
|
||||
|
||||
if "edge" in browsers:
|
||||
hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
|
||||
else:
|
||||
hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
|
||||
|
||||
elif "firefox" in browsers:
|
||||
# Firefox doesn't typically send Sec-CH-UA
|
||||
return '""'
|
||||
|
||||
elif "safari" in browsers:
|
||||
# Safari's format for client hints
|
||||
hints.append(f'"Safari";v="{browsers["safari"]}"')
|
||||
hints.append('"Not_A Brand";v="8"')
|
||||
|
||||
return ", ".join(hints)
|
||||
|
||||
class ValidUAGenerator(UAGen):
|
||||
def __init__(self):
|
||||
self.ua = UserAgent()
|
||||
|
||||
def generate(self,
|
||||
browsers: Optional[List[str]] = None,
|
||||
os: Optional[Union[str, List[str]]] = None,
|
||||
min_version: float = 0.0,
|
||||
platforms: Optional[Union[str, List[str]]] = None,
|
||||
pct_threshold: Optional[float] = None,
|
||||
fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> str:
|
||||
|
||||
self.ua = UserAgent(
|
||||
browsers=browsers or ['Chrome', 'Firefox', 'Edge'],
|
||||
os=os or ['Windows', 'Mac OS X'],
|
||||
min_version=min_version,
|
||||
platforms=platforms or ['desktop'],
|
||||
fallback=fallback
|
||||
)
|
||||
return self.ua.random
|
||||
|
||||
class OnlineUAGenerator(UAGen):
|
||||
def __init__(self):
|
||||
self.agents = []
|
||||
self._fetch_agents()
|
||||
|
||||
def _fetch_agents(self):
|
||||
try:
|
||||
response = requests.get(
|
||||
'https://www.useragents.me/',
|
||||
timeout=5,
|
||||
headers={'Accept': 'text/html,application/xhtml+xml'}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
tree = html.fromstring(response.content)
|
||||
json_text = tree.cssselect('#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea')[0].text
|
||||
self.agents = json.loads(json_text)
|
||||
except Exception as e:
|
||||
print(f"Error fetching agents: {e}")
|
||||
|
||||
def generate(self,
|
||||
browsers: Optional[List[str]] = None,
|
||||
os: Optional[Union[str, List[str]]] = None,
|
||||
min_version: float = 0.0,
|
||||
platforms: Optional[Union[str, List[str]]] = None,
|
||||
pct_threshold: Optional[float] = None,
|
||||
fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Dict:
|
||||
|
||||
if not self.agents:
|
||||
self._fetch_agents()
|
||||
|
||||
filtered_agents = self.agents
|
||||
|
||||
if pct_threshold:
|
||||
filtered_agents = [a for a in filtered_agents if a['pct'] >= pct_threshold]
|
||||
|
||||
if browsers:
|
||||
filtered_agents = [a for a in filtered_agents
|
||||
if any(b.lower() in a['ua'].lower() for b in browsers)]
|
||||
|
||||
if os:
|
||||
os_list = [os] if isinstance(os, str) else os
|
||||
filtered_agents = [a for a in filtered_agents
|
||||
if any(o.lower() in a['ua'].lower() for o in os_list)]
|
||||
|
||||
if platforms:
|
||||
platform_list = [platforms] if isinstance(platforms, str) else platforms
|
||||
filtered_agents = [a for a in filtered_agents
|
||||
if any(p.lower() in a['ua'].lower() for p in platform_list)]
|
||||
|
||||
return filtered_agents[0] if filtered_agents else {'ua': fallback, 'pct': 0}
|
||||
|
||||
|
||||
|
||||
class UserAgentGenerator():
|
||||
"""
|
||||
Generate random user agents with specified constraints.
|
||||
|
||||
@@ -187,9 +325,15 @@ class UserAgentGenerator:
|
||||
browser_stack = self.get_browser_stack(num_browsers)
|
||||
|
||||
# Add appropriate legacy token based on browser stack
|
||||
if "Firefox" in str(browser_stack):
|
||||
if "Firefox" in str(browser_stack) or browser_type == "firefox":
|
||||
components.append(random.choice(self.rendering_engines["gecko"]))
|
||||
elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack):
|
||||
elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack) or browser_type == "chrome":
|
||||
components.append(self.rendering_engines["chrome_webkit"])
|
||||
components.append("(KHTML, like Gecko)")
|
||||
elif "Edge" in str(browser_stack) or browser_type == "edge":
|
||||
components.append(self.rendering_engines["safari_webkit"])
|
||||
components.append("(KHTML, like Gecko)")
|
||||
elif "Safari" in str(browser_stack) or browser_type == "safari":
|
||||
components.append(self.rendering_engines["chrome_webkit"])
|
||||
components.append("(KHTML, like Gecko)")
|
||||
|
||||
@@ -273,27 +417,13 @@ class UserAgentGenerator:
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
generator = UserAgentGenerator()
|
||||
print(generator.generate())
|
||||
|
||||
# Usage example:
|
||||
generator = ValidUAGenerator()
|
||||
ua = generator.generate()
|
||||
print(ua)
|
||||
|
||||
generator = OnlineUAGenerator()
|
||||
ua = generator.generate()
|
||||
print(ua)
|
||||
|
||||
print("\nSingle browser (Chrome):")
|
||||
print(generator.generate(num_browsers=1, browser_type="chrome"))
|
||||
|
||||
print("\nTwo browsers (Gecko/Firefox):")
|
||||
print(generator.generate(num_browsers=2))
|
||||
|
||||
print("\nThree browsers (Chrome/Safari/Edge):")
|
||||
print(generator.generate(num_browsers=3))
|
||||
|
||||
print("\nFirefox on Linux:")
|
||||
print(
|
||||
generator.generate(
|
||||
device_type="desktop",
|
||||
os_type="linux",
|
||||
browser_type="firefox",
|
||||
num_browsers=2,
|
||||
)
|
||||
)
|
||||
|
||||
print("\nChrome/Safari/Edge on Windows:")
|
||||
print(generator.generate(device_type="desktop", os_type="windows", num_browsers=3))
|
||||
|
||||
244
docs/deep_crawl/bfs_traversal_strategy.md
Normal file
244
docs/deep_crawl/bfs_traversal_strategy.md
Normal file
@@ -0,0 +1,244 @@
|
||||
# BFS Scraper Strategy: Smart Web Traversal
|
||||
|
||||
The BFS (Breadth-First Search) Scraper Strategy provides an intelligent way to traverse websites systematically. It crawls websites level by level, ensuring thorough coverage while respecting web crawling etiquette.
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
Start([Start]) --> Init[Initialize BFS Strategy]
|
||||
Init --> InitStats[Initialize CrawlStats]
|
||||
InitStats --> InitQueue[Initialize Priority Queue]
|
||||
InitQueue --> AddStart[Add Start URL to Queue]
|
||||
|
||||
AddStart --> CheckState{Queue Empty or\nTasks Pending?}
|
||||
CheckState -->|No| Cleanup[Cleanup & Stats]
|
||||
Cleanup --> End([End])
|
||||
|
||||
CheckState -->|Yes| CheckCancel{Cancel\nRequested?}
|
||||
CheckCancel -->|Yes| Cleanup
|
||||
|
||||
CheckCancel -->|No| CheckConcurrent{Under Max\nConcurrent?}
|
||||
|
||||
CheckConcurrent -->|No| WaitComplete[Wait for Task Completion]
|
||||
WaitComplete --> YieldResult[Yield Result]
|
||||
YieldResult --> CheckState
|
||||
|
||||
CheckConcurrent -->|Yes| GetNextURL[Get Next URL from Queue]
|
||||
|
||||
GetNextURL --> ValidateURL{Already\nVisited?}
|
||||
ValidateURL -->|Yes| CheckState
|
||||
|
||||
ValidateURL -->|No| ProcessURL[Process URL]
|
||||
|
||||
subgraph URL_Processing [URL Processing]
|
||||
ProcessURL --> CheckValid{URL Valid?}
|
||||
CheckValid -->|No| UpdateStats[Update Skip Stats]
|
||||
|
||||
CheckValid -->|Yes| CheckRobots{Allowed by\nrobots.txt?}
|
||||
CheckRobots -->|No| UpdateRobotStats[Update Robot Stats]
|
||||
|
||||
CheckRobots -->|Yes| ApplyDelay[Apply Politeness Delay]
|
||||
ApplyDelay --> FetchContent[Fetch Content with Rate Limit]
|
||||
|
||||
FetchContent --> CheckError{Error?}
|
||||
CheckError -->|Yes| Retry{Retry\nNeeded?}
|
||||
Retry -->|Yes| FetchContent
|
||||
Retry -->|No| UpdateFailStats[Update Fail Stats]
|
||||
|
||||
CheckError -->|No| ExtractLinks[Extract & Process Links]
|
||||
ExtractLinks --> ScoreURLs[Score New URLs]
|
||||
ScoreURLs --> AddToQueue[Add to Priority Queue]
|
||||
end
|
||||
|
||||
ProcessURL --> CreateTask{Parallel\nProcessing?}
|
||||
CreateTask -->|Yes| AddTask[Add to Pending Tasks]
|
||||
CreateTask -->|No| DirectProcess[Process Directly]
|
||||
|
||||
AddTask --> CheckState
|
||||
DirectProcess --> YieldResult
|
||||
|
||||
UpdateStats --> CheckState
|
||||
UpdateRobotStats --> CheckState
|
||||
UpdateFailStats --> CheckState
|
||||
|
||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
||||
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
||||
classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
||||
classDef stats fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
||||
|
||||
class Start,End stats;
|
||||
class CheckState,CheckCancel,CheckConcurrent,ValidateURL,CheckValid,CheckRobots,CheckError,Retry,CreateTask decision;
|
||||
class UpdateStats,UpdateRobotStats,UpdateFailStats,InitStats,Cleanup stats;
|
||||
class ProcessURL,FetchContent,ExtractLinks,ScoreURLs process;
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
The BFS strategy crawls a website by:
|
||||
1. Starting from a root URL
|
||||
2. Processing all URLs at the current depth
|
||||
3. Moving to URLs at the next depth level
|
||||
4. Continuing until maximum depth is reached
|
||||
|
||||
This ensures systematic coverage of the website while maintaining control over the crawling process.
|
||||
|
||||
## Key Features
|
||||
|
||||
### 1. Smart URL Processing
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2,
|
||||
filter_chain=my_filters,
|
||||
url_scorer=my_scorer,
|
||||
max_concurrent=5
|
||||
)
|
||||
```
|
||||
- Controls crawl depth
|
||||
- Filters unwanted URLs
|
||||
- Scores URLs for priority
|
||||
- Manages concurrent requests
|
||||
|
||||
### 2. Polite Crawling
|
||||
The strategy automatically implements web crawling best practices:
|
||||
- Respects robots.txt
|
||||
- Implements rate limiting
|
||||
- Adds politeness delays
|
||||
- Manages concurrent requests
|
||||
|
||||
### 3. Link Processing Control
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
...,
|
||||
process_external_links=False # Only process internal links
|
||||
)
|
||||
```
|
||||
- Control whether to follow external links
|
||||
- Default: internal links only
|
||||
- Enable external links when needed
|
||||
|
||||
## Configuration Options
|
||||
|
||||
| Parameter | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| max_depth | Maximum crawl depth | Required |
|
||||
| filter_chain | URL filtering rules | Required |
|
||||
| url_scorer | URL priority scoring | Required |
|
||||
| max_concurrent | Max parallel requests | 5 |
|
||||
| min_crawl_delay | Seconds between requests | 1 |
|
||||
| process_external_links | Follow external links | False |
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Set Appropriate Depth**
|
||||
- Start with smaller depths (2-3)
|
||||
- Increase based on needs
|
||||
- Consider site structure
|
||||
|
||||
2. **Configure Filters**
|
||||
- Use URL patterns
|
||||
- Filter by content type
|
||||
- Avoid unwanted sections
|
||||
|
||||
3. **Tune Performance**
|
||||
- Adjust max_concurrent
|
||||
- Set appropriate delays
|
||||
- Monitor resource usage
|
||||
|
||||
4. **Handle External Links**
|
||||
- Keep external_links=False for focused crawls
|
||||
- Enable only when needed
|
||||
- Consider additional filtering
|
||||
|
||||
## Example Usage
|
||||
|
||||
```python
|
||||
from crawl4ai.scraper import BFSScraperStrategy
|
||||
from crawl4ai.scraper.filters import FilterChain
|
||||
from crawl4ai.scraper.scorers import BasicURLScorer
|
||||
|
||||
# Configure strategy
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=3,
|
||||
filter_chain=FilterChain([
|
||||
URLPatternFilter("*.example.com/*"),
|
||||
ContentTypeFilter(["text/html"])
|
||||
]),
|
||||
url_scorer=BasicURLScorer(),
|
||||
max_concurrent=5,
|
||||
min_crawl_delay=1,
|
||||
process_external_links=False
|
||||
)
|
||||
|
||||
# Use with AsyncWebScraper
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
results = await scraper.ascrape("https://example.com")
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Site Mapping
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=5,
|
||||
filter_chain=site_filter,
|
||||
url_scorer=depth_scorer,
|
||||
process_external_links=False
|
||||
)
|
||||
```
|
||||
Perfect for creating complete site maps or understanding site structure.
|
||||
|
||||
### 2. Content Aggregation
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2,
|
||||
filter_chain=content_filter,
|
||||
url_scorer=relevance_scorer,
|
||||
max_concurrent=3
|
||||
)
|
||||
```
|
||||
Ideal for collecting specific types of content (articles, products, etc.).
|
||||
|
||||
### 3. Link Analysis
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=1,
|
||||
filter_chain=link_filter,
|
||||
url_scorer=link_scorer,
|
||||
process_external_links=True
|
||||
)
|
||||
```
|
||||
Useful for analyzing both internal and external link structures.
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Progress Monitoring
|
||||
```python
|
||||
async for result in scraper.ascrape(url):
|
||||
print(f"Current depth: {strategy.stats.current_depth}")
|
||||
print(f"Processed URLs: {strategy.stats.urls_processed}")
|
||||
```
|
||||
|
||||
### Custom URL Scoring
|
||||
```python
|
||||
class CustomScorer(URLScorer):
|
||||
def score(self, url: str) -> float:
|
||||
# Lower scores = higher priority
|
||||
return score_based_on_criteria(url)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
1. **Slow Crawling**
|
||||
- Increase max_concurrent
|
||||
- Adjust min_crawl_delay
|
||||
- Check network conditions
|
||||
|
||||
2. **Missing Content**
|
||||
- Verify max_depth
|
||||
- Check filter settings
|
||||
- Review URL patterns
|
||||
|
||||
3. **High Resource Usage**
|
||||
- Reduce max_concurrent
|
||||
- Increase crawl delay
|
||||
- Add more specific filters
|
||||
|
||||
260
docs/deep_crawl/deep_crawl_quickstart.py
Normal file
260
docs/deep_crawl/deep_crawl_quickstart.py
Normal file
@@ -0,0 +1,260 @@
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawl import (
|
||||
BFSDeepCrawlStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer,
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
|
||||
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
||||
|
||||
|
||||
async def basic_example():
|
||||
"""
|
||||
Basic example: Deep crawl a blog site for articles
|
||||
- Crawls only HTML pages
|
||||
- Stays within the blog section
|
||||
- Collects all results at once
|
||||
"""
|
||||
# Create a simple filter chain
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
# Only crawl pages within the blog section
|
||||
URLPatternFilter("*/basic/*"),
|
||||
# Only process HTML pages
|
||||
ContentTypeFilter(["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Initialize the strategy with basic configuration
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
process_external_links=True,
|
||||
)
|
||||
|
||||
# Create the crawler
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_config,
|
||||
) as crawler:
|
||||
# Start scraping
|
||||
try:
|
||||
results = await crawler.arun(
|
||||
"https://crawl4ai.com/mkdocs",
|
||||
CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
|
||||
)
|
||||
# Process results
|
||||
print(f"Crawled {len(results)} pages:")
|
||||
for result in results:
|
||||
print(f"- {result.url}: {len(result.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
|
||||
async def advanced_example():
|
||||
"""
|
||||
Advanced example: Intelligent news site crawling
|
||||
- Uses all filter types
|
||||
- Implements sophisticated scoring
|
||||
- Streams results
|
||||
- Includes monitoring and logging
|
||||
"""
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("advanced_deep_crawler")
|
||||
|
||||
# Create sophisticated filter chain
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
# Domain control
|
||||
DomainFilter(
|
||||
allowed_domains=["techcrunch.com"],
|
||||
blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
|
||||
),
|
||||
# URL patterns
|
||||
URLPatternFilter(
|
||||
[
|
||||
"*/article/*",
|
||||
"*/news/*",
|
||||
"*/blog/*",
|
||||
re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs
|
||||
]
|
||||
),
|
||||
# Content types
|
||||
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create composite scorer
|
||||
scorer = CompositeScorer(
|
||||
[
|
||||
# Prioritize by keywords
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["news", "breaking", "update", "latest"], weight=1.0
|
||||
),
|
||||
# Prefer optimal URL structure
|
||||
PathDepthScorer(optimal_depth=3, weight=0.7),
|
||||
# Prioritize fresh content
|
||||
FreshnessScorer(weight=0.9),
|
||||
]
|
||||
)
|
||||
|
||||
# Initialize strategy with advanced configuration
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
||||
)
|
||||
|
||||
# Create crawler
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_config,
|
||||
) as crawler:
|
||||
|
||||
# Track statistics
|
||||
stats = {"processed": 0, "errors": 0, "total_size": 0}
|
||||
|
||||
try:
|
||||
# Use streaming mode
|
||||
results = []
|
||||
result_generator = await crawler.arun(
|
||||
"https://techcrunch.com",
|
||||
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
|
||||
)
|
||||
async for result in result_generator:
|
||||
stats["processed"] += 1
|
||||
|
||||
if result.success:
|
||||
stats["total_size"] += len(result.html)
|
||||
logger.info(
|
||||
f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
|
||||
)
|
||||
results.append(result)
|
||||
else:
|
||||
stats["errors"] += 1
|
||||
logger.error(
|
||||
f"Failed to process {result.url}: {result.error_message}"
|
||||
)
|
||||
|
||||
# Log progress regularly
|
||||
if stats["processed"] % 10 == 0:
|
||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error: {e}")
|
||||
|
||||
finally:
|
||||
# Print final statistics
|
||||
logger.info("Scraping completed:")
|
||||
logger.info(f"- URLs processed: {stats['processed']}")
|
||||
logger.info(f"- Errors: {stats['errors']}")
|
||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
||||
|
||||
# Print filter statistics
|
||||
for filter_ in filter_chain.filters:
|
||||
logger.info(f"{filter_.name} stats:")
|
||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
||||
|
||||
# Print scorer statistics
|
||||
logger.info("Scoring statistics:")
|
||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
||||
logger.info(
|
||||
f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
|
||||
)
|
||||
|
||||
|
||||
async def basic_example_many_urls():
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
URLPatternFilter("*/basic/*"),
|
||||
ContentTypeFilter(["text/html"]),
|
||||
]
|
||||
)
|
||||
# Initialize the strategy with basic configuration
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
process_external_links=False,
|
||||
)
|
||||
|
||||
# Create the crawler
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_config,
|
||||
) as crawler:
|
||||
# Start scraping
|
||||
try:
|
||||
results = await crawler.arun_many(
|
||||
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
|
||||
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
|
||||
)
|
||||
# Process results
|
||||
print(f"Crawled {len(results)} pages:")
|
||||
for url_result in results:
|
||||
for result in url_result:
|
||||
print(f"- {result.url}: {len(result.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
async def basic_example_many_urls_stream():
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
URLPatternFilter("*/basic/*"),
|
||||
ContentTypeFilter(["text/html"]),
|
||||
]
|
||||
)
|
||||
# Initialize the strategy with basic configuration
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
process_external_links=False,
|
||||
)
|
||||
|
||||
# Create the crawler
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_config,
|
||||
) as crawler:
|
||||
# Start scraping
|
||||
try:
|
||||
async for result in await crawler.arun_many(
|
||||
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
|
||||
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
|
||||
):
|
||||
# Process results
|
||||
print(f"- {result.url}: {len(result.html)} bytes")
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
# Run basic example
|
||||
start_time = time.perf_counter()
|
||||
print("Running basic Deep crawl example...")
|
||||
asyncio.run(basic_example())
|
||||
end_time = time.perf_counter()
|
||||
print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
|
||||
|
||||
# Run advanced example
|
||||
print("\nRunning advanced deep crawl example...")
|
||||
asyncio.run(advanced_example())
|
||||
|
||||
print("\nRunning advanced deep crawl example with arun_many...")
|
||||
asyncio.run(basic_example_many_urls())
|
||||
|
||||
print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
|
||||
asyncio.run(basic_example_many_urls_stream())
|
||||
342
docs/deep_crawl/filters_scrorers.md
Normal file
342
docs/deep_crawl/filters_scrorers.md
Normal file
@@ -0,0 +1,342 @@
|
||||
# URL Filters and Scorers
|
||||
|
||||
The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively.
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
Start([URL Input]) --> Chain[Filter Chain]
|
||||
|
||||
subgraph Chain Process
|
||||
Chain --> Pattern{URL Pattern\nFilter}
|
||||
Pattern -->|Match| Content{Content Type\nFilter}
|
||||
Pattern -->|No Match| Reject1[Reject URL]
|
||||
|
||||
Content -->|Allowed| Domain{Domain\nFilter}
|
||||
Content -->|Not Allowed| Reject2[Reject URL]
|
||||
|
||||
Domain -->|Allowed| Accept[Accept URL]
|
||||
Domain -->|Blocked| Reject3[Reject URL]
|
||||
end
|
||||
|
||||
subgraph Statistics
|
||||
Pattern --> UpdatePattern[Update Pattern Stats]
|
||||
Content --> UpdateContent[Update Content Stats]
|
||||
Domain --> UpdateDomain[Update Domain Stats]
|
||||
Accept --> UpdateChain[Update Chain Stats]
|
||||
Reject1 --> UpdateChain
|
||||
Reject2 --> UpdateChain
|
||||
Reject3 --> UpdateChain
|
||||
end
|
||||
|
||||
Accept --> End([End])
|
||||
Reject1 --> End
|
||||
Reject2 --> End
|
||||
Reject3 --> End
|
||||
|
||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
||||
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
||||
classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
||||
classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
||||
|
||||
class Start,End accept;
|
||||
class Pattern,Content,Domain decision;
|
||||
class Reject1,Reject2,Reject3 reject;
|
||||
class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process;
|
||||
```
|
||||
|
||||
## URL Filters
|
||||
|
||||
URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules.
|
||||
|
||||
### Available Filters
|
||||
|
||||
1. **URL Pattern Filter**
|
||||
```python
|
||||
pattern_filter = URLPatternFilter([
|
||||
"*.example.com/*", # Glob pattern
|
||||
"*/article/*", # Path pattern
|
||||
re.compile(r"blog-\d+") # Regex pattern
|
||||
])
|
||||
```
|
||||
- Supports glob patterns and regex
|
||||
- Multiple patterns per filter
|
||||
- Pattern pre-compilation for performance
|
||||
|
||||
2. **Content Type Filter**
|
||||
```python
|
||||
content_filter = ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/pdf"
|
||||
], check_extension=True)
|
||||
```
|
||||
- Filter by MIME types
|
||||
- Extension checking
|
||||
- Support for multiple content types
|
||||
|
||||
3. **Domain Filter**
|
||||
```python
|
||||
domain_filter = DomainFilter(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
blocked_domains=["ads.example.com"]
|
||||
)
|
||||
```
|
||||
- Allow/block specific domains
|
||||
- Subdomain support
|
||||
- Efficient domain matching
|
||||
|
||||
### Creating Filter Chains
|
||||
|
||||
```python
|
||||
# Create and configure a filter chain
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter(["*.example.com/*"]),
|
||||
ContentTypeFilter(["text/html"]),
|
||||
DomainFilter(blocked_domains=["ads.*"])
|
||||
])
|
||||
|
||||
# Add more filters
|
||||
filter_chain.add_filter(
|
||||
URLPatternFilter(["*/article/*"])
|
||||
)
|
||||
```
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
Start([URL Input]) --> Composite[Composite Scorer]
|
||||
|
||||
subgraph Scoring Process
|
||||
Composite --> Keywords[Keyword Relevance]
|
||||
Composite --> Path[Path Depth]
|
||||
Composite --> Content[Content Type]
|
||||
Composite --> Fresh[Freshness]
|
||||
Composite --> Domain[Domain Authority]
|
||||
|
||||
Keywords --> KeywordScore[Calculate Score]
|
||||
Path --> PathScore[Calculate Score]
|
||||
Content --> ContentScore[Calculate Score]
|
||||
Fresh --> FreshScore[Calculate Score]
|
||||
Domain --> DomainScore[Calculate Score]
|
||||
|
||||
KeywordScore --> Weight1[Apply Weight]
|
||||
PathScore --> Weight2[Apply Weight]
|
||||
ContentScore --> Weight3[Apply Weight]
|
||||
FreshScore --> Weight4[Apply Weight]
|
||||
DomainScore --> Weight5[Apply Weight]
|
||||
end
|
||||
|
||||
Weight1 --> Combine[Combine Scores]
|
||||
Weight2 --> Combine
|
||||
Weight3 --> Combine
|
||||
Weight4 --> Combine
|
||||
Weight5 --> Combine
|
||||
|
||||
Combine --> Normalize{Normalize?}
|
||||
Normalize -->|Yes| NormalizeScore[Normalize Combined Score]
|
||||
Normalize -->|No| FinalScore[Final Score]
|
||||
NormalizeScore --> FinalScore
|
||||
|
||||
FinalScore --> Stats[Update Statistics]
|
||||
Stats --> End([End])
|
||||
|
||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
||||
classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px;
|
||||
classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
||||
classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
||||
|
||||
class Start,End calc;
|
||||
class Keywords,Path,Content,Fresh,Domain scorer;
|
||||
class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process;
|
||||
class Normalize decision;
|
||||
```
|
||||
|
||||
## URL Scorers
|
||||
|
||||
URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority.
|
||||
|
||||
### Available Scorers
|
||||
|
||||
1. **Keyword Relevance Scorer**
|
||||
```python
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["python", "programming"],
|
||||
weight=1.0,
|
||||
case_sensitive=False
|
||||
)
|
||||
```
|
||||
- Score based on keyword matches
|
||||
- Case sensitivity options
|
||||
- Weighted scoring
|
||||
|
||||
2. **Path Depth Scorer**
|
||||
```python
|
||||
path_scorer = PathDepthScorer(
|
||||
optimal_depth=3, # Preferred URL depth
|
||||
weight=0.7
|
||||
)
|
||||
```
|
||||
- Score based on URL path depth
|
||||
- Configurable optimal depth
|
||||
- Diminishing returns for deeper paths
|
||||
|
||||
3. **Content Type Scorer**
|
||||
```python
|
||||
content_scorer = ContentTypeScorer({
|
||||
r'\.html$': 1.0,
|
||||
r'\.pdf$': 0.8,
|
||||
r'\.xml$': 0.6
|
||||
})
|
||||
```
|
||||
- Score based on file types
|
||||
- Configurable type weights
|
||||
- Pattern matching support
|
||||
|
||||
4. **Freshness Scorer**
|
||||
```python
|
||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
||||
```
|
||||
- Score based on date indicators in URLs
|
||||
- Multiple date format support
|
||||
- Recency weighting
|
||||
|
||||
5. **Domain Authority Scorer**
|
||||
```python
|
||||
authority_scorer = DomainAuthorityScorer({
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.9,
|
||||
"medium.com": 0.7
|
||||
})
|
||||
```
|
||||
- Score based on domain importance
|
||||
- Configurable domain weights
|
||||
- Default weight for unknown domains
|
||||
|
||||
### Combining Scorers
|
||||
|
||||
```python
|
||||
# Create a composite scorer
|
||||
composite_scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(["python"], weight=1.0),
|
||||
PathDepthScorer(optimal_depth=2, weight=0.7),
|
||||
FreshnessScorer(weight=0.8)
|
||||
], normalize=True)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Filter Configuration
|
||||
|
||||
1. **Start Restrictive**
|
||||
```python
|
||||
# Begin with strict filters
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(allowed_domains=["example.com"]),
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
```
|
||||
|
||||
2. **Layer Filters**
|
||||
```python
|
||||
# Add more specific filters
|
||||
filter_chain.add_filter(
|
||||
URLPatternFilter(["*/article/*", "*/blog/*"])
|
||||
)
|
||||
```
|
||||
|
||||
3. **Monitor Filter Statistics**
|
||||
```python
|
||||
# Check filter performance
|
||||
for filter in filter_chain.filters:
|
||||
print(f"{filter.name}: {filter.stats.rejected_urls} rejected")
|
||||
```
|
||||
|
||||
### Scorer Configuration
|
||||
|
||||
1. **Balance Weights**
|
||||
```python
|
||||
# Balanced scoring configuration
|
||||
scorer = create_balanced_scorer()
|
||||
```
|
||||
|
||||
2. **Customize for Content**
|
||||
```python
|
||||
# News site configuration
|
||||
news_scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(["news", "article"], weight=1.0),
|
||||
FreshnessScorer(weight=1.0),
|
||||
PathDepthScorer(optimal_depth=2, weight=0.5)
|
||||
])
|
||||
```
|
||||
|
||||
3. **Monitor Scoring Statistics**
|
||||
```python
|
||||
# Check scoring distribution
|
||||
print(f"Average score: {scorer.stats.average_score}")
|
||||
print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}")
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Blog Crawling
|
||||
```python
|
||||
blog_config = {
|
||||
'filters': FilterChain([
|
||||
URLPatternFilter(["*/blog/*", "*/post/*"]),
|
||||
ContentTypeFilter(["text/html"])
|
||||
]),
|
||||
'scorer': CompositeScorer([
|
||||
FreshnessScorer(weight=1.0),
|
||||
KeywordRelevanceScorer(["blog", "article"], weight=0.8)
|
||||
])
|
||||
}
|
||||
```
|
||||
|
||||
### Documentation Sites
|
||||
```python
|
||||
docs_config = {
|
||||
'filters': FilterChain([
|
||||
URLPatternFilter(["*/docs/*", "*/guide/*"]),
|
||||
ContentTypeFilter(["text/html", "application/pdf"])
|
||||
]),
|
||||
'scorer': CompositeScorer([
|
||||
PathDepthScorer(optimal_depth=3, weight=1.0),
|
||||
KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9)
|
||||
])
|
||||
}
|
||||
```
|
||||
|
||||
### E-commerce Sites
|
||||
```python
|
||||
ecommerce_config = {
|
||||
'filters': FilterChain([
|
||||
URLPatternFilter(["*/product/*", "*/category/*"]),
|
||||
DomainFilter(blocked_domains=["ads.*", "tracker.*"])
|
||||
]),
|
||||
'scorer': CompositeScorer([
|
||||
PathDepthScorer(optimal_depth=2, weight=1.0),
|
||||
ContentTypeScorer({
|
||||
r'/product/': 1.0,
|
||||
r'/category/': 0.8
|
||||
})
|
||||
])
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Topics
|
||||
|
||||
### Custom Filters
|
||||
```python
|
||||
class CustomFilter(URLFilter):
|
||||
def apply(self, url: str) -> bool:
|
||||
# Your custom filtering logic
|
||||
return True
|
||||
```
|
||||
|
||||
### Custom Scorers
|
||||
```python
|
||||
class CustomScorer(URLScorer):
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
# Your custom scoring logic
|
||||
return 1.0
|
||||
```
|
||||
|
||||
For more examples, check our [example repository](https://github.com/example/crawl4ai/examples).
|
||||
206
docs/deep_crawl/how_to_use.md
Normal file
206
docs/deep_crawl/how_to_use.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Scraper Examples Guide
|
||||
|
||||
This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features.
|
||||
|
||||
## Basic Example
|
||||
|
||||
The basic example demonstrates a simple blog scraping scenario:
|
||||
|
||||
```python
|
||||
from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain
|
||||
|
||||
# Create simple filter chain
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter("*/blog/*"),
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Initialize strategy
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None,
|
||||
max_concurrent=3
|
||||
)
|
||||
|
||||
# Create and run scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
result = await scraper.ascrape("https://example.com/blog/")
|
||||
```
|
||||
|
||||
### Features Demonstrated
|
||||
- Basic URL filtering
|
||||
- Simple content type filtering
|
||||
- Depth control
|
||||
- Concurrent request limiting
|
||||
- Result collection
|
||||
|
||||
## Advanced Example
|
||||
|
||||
The advanced example shows a sophisticated news site scraping setup with all features enabled:
|
||||
|
||||
```python
|
||||
# Create comprehensive filter chain
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(
|
||||
allowed_domains=["example.com"],
|
||||
blocked_domains=["ads.example.com"]
|
||||
),
|
||||
URLPatternFilter([
|
||||
"*/article/*",
|
||||
re.compile(r"\d{4}/\d{2}/.*")
|
||||
]),
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Create intelligent scorer
|
||||
scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["news", "breaking"],
|
||||
weight=1.0
|
||||
),
|
||||
PathDepthScorer(optimal_depth=3, weight=0.7),
|
||||
FreshnessScorer(weight=0.9)
|
||||
])
|
||||
|
||||
# Initialize advanced strategy
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=4,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=scorer,
|
||||
max_concurrent=5
|
||||
)
|
||||
```
|
||||
|
||||
### Features Demonstrated
|
||||
1. **Advanced Filtering**
|
||||
- Domain filtering
|
||||
- Pattern matching
|
||||
- Content type control
|
||||
|
||||
2. **Intelligent Scoring**
|
||||
- Keyword relevance
|
||||
- Path optimization
|
||||
- Freshness priority
|
||||
|
||||
3. **Monitoring**
|
||||
- Progress tracking
|
||||
- Error handling
|
||||
- Statistics collection
|
||||
|
||||
4. **Resource Management**
|
||||
- Concurrent processing
|
||||
- Rate limiting
|
||||
- Cleanup handling
|
||||
|
||||
## Running the Examples
|
||||
|
||||
```bash
|
||||
# Basic usage
|
||||
python basic_scraper_example.py
|
||||
|
||||
# Advanced usage with logging
|
||||
PYTHONPATH=. python advanced_scraper_example.py
|
||||
```
|
||||
|
||||
## Example Output
|
||||
|
||||
### Basic Example
|
||||
```
|
||||
Crawled 15 pages:
|
||||
- https://example.com/blog/post1: 24560 bytes
|
||||
- https://example.com/blog/post2: 18920 bytes
|
||||
...
|
||||
```
|
||||
|
||||
### Advanced Example
|
||||
```
|
||||
INFO: Starting crawl of https://example.com/news/
|
||||
INFO: Processed: https://example.com/news/breaking/story1
|
||||
DEBUG: KeywordScorer: 0.85
|
||||
DEBUG: FreshnessScorer: 0.95
|
||||
INFO: Progress: 10 URLs processed
|
||||
...
|
||||
INFO: Scraping completed:
|
||||
INFO: - URLs processed: 50
|
||||
INFO: - Errors: 2
|
||||
INFO: - Total content size: 1240.50 KB
|
||||
```
|
||||
|
||||
## Customization
|
||||
|
||||
### Adding Custom Filters
|
||||
```python
|
||||
class CustomFilter(URLFilter):
|
||||
def apply(self, url: str) -> bool:
|
||||
# Your custom filtering logic
|
||||
return True
|
||||
|
||||
filter_chain.add_filter(CustomFilter())
|
||||
```
|
||||
|
||||
### Custom Scoring Logic
|
||||
```python
|
||||
class CustomScorer(URLScorer):
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
# Your custom scoring logic
|
||||
return 1.0
|
||||
|
||||
scorer = CompositeScorer([
|
||||
CustomScorer(weight=1.0),
|
||||
...
|
||||
])
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start Simple**
|
||||
- Begin with basic filtering
|
||||
- Add features incrementally
|
||||
- Test thoroughly at each step
|
||||
|
||||
2. **Monitor Performance**
|
||||
- Watch memory usage
|
||||
- Track processing times
|
||||
- Adjust concurrency as needed
|
||||
|
||||
3. **Handle Errors**
|
||||
- Implement proper error handling
|
||||
- Log important events
|
||||
- Track error statistics
|
||||
|
||||
4. **Optimize Resources**
|
||||
- Set appropriate delays
|
||||
- Limit concurrent requests
|
||||
- Use streaming for large crawls
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Common issues and solutions:
|
||||
|
||||
1. **Too Many Requests**
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
max_concurrent=3, # Reduce concurrent requests
|
||||
min_crawl_delay=2 # Increase delay between requests
|
||||
)
|
||||
```
|
||||
|
||||
2. **Memory Issues**
|
||||
```python
|
||||
# Use streaming mode for large crawls
|
||||
async for result in scraper.ascrape(url, stream=True):
|
||||
process_result(result)
|
||||
```
|
||||
|
||||
3. **Missing Content**
|
||||
```python
|
||||
# Check your filter chain
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter("*"), # Broaden patterns
|
||||
ContentTypeFilter(["*"]) # Accept all content
|
||||
])
|
||||
```
|
||||
|
||||
For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples).
|
||||
@@ -112,19 +112,19 @@ def create_performance_table(results):
|
||||
|
||||
|
||||
async def main():
|
||||
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
|
||||
urls = [f"https://example.com/page{i}" for i in range(1, 40)]
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
|
||||
|
||||
results = {
|
||||
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
||||
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
|
||||
urls, browser_config, run_config
|
||||
),
|
||||
"Semaphore": await semaphore(urls, browser_config, run_config),
|
||||
"Semaphore + Rate Limit": await semaphore_with_rate_limit(
|
||||
urls, browser_config, run_config
|
||||
),
|
||||
# "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
|
||||
# urls, browser_config, run_config
|
||||
# ),
|
||||
# "Semaphore": await semaphore(urls, browser_config, run_config),
|
||||
# "Semaphore + Rate Limit": await semaphore_with_rate_limit(
|
||||
# urls, browser_config, run_config
|
||||
# ),
|
||||
}
|
||||
|
||||
table = create_performance_table(results)
|
||||
|
||||
@@ -117,17 +117,17 @@ def test_scraping():
|
||||
timing_stats.report()
|
||||
|
||||
# Print stats of LXML output
|
||||
print("\nLXML Output:")
|
||||
print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}")
|
||||
print(f"Extracted images: {len(result_selected['media']['images'])}")
|
||||
print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB")
|
||||
print("\Turbo Output:")
|
||||
print(f"\nExtracted links: {len(result_selected.links.internal) + len(result_selected.links.external)}")
|
||||
print(f"Extracted images: {len(result_selected.media.images)}")
|
||||
print(f"Clean HTML size: {len(result_selected.cleaned_html)/1024:.2f} KB")
|
||||
print(f"Scraping time: {t2 - t1:.2f} seconds")
|
||||
|
||||
# Print stats of original output
|
||||
print("\nOriginal Output:")
|
||||
print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}")
|
||||
print(f"Extracted images: {len(result_original['media']['images'])}")
|
||||
print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB")
|
||||
print(f"\nExtracted links: {len(result_original.links.internal) + len(result_original.links.external)}")
|
||||
print(f"Extracted images: {len(result_original.media.images)}")
|
||||
print(f"Clean HTML size: {len(result_original.cleaned_html)/1024:.2f} KB")
|
||||
print(f"Scraping time: {t3 - t1:.2f} seconds")
|
||||
|
||||
|
||||
|
||||
@@ -1,56 +1,99 @@
|
||||
"""
|
||||
Crawl4ai v0.4.3 Features Demo
|
||||
Crawl4ai v0.4.3b2 Features Demo
|
||||
============================
|
||||
|
||||
This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
|
||||
Each section showcases a specific feature with practical examples and explanations.
|
||||
This demonstration showcases three major categories of new features in Crawl4ai v0.4.3:
|
||||
|
||||
1. Efficiency & Speed:
|
||||
- Memory-efficient dispatcher strategies
|
||||
- New scraping algorithm
|
||||
- Streaming support for batch crawling
|
||||
|
||||
2. LLM Integration:
|
||||
- Automatic schema generation
|
||||
- LLM-powered content filtering
|
||||
- Smart markdown generation
|
||||
|
||||
3. Core Improvements:
|
||||
- Robots.txt compliance
|
||||
- Proxy rotation
|
||||
- Enhanced URL handling
|
||||
- Shared data among hooks
|
||||
- add page routes
|
||||
|
||||
Each demo function can be run independently or as part of the full suite.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import *
|
||||
import json
|
||||
import re
|
||||
import random
|
||||
from typing import Optional, Dict
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DisplayMode,
|
||||
MemoryAdaptiveDispatcher,
|
||||
CrawlerMonitor,
|
||||
DefaultMarkdownGenerator,
|
||||
LXMLWebScrapingStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
LLMContentFilter
|
||||
)
|
||||
|
||||
|
||||
async def demo_memory_dispatcher():
|
||||
"""Demonstrates the new memory-efficient dispatcher system.
|
||||
|
||||
Key Features:
|
||||
- Adaptive memory management
|
||||
- Real-time performance monitoring
|
||||
- Concurrent session control
|
||||
"""
|
||||
1. Memory Dispatcher System Demo
|
||||
===============================
|
||||
Shows how to use the new memory dispatcher with monitoring
|
||||
"""
|
||||
print("\n=== 1. Memory Dispatcher System Demo ===")
|
||||
|
||||
# Configure crawler
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Initialize dispatcher with monitoring
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=10,
|
||||
display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED
|
||||
print("\n=== Memory Dispatcher Demo ===")
|
||||
|
||||
try:
|
||||
# Configuration
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Memory usage threshold
|
||||
check_interval=0.5, # How often to check memory
|
||||
max_session_permit=5, # Max concurrent crawls
|
||||
monitor=monitor, # Pass the monitor
|
||||
)
|
||||
|
||||
# Run with memory monitoring
|
||||
print("Starting batch crawl with memory monitoring...")
|
||||
results = await dispatcher.run_urls(
|
||||
urls=urls,
|
||||
crawler=crawler,
|
||||
config=crawler_config,
|
||||
)
|
||||
print(f"Completed {len(results)} URLs")
|
||||
# Test URLs
|
||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
||||
|
||||
print("\n📈 Initializing crawler with memory monitoring...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=10,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0,
|
||||
check_interval=0.5,
|
||||
max_session_permit=5,
|
||||
monitor=monitor
|
||||
)
|
||||
|
||||
print("\n🚀 Starting batch crawl...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
print(f"\n✅ Completed {len(results)} URLs successfully")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error in memory dispatcher demo: {str(e)}")
|
||||
|
||||
async def demo_streaming_support():
|
||||
"""
|
||||
@@ -60,7 +103,7 @@ async def demo_streaming_support():
|
||||
"""
|
||||
print("\n=== 2. Streaming Support Demo ===")
|
||||
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
|
||||
|
||||
# Test URLs
|
||||
@@ -71,16 +114,17 @@ async def demo_streaming_support():
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
|
||||
|
||||
print("Starting streaming crawl...")
|
||||
async for result in dispatcher.run_urls_stream(
|
||||
urls=urls, crawler=crawler, config=crawler_config
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
):
|
||||
# Process each result as it arrives
|
||||
print(
|
||||
f"Received result for {result.url} - Success: {result.result.success}"
|
||||
f"Received result for {result.url} - Success: {result.success}"
|
||||
)
|
||||
if result.result.success:
|
||||
print(f"Content length: {len(result.result.markdown)}")
|
||||
|
||||
if result.success:
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
|
||||
async def demo_content_scraping():
|
||||
"""
|
||||
@@ -94,7 +138,10 @@ async def demo_content_scraping():
|
||||
url = "https://example.com/article"
|
||||
|
||||
# Configure with the new LXML strategy
|
||||
config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print("Scraping content with LXML strategy...")
|
||||
async with crawler:
|
||||
@@ -102,7 +149,6 @@ async def demo_content_scraping():
|
||||
if result.success:
|
||||
print("Successfully scraped content using LXML strategy")
|
||||
|
||||
|
||||
async def demo_llm_markdown():
|
||||
"""
|
||||
4. LLM-Powered Markdown Generation Demo
|
||||
@@ -153,7 +199,6 @@ async def demo_llm_markdown():
|
||||
print(result.markdown_v2.fit_markdown[:500])
|
||||
print("Successfully generated LLM-filtered markdown")
|
||||
|
||||
|
||||
async def demo_robots_compliance():
|
||||
"""
|
||||
5. Robots.txt Compliance Demo
|
||||
@@ -177,9 +222,7 @@ async def demo_robots_compliance():
|
||||
elif result.success:
|
||||
print(f"Successfully crawled: {result.url}")
|
||||
|
||||
|
||||
|
||||
async def demo_llm_schema_generation():
|
||||
async def demo_json_schema_generation():
|
||||
"""
|
||||
7. LLM-Powered Schema Generation Demo
|
||||
=================================
|
||||
@@ -232,21 +275,77 @@ async def demo_llm_schema_generation():
|
||||
print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
|
||||
print("Successfully used generated schema for crawling")
|
||||
|
||||
async def demo_proxy_rotation():
|
||||
"""
|
||||
8. Proxy Rotation Demo
|
||||
===================
|
||||
Demonstrates how to rotate proxies for each request using Crawl4ai.
|
||||
"""
|
||||
print("\n=== 8. Proxy Rotation Demo ===")
|
||||
|
||||
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
||||
"""Get next proxy from local file"""
|
||||
try:
|
||||
proxies = os.getenv("PROXIES", "").split(",")
|
||||
|
||||
ip, port, username, password = random.choice(proxies).split(":")
|
||||
return {
|
||||
"server": f"http://{ip}:{port}",
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP for verification
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error loading proxy: {e}")
|
||||
return None
|
||||
|
||||
# Create 10 test requests to httpbin
|
||||
urls = ["https://httpbin.org/ip"] * 2
|
||||
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for url in urls:
|
||||
proxy = await get_next_proxy()
|
||||
if not proxy:
|
||||
print("No proxy available, skipping...")
|
||||
continue
|
||||
|
||||
# Create new config with proxy
|
||||
current_config = run_config.clone(proxy_config=proxy, user_agent="")
|
||||
result = await crawler.arun(url=url, config=current_config)
|
||||
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
||||
verified = ip_match.group(0) == proxy['ip']
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
||||
else:
|
||||
print(f"❌ Proxy failed or IP mismatch!")
|
||||
else:
|
||||
print(f"Failed with proxy {proxy['ip']}")
|
||||
|
||||
async def main():
|
||||
"""Run all feature demonstrations."""
|
||||
demo_memory_dispatcher(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_streaming_support(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_content_scraping(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_llm_schema_generation(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_llm_markdown(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
demo_robots_compliance(),
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
print("\n📊 Running Crawl4ai v0.4.3 Feature Demos\n")
|
||||
|
||||
# Efficiency & Speed Demos
|
||||
print("\n🚀 EFFICIENCY & SPEED DEMOS")
|
||||
await demo_memory_dispatcher()
|
||||
await demo_streaming_support()
|
||||
await demo_content_scraping()
|
||||
|
||||
# # LLM Integration Demos
|
||||
print("\n🤖 LLM INTEGRATION DEMOS")
|
||||
await demo_json_schema_generation()
|
||||
await demo_llm_markdown()
|
||||
|
||||
# # Core Improvements
|
||||
print("\n🔧 CORE IMPROVEMENT DEMOS")
|
||||
await demo_robots_compliance()
|
||||
await demo_proxy_rotation()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -5,16 +5,20 @@
|
||||
## 1. Introduction
|
||||
|
||||
When crawling many URLs:
|
||||
|
||||
- **Basic**: Use `arun()` in a loop (simple but less efficient)
|
||||
- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
|
||||
- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
|
||||
|
||||
**Why Dispatchers?**
|
||||
|
||||
- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
|
||||
- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
|
||||
- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
|
||||
- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
|
||||
|
||||
---
|
||||
|
||||
## 2. Core Components
|
||||
|
||||
### 2.1 Rate Limiter
|
||||
@@ -22,34 +26,116 @@ When crawling many URLs:
|
||||
```python
|
||||
class RateLimiter:
|
||||
def __init__(
|
||||
base_delay: Tuple[float, float] = (1.0, 3.0), # Random delay range between requests
|
||||
max_delay: float = 60.0, # Maximum backoff delay
|
||||
max_retries: int = 3, # Retries before giving up
|
||||
rate_limit_codes: List[int] = [429, 503] # Status codes triggering backoff
|
||||
# Random delay range between requests
|
||||
base_delay: Tuple[float, float] = (1.0, 3.0),
|
||||
|
||||
# Maximum backoff delay
|
||||
max_delay: float = 60.0,
|
||||
|
||||
# Retries before giving up
|
||||
max_retries: int = 3,
|
||||
|
||||
# Status codes triggering backoff
|
||||
rate_limit_codes: List[int] = [429, 503]
|
||||
)
|
||||
```
|
||||
|
||||
The RateLimiter provides:
|
||||
- Random delays between requests
|
||||
- Exponential backoff on rate limit responses
|
||||
- Domain-specific rate limiting
|
||||
- Automatic retry handling
|
||||
Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines.
|
||||
|
||||
#### RateLimiter Constructor Parameters
|
||||
|
||||
The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters.
|
||||
|
||||
**Parameters of the `RateLimiter` constructor:**
|
||||
|
||||
1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`)
|
||||
The range for a random delay (in seconds) between consecutive requests to the same domain.
|
||||
|
||||
- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request.
|
||||
- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits.
|
||||
|
||||
**Example:**
|
||||
If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc.
|
||||
|
||||
---
|
||||
|
||||
2. **`max_delay`** (`float`, default: `60.0`)
|
||||
The maximum allowable delay when rate-limiting errors occur.
|
||||
|
||||
- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter.
|
||||
- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value.
|
||||
|
||||
**Example:**
|
||||
For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`.
|
||||
|
||||
---
|
||||
|
||||
3. **`max_retries`** (`int`, default: `3`)
|
||||
The maximum number of retries for a request if rate-limiting errors occur.
|
||||
|
||||
- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times.
|
||||
- If all retries fail, the request is marked as failed, and the process continues.
|
||||
|
||||
**Example:**
|
||||
If `max_retries = 3`, the system retries a failed request three times before giving up.
|
||||
|
||||
---
|
||||
|
||||
4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`)
|
||||
A list of HTTP status codes that trigger the rate-limiting logic.
|
||||
|
||||
- These status codes indicate the server is overwhelmed or actively limiting requests.
|
||||
- You can customize this list to include other codes based on specific server behavior.
|
||||
|
||||
**Example:**
|
||||
If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes.
|
||||
|
||||
---
|
||||
|
||||
**How to Use the `RateLimiter`:**
|
||||
|
||||
Here’s an example of initializing and using a `RateLimiter` in your project:
|
||||
|
||||
```python
|
||||
from crawl4ai import RateLimiter
|
||||
|
||||
# Create a RateLimiter with custom settings
|
||||
rate_limiter = RateLimiter(
|
||||
base_delay=(2.0, 4.0), # Random delay between 2-4 seconds
|
||||
max_delay=30.0, # Cap delay at 30 seconds
|
||||
max_retries=5, # Retry up to 5 times on rate-limiting errors
|
||||
rate_limit_codes=[429, 503] # Handle these HTTP status codes
|
||||
)
|
||||
|
||||
# RateLimiter will handle delays and retries internally
|
||||
# No additional setup is required for its operation
|
||||
```
|
||||
|
||||
The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency.
|
||||
|
||||
|
||||
### 2.2 Crawler Monitor
|
||||
|
||||
The CrawlerMonitor provides real-time visibility into crawling operations:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerMonitor, DisplayMode
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=15, # Maximum rows in live display
|
||||
display_mode=DisplayMode.DETAILED # DETAILED or AGGREGATED view
|
||||
# Maximum rows in live display
|
||||
max_visible_rows=15,
|
||||
|
||||
# DETAILED or AGGREGATED view
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
```
|
||||
|
||||
**Display Modes**:
|
||||
|
||||
1. **DETAILED**: Shows individual task status, memory usage, and timing
|
||||
2. **AGGREGATED**: Displays summary statistics and overall progress
|
||||
|
||||
---
|
||||
|
||||
## 3. Available Dispatchers
|
||||
|
||||
### 3.1 MemoryAdaptiveDispatcher (Default)
|
||||
@@ -57,6 +143,8 @@ monitor = CrawlerMonitor(
|
||||
Automatically manages concurrency based on system memory usage:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=90.0, # Pause if memory exceeds this
|
||||
check_interval=1.0, # How often to check memory
|
||||
@@ -73,13 +161,37 @@ dispatcher = MemoryAdaptiveDispatcher(
|
||||
)
|
||||
```
|
||||
|
||||
**Constructor Parameters:**
|
||||
|
||||
1. **`memory_threshold_percent`** (`float`, default: `90.0`)
|
||||
Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload.
|
||||
|
||||
2. **`check_interval`** (`float`, default: `1.0`)
|
||||
The interval (in seconds) at which the dispatcher checks system memory usage.
|
||||
|
||||
3. **`max_session_permit`** (`int`, default: `10`)
|
||||
The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
|
||||
|
||||
4. **`memory_wait_timeout`** (`float`, default: `300.0`)
|
||||
Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
|
||||
|
||||
5. **`rate_limiter`** (`RateLimiter`, default: `None`)
|
||||
Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details.
|
||||
|
||||
6. **`monitor`** (`CrawlerMonitor`, default: `None`)
|
||||
Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details.
|
||||
|
||||
---
|
||||
|
||||
### 3.2 SemaphoreDispatcher
|
||||
|
||||
Provides simple concurrency control with a fixed limit:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_dispatcher import SemaphoreDispatcher
|
||||
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
max_session_permit=5, # Fixed concurrent tasks
|
||||
max_session_permit=20, # Maximum concurrent tasks
|
||||
rate_limiter=RateLimiter( # Optional rate limiting
|
||||
base_delay=(0.5, 1.0),
|
||||
max_delay=10.0
|
||||
@@ -91,6 +203,19 @@ dispatcher = SemaphoreDispatcher(
|
||||
)
|
||||
```
|
||||
|
||||
**Constructor Parameters:**
|
||||
|
||||
1. **`max_session_permit`** (`int`, default: `20`)
|
||||
The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots.
|
||||
|
||||
2. **`rate_limiter`** (`RateLimiter`, default: `None`)
|
||||
Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details.
|
||||
|
||||
3. **`monitor`** (`CrawlerMonitor`, default: `None`)
|
||||
Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details.
|
||||
|
||||
---
|
||||
|
||||
## 4. Usage Examples
|
||||
|
||||
### 4.1 Batch Processing (Default)
|
||||
@@ -128,6 +253,14 @@ async def crawl_batch():
|
||||
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||
```
|
||||
|
||||
**Review:**
|
||||
- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete.
|
||||
- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory.
|
||||
- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing.
|
||||
- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl.
|
||||
|
||||
---
|
||||
|
||||
### 4.2 Streaming Mode
|
||||
|
||||
```python
|
||||
@@ -161,6 +294,14 @@ async def crawl_streaming():
|
||||
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||
```
|
||||
|
||||
**Review:**
|
||||
- **Purpose:** Enables streaming to process results as soon as they’re available.
|
||||
- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management.
|
||||
- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling.
|
||||
- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage.
|
||||
|
||||
---
|
||||
|
||||
### 4.3 Semaphore-based Crawling
|
||||
|
||||
```python
|
||||
@@ -189,6 +330,14 @@ async def crawl_with_semaphore(urls):
|
||||
return results
|
||||
```
|
||||
|
||||
**Review:**
|
||||
- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots.
|
||||
- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks.
|
||||
- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests.
|
||||
- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory.
|
||||
|
||||
---
|
||||
|
||||
### 4.4 Robots.txt Consideration
|
||||
|
||||
```python
|
||||
@@ -221,11 +370,13 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Key Points**:
|
||||
- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling
|
||||
- Robots.txt files are cached for efficiency
|
||||
- Failed robots.txt checks return 403 status code
|
||||
- Dispatcher handles robots.txt checks automatically for each URL
|
||||
**Review:**
|
||||
- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling.
|
||||
- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling.
|
||||
- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`).
|
||||
- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices.
|
||||
|
||||
---
|
||||
|
||||
## 5. Dispatch Results
|
||||
|
||||
@@ -255,20 +406,24 @@ for result in results:
|
||||
|
||||
## 6. Summary
|
||||
|
||||
1. **Two Dispatcher Types**:
|
||||
1. **Two Dispatcher Types**:
|
||||
|
||||
- MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
|
||||
- SemaphoreDispatcher: Fixed concurrency limit
|
||||
|
||||
2. **Optional Components**:
|
||||
2. **Optional Components**:
|
||||
|
||||
- RateLimiter: Smart request pacing and backoff
|
||||
- CrawlerMonitor: Real-time progress visualization
|
||||
|
||||
3. **Key Benefits**:
|
||||
3. **Key Benefits**:
|
||||
|
||||
- Automatic memory management
|
||||
- Built-in rate limiting
|
||||
- Live progress monitoring
|
||||
- Flexible concurrency control
|
||||
|
||||
Choose the dispatcher that best fits your needs:
|
||||
|
||||
- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
|
||||
- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
|
||||
|
||||
@@ -38,7 +38,7 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
Here's the corrected documentation:
|
||||
|
||||
## Rotating Proxies [COMING SOON]
|
||||
## Rotating Proxies
|
||||
|
||||
Example using a proxy rotation service dynamically:
|
||||
|
||||
|
||||
@@ -95,6 +95,10 @@ strong {
|
||||
|
||||
}
|
||||
|
||||
div.highlight {
|
||||
margin-bottom: 2em;
|
||||
}
|
||||
|
||||
.terminal-card > header {
|
||||
color: var(--font-color);
|
||||
text-align: center;
|
||||
@@ -231,6 +235,16 @@ pre {
|
||||
font-size: 2em;
|
||||
}
|
||||
|
||||
.terminal h2 {
|
||||
font-size: 1.5em;
|
||||
margin-bottom: 0.8em;
|
||||
}
|
||||
|
||||
.terminal h3 {
|
||||
font-size: 1.3em;
|
||||
margin-bottom: 0.8em;
|
||||
}
|
||||
|
||||
.terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 {
|
||||
text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color);
|
||||
}
|
||||
|
||||
137
docs/md_v2/basic/installation.md
Normal file
137
docs/md_v2/basic/installation.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# Installation 💻
|
||||
|
||||
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server.
|
||||
|
||||
## Option 1: Python Package Installation (Recommended)
|
||||
|
||||
Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs:
|
||||
|
||||
### Basic Installation
|
||||
|
||||
For basic web crawling and scraping tasks:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai
|
||||
playwright install # Install Playwright dependencies
|
||||
```
|
||||
|
||||
### Installation with PyTorch
|
||||
|
||||
For advanced text clustering (includes CosineSimilarity cluster strategy):
|
||||
|
||||
```bash
|
||||
pip install crawl4ai[torch]
|
||||
```
|
||||
|
||||
### Installation with Transformers
|
||||
|
||||
For text summarization and Hugging Face models:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai[transformer]
|
||||
```
|
||||
|
||||
### Full Installation
|
||||
|
||||
For all features:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai[all]
|
||||
```
|
||||
|
||||
### Development Installation
|
||||
|
||||
For contributors who plan to modify the source code:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e ".[all]"
|
||||
playwright install # Install Playwright dependencies
|
||||
```
|
||||
|
||||
💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models:
|
||||
|
||||
```bash
|
||||
crawl4ai-download-models
|
||||
```
|
||||
|
||||
This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
|
||||
|
||||
## Playwright Installation Note for Ubuntu
|
||||
|
||||
If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies:
|
||||
|
||||
```bash
|
||||
sudo apt-get install -y \
|
||||
libwoff1 \
|
||||
libopus0 \
|
||||
libwebp7 \
|
||||
libwebpdemux2 \
|
||||
libenchant-2-2 \
|
||||
libgudev-1.0-0 \
|
||||
libsecret-1-0 \
|
||||
libhyphen0 \
|
||||
libgdk-pixbuf2.0-0 \
|
||||
libegl1 \
|
||||
libnotify4 \
|
||||
libxslt1.1 \
|
||||
libevent-2.1-7 \
|
||||
libgles2 \
|
||||
libxcomposite1 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libepoxy0 \
|
||||
libgtk-3-0 \
|
||||
libharfbuzz-icu0 \
|
||||
libgstreamer-gl1.0-0 \
|
||||
libgstreamer-plugins-bad1.0-0 \
|
||||
gstreamer1.0-plugins-good \
|
||||
gstreamer1.0-plugins-bad \
|
||||
libxt6 \
|
||||
libxaw7 \
|
||||
xvfb \
|
||||
fonts-noto-color-emoji \
|
||||
libfontconfig \
|
||||
libfreetype6 \
|
||||
xfonts-cyrillic \
|
||||
xfonts-scalable \
|
||||
fonts-liberation \
|
||||
fonts-ipafont-gothic \
|
||||
fonts-wqy-zenhei \
|
||||
fonts-tlwg-loma-otf \
|
||||
fonts-freefont-ttf
|
||||
```
|
||||
|
||||
## Option 2: Using Docker (Coming Soon)
|
||||
|
||||
Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.
|
||||
|
||||
## Option 3: Local Server Installation
|
||||
|
||||
For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete.
|
||||
|
||||
## Verifying Your Installation
|
||||
|
||||
After installation, you can verify that Crawl4AI is working correctly by running a simple Python script:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
This script should successfully crawl the example website and print the first 500 characters of the extracted content.
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you encounter any issues during installation or usage, please check the [documentation](https://docs.crawl4ai.com/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).
|
||||
|
||||
Happy crawling! 🕷️🤖
|
||||
@@ -1,266 +1,138 @@
|
||||
# Crawl4AI 0.4.3b1 is Here: Faster, Smarter, and Ready for Real-World Crawling!
|
||||
# Crawl4AI 0.4.3: Major Performance Boost & LLM Integration
|
||||
|
||||
Hey, Crawl4AI enthusiasts! We're thrilled to announce the release of **Crawl4AI 0.4.3b1**, packed with powerful new features and enhancements that take web crawling to a whole new level of efficiency and intelligence. This release is all about giving you more control, better performance, and deeper insights into your crawled data.
|
||||
We're excited to announce Crawl4AI 0.4.3, focusing on three key areas: Speed & Efficiency, LLM Integration, and Core Platform Improvements. This release significantly improves crawling performance while adding powerful new LLM-powered features.
|
||||
|
||||
Let's dive into what's new!
|
||||
## ⚡ Speed & Efficiency Improvements
|
||||
|
||||
## 🚀 Major Feature Highlights
|
||||
|
||||
### 1. LLM-Powered Schema Generation: Zero to Structured Data in Seconds!
|
||||
|
||||
Tired of manually crafting CSS or XPath selectors? We've got you covered! Crawl4AI now features a revolutionary **schema generator** that uses the power of Large Language Models (LLMs) to automatically create extraction schemas for you.
|
||||
|
||||
**How it Works:**
|
||||
|
||||
1. **Provide HTML**: Feed in a sample HTML snippet that contains the type of data you want to extract (e.g., product listings, article sections).
|
||||
2. **Describe Your Needs (Optional)**: You can provide a natural language query like "extract all product names and prices" to guide the schema creation.
|
||||
3. **Choose Your LLM**: Use either **OpenAI** (GPT-4o recommended) for top-tier accuracy or **Ollama** for a local, open-source option.
|
||||
4. **Get Your Schema**: The tool outputs a ready-to-use JSON schema that works seamlessly with `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`.
|
||||
|
||||
**Why You'll Love It:**
|
||||
|
||||
- **No More Tedious Selector Writing**: Let the LLM analyze the HTML and create the selectors for you!
|
||||
- **One-Time Cost**: Schema generation uses LLM, but once you have your schema, subsequent extractions are fast and LLM-free.
|
||||
- **Handles Complex Structures**: The LLM can understand nested elements, lists, and variations in layout—far beyond what simple CSS selectors can achieve.
|
||||
- **Learn by Example**: The generated schemas are a fantastic way to learn best practices for writing your own schemas.
|
||||
|
||||
**Example:**
|
||||
### 1. Memory-Adaptive Dispatcher System
|
||||
The new dispatcher system provides intelligent resource management and real-time monitoring:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# Sample HTML snippet (imagine this is part of a product listing page)
|
||||
html = """
|
||||
<div class="product">
|
||||
<h2 class="name">Awesome Gadget</h2>
|
||||
<span class="price">$99.99</span>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# Generate schema using OpenAI
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llm_provider="openai/gpt-4o",
|
||||
api_token="YOUR_API_TOKEN"
|
||||
)
|
||||
|
||||
# Or use Ollama for a local, open-source option
|
||||
# schema = JsonCssExtractionStrategy.generate_schema(
|
||||
# html,
|
||||
# llm_provider="ollama/llama3"
|
||||
# )
|
||||
|
||||
print(json.dumps(schema, indent=2))
|
||||
```
|
||||
|
||||
**Output (Schema):**
|
||||
|
||||
```json
|
||||
{
|
||||
"name": null,
|
||||
"baseSelector": "div.product",
|
||||
"fields": [
|
||||
{
|
||||
"name": "name",
|
||||
"selector": "h2.name",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": "span.price",
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
You can now **save** this schema and use it for all your extractions on pages with the same structure. No more LLM costs, just **fast, reliable** data extraction!
|
||||
|
||||
### 2. Robots.txt Compliance: Crawl Responsibly
|
||||
|
||||
Crawl4AI now respects website rules! With the new `check_robots_txt=True` option in `CrawlerRunConfig`, the crawler automatically fetches, parses, and obeys each site's `robots.txt` file.
|
||||
|
||||
**Key Features**:
|
||||
|
||||
- **Efficient Caching**: Stores parsed `robots.txt` files locally for 7 days to avoid re-fetching.
|
||||
- **Automatic Integration**: Works seamlessly with both `arun()` and `arun_many()`.
|
||||
- **Clear Status Codes**: Returns a 403 status code if a URL is disallowed.
|
||||
- **Customizable**: Adjust the cache directory and TTL if needed.
|
||||
|
||||
**Example**:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DisplayMode
|
||||
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, CrawlerMonitor
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
check_robots_txt=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com/private-page", config=config)
|
||||
if result.status_code == 403:
|
||||
print("Access denied by robots.txt")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### 3. Proxy Support in `CrawlerRunConfig`
|
||||
|
||||
Need more control over your proxy settings? Now you can configure proxies directly within `CrawlerRunConfig` for each crawl:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
proxy_config={
|
||||
"server": "http://your-proxy.com:8080",
|
||||
"username": "your_username", # Optional
|
||||
"password": "your_password" # Optional
|
||||
}
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
```
|
||||
|
||||
This allows for dynamic proxy assignment per URL or even per request.
|
||||
|
||||
### 4. LLM-Powered Markdown Filtering (Beta)
|
||||
|
||||
We're introducing an experimental **`LLMContentFilter`**! This filter, when used with the `DefaultMarkdownGenerator`, can produce highly focused markdown output by using an LLM to analyze content relevance.
|
||||
|
||||
**How it Works:**
|
||||
|
||||
1. You provide an **instruction** (e.g., "extract only the key technical details").
|
||||
2. The LLM analyzes each section of the page based on your instruction.
|
||||
3. Only the most relevant content is included in the final `fit_markdown`.
|
||||
|
||||
**Example**:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def main():
|
||||
llm_filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token="YOUR_API_TOKEN", # Or use "ollama/llama3" with no token
|
||||
instruction="Extract the core educational content about Python classes."
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(content_filter=llm_filter)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://docs.python.org/3/tutorial/classes.html",
|
||||
config=config
|
||||
urls = ["https://example1.com", "https://example2.com"] * 50
|
||||
|
||||
# Configure memory-aware dispatch
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Auto-throttle at 80% memory
|
||||
check_interval=0.5, # Check every 0.5 seconds
|
||||
max_session_permit=20, # Max concurrent sessions
|
||||
monitor=CrawlerMonitor( # Real-time monitoring
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await dispatcher.run_urls(
|
||||
urls=urls,
|
||||
crawler=crawler,
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Note**: This is a beta feature. We're actively working on improving its accuracy and performance.
|
||||
|
||||
### 5. Streamlined `arun_many()` with Dispatchers
|
||||
|
||||
We've simplified concurrent crawling! `arun_many()` now intelligently handles multiple URLs, either returning a **list** of results or an **async generator** for streaming.
|
||||
|
||||
**Basic Usage (Batch)**:
|
||||
### 2. Streaming Support
|
||||
Process crawled URLs in real-time instead of waiting for all results:
|
||||
|
||||
```python
|
||||
results = await crawler.arun_many(
|
||||
urls=["https://site1.com", "https://site2.com"],
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
config = CrawlerRunConfig(stream=True)
|
||||
|
||||
for res in results:
|
||||
print(res.url, "crawled successfully:", res.success)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun_many(urls, config=config):
|
||||
print(f"Got result for {result.url}")
|
||||
# Process each result immediately
|
||||
```
|
||||
|
||||
**Streaming Mode**:
|
||||
### 3. LXML-Based Scraping
|
||||
New LXML scraping strategy offering up to 20x faster parsing:
|
||||
|
||||
```python
|
||||
async for result in await crawler.arun_many(
|
||||
urls=["https://site1.com", "https://site2.com"],
|
||||
config=CrawlerRunConfig(stream=True)
|
||||
):
|
||||
print("Just finished:", result.url)
|
||||
# Process each result immediately
|
||||
```
|
||||
|
||||
**Advanced:** You can now customize how `arun_many` handles concurrency by passing a **dispatcher**. See [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) for details.
|
||||
|
||||
### 6. Enhanced Browser Context Management
|
||||
|
||||
We've improved how Crawl4AI manages browser contexts for better resource utilization and session handling.
|
||||
|
||||
- **`shared_data` in `CrawlerRunConfig`**: Pass data between hooks using the `shared_data` dictionary.
|
||||
- **Context Reuse**: The crawler now intelligently reuses browser contexts based on configuration, reducing overhead.
|
||||
|
||||
### 7. Faster Scraping with `LXMLWebScrapingStrategy`
|
||||
|
||||
Introducing a new, optional **`LXMLWebScrapingStrategy`** that can be **10-20x faster** than the default BeautifulSoup approach for large, complex pages.
|
||||
|
||||
**How to Use**:
|
||||
|
||||
```python
|
||||
from crawl4ai import LXMLWebScrapingStrategy
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Add this line
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=CacheMode.ENABLED
|
||||
)
|
||||
```
|
||||
|
||||
**When to Use**:
|
||||
- If profiling shows a bottleneck in `WebScrapingStrategy`.
|
||||
- For very large HTML documents where parsing speed matters.
|
||||
## 🤖 LLM Integration
|
||||
|
||||
**Caveats**:
|
||||
- It might not handle malformed HTML as gracefully as BeautifulSoup.
|
||||
- We're still gathering data, so report any issues!
|
||||
### 1. LLM-Powered Markdown Generation
|
||||
Smart content filtering and organization using LLMs:
|
||||
|
||||
---
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
instruction="Extract technical documentation and code examples"
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Try the Feature Demo Script!
|
||||
### 2. Automatic Schema Generation
|
||||
Generate extraction schemas instantly using LLMs instead of manual CSS/XPath writing:
|
||||
|
||||
We've prepared a Python script demonstrating these new features. You can find it at:
|
||||
```python
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html_content,
|
||||
schema_type="CSS",
|
||||
query="Extract product name, price, and description"
|
||||
)
|
||||
```
|
||||
|
||||
[**`features_demo.py`**](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/0_4_3b1_feature_demo.py)
|
||||
## 🔧 Core Improvements
|
||||
|
||||
**To run the demo:**
|
||||
### 1. Proxy Support & Rotation
|
||||
Integrated proxy support with automatic rotation and verification:
|
||||
|
||||
1. Make sure you have Crawl4AI installed (`pip install crawl4ai`).
|
||||
2. Copy the `features_demo.py` script to your local environment.
|
||||
3. Set your OpenAI API key as an environment variable (if using OpenAI models):
|
||||
```bash
|
||||
export OPENAI_API_KEY="your_api_key"
|
||||
```
|
||||
4. Run the script:
|
||||
```bash
|
||||
python features_demo.py
|
||||
```
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
proxy_config={
|
||||
"server": "http://proxy:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
The script will execute various crawl scenarios, showcasing the new features and printing results to your console.
|
||||
### 2. Robots.txt Compliance
|
||||
Built-in robots.txt support with SQLite caching:
|
||||
|
||||
## Conclusion
|
||||
```python
|
||||
config = CrawlerRunConfig(check_robots_txt=True)
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.status_code == 403:
|
||||
print("Access blocked by robots.txt")
|
||||
```
|
||||
|
||||
Crawl4AI version 0.4.3b1 is a major step forward in flexibility, performance, and ease of use. With automatic schema generation, robots.txt handling, advanced content filtering, and streamlined multi-URL crawling, you can build powerful, efficient, and responsible web scrapers.
|
||||
### 3. URL Redirection Tracking
|
||||
Track final URLs after redirects:
|
||||
|
||||
We encourage you to try out these new capabilities, explore the updated documentation, and share your feedback! Your input is invaluable as we continue to improve Crawl4AI.
|
||||
```python
|
||||
result = await crawler.arun(url)
|
||||
print(f"Initial URL: {url}")
|
||||
print(f"Final URL: {result.redirected_url}")
|
||||
```
|
||||
|
||||
**Stay Connected:**
|
||||
## Performance Impact
|
||||
|
||||
- **Star** us on [GitHub](https://github.com/unclecode/crawl4ai) to show your support!
|
||||
- **Follow** [@unclecode](https://twitter.com/unclecode) on Twitter for updates and tips.
|
||||
- **Join** our community on Discord (link coming soon) to discuss your projects and get help.
|
||||
- Memory usage reduced by up to 40% with adaptive dispatcher
|
||||
- Parsing speed increased up to 20x with LXML strategy
|
||||
- Streaming reduces memory footprint for large crawls by ~60%
|
||||
|
||||
Happy crawling!
|
||||
## Getting Started
|
||||
|
||||
```bash
|
||||
pip install -U crawl4ai
|
||||
```
|
||||
|
||||
For complete examples, check our [demo repository](https://github.com/unclecode/crawl4ai/examples).
|
||||
|
||||
## Stay Connected
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode)
|
||||
- Join our [Discord](https://discord.gg/crawl4ai)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
11
mkdocs.yml
11
mkdocs.yml
@@ -1,4 +1,4 @@
|
||||
site_name: Crawl4AI Documentation
|
||||
site_name: Crawl4AI Documentation (v0.4.3b2)
|
||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
repo_url: https://github.com/unclecode/crawl4ai
|
||||
@@ -52,6 +52,11 @@ nav:
|
||||
theme:
|
||||
name: 'terminal'
|
||||
palette: 'dark'
|
||||
icon:
|
||||
repo: fontawesome/brands/github
|
||||
|
||||
plugins:
|
||||
- search
|
||||
|
||||
markdown_extensions:
|
||||
- pymdownx.highlight:
|
||||
@@ -64,6 +69,9 @@ markdown_extensions:
|
||||
- attr_list
|
||||
- tables
|
||||
|
||||
extra:
|
||||
version: !ENV [CRAWL4AI_VERSION, 'development']
|
||||
|
||||
extra_css:
|
||||
- assets/styles.css
|
||||
- assets/highlight.css
|
||||
@@ -72,3 +80,4 @@ extra_css:
|
||||
extra_javascript:
|
||||
- assets/highlight.min.js
|
||||
- assets/highlight_init.js
|
||||
- https://buttons.github.io/buttons.js
|
||||
@@ -37,6 +37,7 @@ dependencies = [
|
||||
"rich>=13.9.4",
|
||||
"cssselect>=1.2.0",
|
||||
"httpx==0.27.2",
|
||||
"fake-useragent>=2.0.3"
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
|
||||
184
tests/test_scraper.py
Normal file
184
tests/test_scraper.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# basic_scraper_example.py
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
async def basic_scraper_example():
|
||||
"""
|
||||
Basic example: Scrape a blog site for articles
|
||||
- Crawls only HTML pages
|
||||
- Stays within the blog section
|
||||
- Collects all results at once
|
||||
"""
|
||||
# Create a simple filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Only crawl pages within the blog section
|
||||
URLPatternFilter("*/blog/*"),
|
||||
# Only process HTML pages
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Initialize the strategy with basic configuration
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
max_concurrent=3 # Limit concurrent requests
|
||||
)
|
||||
|
||||
# Create the crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Start scraping
|
||||
try:
|
||||
result = await scraper.ascrape("https://example.com/blog/")
|
||||
|
||||
# Process results
|
||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||
for url, data in result.extracted_data.items():
|
||||
print(f"- {url}: {len(data.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
# advanced_scraper_example.py
|
||||
import logging
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
async def advanced_scraper_example():
|
||||
"""
|
||||
Advanced example: Intelligent news site scraping
|
||||
- Uses all filter types
|
||||
- Implements sophisticated scoring
|
||||
- Streams results
|
||||
- Includes monitoring and logging
|
||||
"""
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("advanced_scraper")
|
||||
|
||||
# Create sophisticated filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Domain control
|
||||
DomainFilter(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
||||
),
|
||||
# URL patterns
|
||||
URLPatternFilter([
|
||||
"*/article/*",
|
||||
"*/news/*",
|
||||
"*/blog/*",
|
||||
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
|
||||
]),
|
||||
# Content types
|
||||
ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/xhtml+xml"
|
||||
])
|
||||
])
|
||||
|
||||
# Create composite scorer
|
||||
scorer = CompositeScorer([
|
||||
# Prioritize by keywords
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["news", "breaking", "update", "latest"],
|
||||
weight=1.0
|
||||
),
|
||||
# Prefer optimal URL structure
|
||||
PathDepthScorer(
|
||||
optimal_depth=3,
|
||||
weight=0.7
|
||||
),
|
||||
# Prioritize fresh content
|
||||
FreshnessScorer(weight=0.9)
|
||||
])
|
||||
|
||||
# Initialize strategy with advanced configuration
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=4,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=scorer,
|
||||
max_concurrent=5,
|
||||
min_crawl_delay=1
|
||||
)
|
||||
|
||||
# Create crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Track statistics
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'errors': 0,
|
||||
'total_size': 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Use streaming mode
|
||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
||||
stats['processed'] += 1
|
||||
|
||||
if result.success:
|
||||
stats['total_size'] += len(result.html)
|
||||
logger.info(f"Processed: {result.url}")
|
||||
|
||||
# Print scoring information
|
||||
for scorer_name, score in result.scores.items():
|
||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
||||
else:
|
||||
stats['errors'] += 1
|
||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
||||
|
||||
# Log progress regularly
|
||||
if stats['processed'] % 10 == 0:
|
||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error: {e}")
|
||||
|
||||
finally:
|
||||
# Print final statistics
|
||||
logger.info("Scraping completed:")
|
||||
logger.info(f"- URLs processed: {stats['processed']}")
|
||||
logger.info(f"- Errors: {stats['errors']}")
|
||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
||||
|
||||
# Print filter statistics
|
||||
for filter_ in filter_chain.filters:
|
||||
logger.info(f"{filter_.name} stats:")
|
||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
||||
|
||||
# Print scorer statistics
|
||||
logger.info("Scoring statistics:")
|
||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
||||
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
# Run basic example
|
||||
print("Running basic scraper example...")
|
||||
asyncio.run(basic_scraper_example())
|
||||
|
||||
print("\nRunning advanced scraper example...")
|
||||
asyncio.run(advanced_scraper_example())
|
||||
Reference in New Issue
Block a user