diff --git a/README.md b/README.md index 9f1ef712..48124f2d 100644 --- a/README.md +++ b/README.md @@ -22,32 +22,26 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Power and Simplicity of Crawl4AI 🚀 -Crawl4AI makes even complex web crawling tasks simple and intuitive. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go! +To show the simplicity take a look at the first example: -**Example Task:** +```python +from crawl4ai import WebCrawler + +# Create the WebCrawler instance +crawler = WebCrawler() + +# Run the crawler with keyword filtering and CSS selector +result = crawler.run(url="https://www.example.com") +print(result) # {url, html, markdown, extracted_content, metadata} +``` + +Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go! 1. Instantiate a WebCrawler object. 2. Execute custom JavaScript to click a "Load More" button. -3. Filter the data to include only content related to "technology". +3. Extract semantical chunks of content and filter the data to include only content related to technology. 4. Use a CSS selector to extract only paragraphs (`
` tags).
-**Example Code:**
-
-Simply, firtsy install the package:
-```bash
-virtualenv venv
-source venv/bin/activate
-# Install Crawl4AI
-pip install git+https://github.com/unclecode/crawl4ai.git
-```
-
-Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
-```bash
-crawl4ai-download-models
-```
-
-Now, you can run the following code:
-
```python
# Import necessary modules
from crawl4ai import WebCrawler
@@ -137,7 +131,7 @@ To install Crawl4AI as a library, follow these steps:
```bash
virtualenv venv
source venv/bin/activate
-pip install git+https://github.com/unclecode/crawl4ai.git
+pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
```
đź’ˇ Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
@@ -150,12 +144,12 @@ virtualenv venv
source venv/bin/activate
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
-pip install -e .
+pip install -e .[all]
```
3. Use docker to run the local server:
```bash
-docker build -t crawl4ai .
+docker build -t crawl4ai .
# For Mac users
# docker build --platform linux/amd64 -t crawl4ai .
docker run -d -p 8000:80 crawl4ai
@@ -349,7 +343,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
-| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `CosineStrategy` |
+| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` |
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
| `verbose` | Whether to enable verbose logging. | No | `true` |
diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index de30c7f5..fa3e9045 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -170,22 +170,11 @@ def main():
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
crawler = create_crawler()
-
- cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
- crawler.always_by_pass_cache = True
-
-
- cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
- cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- chunking_strategy=NlpSentenceChunking()
- )
- cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
- print_result(result)
basic_usage(crawler)
understanding_parameters(crawler)
+
+ crawler.always_by_pass_cache = True
add_chunking_strategy(crawler)
add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler)
diff --git a/pages/app.js b/pages/app.js
index a30581a5..200e29d3 100644
--- a/pages/app.js
+++ b/pages/app.js
@@ -69,9 +69,12 @@ axios
// Handle crawl button click
document.getElementById("crawl-btn").addEventListener("click", () => {
// validate input to have both URL and API token
- if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
- alert("Please enter both URL(s) and API token.");
- return;
+ // if selected extraction strategy is LLMExtractionStrategy, then API token is required
+ if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") {
+ if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
+ alert("Please enter both URL(s) and API token.");
+ return;
+ }
}
const selectedProviderModel = document.getElementById("provider-model-select").value;
@@ -87,8 +90,6 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
const urls = urlsInput.split(",").map((url) => url.trim());
const data = {
urls: urls,
- provider_model: selectedProviderModel,
- api_token: apiToken,
include_raw_html: true,
bypass_cache: bypassCache,
extract_blocks: extractBlocks,
@@ -112,8 +113,8 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
localStorage.setItem("api_token", document.getElementById("token-input").value);
document.getElementById("loading").classList.remove("hidden");
- document.getElementById("result").classList.add("hidden");
- document.getElementById("code_help").classList.add("hidden");
+ document.getElementById("result").style.visibility = "hidden";
+ document.getElementById("code_help").style.visibility = "hidden";
axios
.post("/crawl", data)
@@ -128,18 +129,20 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
const extractionStrategy = data.extraction_strategy;
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
+ // REMOVE API TOKEN FROM CODE EXAMPLES
+ data.extraction_strategy_args.api_token = "your_api_token";
document.getElementById(
"curl-code"
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
...data,
api_token: isLLMExtraction ? "your_api_token" : undefined,
- })}' http://crawl4ai.uccode.io/crawl`;
+ }, null, 2)}' http://crawl4ai.com/crawl`;
document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
null,
2
- )}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+ )}\n\nresponse = requests.post("http://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
document.getElementById(
"nodejs-code"
@@ -147,7 +150,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
null,
2
- )};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
+ )};\n\naxios.post("http://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
document.getElementById(
"library-code"
@@ -169,8 +172,8 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
document.getElementById("loading").classList.add("hidden");
- document.getElementById("result").classList.remove("hidden");
- document.getElementById("code_help").classList.remove("hidden");
+ document.getElementById("result").style.visibility = "visible";
+ document.getElementById("code_help").style.visibility = "visible";
// increment the total count
document.getElementById("total-count").textContent =
diff --git a/pages/partial/installation.html b/pages/partial/installation.html
index 01ff715b..6a6561cd 100644
--- a/pages/partial/installation.html
+++ b/pages/partial/installation.html
@@ -29,7 +29,7 @@
class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
>virtualenv venv
source venv/bin/activate
-pip install git+https://github.com/unclecode/crawl4ai.git
+pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
Loading... Please wait.
+Loading... Please wait.
-+-+-+-+