From 1cc67df3019089f350d669af688f0a577156c0d1 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 17 May 2024 16:53:03 +0800 Subject: [PATCH] chore: Update pip installation command and requirements, add new dependencies --- README.md | 42 ++++++++++++++------------------- docs/examples/quickstart.py | 15 ++---------- pages/app.js | 27 +++++++++++---------- pages/partial/installation.html | 4 ++-- pages/partial/try_it.html | 18 +++++++------- 5 files changed, 46 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 9f1ef712..48124f2d 100644 --- a/README.md +++ b/README.md @@ -22,32 +22,26 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Power and Simplicity of Crawl4AI 🚀 -Crawl4AI makes even complex web crawling tasks simple and intuitive. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go! +To show the simplicity take a look at the first example: -**Example Task:** +```python +from crawl4ai import WebCrawler + +# Create the WebCrawler instance +crawler = WebCrawler() + +# Run the crawler with keyword filtering and CSS selector +result = crawler.run(url="https://www.example.com") +print(result) # {url, html, markdown, extracted_content, metadata} +``` + +Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go! 1. Instantiate a WebCrawler object. 2. Execute custom JavaScript to click a "Load More" button. -3. Filter the data to include only content related to "technology". +3. Extract semantical chunks of content and filter the data to include only content related to technology. 4. Use a CSS selector to extract only paragraphs (`

` tags). -**Example Code:** - -Simply, firtsy install the package: -```bash -virtualenv venv -source venv/bin/activate -# Install Crawl4AI -pip install git+https://github.com/unclecode/crawl4ai.git -``` - -Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once. -```bash -crawl4ai-download-models -``` - -Now, you can run the following code: - ```python # Import necessary modules from crawl4ai import WebCrawler @@ -137,7 +131,7 @@ To install Crawl4AI as a library, follow these steps: ```bash virtualenv venv source venv/bin/activate -pip install git+https://github.com/unclecode/crawl4ai.git +pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git" ``` 💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once. @@ -150,12 +144,12 @@ virtualenv venv source venv/bin/activate git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install -e . +pip install -e .[all] ``` 3. Use docker to run the local server: ```bash -docker build -t crawl4ai . +docker build -t crawl4ai . # For Mac users # docker build --platform linux/amd64 -t crawl4ai . docker run -d -p 8000:80 crawl4ai @@ -349,7 +343,7 @@ result = crawler.run(url="https://www.nbcnews.com/business") | `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` | | `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` | | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` | -| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `CosineStrategy` | +| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` | | `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` | | `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` | | `verbose` | Whether to enable verbose logging. | No | `true` | diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index de30c7f5..fa3e9045 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -170,22 +170,11 @@ def main(): cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.") crawler = create_crawler() - - cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.") - crawler.always_by_pass_cache = True - - - cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True) - cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!") - result = crawler.run( - url="https://www.nbcnews.com/business", - chunking_strategy=NlpSentenceChunking() - ) - cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]") - print_result(result) basic_usage(crawler) understanding_parameters(crawler) + + crawler.always_by_pass_cache = True add_chunking_strategy(crawler) add_extraction_strategy(crawler) add_llm_extraction_strategy(crawler) diff --git a/pages/app.js b/pages/app.js index a30581a5..200e29d3 100644 --- a/pages/app.js +++ b/pages/app.js @@ -69,9 +69,12 @@ axios // Handle crawl button click document.getElementById("crawl-btn").addEventListener("click", () => { // validate input to have both URL and API token - if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) { - alert("Please enter both URL(s) and API token."); - return; + // if selected extraction strategy is LLMExtractionStrategy, then API token is required + if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") { + if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) { + alert("Please enter both URL(s) and API token."); + return; + } } const selectedProviderModel = document.getElementById("provider-model-select").value; @@ -87,8 +90,6 @@ document.getElementById("crawl-btn").addEventListener("click", () => { const urls = urlsInput.split(",").map((url) => url.trim()); const data = { urls: urls, - provider_model: selectedProviderModel, - api_token: apiToken, include_raw_html: true, bypass_cache: bypassCache, extract_blocks: extractBlocks, @@ -112,8 +113,8 @@ document.getElementById("crawl-btn").addEventListener("click", () => { localStorage.setItem("api_token", document.getElementById("token-input").value); document.getElementById("loading").classList.remove("hidden"); - document.getElementById("result").classList.add("hidden"); - document.getElementById("code_help").classList.add("hidden"); + document.getElementById("result").style.visibility = "hidden"; + document.getElementById("code_help").style.visibility = "hidden"; axios .post("/crawl", data) @@ -128,18 +129,20 @@ document.getElementById("crawl-btn").addEventListener("click", () => { const extractionStrategy = data.extraction_strategy; const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy"; + // REMOVE API TOKEN FROM CODE EXAMPLES + data.extraction_strategy_args.api_token = "your_api_token"; document.getElementById( "curl-code" ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined, - })}' http://crawl4ai.uccode.io/crawl`; + }, null, 2)}' http://crawl4ai.com/crawl`; document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify( { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, null, 2 - )}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`; + )}\n\nresponse = requests.post("http://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`; document.getElementById( "nodejs-code" @@ -147,7 +150,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => { { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, null, 2 - )};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`; + )};\n\naxios.post("http://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`; document.getElementById( "library-code" @@ -169,8 +172,8 @@ document.getElementById("crawl-btn").addEventListener("click", () => { document.getElementById("loading").classList.add("hidden"); - document.getElementById("result").classList.remove("hidden"); - document.getElementById("code_help").classList.remove("hidden"); + document.getElementById("result").style.visibility = "visible"; + document.getElementById("code_help").style.visibility = "visible"; // increment the total count document.getElementById("total-count").textContent = diff --git a/pages/partial/installation.html b/pages/partial/installation.html index 01ff715b..6a6561cd 100644 --- a/pages/partial/installation.html +++ b/pages/partial/installation.html @@ -29,7 +29,7 @@ class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100" >virtualenv venv source venv/bin/activate -pip install git+https://github.com/unclecode/crawl4ai.git +pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"

  • @@ -46,7 +46,7 @@ pip install git+https://github.com/unclecode/crawl4ai.git source venv/bin/activate git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install -e . +pip install -e .[all]
  • diff --git a/pages/partial/try_it.html b/pages/partial/try_it.html index 56f85062..b7fa2a13 100644 --- a/pages/partial/try_it.html +++ b/pages/partial/try_it.html @@ -46,9 +46,9 @@ id="extraction-strategy-select" class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300" > + -
    @@ -99,7 +99,7 @@
    -
    +