diff --git a/deploy/railway/README.md b/deploy/railway/README.md deleted file mode 100644 index 155e7642..00000000 --- a/deploy/railway/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Railway Deployment - -## Quick Deploy -[](https://railway.app/template/crawl4ai) - -## Manual Setup -1. Fork this repository -2. Create a new Railway project -3. Configure environment variables: - - `INSTALL_TYPE`: basic or all - - `ENABLE_GPU`: true/false -4. Deploy! - -## Configuration -See `railway.toml` for: -- Memory limits -- Health checks -- Restart policies -- Scaling options \ No newline at end of file diff --git a/deploy/railway/button.json b/deploy/railway/button.json deleted file mode 100644 index 1fc52167..00000000 --- a/deploy/railway/button.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "Crawl4AI", - "description": "LLM Friendly Web Crawler & Scraper", - "render": { - "dockerfile": { - "path": "Dockerfile" - } - }, - "env": [ - { - "key": "INSTALL_TYPE", - "description": "Installation type (basic/all)", - "default": "basic", - "required": true - }, - { - "key": "ENABLE_GPU", - "description": "Enable GPU support", - "default": "false", - "required": false - } - ], - "services": [ - { - "name": "web", - "dockerfile": "./Dockerfile", - "healthcheck": { - "path": "/health", - "port": 11235 - } - } - ] - } \ No newline at end of file diff --git a/deploy/railway/railway.toml b/deploy/railway/railway.toml deleted file mode 100644 index f24d8fab..00000000 --- a/deploy/railway/railway.toml +++ /dev/null @@ -1,18 +0,0 @@ -# railway.toml -[build] -builder = "DOCKERFILE" -dockerfilePath = "Dockerfile" - -[deploy] -startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT" -healthcheckPath = "/health" -restartPolicyType = "ON_FAILURE" -restartPolicyMaxRetries = 3 - -[deploy.memory] -soft = 2048 # 2GB min for Playwright -hard = 4096 # 4GB max - -[deploy.scaling] -min = 1 -max = 1 diff --git a/pages/app.css b/pages/app.css deleted file mode 100644 index 0e94a2e5..00000000 --- a/pages/app.css +++ /dev/null @@ -1,131 +0,0 @@ -:root { - --ifm-font-size-base: 100%; - --ifm-line-height-base: 1.65; - --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif, - BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", - "Segoe UI Symbol"; -} -html { - -webkit-font-smoothing: antialiased; - -webkit-text-size-adjust: 100%; - text-size-adjust: 100%; - font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base); -} -body { - background-color: #1a202c; - color: #fff; -} -.tab-content { - max-height: 400px; - overflow: auto; -} -pre { - white-space: pre-wrap; - font-size: 14px; -} -pre code { - width: 100%; -} - -/* Custom styling for docs-item class and Markdown generated elements */ -.docs-item { - background-color: #2d3748; /* bg-gray-800 */ - padding: 1rem; /* p-4 */ - border-radius: 0.375rem; /* rounded */ - box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */ - margin-bottom: 1rem; /* space between items */ - line-height: 1.5; /* leading-normal */ -} - -.docs-item h3, -.docs-item h4 { - color: #ffffff; /* text-white */ - font-size: 1.25rem; /* text-xl */ - font-weight: 700; /* font-bold */ - margin-bottom: 0.5rem; /* mb-2 */ -} -.docs-item h4 { - font-size: 1rem; /* text-xl */ -} - -.docs-item p { - color: #e2e8f0; /* text-gray-300 */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -.docs-item code { - background-color: #1a202c; /* bg-gray-900 */ - color: #e2e8f0; /* text-gray-300 */ - padding: 0.25rem 0.5rem; /* px-2 py-1 */ - border-radius: 0.25rem; /* rounded */ - font-size: 0.875rem; /* text-sm */ -} - -.docs-item pre { - background-color: #1a202c; /* bg-gray-900 */ - color: #e2e8f0; /* text-gray-300 */ - padding: 0.5rem; /* p-2 */ - border-radius: 0.375rem; /* rounded */ - overflow: auto; /* overflow-auto */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -.docs-item div { - color: #e2e8f0; /* text-gray-300 */ - font-size: 1rem; /* prose prose-sm */ - line-height: 1.25rem; /* line-height for readability */ -} - -/* Adjustments to make prose class more suitable for dark mode */ -.prose { - max-width: none; /* max-w-none */ -} - -.prose p, -.prose ul { - margin-bottom: 1rem; /* mb-4 */ -} - -.prose code { - /* background-color: #4a5568; */ /* bg-gray-700 */ - color: #65a30d; /* text-white */ - padding: 0.25rem 0.5rem; /* px-1 py-0.5 */ - border-radius: 0.25rem; /* rounded */ - display: inline-block; /* inline-block */ -} - -.prose pre { - background-color: #1a202c; /* bg-gray-900 */ - color: #ffffff; /* text-white */ - padding: 0.5rem; /* p-2 */ - border-radius: 0.375rem; /* rounded */ -} - -.prose h3 { - color: #65a30d; /* text-white */ - font-size: 1.25rem; /* text-xl */ - font-weight: 700; /* font-bold */ - margin-bottom: 0.5rem; /* mb-2 */ -} - -body { - background-color: #1a1a1a; - color: #b3ff00; -} -.sidebar { - color: #b3ff00; - border-right: 1px solid #333; -} -.sidebar a { - color: #b3ff00; - text-decoration: none; -} -.sidebar a:hover { - background-color: #555; -} -.content-section { - display: none; -} -.content-section.active { - display: block; -} diff --git a/pages/app.js b/pages/app.js deleted file mode 100644 index 098008ab..00000000 --- a/pages/app.js +++ /dev/null @@ -1,356 +0,0 @@ -// JavaScript to manage dynamic form changes and logic -document.getElementById("extraction-strategy-select").addEventListener("change", function () { - const strategy = this.value; - const providerModelSelect = document.getElementById("provider-model-select"); - const tokenInput = document.getElementById("token-input"); - const instruction = document.getElementById("instruction"); - const semantic_filter = document.getElementById("semantic_filter"); - const instruction_div = document.getElementById("instruction_div"); - const semantic_filter_div = document.getElementById("semantic_filter_div"); - const llm_settings = document.getElementById("llm_settings"); - - if (strategy === "LLMExtractionStrategy") { - // providerModelSelect.disabled = false; - // tokenInput.disabled = false; - // semantic_filter.disabled = true; - // instruction.disabled = false; - llm_settings.classList.remove("hidden"); - instruction_div.classList.remove("hidden"); - semantic_filter_div.classList.add("hidden"); - } else if (strategy === "NoExtractionStrategy") { - semantic_filter_div.classList.add("hidden"); - instruction_div.classList.add("hidden"); - llm_settings.classList.add("hidden"); - } else { - // providerModelSelect.disabled = true; - // tokenInput.disabled = true; - // semantic_filter.disabled = false; - // instruction.disabled = true; - llm_settings.classList.add("hidden"); - instruction_div.classList.add("hidden"); - semantic_filter_div.classList.remove("hidden"); - } - - -}); - -// Get the selected provider model and token from local storage -const storedProviderModel = localStorage.getItem("provider_model"); -const storedToken = localStorage.getItem(storedProviderModel); - -if (storedProviderModel) { - document.getElementById("provider-model-select").value = storedProviderModel; -} - -if (storedToken) { - document.getElementById("token-input").value = storedToken; -} - -// Handle provider model dropdown change -document.getElementById("provider-model-select").addEventListener("change", () => { - const selectedProviderModel = document.getElementById("provider-model-select").value; - const storedToken = localStorage.getItem(selectedProviderModel); - - if (storedToken) { - document.getElementById("token-input").value = storedToken; - } else { - document.getElementById("token-input").value = ""; - } -}); - -// Fetch total count from the database -axios - .get("/total-count") - .then((response) => { - document.getElementById("total-count").textContent = response.data.count; - }) - .catch((error) => console.error(error)); - -// Handle crawl button click -document.getElementById("crawl-btn").addEventListener("click", () => { - // validate input to have both URL and API token - // if selected extraction strategy is LLMExtractionStrategy, then API token is required - if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") { - if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) { - alert("Please enter both URL(s) and API token."); - return; - } - } - - const selectedProviderModel = document.getElementById("provider-model-select").value; - const apiToken = document.getElementById("token-input").value; - const extractBlocks = document.getElementById("extract-blocks-checkbox").checked; - const bypassCache = document.getElementById("bypass-cache-checkbox").checked; - - // Save the selected provider model and token to local storage - localStorage.setItem("provider_model", selectedProviderModel); - localStorage.setItem(selectedProviderModel, apiToken); - - const urlsInput = document.getElementById("url-input").value; - const urls = urlsInput.split(",").map((url) => url.trim()); - const data = { - urls: urls, - include_raw_html: true, - bypass_cache: bypassCache, - extract_blocks: extractBlocks, - word_count_threshold: parseInt(document.getElementById("threshold").value), - extraction_strategy: document.getElementById("extraction-strategy-select").value, - extraction_strategy_args: { - provider: selectedProviderModel, - api_token: apiToken, - instruction: document.getElementById("instruction").value, - semantic_filter: document.getElementById("semantic_filter").value, - }, - chunking_strategy: document.getElementById("chunking-strategy-select").value, - chunking_strategy_args: {}, - css_selector: document.getElementById("css-selector").value, - screenshot: document.getElementById("screenshot-checkbox").checked, - // instruction: document.getElementById("instruction").value, - // semantic_filter: document.getElementById("semantic_filter").value, - verbose: true, - }; - - // import requests - - // data = { - // "urls": [ - // "https://www.nbcnews.com/business" - // ], - // "word_count_threshold": 10, - // "extraction_strategy": "NoExtractionStrategy", - // } - - // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally - // print(response.json()) - - // save api token to local storage - localStorage.setItem("api_token", document.getElementById("token-input").value); - - document.getElementById("loading").classList.remove("hidden"); - document.getElementById("result").style.visibility = "hidden"; - document.getElementById("code_help").style.visibility = "hidden"; - - axios - .post("/crawl", data) - .then((response) => { - const result = response.data.results[0]; - const parsedJson = JSON.parse(result.extracted_content); - document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2); - document.getElementById("cleaned-html-result").textContent = result.cleaned_html; - document.getElementById("markdown-result").textContent = result.markdown; - document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2); - if (result.screenshot){ - const imgElement = document.createElement("img"); - // Set the src attribute with the base64 data - imgElement.src = `data:image/png;base64,${result.screenshot}`; - document.getElementById("screenshot-result").innerHTML = ""; - document.getElementById("screenshot-result").appendChild(imgElement); - } - - // Update code examples dynamically - const extractionStrategy = data.extraction_strategy; - const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy"; - - // REMOVE API TOKEN FROM CODE EXAMPLES - data.extraction_strategy_args.api_token = "your_api_token"; - - if (data.extraction_strategy === "NoExtractionStrategy") { - delete data.extraction_strategy_args; - delete data.extrac_blocks; - } - - if (data.chunking_strategy === "RegexChunking") { - delete data.chunking_strategy_args; - } - - delete data.verbose; - - if (data.css_selector === "") { - delete data.css_selector; - } - - if (!data.bypass_cache) { - delete data.bypass_cache; - } - - if (!data.extract_blocks) { - delete data.extract_blocks; - } - - if (!data.include_raw_html) { - delete data.include_raw_html; - } - - document.getElementById( - "curl-code" - ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({ - ...data, - api_token: isLLMExtraction ? "your_api_token" : undefined, - }, null, 2)}' https://crawl4ai.com/crawl`; - - document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify( - { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, - null, - 2 - )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`; - - document.getElementById( - "nodejs-code" - ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify( - { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined }, - null, - 2 - )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`; - - document.getElementById( - "library-code" - ).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${ - urls[0] - }',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${ - isLLMExtraction - ? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")` - : extractionStrategy + "()" - },\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${ - data.bypass_cache - },\n css_selector="${data.css_selector}"\n)\nprint(result)`; - - // Highlight code syntax - hljs.highlightAll(); - - // Select JSON tab by default - document.querySelector('.tab-btn[data-tab="json"]').click(); - - document.getElementById("loading").classList.add("hidden"); - - document.getElementById("result").style.visibility = "visible"; - document.getElementById("code_help").style.visibility = "visible"; - - // increment the total count - document.getElementById("total-count").textContent = - parseInt(document.getElementById("total-count").textContent) + 1; - }) - .catch((error) => { - console.error(error); - document.getElementById("loading").classList.add("hidden"); - }); -}); - -// Handle tab clicks -document.querySelectorAll(".tab-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const tab = btn.dataset.tab; - document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white")); - btn.classList.add("bg-lime-700", "text-white"); - document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden")); - document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden"); - }); -}); - -// Handle code tab clicks -document.querySelectorAll(".code-tab-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const tab = btn.dataset.tab; - document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white")); - btn.classList.add("bg-lime-700", "text-white"); - document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden")); - document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden"); - }); -}); - -// Handle copy to clipboard button clicks - -async function copyToClipboard(text) { - if (navigator.clipboard && navigator.clipboard.writeText) { - return navigator.clipboard.writeText(text); - } else { - return fallbackCopyTextToClipboard(text); - } -} - -function fallbackCopyTextToClipboard(text) { - return new Promise((resolve, reject) => { - const textArea = document.createElement("textarea"); - textArea.value = text; - - // Avoid scrolling to bottom - textArea.style.top = "0"; - textArea.style.left = "0"; - textArea.style.position = "fixed"; - - document.body.appendChild(textArea); - textArea.focus(); - textArea.select(); - - try { - const successful = document.execCommand("copy"); - if (successful) { - resolve(); - } else { - reject(); - } - } catch (err) { - reject(err); - } - - document.body.removeChild(textArea); - }); -} - -document.querySelectorAll(".copy-btn").forEach((btn) => { - btn.addEventListener("click", () => { - const target = btn.dataset.target; - const code = document.getElementById(target).textContent; - //navigator.clipboard.writeText(code).then(() => { - copyToClipboard(code).then(() => { - btn.textContent = "Copied!"; - setTimeout(() => { - btn.textContent = "Copy"; - }, 2000); - }); - }); -}); - -document.addEventListener("DOMContentLoaded", async () => { - try { - const extractionResponse = await fetch("/strategies/extraction"); - const extractionStrategies = await extractionResponse.json(); - - const chunkingResponse = await fetch("/strategies/chunking"); - const chunkingStrategies = await chunkingResponse.json(); - - renderStrategies("extraction-strategies", extractionStrategies); - renderStrategies("chunking-strategies", chunkingStrategies); - } catch (error) { - console.error("Error fetching strategies:", error); - } -}); - -function renderStrategies(containerId, strategies) { - const container = document.getElementById(containerId); - container.innerHTML = ""; // Clear any existing content - strategies = JSON.parse(strategies); - Object.entries(strategies).forEach(([strategy, description]) => { - const strategyElement = document.createElement("div"); - strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item"); - - const strategyDescription = document.createElement("div"); - strategyDescription.classList.add("text-gray-300", "prose", "prose-sm"); - strategyDescription.innerHTML = marked.parse(description); - - strategyElement.appendChild(strategyDescription); - - container.appendChild(strategyElement); - }); -} -document.querySelectorAll(".sidebar a").forEach((link) => { - link.addEventListener("click", function (event) { - event.preventDefault(); - document.querySelectorAll(".content-section").forEach((section) => { - section.classList.remove("active"); - }); - const target = event.target.getAttribute("data-target"); - document.getElementById(target).classList.add("active"); - }); -}); -// Highlight code syntax -hljs.highlightAll(); diff --git a/pages/index copy.html b/pages/index copy.html deleted file mode 100644 index b61b7298..00000000 --- a/pages/index copy.html +++ /dev/null @@ -1,971 +0,0 @@ - - -
- - -Loading... Please wait.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- warmup() function.
- crawler = WebCrawler()
- crawler.warmup()
- result = crawler.run(url="https://www.nbcnews.com/business")
- result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
- result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
- always_by_pass_cache to True:crawler.always_by_pass_cache = True
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- chunking_strategy=RegexChunking(patterns=["\n\n"])
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- chunking_strategy=NlpSentenceChunking()
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(
- provider="openai/gpt-4o",
- api_token=os.getenv('OPENAI_API_KEY'),
- instruction="I am interested in only financial news"
- )
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- css_selector="h2"
- )
- js_code = """
- const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
- loadMoreButton && loadMoreButton.click();
- """
- crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
- crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
- result = crawler.run(url="https://www.nbcnews.com/business")
- - There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local - server. -
- -
- You can also try Crawl4AI in a Google Colab
-
-
To install Crawl4AI as a library, follow these steps:
- -pip install git+https://github.com/unclecode/crawl4ai.git
- virtualenv venv
-source venv/bin/activate
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-pip install -e .
-
- from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-import os
-
-crawler = WebCrawler()
-
-# Single page crawl
-single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
-result = crawl4ai.fetch_page(
- url='https://www.nbcnews.com/business',
- word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
- chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
- extraction_strategy= CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
- # extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
- bypass_cache=False,
- extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
- css_selector = "", # Eg: "div.article-body"
- verbose=True,
- include_raw_html=True, # Whether to include the raw HTML content in the response
-)
-print(result.model_dump())
-
- - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -
- -| Parameter | -Description | -Required | -Default Value | -
|---|---|---|---|
| urls | -- A list of URLs to crawl and extract data from. - | -Yes | -- | -
| include_raw_html | -- Whether to include the raw HTML content in the response. - | -No | -false | -
| bypass_cache | -- Whether to force a fresh crawl even if the URL has been previously crawled. - | -No | -false | -
| extract_blocks | -- Whether to extract semantical blocks of text from the HTML. - | -No | -true | -
| word_count_threshold | -- The minimum number of words a block must contain to be considered meaningful (minimum - value is 5). - | -No | -5 | -
| extraction_strategy | -- The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). - | -No | -CosineStrategy | -
| chunking_strategy | -- The strategy to use for chunking the text before processing (e.g., "RegexChunking"). - | -No | -RegexChunking | -
| css_selector | -- The CSS selector to target specific parts of the HTML for extraction. - | -No | -None | -
| verbose | -Whether to enable verbose logging. | -No | -true | -
- In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging - for services that should rightfully be accessible to everyone. ππΈ One such example is scraping and - crawling web pages and transforming them into a format suitable for Large Language Models (LLMs). - πΈοΈπ€ We believe that building a business around this is not the right approach; instead, it should - definitely be open-source. ππ So, if you possess the skills to build such tools and share our - philosophy, we invite you to join our "Robinhood" band and help set these products free for the - benefit of all. π€πͺ -
-- To install and run Crawl4AI as a library or a local server, please refer to the π - GitHub repository. -
-Content for chunking strategies...
-Content for extraction strategies...
-Loading...
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- - In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for - services that should rightfully be accessible to everyone. ππΈ One for example is to scrap and crawl - a web page, and transform it o a form suitable for LLM. We don't think one should build a business - out of this, but definilty should be opened source. So if you possess the skills to build such things - and you have such philosphy you should join our "Robinhood" band and help set - these products free. ππ€ -
-- To install and run Crawl4AI locally or on your own service, the best way is to use Docker. π³ Follow - these steps: -
-git clone https://github.com/unclecode/crawl4ai.git
- cd crawl4aidocker build -t crawl4ai . On Mac, follow: π
- docker build --platform linux/amd64 -t crawl4ai .
- docker run -p 8000:80 crawl4ai- For more detailed instructions and advanced configuration options, please refer to the π - GitHub repository. -
-- In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging - for services that should rightfully be accessible to everyone. ππΈ One such example is scraping and - crawling web pages and transforming them into a format suitable for Large Language Models (LLMs). - πΈοΈπ€ We believe that building a business around this is not the right approach; instead, it should - definitely be open-source. ππ So, if you possess the skills to build such tools and share our - philosophy, we invite you to join our "Robinhood" band and help set these products free for the - benefit of all. π€πͺ -
-warmup() function.
- crawler = WebCrawler()
-crawler.warmup()
- result = crawler.run(url="https://www.nbcnews.com/business")
- result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
- `bypass_cache` to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set `always_by_pass_cache` in constructor to True to always bypass the cache.
- result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
- always_by_pass_cache to True:crawler.always_by_pass_cache = True
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- screenshot=True
-)
-with open("screenshot.png", "wb") as f:
- f.write(base64.b64decode(result.screenshot))
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- chunking_strategy=RegexChunking(patterns=["\n\n"])
-)
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- chunking_strategy=NlpSentenceChunking()
-)
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
-)
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-)
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(
- provider="openai/gpt-4o",
- api_token=os.getenv('OPENAI_API_KEY'),
- instruction="I am interested in only financial news"
-)
-)
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- css_selector="h2"
-)
- js_code = ["""
-const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-loadMoreButton && loadMoreButton.click();
-"""]
-crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)
- - There are three ways to use Crawl4AI: -
To install Crawl4AI as a library, follow these steps:
- -virtualenv venv
-source venv/bin/activate
-pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
-
- crawl4ai-download-models
- virtualenv venv
-source venv/bin/activate
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-pip install -e .[all]
-
- docker build -t crawl4ai .
-# docker build --platform linux/amd64 -t crawl4ai . For Mac users
-docker run -d -p 8000:80 crawl4ai
- - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -
-Loading... Please wait.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- There are three ways to use Crawl4AI:
-To install Crawl4AI as a library, follow these steps:
- -pip install git+https://github.com/unclecode/crawl4ai.git
- virtualenv venv
-source venv/bin/activate
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-pip install -e .
-
- docker build -t crawl4ai .
-# docker build --platform linux/amd64 -t crawl4ai . For Mac users
-docker run -d -p 8000:80 crawl4ai
- - For more information about how to run Crawl4AI as a local server, please refer to the - GitHub repository. -
-warmup() function.
- crawler = WebCrawler()
-crawler.warmup()
- result = crawler.run(url="https://www.nbcnews.com/business")
- result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
- `bypass_cache` to True if you want to try different strategies
- for the same URL. Otherwise, the cached result will be returned. You can also set
- `always_by_pass_cache` in constructor to True to always bypass the cache.
- result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
- always_by_pass_cache to True:crawler.always_by_pass_cache = True
- result = crawler.run(
-url="https://www.nbcnews.com/business",
-chunking_strategy=RegexChunking(patterns=["\n\n"])
-)
- result = crawler.run(
-url="https://www.nbcnews.com/business",
-chunking_strategy=NlpSentenceChunking()
-)
- result = crawler.run(
-url="https://www.nbcnews.com/business",
-extraction_strategy=CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3)
-)
- result = crawler.run(
-url="https://www.nbcnews.com/business",
-extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-)
- result = crawler.run(
-url="https://www.nbcnews.com/business",
-extraction_strategy=LLMExtractionStrategy(
-provider="openai/gpt-4o",
-api_token=os.getenv('OPENAI_API_KEY'),
-instruction="I am interested in only financial news"
-)
-)
- result = crawler.run(
-url="https://www.nbcnews.com/business",
-css_selector="h2"
-)
- js_code = """
-const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-loadMoreButton && loadMoreButton.click();
-"""
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business")
-
- RegexChunking is a text chunking strategy that splits a given text into smaller parts
- using regular expressions. This is useful for preparing large texts for processing by language
- models, ensuring they are divided into manageable segments.
-
patterns (list, optional): A list of regular expression patterns used to split the
- text. Default is to split by double newlines (['\n\n']).
- chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
-chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
-
-
- NlpSentenceChunking uses a natural language processing model to chunk a given text into
- sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.
-
chunker = NlpSentenceChunking()
-chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
-
-
- TopicSegmentationChunking uses the TextTiling algorithm to segment a given text into
- topic-based chunks. This method identifies thematic boundaries in the text.
-
num_keywords (int, optional): The number of keywords to extract for each topic
- segment. Default is 3.
- chunker = TopicSegmentationChunking(num_keywords=3)
-chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
-
-
- FixedLengthWordChunking splits a given text into chunks of fixed length, based on the
- number of words.
-
chunk_size (int, optional): The number of words in each chunk. Default is
- 100.
- chunker = FixedLengthWordChunking(chunk_size=100)
-chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
-
-
- SlidingWindowChunking uses a sliding window approach to chunk a given text. Each chunk
- has a fixed length, and the window slides by a specified step size.
-
window_size (int, optional): The number of words in each chunk. Default is
- 100.
- step (int, optional): The number of words to slide the window. Default is
- 50.
- chunker = SlidingWindowChunking(window_size=100, step=50)
-chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
-
-
- NoExtractionStrategy is a basic extraction strategy that returns the entire HTML
- content without any modification. It is useful for cases where no specific extraction is required.
- Only clean html, and amrkdown.
-
None.
-extractor = NoExtractionStrategy()
-extracted_content = extractor.extract(url, html)
-
-
- LLMExtractionStrategy uses a Language Model (LLM) to extract meaningful blocks or
- chunks from the given HTML content. This strategy leverages an external provider for language model
- completions.
-
provider (str, optional): The provider to use for the language model completions.
- Default is DEFAULT_PROVIDER (e.g., openai/gpt-4).
- api_token (str, optional): The API token for the provider. If not provided, it will
- try to load from the environment variable OPENAI_API_KEY.
- instruction (str, optional): An instruction to guide the LLM on how to perform the
- extraction. This allows users to specify the type of data they are interested in or set the tone
- of the response. Default is None.
- extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
-extracted_content = extractor.extract(url, html)
-
- - By providing clear instructions, users can tailor the extraction process to their specific needs, - enhancing the relevance and utility of the extracted content. -
-
- CosineStrategy uses hierarchical clustering based on cosine similarity to extract
- clusters of text from the given HTML content. This strategy is suitable for identifying related
- content sections.
-
semantic_filter (str, optional): A string containing keywords for filtering relevant
- documents before clustering. If provided, documents are filtered based on their cosine
- similarity to the keyword filter embedding. Default is None.
- word_count_threshold (int, optional): Minimum number of words per cluster. Default
- is 20.
- max_dist (float, optional): The maximum cophenetic distance on the dendrogram to
- form clusters. Default is 0.2.
- linkage_method (str, optional): The linkage method for hierarchical clustering.
- Default is 'ward'.
- top_k (int, optional): Number of top categories to extract. Default is
- 3.
- model_name (str, optional): The model name for embedding generation. Default is
- 'BAAI/bge-small-en-v1.5'.
- extractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
-extracted_content = extractor.extract(url, html)
-
-
- When a semantic_filter is provided, the CosineStrategy applies an
- embedding-based filtering process to select relevant documents before performing hierarchical
- clustering.
-
- TopicExtractionStrategy uses the TextTiling algorithm to segment the HTML content into
- topics and extracts keywords for each segment. This strategy is useful for identifying and
- summarizing thematic content.
-
num_keywords (int, optional): Number of keywords to represent each topic segment.
- Default is 3.
- extractor = TopicExtractionStrategy(num_keywords=3)
-extracted_content = extractor.extract(url, html)
-
-