- Test all methods

- Update index.hml
- Update Readme
- Resolve some bugs
This commit is contained in:
unclecode
2024-05-14 21:27:41 +08:00
parent 5fea6c064b
commit f6e59157bf
17 changed files with 1004 additions and 402 deletions

1
.gitignore vendored
View File

@@ -166,5 +166,6 @@ Crawl4AI.egg-info/*
crawler_data.db
.vscode/
test_pad.py
test_pad*.py
.data/
Crawl4AI.egg-info/

View File

@@ -56,40 +56,28 @@ pip install -e .
2. Import the necessary modules in your Python script:
```python
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.models import UrlModel
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
import os
crawler = WebCrawler(db_path='crawler_data.db')
crawler = WebCrawler()
crawler.warmup() # IMPORTANT: Warmup the engine before running the first crawl
# Single page crawl
single_url = UrlModel(url='https://kidocode.com', forced=False)
result = crawl4ai.fetch_page(
single_url,
provider= "openai/gpt-3.5-turbo",
api_token = os.getenv('OPENAI_API_KEY'),
# Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks
# and return them as JSON. Depending on the model and data size, this may take up to 1 minute.
# Without this setting, it will take between 5 to 20 seconds.
extract_blocks_flag=False
word_count_threshold=5 # Minimum word count for a HTML tag to be considered as a worthy block
result = crawler.run(
url='https://www.nbcnews.com/business',
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
chunking_strategy= RegexChunking( patterns = ["\n\n"]), # Default is RegexChunking
extraction_strategy= CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
bypass_cache=False,
extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
css_selector = "", # Eg: "div.article-body"
verbose=True,
include_raw_html=True, # Whether to include the raw HTML content in the response
)
print(result.model_dump())
# Multiple page crawl
urls = [
UrlModel(url='http://example.com', forced=False),
UrlModel(url='http://example.org', forced=False)
]
results = crawl4ai.fetch_pages(
urls,
provider= "openai/gpt-3.5-turbo",
api_token = os.getenv('OPENAI_API_KEY'),
extract_blocks_flag=True,
word_count_threshold=5
)
for res in results:
print(res.model_dump())
```
Running for the first time will download the chrome driver for selenium. Also creates a SQLite database file `crawler_data.db` in the current directory. This file will store the crawled data for future reference.
@@ -150,23 +138,22 @@ Set `extract_blocks_flag` to True to enable the LLM to generate semantically clu
import requests
import os
url = "http://localhost:8000/crawl" # Replace with the appropriate server URL
data = {
"urls": [
"https://example.com"
"https://www.nbcnews.com/business"
],
"provider_model": "groq/llama3-70b-8192",
"api_token": "your_api_token",
"include_raw_html": true,
"forced": false,
# Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks
# and return them as JSON. Depending on the model and data size, this may take up to 1 minute.
# Without this setting, it will take between 5 to 20 seconds.
"extract_blocks_flag": False,
"word_count_threshold": 5
"bypass_cache": false,
"extract_blocks": true,
"word_count_threshold": 10,
"extraction_strategy": "CosineStrategy",
"chunking_strategy": "RegexChunking",
"css_selector": "",
"verbose": true
}
response = requests.post(url, json=data)
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR http://localhost:8000 if your run locally
if response.status_code == 200:
result = response.json()["results"][0]
@@ -180,9 +167,9 @@ else:
print("Error:", response.status_code, response.text)
```
This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`https://example.com`) and the desired options (`grq_api_token`, `include_raw_html`, and `forced`). The server processes the request and returns the crawled data in JSON format.
This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`http://crawl4ai.uccode.io/crawl`) and the desired options. The server processes the request and returns the crawled data in JSON format.
The response from the server includes the parsed JSON, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
The response from the server includes the semantical clusters, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
Make sure to replace `"http://localhost:8000/crawl"` with the appropriate server URL if your Crawl4AI server is running on a different host or port.
@@ -194,15 +181,17 @@ That's it! You can now integrate Crawl4AI into your Python projects and leverage
## 📖 Parameters
| Parameter | Description | Required | Default Value |
|----------------------|-------------------------------------------------------------------------------------------------|----------|---------------|
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
| `provider_model` | The provider and model to use for extracting relevant information (e.g., "groq/llama3-70b-8192"). | Yes | - |
| `api_token` | Your API token for the specified provider. | Yes | - |
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
| `forced` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
| `extract_blocks_flag`| Whether to extract semantical blocks of text from the HTML. | No | `false` |
| `word_count_threshold` | The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
| Parameter | Description | Required | Default Value |
|-----------------------|-------------------------------------------------------------------------------------------------------|----------|---------------------|
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
| `extract_blocks` | Whether to extract semantical blocks of text from the HTML. | No | `true` |
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `CosineStrategy` |
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
| `verbose` | Whether to enable verbose logging. | No | `true` |
## 🛠️ Configuration
Crawl4AI allows you to configure various parameters and settings in the `crawler/config.py` file. Here's an example of how you can adjust the parameters:
@@ -213,15 +202,17 @@ from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file
# Default provider
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
# Provider-model dictionary
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
PROVIDER_MODELS = {
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
@@ -229,12 +220,14 @@ PROVIDER_MODELS = {
# Chunk token threshold
CHUNK_TOKEN_THRESHOLD = 1000
# Threshold for the minimum number of words in an HTML tag to be considered
MIN_WORD_THRESHOLD = 5
```
In the `crawler/config.py` file, you can:
REMEBER: You only need to set the API keys for the providers in case you choose LLMExtractStrategy as the extraction strategy. If you choose CosineStrategy, you don't need to set the API keys.
- Set the default provider using the `DEFAULT_PROVIDER` variable.
- Add or modify the provider-model dictionary (`PROVIDER_MODELS`) to include your desired providers and their corresponding API keys. Crawl4AI supports various providers such as Groq, OpenAI, Anthropic, and more. You can add any provider supported by LiteLLM, as well as Ollama.
- Adjust the `CHUNK_TOKEN_THRESHOLD` value to control the splitting of web content into chunks for parallel processing. A higher value means fewer chunks and faster processing, but it may cause issues with weaker LLMs during extraction.

View File

@@ -3,15 +3,17 @@ from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file
# Default provider
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
# Provider-model dictionary
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
PROVIDER_MODELS = {
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),

View File

@@ -5,18 +5,20 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException
import chromedriver_autoinstaller
from typing import List
import requests
import os
from pathlib import Path
class CrawlerStrategy(ABC):
@abstractmethod
def crawl(self, url: str) -> str:
def crawl(self, url: str, **kwargs) -> str:
pass
class CloudCrawlerStrategy(CrawlerStrategy):
def crawl(self, url: str) -> str:
def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
data = {
"urls": [url],
"provider_model": "",
@@ -40,19 +42,34 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--headless")
chromedriver_autoinstaller.install()
# chromedriver_autoinstaller.install()
self.service = Service(chromedriver_autoinstaller.install())
self.driver = webdriver.Chrome(service=self.service, options=self.options)
def crawl(self, url: str, use_cached_html = False) -> str:
def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
if use_cached_html:
return get_content_of_website(url)
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
)
html = self.driver.page_source
return html
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f:
return f.read()
try:
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
)
html = self.driver.page_source
# Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
with open(cache_file_path, "w") as f:
f.write(html)
return html
except InvalidArgumentException:
raise InvalidArgumentException(f"Invalid URL {url}")
except Exception as e:
raise Exception(f"Failed to crawl {url}: {str(e)}")
def quit(self):
self.driver.quit()

View File

@@ -1,7 +1,15 @@
import os
from pathlib import Path
import sqlite3
from typing import Optional
from typing import Optional, Tuple
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
def init_db(db_path: str):
global DB_PATH
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
@@ -16,46 +24,65 @@ def init_db(db_path: str):
''')
conn.commit()
conn.close()
DB_PATH = db_path
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
def check_db_path():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
parsed_json = excluded.parsed_json,
success = excluded.success
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
conn.commit()
conn.close()
def get_total_count(db_path: str) -> int:
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
check_db_path()
try:
conn = sqlite3.connect(db_path)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
except Exception as e:
print(f"Error retrieving cached URL: {e}")
return None
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
parsed_json = excluded.parsed_json,
success = excluded.success
''', (url, html, cleaned_html, markdown, parsed_json, success))
conn.commit()
conn.close()
except Exception as e:
print(f"Error caching URL: {e}")
def get_total_count() -> int:
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM crawled_data')
result = cursor.fetchone()
conn.close()
return result[0]
except Exception as e:
print(f"Error getting total count: {e}")
return 0
# Crete function to cler the database
def clear_db(db_path: str):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('DELETE FROM crawled_data')
conn.commit()
conn.close()
def clear_db():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('DELETE FROM crawled_data')
conn.commit()
conn.close()
except Exception as e:
print(f"Error clearing database: {e}")

View File

@@ -7,6 +7,8 @@ from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import *
from .utils import *
from functools import partial
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
class ExtractionStrategy(ABC):
"""
@@ -15,6 +17,7 @@ class ExtractionStrategy(ABC):
def __init__(self):
self.DEL = "<|DEL|>"
self.name = self.__class__.__name__
@abstractmethod
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
@@ -67,7 +70,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
def extract(self, url: str, html: str) -> List[Dict[str, Any]]:
print("Extracting blocks ...")
print("[LOG] Extracting blocks from URL:", url)
variable_values = {
"URL": url,
"HTML": escape_json_string(sanitize_html(html)),
@@ -98,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
"content": unparsed
})
print("Extracted", len(blocks), "blocks.")
print("[LOG] Extracted", len(blocks), "blocks from URL:", url)
return blocks
def _merge(self, documents):
@@ -125,6 +128,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
"""
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
"""
merged_sections = self._merge(sections)
parsed_json = []
if self.provider.startswith("groq/"):
@@ -144,7 +148,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
return parsed_json
class CosinegStrategy(ExtractionStrategy):
class CosineStrategy(ExtractionStrategy):
def __init__(self, word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5'):
"""
Initialize the strategy with clustering parameters.
@@ -166,18 +170,11 @@ class CosinegStrategy(ExtractionStrategy):
self.timer = time.time()
if model_name == "bert-base-uncased":
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
self.model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
elif model_name == "sshleifer/distilbart-cnn-12-6":
# self.model = IPEXModel.from_pretrained("Intel/bge-small-en-v1.5-rag-int8-static")
# self.tokenizer = AutoTokenizer.from_pretrained("Intel/bge-small-en-v1.5-rag-int8-static")
pass
self.tokenizer, self.model = load_bert_base_uncased()
elif model_name == "BAAI/bge-small-en-v1.5":
self.tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
self.model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
self.model.eval()
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.nlp = spacy.load("models/reuters")
self.nlp = load_spacy_model()
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
def get_embeddings(self, sentences: List[str]):

20
crawl4ai/model_loader.py Normal file
View File

@@ -0,0 +1,20 @@
from functools import lru_cache
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import spacy
@lru_cache()
def load_bert_base_uncased():
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
return tokenizer, model
@lru_cache()
def load_bge_small_en_v1_5():
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
model.eval()
return tokenizer, model
@lru_cache()
def load_spacy_model():
return spacy.load("models/reuters")

View File

@@ -10,6 +10,8 @@ from html2text import HTML2Text
from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import *
class InvalidCSSSelectorError(Exception):
pass
def beautify_html(escaped_html):
"""
@@ -140,14 +142,26 @@ class CustomHTML2Text(HTML2Text):
super().handle_tag(tag, attrs, start)
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
try:
if not html:
return None
# Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Get the content within the <body> tag
body = soup.body
# If css_selector is provided, extract content based on the selector
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
div_tag = soup.new_tag('div')
for el in selected_elements:
div_tag.append(el)
body = div_tag
# Remove script, style, and other tags that don't carry useful content from body
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
tag.decompose()
@@ -255,7 +269,7 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
# Remove comments
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove consecutive empty newlines and replace multiple spaces with a single space
@@ -281,7 +295,7 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
except Exception as e:
print('Error processing HTML content:', str(e))
return None
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
def extract_xml_tags(string):
tags = re.findall(r'<(\w+)>', string)

View File

@@ -2,7 +2,7 @@ import os, time
from pathlib import Path
from .models import UrlModel, CrawlResult
from .database import init_db, get_cached_url, cache_url
from .database import init_db, get_cached_url, cache_url, DB_PATH
from .utils import *
from .chunking_strategy import *
from .extraction_strategy import *
@@ -10,6 +10,7 @@ from .crawler_strategy import *
from typing import List
from concurrent.futures import ThreadPoolExecutor
from .config import *
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
class WebCrawler:
@@ -36,11 +37,11 @@ class WebCrawler:
def warmup(self):
print("[LOG] 🌤️ Warming up the WebCrawler")
single_url = UrlModel(url='https://crawl4ai.uccode.io/', forced=False)
result = self.run(
single_url,
url='https://crawl4ai.uccode.io/',
word_count_threshold=5,
extraction_strategy= CosinegStrategy(),
extraction_strategy= CosineStrategy(),
bypass_cache=False,
verbose = False
)
self.ready = True
@@ -60,10 +61,11 @@ class WebCrawler:
**kwargs,
) -> CrawlResult:
return self.run(
url_model,
url_model.url,
word_count_threshold,
extraction_strategy,
chunking_strategy,
bypass_cache=url_model.forced,
**kwargs,
)
pass
@@ -71,77 +73,85 @@ class WebCrawler:
def run(
self,
url_model: UrlModel,
url: str,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = NoExtractionStrategy(),
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
verbose=True,
**kwargs,
) -> CrawlResult:
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
if word_count_threshold < MIN_WORD_THRESHOLD:
word_count_threshold = MIN_WORD_THRESHOLD
# Check cache first
cached = get_cached_url(self.db_path, str(url_model.url))
if cached and not url_model.forced:
return CrawlResult(
**{
"url": cached[0],
"html": cached[1],
"cleaned_html": cached[2],
"markdown": cached[3],
"parsed_json": cached[4],
"success": cached[5],
"error_message": "",
}
)
if not bypass_cache:
cached = get_cached_url(url)
if cached:
return CrawlResult(
**{
"url": cached[0],
"html": cached[1],
"cleaned_html": cached[2],
"markdown": cached[3],
"parsed_json": cached[4],
"success": cached[5],
"error_message": "",
}
)
# Initialize WebDriver for crawling
t = time.time()
try:
html = self.crawler_strategy.crawl(str(url_model.url))
success = True
error_message = ""
except Exception as e:
html = ""
success = False
error_message = str(e)
html = self.crawler_strategy.crawl(url)
success = True
error_message = ""
# Extract content from HTML
result = get_content_of_website(html, word_count_threshold)
try:
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
cleaned_html = result.get("cleaned_html", html)
markdown = result.get("markdown", "")
# Print a profession LOG style message, show time taken and say crawling is done
if verbose:
print(
f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds"
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
)
parsed_json = []
if verbose:
print(f"[LOG] 🔥 Extracting semantic blocks for {url_model.url}")
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
t = time.time()
# Split markdown into sections
sections = chunking_strategy.chunk(markdown)
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
parsed_json = extraction_strategy.run(
str(url_model.url), sections,
url, sections,
)
parsed_json = json.dumps(parsed_json)
if verbose:
print(
f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds."
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
)
# Cache the result
cleaned_html = beautify_html(cleaned_html)
cache_url(
self.db_path,
str(url_model.url),
url,
html,
cleaned_html,
markdown,
@@ -150,7 +160,7 @@ class WebCrawler:
)
return CrawlResult(
url=str(url_model.url),
url=url,
html=html,
cleaned_html=cleaned_html,
markdown=markdown,

View File

@@ -0,0 +1,12 @@
{
"RegexChunking": "### RegexChunking\n\n`RegexChunking` is a text chunking strategy that splits a given text into smaller parts using regular expressions.\nThis is useful for preparing large texts for processing by language models, ensuring they are divided into manageable segments.\n\n#### Constructor Parameters:\n- `patterns` (list, optional): A list of regular expression patterns used to split the text. Default is to split by double newlines (`['\\n\\n']`).\n\n#### Example usage:\n```python\nchunker = RegexChunking(patterns=[r'\\n\\n', r'\\. '])\nchunks = chunker.chunk(\"This is a sample text. It will be split into chunks.\")\n```",
"NlpSentenceChunking": "### NlpSentenceChunking\n\n`NlpSentenceChunking` uses a natural language processing model to chunk a given text into sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.\n\n#### Constructor Parameters:\n- `model` (str, optional): The SpaCy model to use for sentence detection. Default is `'en_core_web_sm'`.\n\n#### Example usage:\n```python\nchunker = NlpSentenceChunking(model='en_core_web_sm')\nchunks = chunker.chunk(\"This is a sample text. It will be split into sentences.\")\n```",
"TopicSegmentationChunking": "### TopicSegmentationChunking\n\n`TopicSegmentationChunking` uses the TextTiling algorithm to segment a given text into topic-based chunks. This method identifies thematic boundaries in the text.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): The number of keywords to extract for each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nchunker = TopicSegmentationChunking(num_keywords=3)\nchunks = chunker.chunk(\"This is a sample text. It will be split into topic-based segments.\")\n```",
"FixedLengthWordChunking": "### FixedLengthWordChunking\n\n`FixedLengthWordChunking` splits a given text into chunks of fixed length, based on the number of words.\n\n#### Constructor Parameters:\n- `chunk_size` (int, optional): The number of words in each chunk. Default is `100`.\n\n#### Example usage:\n```python\nchunker = FixedLengthWordChunking(chunk_size=100)\nchunks = chunker.chunk(\"This is a sample text. It will be split into fixed-length word chunks.\")\n```",
"SlidingWindowChunking": "### SlidingWindowChunking\n\n`SlidingWindowChunking` uses a sliding window approach to chunk a given text. Each chunk has a fixed length, and the window slides by a specified step size.\n\n#### Constructor Parameters:\n- `window_size` (int, optional): The number of words in each chunk. Default is `100`.\n- `step` (int, optional): The number of words to slide the window. Default is `50`.\n\n#### Example usage:\n```python\nchunker = SlidingWindowChunking(window_size=100, step=50)\nchunks = chunker.chunk(\"This is a sample text. It will be split using a sliding window approach.\")\n```"
}

View File

@@ -0,0 +1,10 @@
{
"NoExtractionStrategy": "### NoExtractionStrategy\n\n`NoExtractionStrategy` is a basic extraction strategy that returns the entire HTML content without any modification. It is useful for cases where no specific extraction is required. Only clean html, and amrkdown.\n\n#### Constructor Parameters:\nNone.\n\n#### Example usage:\n```python\nextractor = NoExtractionStrategy()\nextracted_content = extractor.extract(url, html)\n```",
"LLMExtractionStrategy": "### LLMExtractionStrategy\n\n`LLMExtractionStrategy` uses a Language Model (LLM) to extract meaningful blocks or chunks from the given HTML content. This strategy leverages an external provider for language model completions.\n\n#### Constructor Parameters:\n- `provider` (str, optional): The provider to use for the language model completions. Default is `DEFAULT_PROVIDER` (following provider/model eg. openai/gpt-4o).\n- `api_token` (str, optional): The API token for the provider. If not provided, it will try to load from the environment variable `OPENAI_API_KEY`.\n\n#### Example usage:\n```python\nextractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token')\nextracted_content = extractor.extract(url, html)\n```",
"CosineStrategy": "### CosineStrategy\n\n`CosineStrategy` uses hierarchical clustering based on cosine similarity to extract clusters of text from the given HTML content. This strategy is suitable for identifying related content sections.\n\n#### Constructor Parameters:\n- `word_count_threshold` (int, optional): Minimum number of words per cluster. Default is `20`.\n- `max_dist` (float, optional): The maximum cophenetic distance on the dendrogram to form clusters. Default is `0.2`.\n- `linkage_method` (str, optional): The linkage method for hierarchical clustering. Default is `'ward'`.\n- `top_k` (int, optional): Number of top categories to extract. Default is `3`.\n- `model_name` (str, optional): The model name for embedding generation. Default is `'BAAI/bge-small-en-v1.5'`.\n\n#### Example usage:\n```python\nextractor = CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')\nextracted_content = extractor.extract(url, html)\n```",
"TopicExtractionStrategy": "### TopicExtractionStrategy\n\n`TopicExtractionStrategy` uses the TextTiling algorithm to segment the HTML content into topics and extracts keywords for each segment. This strategy is useful for identifying and summarizing thematic content.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): Number of keywords to represent each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nextractor = TopicExtractionStrategy(num_keywords=3)\nextracted_content = extractor.extract(url, html)\n```"
}

33
docs/quickstart.py Normal file
View File

@@ -0,0 +1,33 @@
import os
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
def main():
crawler = WebCrawler()
crawler.warmup()
# Single page crawl
result = crawler.run(
url="https://www.nbcnews.com/business",
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
chunking_strategy=RegexChunking(patterns=["\n\n"]), # Default is RegexChunking
extraction_strategy=CosineStrategy(
word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3
), # Default is CosineStrategy
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
bypass_cache=True,
extract_blocks=True, # Whether to extract semantical blocks of text from the HTML
css_selector="", # Eg: "div.article-body" or all H2 tags liek "h2"
verbose=True,
include_raw_html=True, # Whether to include the raw HTML content in the response
)
print("[LOG] 📦 Crawl result:")
print(result.model_dump())
if __name__ == "__main__":
main()

View File

@@ -1,32 +0,0 @@
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.models import UrlModel
from crawl4ai.utils import get_content_of_website
import os
def main():
# Initialize the WebCrawler with just the database path
crawler = WebCrawler(db_path='crawler_data.db')
# Fetch a single page
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=True)
result = crawler.fetch_page(
single_url,
provider= "openai/gpt-3.5-turbo",
api_token = os.getenv('OPENAI_API_KEY'),
use_cached_html = True,
extract_blocks_flag=True,
word_count_threshold=10
)
print(result.model_dump())
# Fetch multiple pages
# urls = [
# UrlModel(url='http://example.com', forced=False),
# UrlModel(url='http://example.org', forced=False)
# ]
# results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
# for res in results:
# print(res.model_copy())
if __name__ == '__main__':
main()

151
main.py
View File

@@ -1,24 +1,19 @@
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.responses import JSONResponse
from pydantic import BaseModel, HttpUrl
from typing import List, Optional
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.models import UrlModel
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
import chromedriver_autoinstaller
from functools import lru_cache
from crawl4ai.database import get_total_count, clear_db
import os
import uuid
# Import the CORS middleware
import importlib
import asyncio
from functools import lru_cache
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, HttpUrl
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional
# Task management
tasks = {}
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.database import get_total_count, clear_db
# Configuration
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -41,22 +36,25 @@ app.add_middleware(
# Mount the pages directory as a static directory
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
# chromedriver_autoinstaller.install() # Ensure chromedriver is installed
@lru_cache()
def get_crawler():
# Initialize and return a WebCrawler instance
return WebCrawler()
chromedriver_autoinstaller.install() # Ensure chromedriver is installed
class UrlsInput(BaseModel):
class CrawlRequest(BaseModel):
urls: List[HttpUrl]
provider_model: str
api_token: str
include_raw_html: Optional[bool] = False
forced: bool = False
bypass_cache: bool = False
extract_blocks: bool = True
word_count_threshold: Optional[int] = 5
extraction_strategy: Optional[str] = "CosineStrategy"
chunking_strategy: Optional[str] = "RegexChunking"
css_selector: Optional[str] = None
verbose: Optional[bool] = True
@lru_cache()
def get_crawler():
# Initialize and return a WebCrawler instance
return WebCrawler(db_path='crawler_data.db')
@app.get("/", response_class=HTMLResponse)
async def read_index():
@@ -66,20 +64,30 @@ async def read_index():
@app.get("/total-count")
async def get_total_url_count():
count = get_total_count(db_path='crawler_data.db')
count = get_total_count()
return JSONResponse(content={"count": count})
# Add endpoit to clear db
@app.get("/clear-db")
async def clear_database():
clear_db(db_path='crawler_data.db')
clear_db()
return JSONResponse(content={"message": "Database cleared."})
def import_strategy(module_name: str, class_name: str):
try:
module = importlib.import_module(module_name)
strategy_class = getattr(module, class_name)
return strategy_class()
except ImportError:
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
except AttributeError:
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
@app.post("/crawl")
async def crawl_urls(urls_input: UrlsInput, request: Request):
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
global current_requests
# Raise error if api_token is not provided
if not urls_input.api_token:
if not crawl_request.api_token:
raise HTTPException(status_code=401, detail="API token is required.")
async with lock:
if current_requests >= MAX_CONCURRENT_REQUESTS:
@@ -87,20 +95,30 @@ async def crawl_urls(urls_input: UrlsInput, request: Request):
current_requests += 1
try:
# Prepare URL models for crawling
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy)
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy)
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
with ThreadPoolExecutor() as executor:
loop = asyncio.get_event_loop()
futures = [
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, urls_input.provider_model, urls_input.api_token, urls_input.extract_blocks, urls_input.word_count_threshold)
for url_model in url_models
loop.run_in_executor(
executor,
get_crawler().run,
str(url),
crawl_request.word_count_threshold,
extraction_strategy,
chunking_strategy,
crawl_request.bypass_cache,
crawl_request.css_selector,
crawl_request.verbose
)
for url in crawl_request.urls
]
results = await asyncio.gather(*futures)
# if include_raw_html is False, remove the raw HTML content from the results
if not urls_input.include_raw_html:
if not crawl_request.include_raw_html:
for result in results:
result.html = None
@@ -109,64 +127,17 @@ async def crawl_urls(urls_input: UrlsInput, request: Request):
async with lock:
current_requests -= 1
@app.post("/crawl_async")
async def crawl_urls(urls_input: UrlsInput, request: Request):
global current_requests
if not urls_input.api_token:
raise HTTPException(status_code=401, detail="API token is required.")
@app.get("/strategies/extraction", response_class=JSONResponse)
async def get_extraction_strategies():
# Load docs/extraction_strategies.json" and return as JSON response
with open(f"{__location__}/docs/extraction_strategies.json", "r") as file:
return JSONResponse(content=file.read())
async with lock:
if current_requests >= MAX_CONCURRENT_REQUESTS:
raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
current_requests += 1
@app.get("/strategies/chunking", response_class=JSONResponse)
async def get_chunking_strategies():
with open(f"{__location__}/docs/chunking_strategies.json", "r") as file:
return JSONResponse(content=file.read())
task_id = str(uuid.uuid4())
tasks[task_id] = {"status": "pending", "results": None}
try:
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
loop = asyncio.get_running_loop()
loop.create_task(
process_crawl_task(url_models, urls_input.provider_model, urls_input.api_token, task_id, urls_input.extract_blocks)
)
return {"task_id": task_id}
finally:
async with lock:
current_requests -= 1
async def process_crawl_task(url_models, provider, api_token, task_id, extract_blocks_flag):
try:
with ThreadPoolExecutor() as executor:
loop = asyncio.get_running_loop()
futures = [
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, provider, api_token, extract_blocks_flag)
for url_model in url_models
]
results = await asyncio.gather(*futures)
tasks[task_id] = {"status": "done", "results": results}
except Exception as e:
tasks[task_id] = {"status": "failed", "error": str(e)}
@app.get("/task/{task_id}")
async def get_task_status(task_id: str):
task = tasks.get(task_id)
if not task:
raise HTTPException(status_code=404, detail="Task not found")
if task['status'] == 'done':
return {
"status": task['status'],
"results": [result.dict() for result in task['results']]
}
elif task['status'] == 'failed':
return {
"status": task['status'],
"error": task['error']
}
else:
return {"status": task['status']}
if __name__ == "__main__":
import uvicorn

View File

@@ -9,12 +9,15 @@
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
<!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
<script src="https://cdn.tailwindcss.com"></script>
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
/>
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
<style>
:root {
@@ -46,138 +49,447 @@
width: 100%;
}
</style>
<style>
/* Custom styling for docs-item class and Markdown generated elements */
.docs-item {
background-color: #2d3748; /* bg-gray-800 */
padding: 1rem; /* p-4 */
border-radius: 0.375rem; /* rounded */
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
margin-bottom: 1rem; /* space between items */
}
.docs-item h3,
.docs-item h4 {
color: #ffffff; /* text-white */
font-size: 1.25rem; /* text-xl */
font-weight: 700; /* font-bold */
margin-bottom: 0.5rem; /* mb-2 */
}
.docs-item p {
color: #e2e8f0; /* text-gray-300 */
margin-bottom: 0.5rem; /* mb-2 */
}
.docs-item code {
background-color: #1a202c; /* bg-gray-900 */
color: #e2e8f0; /* text-gray-300 */
padding: 0.25rem 0.5rem; /* px-2 py-1 */
border-radius: 0.25rem; /* rounded */
}
.docs-item pre {
background-color: #1a202c; /* bg-gray-900 */
color: #e2e8f0; /* text-gray-300 */
padding: 0.5rem; /* p-2 */
border-radius: 0.375rem; /* rounded */
overflow: auto; /* overflow-auto */
margin-bottom: 0.5rem; /* mb-2 */
}
.docs-item div {
color: #e2e8f0; /* text-gray-300 */
font-size: 1rem; /* prose prose-sm */
line-height: 1.25rem; /* line-height for readability */
}
/* Adjustments to make prose class more suitable for dark mode */
.prose {
max-width: none; /* max-w-none */
}
.prose p,
.prose ul {
margin-bottom: 1rem; /* mb-4 */
}
.prose code {
/* background-color: #4a5568; */ /* bg-gray-700 */
color: #65a30d; /* text-white */
padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
border-radius: 0.25rem; /* rounded */
display: inline-block; /* inline-block */
}
.prose pre {
background-color: #1a202c; /* bg-gray-900 */
color: #ffffff; /* text-white */
padding: 0.5rem; /* p-2 */
border-radius: 0.375rem; /* rounded */
}
.prose h3 {
color: #65a30d; /* text-white */
font-size: 1.25rem; /* text-xl */
font-weight: 700; /* font-bold */
margin-bottom: 0.5rem; /* mb-2 */
}
</style>
</head>
<body>
<header class="bg-gray-900 text-white py-4">
<div class="container mx-auto px-4">
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper</h1>
<body class="bg-black text-gray-200">
<header class="bg-zinc-950 text-white py-4 flex">
<div class="mx-auto px-4">
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
</div>
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
<span>📊 Total Website Processed</span>
<span id="total-count" class="text-lime-400">2</span>
</div>
</header>
<!-- Add a section to show total-count websited already crawled -->
<section class="bg-gray-600 py-8">
<div class="container mx-auto px-4 flex font-bold text-xl gap-2">
<span>📊 Total Website Procceced</span>
<span id="total-count" class="text-blue-400">0</span>
</div>
</section>
<section class="try-it py-8 pb-20">
<section class="try-it py-8 px-16 pb-20">
<div class="container mx-auto px-4">
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
<div class="mb-4 flex w-full gap-2">
<div class="flex items-center gap-2 flex-col flex-grow">
<label for="url-input" class="text-white">URL(s)</label>
<input
type="text"
id="url-input"
value="https://www.nbcnews.com/business"
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
placeholder="Enter URL(s) separated by commas"
/>
</div>
<!-- Add a number set if 5 with a label word threshold -->
<div class="flex items-center gap-2 flex-col">
<label for="threshold" class="text-white">Min Words Threshold</label>
<select id="threshold" class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full">
<option value="5">5</option>
<option value="10" selected>10</option>
<option value="15">15</option>
<option value="20">20</option>
<option value="25">25</option>
</select>
<div class="grid grid-cols-1 lg:grid-cols-3 gap-4">
<div class="space-y-4">
<div class="flex flex-col">
<label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
<input
type="text"
id="url-input"
value="https://www.nbcnews.com/business"
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
placeholder="Enter URL(s) separated by commas"
/>
</div>
<div class="flex flex-col">
<label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
<select
id="threshold"
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
>
<option value="5">5</option>
<option value="10" selected>10</option>
<option value="15">15</option>
<option value="20">20</option>
<option value="25">25</option>
</select>
</div>
<div class="flex flex-col">
<label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
<input
type="text"
id="css-selector"
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
placeholder="Enter CSS Selector"
/>
</div>
<div class="flex flex-col">
<label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
>Extraction Strategy</label
>
<select
id="extraction-strategy-select"
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
>
<option value="CosineStrategy">CosineStrategy</option>
<option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
<option value="NoExtractionStrategy">NoExtractionStrategy</option>
</select>
</div>
<div class="flex flex-col">
<label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
>Chunking Strategy</label
>
<select
id="chunking-strategy-select"
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
>
<option value="RegexChunking">RegexChunking</option>
<option value="NlpSentenceChunking">NlpSentenceChunking</option>
<option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
<option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
<option value="SlidingWindowChunking">SlidingWindowChunking</option>
</select>
</div>
<div class="flex flex-col">
<label for="provider-model-select" class="text-lime-500 font-bold text-xs"
>Provider Model</label
>
<select
id="provider-model-select"
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
disabled
>
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
</select>
</div>
<div class="flex flex-col">
<label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
<input
type="password"
id="token-input"
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
placeholder="Enter Groq API token"
disabled
/>
</div>
<div class="flex gap-3">
<div class="flex items-center gap-2">
<input type="checkbox" id="bypass-cache-checkbox" />
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
</div>
<div class="flex items-center gap-2">
<input type="checkbox" id="extract-blocks-checkbox" checked />
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold"
>Extract Blocks</label
>
</div>
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">
Crawl
</button>
</div>
</div>
<div class="flex items-center gap-2 flex-col">
<label for="provider-model-select" class="text-white">Provider Model</label>
<select
id="provider-model-select"
class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full"
>
<!-- Add your option values here -->
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
</select>
</div>
<div class="flex items-center gap-2 flex-col">
<label for="token-input" class="text-white">API Token</label>
<input
type="password"
id="token-input"
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
placeholder="Enter Groq API token"
/>
</div>
<div class="flex items-center justify-center gap-2 flex-col">
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked />
</div>
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
</div>
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
<div id="loading" class="hidden mt-4">
<p>
Depends on the selected model, it may take up to 1 or 2 minutes to process the request.
Loading...
</p>
</div>
<div id="result" class="tab-container flex-1 h-full flex-col">
<div id="result" class=" ">
<div id="loading" class="hidden">
<p class="text-white">Loading... Please wait.</p>
</div>
<div class="tab-buttons flex gap-2">
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
<button
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="json"
>
JSON
</button>
<button
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="cleaned-html"
>
Cleaned HTML
</button>
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
<button
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="markdown"
>
Markdown
</button>
</div>
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
<pre
class="hidden h-full flex"
><code id="cleaned-html-result" class="language-html "></code></pre>
><code id="cleaned-html-result" class="language-html"></code></pre>
<pre
class="hidden h-full flex"
><code id="markdown-result" class="language-markdown "></code></pre>
><code id="markdown-result" class="language-markdown"></code></pre>
</div>
</div>
<div id="code_help" class="tab-container flex-1 h-full">
<div id="code_help" class=" ">
<div class="tab-buttons flex gap-2">
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
Python
<button
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="curl"
>
cURL
</button>
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
<button
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="library"
>
Python Library
</button>
<button
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="python"
>
Python (Request)
</button>
<button
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
data-tab="nodejs"
>
Node.js
</button>
</div>
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
<div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
<pre class="h-full flex relative">
<code id="curl-code" class="language-bash"></code>
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
</pre>
<code id="curl-code" class="language-bash"></code>
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
</pre>
<pre class="hidden h-full flex relative">
<code id="python-code" class="language-python"></code>
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
</pre>
<code id="python-code" class="language-python"></code>
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
</pre>
<pre class="hidden h-full flex relative">
<code id="nodejs-code" class="language-javascript"></code>
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
</pre>
<code id="nodejs-code" class="language-javascript"></code>
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
</pre>
<pre class="hidden h-full flex relative">
<code id="library-code" class="language-python"></code>
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
</pre>
</div>
</div>
</div>
</div>
</section>
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
<h1 class="text-3xl font-bold mb-4">Installation 💻</h1>
<p class="mb-4">There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local server.</p>
<section class="hero bg-gray-900 py-8">
<p class="mb-4">You can also try Crawl4AI in a Google Colab <a href = "https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="display: inline-block; width: 100px; height: 20px;"/></a></p>
<h2 class="text-2xl font-bold mb-2">Using Crawl4AI as a Library 📚</h2>
<p class="mb-4">To install Crawl4AI as a library, follow these steps:</p>
<ol class="list-decimal list-inside mb-4">
<li class="mb-2">
Install the package from GitHub:
<pre class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"><code>pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
</li>
<li class="mb-2">
Alternatively, you can clone the repository and install the package locally:
<pre class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"><code class = "language-python bash">virtualenv venv
source venv/bin/activate
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
pip install -e .
</code></pre>
</li>
<li>
Import the necessary modules in your Python script:
<pre class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"><code class = "language-python hljs">from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
import os
crawler = WebCrawler()
# Single page crawl
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
result = crawl4ai.fetch_page(
url='https://www.nbcnews.com/business',
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
extraction_strategy= CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
bypass_cache=False,
extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
css_selector = "", # Eg: "div.article-body"
verbose=True,
include_raw_html=True, # Whether to include the raw HTML content in the response
)
print(result.model_dump())
</code></pre>
</li>
</ol>
<p class="mb-4">For more information about how to run Crawl4AI as a local server, please refer to the <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.</p>
<a href="
</section>
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
<h1 class="text-3xl font-bold mb-4">📖 Parameters</h1>
<div class="overflow-x-auto">
<table class="min-w-full bg-zinc-800 border border-zinc-700">
<thead>
<tr>
<th class="py-2 px-4 border-b border-zinc-700">Parameter</th>
<th class="py-2 px-4 border-b border-zinc-700">Description</th>
<th class="py-2 px-4 border-b border-zinc-700">Required</th>
<th class="py-2 px-4 border-b border-zinc-700">Default Value</th>
</tr>
</thead>
<tbody>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">urls</td>
<td class="py-2 px-4 border-b border-zinc-700">
A list of URLs to crawl and extract data from.
</td>
<td class="py-2 px-4 border-b border-zinc-700">Yes</td>
<td class="py-2 px-4 border-b border-zinc-700">-</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">include_raw_html</td>
<td class="py-2 px-4 border-b border-zinc-700">
Whether to include the raw HTML content in the response.
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">false</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">bypass_cache</td>
<td class="py-2 px-4 border-b border-zinc-700">
Whether to force a fresh crawl even if the URL has been previously crawled.
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">false</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">extract_blocks</td>
<td class="py-2 px-4 border-b border-zinc-700">
Whether to extract semantical blocks of text from the HTML.
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">true</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">word_count_threshold</td>
<td class="py-2 px-4 border-b border-zinc-700">
The minimum number of words a block must contain to be considered meaningful (minimum
value is 5).
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">5</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">extraction_strategy</td>
<td class="py-2 px-4 border-b border-zinc-700">
The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">CosineStrategy</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">chunking_strategy</td>
<td class="py-2 px-4 border-b border-zinc-700">
The strategy to use for chunking the text before processing (e.g., "RegexChunking").
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">RegexChunking</td>
</tr>
<tr>
<td class="py-2 px-4 border-b border-zinc-700">css_selector</td>
<td class="py-2 px-4 border-b border-zinc-700">
The CSS selector to target specific parts of the HTML for extraction.
</td>
<td class="py-2 px-4 border-b border-zinc-700">No</td>
<td class="py-2 px-4 border-b border-zinc-700">None</td>
</tr>
<tr>
<td class="py-2 px-4">verbose</td>
<td class="py-2 px-4">Whether to enable verbose logging.</td>
<td class="py-2 px-4">No</td>
<td class="py-2 px-4">true</td>
</tr>
</tbody>
</table>
</div>
</section>
<section id="extraction" class="py-8 px-20">
<div class="overflow-x-auto mx-auto px-6">
<h2 class="text-2xl font-bold mb-4">Extraction Strategies</h2>
<div id="extraction-strategies" class="space-y-4"></div>
</div>
</section>
<section id="chunking" class="py-8 px-20">
<div class="overflow-x-auto mx-auto px-6">
<h2 class="text-2xl font-bold mb-4">Chunking Strategies</h2>
<div id="chunking-strategies" class="space-y-4"></div>
</div>
</section>
<section class="hero bg-zinc-900 py-8 px-20">
<div class="container mx-auto px-4">
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
<p class="text-lg mb-4">
@@ -192,7 +504,7 @@
</div>
</section>
<section class="installation py-8">
<section class="installation py-8 px-20">
<div class="container mx-auto px-4">
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
<p class="mb-4">
@@ -202,7 +514,7 @@
</div>
</section>
<footer class="bg-gray-900 text-white py-4">
<footer class="bg-zinc-900 text-white py-4">
<div class="container mx-auto px-4">
<div class="flex justify-between items-center">
<p>© 2024 Crawl4AI. All rights reserved.</p>
@@ -219,18 +531,27 @@
target="_blank"
>🐦 Twitter</a
>
<!-- <a
href="https://discord.gg/your-invite-link"
class="text-white hover:text-gray-300 mx-2"
target="_blank"
>💬 Discord</a
> -->
</div>
</div>
</div>
</footer>
<script>
// JavaScript to manage dynamic form changes and logic
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
const strategy = this.value;
const providerModelSelect = document.getElementById("provider-model-select");
const tokenInput = document.getElementById("token-input");
if (strategy === "LLMExtractionStrategy") {
providerModelSelect.disabled = false;
tokenInput.disabled = false;
} else {
providerModelSelect.disabled = true;
tokenInput.disabled = true;
}
});
// Get the selected provider model and token from local storage
const storedProviderModel = localStorage.getItem("provider_model");
const storedToken = localStorage.getItem(storedProviderModel);
@@ -274,6 +595,7 @@
const selectedProviderModel = document.getElementById("provider-model-select").value;
const apiToken = document.getElementById("token-input").value;
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
// Save the selected provider model and token to local storage
localStorage.setItem("provider_model", selectedProviderModel);
@@ -286,17 +608,21 @@
provider_model: selectedProviderModel,
api_token: apiToken,
include_raw_html: true,
forced: false,
bypass_cache: bypassCache,
extract_blocks: extractBlocks,
word_count_threshold: parseInt(document.getElementById("threshold").value),
extraction_strategy: document.getElementById("extraction-strategy-select").value,
chunking_strategy: document.getElementById("chunking-strategy-select").value,
css_selector: document.getElementById("css-selector").value,
verbose: true,
};
// save api token to local storage
localStorage.setItem("api_token", document.getElementById("token-input").value);
document.getElementById("loading").classList.remove("hidden");
document.getElementById("result").classList.add("hidden");
document.getElementById("code_help").classList.add("hidden");
//document.getElementById("result").classList.add("hidden");
//document.getElementById("code_help").classList.add("hidden");
axios
.post("/crawl", data)
@@ -308,29 +634,44 @@
document.getElementById("markdown-result").textContent = result.markdown;
// Update code examples dynamically
// Update code examples dynamically
const extractionStrategy = data.extraction_strategy;
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
document.getElementById(
"curl-code"
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
...data,
api_token: "your_api_token",
})}' http://localhost:8000/crawl`;
api_token: isLLMExtraction ? "your_api_token" : undefined,
})}' http://crawl4ai.uccode.io/crawl`;
document.getElementById(
"python-code"
).textContent = `import requests\n\ndata = ${JSON.stringify(
{ ...data, api_token: "your_api_token" },
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
null,
2
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
)}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
document.getElementById(
"nodejs-code"
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
{ ...data, api_token: "your_api_token" },
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
null,
2
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
)};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
document.getElementById(
"library-code"
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
urls[0]
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
isLLMExtraction
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
: extractionStrategy + "()"
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
data.bypass_cache
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
// Highlight code syntax
hljs.highlightAll();
@@ -357,8 +698,8 @@
const tab = btn.dataset.tab;
document
.querySelectorAll(".tab-btn")
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
btn.classList.add("bg-blue-600", "text-white");
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
btn.classList.add("bg-lime-700", "text-white");
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
});
@@ -370,19 +711,58 @@
const tab = btn.dataset.tab;
document
.querySelectorAll(".code-tab-btn")
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
btn.classList.add("bg-blue-600", "text-white");
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
btn.classList.add("bg-lime-700", "text-white");
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
});
});
// Handle copy to clipboard button clicks
async function copyToClipboard(text) {
if (navigator.clipboard && navigator.clipboard.writeText) {
return navigator.clipboard.writeText(text);
} else {
return fallbackCopyTextToClipboard(text);
}
}
function fallbackCopyTextToClipboard(text) {
return new Promise((resolve, reject) => {
const textArea = document.createElement("textarea");
textArea.value = text;
// Avoid scrolling to bottom
textArea.style.top = "0";
textArea.style.left = "0";
textArea.style.position = "fixed";
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
try {
const successful = document.execCommand("copy");
if (successful) {
resolve();
} else {
reject();
}
} catch (err) {
reject(err);
}
document.body.removeChild(textArea);
});
}
document.querySelectorAll(".copy-btn").forEach((btn) => {
btn.addEventListener("click", () => {
const target = btn.dataset.target;
const code = document.getElementById(target).textContent;
navigator.clipboard.writeText(code).then(() => {
//navigator.clipboard.writeText(code).then(() => {
copyToClipboard(code).then(() => {
btn.textContent = "Copied!";
setTimeout(() => {
btn.textContent = "Copy";
@@ -390,6 +770,42 @@
});
});
});
document.addEventListener("DOMContentLoaded", async () => {
try {
const extractionResponse = await fetch("/strategies/extraction");
const extractionStrategies = await extractionResponse.json();
const chunkingResponse = await fetch("/strategies/chunking");
const chunkingStrategies = await chunkingResponse.json();
renderStrategies("extraction-strategies", extractionStrategies);
renderStrategies("chunking-strategies", chunkingStrategies);
} catch (error) {
console.error("Error fetching strategies:", error);
}
});
function renderStrategies(containerId, strategies) {
const container = document.getElementById(containerId);
container.innerHTML = ""; // Clear any existing content
strategies = JSON.parse(strategies);
Object.entries(strategies).forEach(([strategy, description]) => {
const strategyElement = document.createElement("div");
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
const strategyDescription = document.createElement("div");
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
strategyDescription.innerHTML = marked.parse(description);
strategyElement.appendChild(strategyDescription);
container.appendChild(strategyElement);
});
}
// Highlight code syntax
hljs.highlightAll();
</script>
</body>
</html>

0
tests/__init__.py Normal file
View File

111
tests/test_web_crawler.py Normal file
View File

@@ -0,0 +1,111 @@
import unittest, os
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
class TestWebCrawler(unittest.TestCase):
def setUp(self):
self.crawler = WebCrawler()
def test_warmup(self):
self.crawler.warmup()
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
def test_run_default_strategies(self):
result = self.crawler.run(
url='https://www.nbcnews.com/business',
word_count_threshold=5,
chunking_strategy=RegexChunking(),
extraction_strategy=CosineStrategy(), bypass_cache=True
)
self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
def test_run_different_strategies(self):
url = 'https://www.nbcnews.com/business'
# Test with FixedLengthWordChunking and LLMExtractionStrategy
result = self.crawler.run(
url=url,
word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
)
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
# Test with SlidingWindowChunking and TopicExtractionStrategy
result = self.crawler.run(
url=url,
word_count_threshold=5,
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
)
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
def test_invalid_url(self):
with self.assertRaises(Exception) as context:
self.crawler.run(url='invalid_url', bypass_cache=True)
self.assertIn("Invalid URL", str(context.exception))
def test_unsupported_extraction_strategy(self):
with self.assertRaises(Exception) as context:
self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
self.assertIn("Unsupported extraction strategy", str(context.exception))
def test_invalid_css_selector(self):
with self.assertRaises(ValueError) as context:
self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
self.assertIn("Invalid CSS selector", str(context.exception))
def test_crawl_with_cache_and_bypass_cache(self):
url = 'https://www.nbcnews.com/business'
# First crawl with cache enabled
result = self.crawler.run(url=url, bypass_cache=False)
self.assertTrue(result.success, "Failed to crawl and cache the result")
# Second crawl with bypass_cache=True
result = self.crawler.run(url=url, bypass_cache=True)
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
def test_fetch_multiple_pages(self):
urls = [
'https://www.nbcnews.com/business',
'https://www.bbc.com/news'
]
results = []
for url in urls:
result = self.crawler.run(
url=url,
word_count_threshold=5,
chunking_strategy=RegexChunking(),
extraction_strategy=CosineStrategy(),
bypass_cache=True
)
results.append(result)
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
for result in results:
self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
def test_run_fixed_length_word_chunking_and_no_extraction(self):
result = self.crawler.run(
url='https://www.nbcnews.com/business',
word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
)
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
def test_run_sliding_window_and_no_extraction(self):
result = self.crawler.run(
url='https://www.nbcnews.com/business',
word_count_threshold=5,
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
)
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
if __name__ == '__main__':
unittest.main()