- Test all methods
- Update index.hml - Update Readme - Resolve some bugs
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -166,5 +166,6 @@ Crawl4AI.egg-info/*
|
||||
crawler_data.db
|
||||
.vscode/
|
||||
test_pad.py
|
||||
test_pad*.py
|
||||
.data/
|
||||
Crawl4AI.egg-info/
|
||||
97
README.md
97
README.md
@@ -56,40 +56,28 @@ pip install -e .
|
||||
2. Import the necessary modules in your Python script:
|
||||
```python
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.models import UrlModel
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
import os
|
||||
|
||||
crawler = WebCrawler(db_path='crawler_data.db')
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup() # IMPORTANT: Warmup the engine before running the first crawl
|
||||
|
||||
# Single page crawl
|
||||
single_url = UrlModel(url='https://kidocode.com', forced=False)
|
||||
result = crawl4ai.fetch_page(
|
||||
single_url,
|
||||
provider= "openai/gpt-3.5-turbo",
|
||||
api_token = os.getenv('OPENAI_API_KEY'),
|
||||
# Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks
|
||||
# and return them as JSON. Depending on the model and data size, this may take up to 1 minute.
|
||||
# Without this setting, it will take between 5 to 20 seconds.
|
||||
extract_blocks_flag=False
|
||||
word_count_threshold=5 # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
result = crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
chunking_strategy= RegexChunking( patterns = ["\n\n"]), # Default is RegexChunking
|
||||
extraction_strategy= CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
|
||||
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
|
||||
bypass_cache=False,
|
||||
extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
|
||||
css_selector = "", # Eg: "div.article-body"
|
||||
verbose=True,
|
||||
include_raw_html=True, # Whether to include the raw HTML content in the response
|
||||
)
|
||||
|
||||
print(result.model_dump())
|
||||
|
||||
# Multiple page crawl
|
||||
urls = [
|
||||
UrlModel(url='http://example.com', forced=False),
|
||||
UrlModel(url='http://example.org', forced=False)
|
||||
]
|
||||
results = crawl4ai.fetch_pages(
|
||||
urls,
|
||||
provider= "openai/gpt-3.5-turbo",
|
||||
api_token = os.getenv('OPENAI_API_KEY'),
|
||||
extract_blocks_flag=True,
|
||||
word_count_threshold=5
|
||||
)
|
||||
|
||||
for res in results:
|
||||
print(res.model_dump())
|
||||
```
|
||||
|
||||
Running for the first time will download the chrome driver for selenium. Also creates a SQLite database file `crawler_data.db` in the current directory. This file will store the crawled data for future reference.
|
||||
@@ -150,23 +138,22 @@ Set `extract_blocks_flag` to True to enable the LLM to generate semantically clu
|
||||
import requests
|
||||
import os
|
||||
|
||||
url = "http://localhost:8000/crawl" # Replace with the appropriate server URL
|
||||
data = {
|
||||
"urls": [
|
||||
"https://example.com"
|
||||
"https://www.nbcnews.com/business"
|
||||
],
|
||||
"provider_model": "groq/llama3-70b-8192",
|
||||
"api_token": "your_api_token",
|
||||
"include_raw_html": true,
|
||||
"forced": false,
|
||||
# Set `extract_blocks_flag` to True to enable the LLM to generate semantically clustered chunks
|
||||
# and return them as JSON. Depending on the model and data size, this may take up to 1 minute.
|
||||
# Without this setting, it will take between 5 to 20 seconds.
|
||||
"extract_blocks_flag": False,
|
||||
"word_count_threshold": 5
|
||||
"bypass_cache": false,
|
||||
"extract_blocks": true,
|
||||
"word_count_threshold": 10,
|
||||
"extraction_strategy": "CosineStrategy",
|
||||
"chunking_strategy": "RegexChunking",
|
||||
"css_selector": "",
|
||||
"verbose": true
|
||||
}
|
||||
|
||||
response = requests.post(url, json=data)
|
||||
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR http://localhost:8000 if your run locally
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()["results"][0]
|
||||
@@ -180,9 +167,9 @@ else:
|
||||
print("Error:", response.status_code, response.text)
|
||||
```
|
||||
|
||||
This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`https://example.com`) and the desired options (`grq_api_token`, `include_raw_html`, and `forced`). The server processes the request and returns the crawled data in JSON format.
|
||||
This code sends a POST request to the Crawl4AI server running on localhost, specifying the target URL (`http://crawl4ai.uccode.io/crawl`) and the desired options. The server processes the request and returns the crawled data in JSON format.
|
||||
|
||||
The response from the server includes the parsed JSON, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
|
||||
The response from the server includes the semantical clusters, cleaned HTML, and markdown representations of the crawled webpage. You can access and use this data in your Python application as needed.
|
||||
|
||||
Make sure to replace `"http://localhost:8000/crawl"` with the appropriate server URL if your Crawl4AI server is running on a different host or port.
|
||||
|
||||
@@ -194,15 +181,17 @@ That's it! You can now integrate Crawl4AI into your Python projects and leverage
|
||||
|
||||
## 📖 Parameters
|
||||
|
||||
| Parameter | Description | Required | Default Value |
|
||||
|----------------------|-------------------------------------------------------------------------------------------------|----------|---------------|
|
||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||
| `provider_model` | The provider and model to use for extracting relevant information (e.g., "groq/llama3-70b-8192"). | Yes | - |
|
||||
| `api_token` | Your API token for the specified provider. | Yes | - |
|
||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||
| `forced` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||
| `extract_blocks_flag`| Whether to extract semantical blocks of text from the HTML. | No | `false` |
|
||||
| `word_count_threshold` | The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||
| Parameter | Description | Required | Default Value |
|
||||
|-----------------------|-------------------------------------------------------------------------------------------------------|----------|---------------------|
|
||||
| `urls` | A list of URLs to crawl and extract data from. | Yes | - |
|
||||
| `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` |
|
||||
| `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` |
|
||||
| `extract_blocks` | Whether to extract semantical blocks of text from the HTML. | No | `true` |
|
||||
| `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` |
|
||||
| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `CosineStrategy` |
|
||||
| `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` |
|
||||
| `css_selector` | The CSS selector to target specific parts of the HTML for extraction. | No | `None` |
|
||||
| `verbose` | Whether to enable verbose logging. | No | `true` |
|
||||
|
||||
## 🛠️ Configuration
|
||||
Crawl4AI allows you to configure various parameters and settings in the `crawler/config.py` file. Here's an example of how you can adjust the parameters:
|
||||
@@ -213,15 +202,17 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
|
||||
# Provider-model dictionary
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
@@ -229,12 +220,14 @@ PROVIDER_MODELS = {
|
||||
|
||||
# Chunk token threshold
|
||||
CHUNK_TOKEN_THRESHOLD = 1000
|
||||
|
||||
# Threshold for the minimum number of words in an HTML tag to be considered
|
||||
MIN_WORD_THRESHOLD = 5
|
||||
```
|
||||
|
||||
In the `crawler/config.py` file, you can:
|
||||
|
||||
REMEBER: You only need to set the API keys for the providers in case you choose LLMExtractStrategy as the extraction strategy. If you choose CosineStrategy, you don't need to set the API keys.
|
||||
|
||||
- Set the default provider using the `DEFAULT_PROVIDER` variable.
|
||||
- Add or modify the provider-model dictionary (`PROVIDER_MODELS`) to include your desired providers and their corresponding API keys. Crawl4AI supports various providers such as Groq, OpenAI, Anthropic, and more. You can add any provider supported by LiteLLM, as well as Ollama.
|
||||
- Adjust the `CHUNK_TOKEN_THRESHOLD` value to control the splitting of web content into chunks for parallel processing. A higher value means fewer chunks and faster processing, but it may cause issues with weaker LLMs during extraction.
|
||||
|
||||
@@ -3,15 +3,17 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
|
||||
# Provider-model dictionary
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
|
||||
@@ -5,18 +5,20 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException
|
||||
import chromedriver_autoinstaller
|
||||
from typing import List
|
||||
import requests
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
class CrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
def crawl(self, url: str) -> str:
|
||||
def crawl(self, url: str, **kwargs) -> str:
|
||||
pass
|
||||
|
||||
class CloudCrawlerStrategy(CrawlerStrategy):
|
||||
def crawl(self, url: str) -> str:
|
||||
def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
|
||||
data = {
|
||||
"urls": [url],
|
||||
"provider_model": "",
|
||||
@@ -40,19 +42,34 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--headless")
|
||||
|
||||
chromedriver_autoinstaller.install()
|
||||
# chromedriver_autoinstaller.install()
|
||||
self.service = Service(chromedriver_autoinstaller.install())
|
||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||
|
||||
def crawl(self, url: str, use_cached_html = False) -> str:
|
||||
def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
|
||||
if use_cached_html:
|
||||
return get_content_of_website(url)
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
)
|
||||
html = self.driver.page_source
|
||||
return html
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
||||
if os.path.exists(cache_file_path):
|
||||
with open(cache_file_path, "r") as f:
|
||||
return f.read()
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
|
||||
)
|
||||
html = self.driver.page_source
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
|
||||
with open(cache_file_path, "w") as f:
|
||||
f.write(html)
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
|
||||
def quit(self):
|
||||
self.driver.quit()
|
||||
@@ -1,7 +1,15 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
def init_db(db_path: str):
|
||||
global DB_PATH
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
@@ -16,46 +24,65 @@ def init_db(db_path: str):
|
||||
''')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
DB_PATH = db_path
|
||||
|
||||
def get_cached_url(db_path: str, url: str) -> Optional[tuple]:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result
|
||||
def check_db_path():
|
||||
if not DB_PATH:
|
||||
raise ValueError("Database path is not set or is empty.")
|
||||
|
||||
def cache_url(db_path: str, url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
parsed_json = excluded.parsed_json,
|
||||
success = excluded.success
|
||||
''', (str(url), html, cleaned_html, markdown, parsed_json, success))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def get_total_count(db_path: str) -> int:
|
||||
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
parsed_json = excluded.parsed_json,
|
||||
success = excluded.success
|
||||
''', (url, html, cleaned_html, markdown, parsed_json, success))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error caching URL: {e}")
|
||||
|
||||
def get_total_count() -> int:
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT COUNT(*) FROM crawled_data')
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result[0]
|
||||
except Exception as e:
|
||||
print(f"Error getting total count: {e}")
|
||||
return 0
|
||||
|
||||
# Crete function to cler the database
|
||||
def clear_db(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM crawled_data')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def clear_db():
|
||||
check_db_path()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM crawled_data')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Error clearing database: {e}")
|
||||
@@ -7,6 +7,8 @@ from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .config import *
|
||||
from .utils import *
|
||||
from functools import partial
|
||||
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||
|
||||
|
||||
class ExtractionStrategy(ABC):
|
||||
"""
|
||||
@@ -15,6 +17,7 @@ class ExtractionStrategy(ABC):
|
||||
|
||||
def __init__(self):
|
||||
self.DEL = "<|DEL|>"
|
||||
self.name = self.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
@@ -67,7 +70,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
|
||||
|
||||
def extract(self, url: str, html: str) -> List[Dict[str, Any]]:
|
||||
print("Extracting blocks ...")
|
||||
print("[LOG] Extracting blocks from URL:", url)
|
||||
variable_values = {
|
||||
"URL": url,
|
||||
"HTML": escape_json_string(sanitize_html(html)),
|
||||
@@ -98,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"content": unparsed
|
||||
})
|
||||
|
||||
print("Extracted", len(blocks), "blocks.")
|
||||
print("[LOG] Extracted", len(blocks), "blocks from URL:", url)
|
||||
return blocks
|
||||
|
||||
def _merge(self, documents):
|
||||
@@ -125,6 +128,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
||||
"""
|
||||
|
||||
merged_sections = self._merge(sections)
|
||||
parsed_json = []
|
||||
if self.provider.startswith("groq/"):
|
||||
@@ -144,7 +148,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
|
||||
return parsed_json
|
||||
|
||||
class CosinegStrategy(ExtractionStrategy):
|
||||
class CosineStrategy(ExtractionStrategy):
|
||||
def __init__(self, word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'BAAI/bge-small-en-v1.5'):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
@@ -164,20 +168,13 @@ class CosinegStrategy(ExtractionStrategy):
|
||||
self.linkage_method = linkage_method
|
||||
self.top_k = top_k
|
||||
self.timer = time.time()
|
||||
|
||||
if model_name == "bert-base-uncased":
|
||||
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
self.model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
elif model_name == "sshleifer/distilbart-cnn-12-6":
|
||||
# self.model = IPEXModel.from_pretrained("Intel/bge-small-en-v1.5-rag-int8-static")
|
||||
# self.tokenizer = AutoTokenizer.from_pretrained("Intel/bge-small-en-v1.5-rag-int8-static")
|
||||
pass
|
||||
elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
self.tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
self.model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
self.model.eval()
|
||||
|
||||
self.nlp = spacy.load("models/reuters")
|
||||
if model_name == "bert-base-uncased":
|
||||
self.tokenizer, self.model = load_bert_base_uncased()
|
||||
elif model_name == "BAAI/bge-small-en-v1.5":
|
||||
self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||||
|
||||
self.nlp = load_spacy_model()
|
||||
print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
|
||||
|
||||
def get_embeddings(self, sentences: List[str]):
|
||||
|
||||
20
crawl4ai/model_loader.py
Normal file
20
crawl4ai/model_loader.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from functools import lru_cache
|
||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||
import spacy
|
||||
|
||||
@lru_cache()
|
||||
def load_bert_base_uncased():
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_bge_small_en_v1_5():
|
||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||
model.eval()
|
||||
return tokenizer, model
|
||||
|
||||
@lru_cache()
|
||||
def load_spacy_model():
|
||||
return spacy.load("models/reuters")
|
||||
@@ -10,6 +10,8 @@ from html2text import HTML2Text
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .config import *
|
||||
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
def beautify_html(escaped_html):
|
||||
"""
|
||||
@@ -140,13 +142,25 @@ class CustomHTML2Text(HTML2Text):
|
||||
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||
try:
|
||||
if not html:
|
||||
return None
|
||||
# Parse HTML content with BeautifulSoup
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Get the content within the <body> tag
|
||||
body = soup.body
|
||||
|
||||
# If css_selector is provided, extract content based on the selector
|
||||
if css_selector:
|
||||
selected_elements = body.select(css_selector)
|
||||
if not selected_elements:
|
||||
raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
|
||||
div_tag = soup.new_tag('div')
|
||||
for el in selected_elements:
|
||||
div_tag.append(el)
|
||||
body = div_tag
|
||||
|
||||
# Remove script, style, and other tags that don't carry useful content from body
|
||||
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
|
||||
@@ -255,7 +269,7 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
|
||||
|
||||
# Remove comments
|
||||
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
# Remove consecutive empty newlines and replace multiple spaces with a single space
|
||||
@@ -281,7 +295,7 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
|
||||
|
||||
except Exception as e:
|
||||
print('Error processing HTML content:', str(e))
|
||||
return None
|
||||
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
|
||||
|
||||
def extract_xml_tags(string):
|
||||
tags = re.findall(r'<(\w+)>', string)
|
||||
|
||||
@@ -2,7 +2,7 @@ import os, time
|
||||
from pathlib import Path
|
||||
|
||||
from .models import UrlModel, CrawlResult
|
||||
from .database import init_db, get_cached_url, cache_url
|
||||
from .database import init_db, get_cached_url, cache_url, DB_PATH
|
||||
from .utils import *
|
||||
from .chunking_strategy import *
|
||||
from .extraction_strategy import *
|
||||
@@ -10,6 +10,7 @@ from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .config import *
|
||||
# from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
@@ -36,11 +37,11 @@ class WebCrawler:
|
||||
|
||||
def warmup(self):
|
||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||
single_url = UrlModel(url='https://crawl4ai.uccode.io/', forced=False)
|
||||
result = self.run(
|
||||
single_url,
|
||||
url='https://crawl4ai.uccode.io/',
|
||||
word_count_threshold=5,
|
||||
extraction_strategy= CosinegStrategy(),
|
||||
extraction_strategy= CosineStrategy(),
|
||||
bypass_cache=False,
|
||||
verbose = False
|
||||
)
|
||||
self.ready = True
|
||||
@@ -60,10 +61,11 @@ class WebCrawler:
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
return self.run(
|
||||
url_model,
|
||||
url_model.url,
|
||||
word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
bypass_cache=url_model.forced,
|
||||
**kwargs,
|
||||
)
|
||||
pass
|
||||
@@ -71,77 +73,85 @@ class WebCrawler:
|
||||
|
||||
def run(
|
||||
self,
|
||||
url_model: UrlModel,
|
||||
url: str,
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = NoExtractionStrategy(),
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
|
||||
if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
# Check cache first
|
||||
cached = get_cached_url(self.db_path, str(url_model.url))
|
||||
if cached and not url_model.forced:
|
||||
return CrawlResult(
|
||||
**{
|
||||
"url": cached[0],
|
||||
"html": cached[1],
|
||||
"cleaned_html": cached[2],
|
||||
"markdown": cached[3],
|
||||
"parsed_json": cached[4],
|
||||
"success": cached[5],
|
||||
"error_message": "",
|
||||
}
|
||||
)
|
||||
if not bypass_cache:
|
||||
cached = get_cached_url(url)
|
||||
if cached:
|
||||
return CrawlResult(
|
||||
**{
|
||||
"url": cached[0],
|
||||
"html": cached[1],
|
||||
"cleaned_html": cached[2],
|
||||
"markdown": cached[3],
|
||||
"parsed_json": cached[4],
|
||||
"success": cached[5],
|
||||
"error_message": "",
|
||||
}
|
||||
)
|
||||
|
||||
# Initialize WebDriver for crawling
|
||||
t = time.time()
|
||||
try:
|
||||
html = self.crawler_strategy.crawl(str(url_model.url))
|
||||
success = True
|
||||
error_message = ""
|
||||
except Exception as e:
|
||||
html = ""
|
||||
success = False
|
||||
error_message = str(e)
|
||||
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
success = True
|
||||
error_message = ""
|
||||
# Extract content from HTML
|
||||
result = get_content_of_website(html, word_count_threshold)
|
||||
try:
|
||||
result = get_content_of_website(html, word_count_threshold, css_selector=css_selector)
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
except InvalidCSSSelectorError as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
cleaned_html = result.get("cleaned_html", html)
|
||||
markdown = result.get("markdown", "")
|
||||
|
||||
# Print a profession LOG style message, show time taken and say crawling is done
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Crawling done for {url_model.url}, success: {success}, time taken: {time.time() - t} seconds"
|
||||
f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
|
||||
)
|
||||
|
||||
parsed_json = []
|
||||
if verbose:
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url_model.url}")
|
||||
print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
|
||||
t = time.time()
|
||||
# Split markdown into sections
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
||||
|
||||
parsed_json = extraction_strategy.run(
|
||||
str(url_model.url), sections,
|
||||
url, sections,
|
||||
)
|
||||
parsed_json = json.dumps(parsed_json)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds."
|
||||
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
|
||||
)
|
||||
|
||||
# Cache the result
|
||||
cleaned_html = beautify_html(cleaned_html)
|
||||
cache_url(
|
||||
self.db_path,
|
||||
str(url_model.url),
|
||||
url,
|
||||
html,
|
||||
cleaned_html,
|
||||
markdown,
|
||||
@@ -150,7 +160,7 @@ class WebCrawler:
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=str(url_model.url),
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown=markdown,
|
||||
|
||||
12
docs/chunking_strategies.json
Normal file
12
docs/chunking_strategies.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"RegexChunking": "### RegexChunking\n\n`RegexChunking` is a text chunking strategy that splits a given text into smaller parts using regular expressions.\nThis is useful for preparing large texts for processing by language models, ensuring they are divided into manageable segments.\n\n#### Constructor Parameters:\n- `patterns` (list, optional): A list of regular expression patterns used to split the text. Default is to split by double newlines (`['\\n\\n']`).\n\n#### Example usage:\n```python\nchunker = RegexChunking(patterns=[r'\\n\\n', r'\\. '])\nchunks = chunker.chunk(\"This is a sample text. It will be split into chunks.\")\n```",
|
||||
|
||||
"NlpSentenceChunking": "### NlpSentenceChunking\n\n`NlpSentenceChunking` uses a natural language processing model to chunk a given text into sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.\n\n#### Constructor Parameters:\n- `model` (str, optional): The SpaCy model to use for sentence detection. Default is `'en_core_web_sm'`.\n\n#### Example usage:\n```python\nchunker = NlpSentenceChunking(model='en_core_web_sm')\nchunks = chunker.chunk(\"This is a sample text. It will be split into sentences.\")\n```",
|
||||
|
||||
"TopicSegmentationChunking": "### TopicSegmentationChunking\n\n`TopicSegmentationChunking` uses the TextTiling algorithm to segment a given text into topic-based chunks. This method identifies thematic boundaries in the text.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): The number of keywords to extract for each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nchunker = TopicSegmentationChunking(num_keywords=3)\nchunks = chunker.chunk(\"This is a sample text. It will be split into topic-based segments.\")\n```",
|
||||
|
||||
"FixedLengthWordChunking": "### FixedLengthWordChunking\n\n`FixedLengthWordChunking` splits a given text into chunks of fixed length, based on the number of words.\n\n#### Constructor Parameters:\n- `chunk_size` (int, optional): The number of words in each chunk. Default is `100`.\n\n#### Example usage:\n```python\nchunker = FixedLengthWordChunking(chunk_size=100)\nchunks = chunker.chunk(\"This is a sample text. It will be split into fixed-length word chunks.\")\n```",
|
||||
|
||||
"SlidingWindowChunking": "### SlidingWindowChunking\n\n`SlidingWindowChunking` uses a sliding window approach to chunk a given text. Each chunk has a fixed length, and the window slides by a specified step size.\n\n#### Constructor Parameters:\n- `window_size` (int, optional): The number of words in each chunk. Default is `100`.\n- `step` (int, optional): The number of words to slide the window. Default is `50`.\n\n#### Example usage:\n```python\nchunker = SlidingWindowChunking(window_size=100, step=50)\nchunks = chunker.chunk(\"This is a sample text. It will be split using a sliding window approach.\")\n```"
|
||||
}
|
||||
|
||||
10
docs/extraction_strategies.json
Normal file
10
docs/extraction_strategies.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"NoExtractionStrategy": "### NoExtractionStrategy\n\n`NoExtractionStrategy` is a basic extraction strategy that returns the entire HTML content without any modification. It is useful for cases where no specific extraction is required. Only clean html, and amrkdown.\n\n#### Constructor Parameters:\nNone.\n\n#### Example usage:\n```python\nextractor = NoExtractionStrategy()\nextracted_content = extractor.extract(url, html)\n```",
|
||||
|
||||
"LLMExtractionStrategy": "### LLMExtractionStrategy\n\n`LLMExtractionStrategy` uses a Language Model (LLM) to extract meaningful blocks or chunks from the given HTML content. This strategy leverages an external provider for language model completions.\n\n#### Constructor Parameters:\n- `provider` (str, optional): The provider to use for the language model completions. Default is `DEFAULT_PROVIDER` (following provider/model eg. openai/gpt-4o).\n- `api_token` (str, optional): The API token for the provider. If not provided, it will try to load from the environment variable `OPENAI_API_KEY`.\n\n#### Example usage:\n```python\nextractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token')\nextracted_content = extractor.extract(url, html)\n```",
|
||||
|
||||
"CosineStrategy": "### CosineStrategy\n\n`CosineStrategy` uses hierarchical clustering based on cosine similarity to extract clusters of text from the given HTML content. This strategy is suitable for identifying related content sections.\n\n#### Constructor Parameters:\n- `word_count_threshold` (int, optional): Minimum number of words per cluster. Default is `20`.\n- `max_dist` (float, optional): The maximum cophenetic distance on the dendrogram to form clusters. Default is `0.2`.\n- `linkage_method` (str, optional): The linkage method for hierarchical clustering. Default is `'ward'`.\n- `top_k` (int, optional): Number of top categories to extract. Default is `3`.\n- `model_name` (str, optional): The model name for embedding generation. Default is `'BAAI/bge-small-en-v1.5'`.\n\n#### Example usage:\n```python\nextractor = CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')\nextracted_content = extractor.extract(url, html)\n```",
|
||||
|
||||
"TopicExtractionStrategy": "### TopicExtractionStrategy\n\n`TopicExtractionStrategy` uses the TextTiling algorithm to segment the HTML content into topics and extracts keywords for each segment. This strategy is useful for identifying and summarizing thematic content.\n\n#### Constructor Parameters:\n- `num_keywords` (int, optional): Number of keywords to represent each topic segment. Default is `3`.\n\n#### Example usage:\n```python\nextractor = TopicExtractionStrategy(num_keywords=3)\nextracted_content = extractor.extract(url, html)\n```"
|
||||
}
|
||||
|
||||
33
docs/quickstart.py
Normal file
33
docs/quickstart.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
|
||||
|
||||
def main():
|
||||
crawler = WebCrawler()
|
||||
crawler.warmup()
|
||||
|
||||
# Single page crawl
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"]), # Default is RegexChunking
|
||||
extraction_strategy=CosineStrategy(
|
||||
word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3
|
||||
), # Default is CosineStrategy
|
||||
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
|
||||
bypass_cache=True,
|
||||
extract_blocks=True, # Whether to extract semantical blocks of text from the HTML
|
||||
css_selector="", # Eg: "div.article-body" or all H2 tags liek "h2"
|
||||
verbose=True,
|
||||
include_raw_html=True, # Whether to include the raw HTML content in the response
|
||||
)
|
||||
|
||||
|
||||
print("[LOG] 📦 Crawl result:")
|
||||
print(result.model_dump())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,32 +0,0 @@
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.models import UrlModel
|
||||
from crawl4ai.utils import get_content_of_website
|
||||
import os
|
||||
|
||||
def main():
|
||||
# Initialize the WebCrawler with just the database path
|
||||
crawler = WebCrawler(db_path='crawler_data.db')
|
||||
|
||||
# Fetch a single page
|
||||
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=True)
|
||||
result = crawler.fetch_page(
|
||||
single_url,
|
||||
provider= "openai/gpt-3.5-turbo",
|
||||
api_token = os.getenv('OPENAI_API_KEY'),
|
||||
use_cached_html = True,
|
||||
extract_blocks_flag=True,
|
||||
word_count_threshold=10
|
||||
)
|
||||
print(result.model_dump())
|
||||
|
||||
# Fetch multiple pages
|
||||
# urls = [
|
||||
# UrlModel(url='http://example.com', forced=False),
|
||||
# UrlModel(url='http://example.org', forced=False)
|
||||
# ]
|
||||
# results = crawler.fetch_pages(urls, provider= "openai/gpt-4-turbo", api_token = os.getenv('OPENAI_API_KEY'))
|
||||
# for res in results:
|
||||
# print(res.model_copy())
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
157
main.py
157
main.py
@@ -1,24 +1,19 @@
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Optional
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.models import UrlModel
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import chromedriver_autoinstaller
|
||||
from functools import lru_cache
|
||||
from crawl4ai.database import get_total_count, clear_db
|
||||
import os
|
||||
import uuid
|
||||
# Import the CORS middleware
|
||||
import importlib
|
||||
import asyncio
|
||||
from functools import lru_cache
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import List, Optional
|
||||
|
||||
# Task management
|
||||
tasks = {}
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.database import get_total_count, clear_db
|
||||
|
||||
# Configuration
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
@@ -41,22 +36,25 @@ app.add_middleware(
|
||||
# Mount the pages directory as a static directory
|
||||
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
|
||||
|
||||
# chromedriver_autoinstaller.install() # Ensure chromedriver is installed
|
||||
@lru_cache()
|
||||
def get_crawler():
|
||||
# Initialize and return a WebCrawler instance
|
||||
return WebCrawler()
|
||||
|
||||
chromedriver_autoinstaller.install() # Ensure chromedriver is installed
|
||||
|
||||
class UrlsInput(BaseModel):
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[HttpUrl]
|
||||
provider_model: str
|
||||
api_token: str
|
||||
include_raw_html: Optional[bool] = False
|
||||
forced: bool = False
|
||||
bypass_cache: bool = False
|
||||
extract_blocks: bool = True
|
||||
word_count_threshold: Optional[int] = 5
|
||||
extraction_strategy: Optional[str] = "CosineStrategy"
|
||||
chunking_strategy: Optional[str] = "RegexChunking"
|
||||
css_selector: Optional[str] = None
|
||||
verbose: Optional[bool] = True
|
||||
|
||||
@lru_cache()
|
||||
def get_crawler():
|
||||
# Initialize and return a WebCrawler instance
|
||||
return WebCrawler(db_path='crawler_data.db')
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def read_index():
|
||||
@@ -66,20 +64,30 @@ async def read_index():
|
||||
|
||||
@app.get("/total-count")
|
||||
async def get_total_url_count():
|
||||
count = get_total_count(db_path='crawler_data.db')
|
||||
count = get_total_count()
|
||||
return JSONResponse(content={"count": count})
|
||||
|
||||
# Add endpoit to clear db
|
||||
@app.get("/clear-db")
|
||||
async def clear_database():
|
||||
clear_db(db_path='crawler_data.db')
|
||||
clear_db()
|
||||
return JSONResponse(content={"message": "Database cleared."})
|
||||
|
||||
def import_strategy(module_name: str, class_name: str):
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
strategy_class = getattr(module, class_name)
|
||||
return strategy_class()
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
|
||||
except AttributeError:
|
||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||
|
||||
@app.post("/crawl")
|
||||
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||
global current_requests
|
||||
# Raise error if api_token is not provided
|
||||
if not urls_input.api_token:
|
||||
if not crawl_request.api_token:
|
||||
raise HTTPException(status_code=401, detail="API token is required.")
|
||||
async with lock:
|
||||
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||
@@ -87,87 +95,50 @@ async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||
current_requests += 1
|
||||
|
||||
try:
|
||||
# Prepare URL models for crawling
|
||||
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
|
||||
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy)
|
||||
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy)
|
||||
|
||||
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
|
||||
with ThreadPoolExecutor() as executor:
|
||||
loop = asyncio.get_event_loop()
|
||||
futures = [
|
||||
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, urls_input.provider_model, urls_input.api_token, urls_input.extract_blocks, urls_input.word_count_threshold)
|
||||
for url_model in url_models
|
||||
loop.run_in_executor(
|
||||
executor,
|
||||
get_crawler().run,
|
||||
str(url),
|
||||
crawl_request.word_count_threshold,
|
||||
extraction_strategy,
|
||||
chunking_strategy,
|
||||
crawl_request.bypass_cache,
|
||||
crawl_request.css_selector,
|
||||
crawl_request.verbose
|
||||
)
|
||||
for url in crawl_request.urls
|
||||
]
|
||||
results = await asyncio.gather(*futures)
|
||||
|
||||
# if include_raw_html is False, remove the raw HTML content from the results
|
||||
if not urls_input.include_raw_html:
|
||||
if not crawl_request.include_raw_html:
|
||||
for result in results:
|
||||
result.html = None
|
||||
|
||||
|
||||
return {"results": [result.dict() for result in results]}
|
||||
finally:
|
||||
async with lock:
|
||||
current_requests -= 1
|
||||
|
||||
@app.get("/strategies/extraction", response_class=JSONResponse)
|
||||
async def get_extraction_strategies():
|
||||
# Load docs/extraction_strategies.json" and return as JSON response
|
||||
with open(f"{__location__}/docs/extraction_strategies.json", "r") as file:
|
||||
return JSONResponse(content=file.read())
|
||||
|
||||
@app.post("/crawl_async")
|
||||
async def crawl_urls(urls_input: UrlsInput, request: Request):
|
||||
global current_requests
|
||||
if not urls_input.api_token:
|
||||
raise HTTPException(status_code=401, detail="API token is required.")
|
||||
|
||||
async with lock:
|
||||
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||
raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
|
||||
current_requests += 1
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
tasks[task_id] = {"status": "pending", "results": None}
|
||||
|
||||
try:
|
||||
url_models = [UrlModel(url=url, forced=urls_input.forced) for url in urls_input.urls]
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.create_task(
|
||||
process_crawl_task(url_models, urls_input.provider_model, urls_input.api_token, task_id, urls_input.extract_blocks)
|
||||
)
|
||||
return {"task_id": task_id}
|
||||
finally:
|
||||
async with lock:
|
||||
current_requests -= 1
|
||||
|
||||
async def process_crawl_task(url_models, provider, api_token, task_id, extract_blocks_flag):
|
||||
try:
|
||||
with ThreadPoolExecutor() as executor:
|
||||
loop = asyncio.get_running_loop()
|
||||
futures = [
|
||||
loop.run_in_executor(executor, get_crawler().fetch_page, url_model, provider, api_token, extract_blocks_flag)
|
||||
for url_model in url_models
|
||||
]
|
||||
results = await asyncio.gather(*futures)
|
||||
|
||||
tasks[task_id] = {"status": "done", "results": results}
|
||||
except Exception as e:
|
||||
tasks[task_id] = {"status": "failed", "error": str(e)}
|
||||
|
||||
@app.get("/task/{task_id}")
|
||||
async def get_task_status(task_id: str):
|
||||
task = tasks.get(task_id)
|
||||
if not task:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
if task['status'] == 'done':
|
||||
return {
|
||||
"status": task['status'],
|
||||
"results": [result.dict() for result in task['results']]
|
||||
}
|
||||
elif task['status'] == 'failed':
|
||||
return {
|
||||
"status": task['status'],
|
||||
"error": task['error']
|
||||
}
|
||||
else:
|
||||
return {"status": task['status']}
|
||||
@app.get("/strategies/chunking", response_class=JSONResponse)
|
||||
async def get_chunking_strategies():
|
||||
with open(f"{__location__}/docs/chunking_strategies.json", "r") as file:
|
||||
return JSONResponse(content=file.read())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
662
pages/index.html
662
pages/index.html
@@ -9,12 +9,15 @@
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
|
||||
|
||||
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
|
||||
<!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
|
||||
/>
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
|
||||
<style>
|
||||
:root {
|
||||
@@ -46,138 +49,447 @@
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
<style>
|
||||
/* Custom styling for docs-item class and Markdown generated elements */
|
||||
.docs-item {
|
||||
background-color: #2d3748; /* bg-gray-800 */
|
||||
padding: 1rem; /* p-4 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
|
||||
margin-bottom: 1rem; /* space between items */
|
||||
}
|
||||
|
||||
.docs-item h3,
|
||||
.docs-item h4 {
|
||||
color: #ffffff; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item p {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item code {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.25rem 0.5rem; /* px-2 py-1 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
}
|
||||
|
||||
.docs-item pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
overflow: auto; /* overflow-auto */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
|
||||
.docs-item div {
|
||||
color: #e2e8f0; /* text-gray-300 */
|
||||
font-size: 1rem; /* prose prose-sm */
|
||||
line-height: 1.25rem; /* line-height for readability */
|
||||
}
|
||||
|
||||
/* Adjustments to make prose class more suitable for dark mode */
|
||||
.prose {
|
||||
max-width: none; /* max-w-none */
|
||||
}
|
||||
|
||||
.prose p,
|
||||
.prose ul {
|
||||
margin-bottom: 1rem; /* mb-4 */
|
||||
}
|
||||
|
||||
.prose code {
|
||||
/* background-color: #4a5568; */ /* bg-gray-700 */
|
||||
color: #65a30d; /* text-white */
|
||||
padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
|
||||
border-radius: 0.25rem; /* rounded */
|
||||
display: inline-block; /* inline-block */
|
||||
}
|
||||
|
||||
.prose pre {
|
||||
background-color: #1a202c; /* bg-gray-900 */
|
||||
color: #ffffff; /* text-white */
|
||||
padding: 0.5rem; /* p-2 */
|
||||
border-radius: 0.375rem; /* rounded */
|
||||
}
|
||||
|
||||
.prose h3 {
|
||||
color: #65a30d; /* text-white */
|
||||
font-size: 1.25rem; /* text-xl */
|
||||
font-weight: 700; /* font-bold */
|
||||
margin-bottom: 0.5rem; /* mb-2 */
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header class="bg-gray-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper</h1>
|
||||
<body class="bg-black text-gray-200">
|
||||
<header class="bg-zinc-950 text-white py-4 flex">
|
||||
<div class="mx-auto px-4">
|
||||
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
|
||||
</div>
|
||||
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Processed</span>
|
||||
<span id="total-count" class="text-lime-400">2</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- Add a section to show total-count websited already crawled -->
|
||||
<section class="bg-gray-600 py-8">
|
||||
<div class="container mx-auto px-4 flex font-bold text-xl gap-2">
|
||||
<span>📊 Total Website Procceced</span>
|
||||
<span id="total-count" class="text-blue-400">0</span>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="try-it py-8 pb-20">
|
||||
<section class="try-it py-8 px-16 pb-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">Try It Now</h2>
|
||||
<div class="mb-4 flex w-full gap-2">
|
||||
<div class="flex items-center gap-2 flex-col flex-grow">
|
||||
<label for="url-input" class="text-white">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<!-- Add a number set if 5 with a label word threshold -->
|
||||
<div class="flex items-center gap-2 flex-col">
|
||||
<label for="threshold" class="text-white">Min Words Threshold</label>
|
||||
<select id="threshold" class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full">
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
<div class="grid grid-cols-1 lg:grid-cols-3 gap-4">
|
||||
<div class="space-y-4">
|
||||
<div class="flex flex-col">
|
||||
<label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
|
||||
<input
|
||||
type="text"
|
||||
id="url-input"
|
||||
value="https://www.nbcnews.com/business"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter URL(s) separated by commas"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
|
||||
<select
|
||||
id="threshold"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="15">15</option>
|
||||
<option value="20">20</option>
|
||||
<option value="25">25</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
|
||||
<input
|
||||
type="text"
|
||||
id="css-selector"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter CSS Selector"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Extraction Strategy</label
|
||||
>
|
||||
<select
|
||||
id="extraction-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="CosineStrategy">CosineStrategy</option>
|
||||
<option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
|
||||
<option value="NoExtractionStrategy">NoExtractionStrategy</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
|
||||
>Chunking Strategy</label
|
||||
>
|
||||
<select
|
||||
id="chunking-strategy-select"
|
||||
class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
|
||||
>
|
||||
<option value="RegexChunking">RegexChunking</option>
|
||||
<option value="NlpSentenceChunking">NlpSentenceChunking</option>
|
||||
<option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
|
||||
<option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
|
||||
<option value="SlidingWindowChunking">SlidingWindowChunking</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="provider-model-select" class="text-lime-500 font-bold text-xs"
|
||||
>Provider Model</label
|
||||
>
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
disabled
|
||||
>
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex flex-col">
|
||||
<label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
|
||||
placeholder="Enter Groq API token"
|
||||
disabled
|
||||
/>
|
||||
</div>
|
||||
<div class="flex gap-3">
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="bypass-cache-checkbox" />
|
||||
<label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<input type="checkbox" id="extract-blocks-checkbox" checked />
|
||||
<label for="extract-blocks-checkbox" class="text-lime-500 font-bold"
|
||||
>Extract Blocks</label
|
||||
>
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">
|
||||
Crawl
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex items-center gap-2 flex-col">
|
||||
<label for="provider-model-select" class="text-white">Provider Model</label>
|
||||
|
||||
<select
|
||||
id="provider-model-select"
|
||||
class="border border-gray-600 rounded px-4 py-3 bg-gray-800 text-white w-full"
|
||||
>
|
||||
<!-- Add your option values here -->
|
||||
<option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
|
||||
<option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
|
||||
<option value="openai/gpt-4-turbo">gpt-4-turbo</option>
|
||||
<option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
|
||||
<option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
|
||||
<option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
|
||||
<option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex items-center gap-2 flex-col">
|
||||
<label for="token-input" class="text-white">API Token</label>
|
||||
|
||||
<input
|
||||
type="password"
|
||||
id="token-input"
|
||||
class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white w-full"
|
||||
placeholder="Enter Groq API token"
|
||||
/>
|
||||
</div>
|
||||
<div class="flex items-center justify-center gap-2 flex-col">
|
||||
<label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
|
||||
<input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked />
|
||||
</div>
|
||||
<button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
|
||||
</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
|
||||
<div id="loading" class="hidden mt-4">
|
||||
<p>
|
||||
Depends on the selected model, it may take up to 1 or 2 minutes to process the request.
|
||||
Loading...
|
||||
</p>
|
||||
</div>
|
||||
<div id="result" class="tab-container flex-1 h-full flex-col">
|
||||
<div id="result" class=" ">
|
||||
<div id="loading" class="hidden">
|
||||
<p class="text-white">Loading... Please wait.</p>
|
||||
</div>
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="json"
|
||||
>
|
||||
JSON
|
||||
</button>
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="cleaned-html"
|
||||
>
|
||||
Cleaned HTML
|
||||
</button>
|
||||
<button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
|
||||
<button
|
||||
class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="markdown"
|
||||
>
|
||||
Markdown
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content code bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
|
||||
<div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="cleaned-html-result" class="language-html "></code></pre>
|
||||
><code id="cleaned-html-result" class="language-html"></code></pre>
|
||||
<pre
|
||||
class="hidden h-full flex"
|
||||
><code id="markdown-result" class="language-markdown "></code></pre>
|
||||
><code id="markdown-result" class="language-markdown"></code></pre>
|
||||
</div>
|
||||
</div>
|
||||
<div id="code_help" class="tab-container flex-1 h-full">
|
||||
|
||||
<div id="code_help" class=" ">
|
||||
<div class="tab-buttons flex gap-2">
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
|
||||
Python
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="curl"
|
||||
>
|
||||
cURL
|
||||
</button>
|
||||
<button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="library"
|
||||
>
|
||||
Python Library
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="python"
|
||||
>
|
||||
Python (Request)
|
||||
</button>
|
||||
<button
|
||||
class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
|
||||
data-tab="nodejs"
|
||||
>
|
||||
Node.js
|
||||
</button>
|
||||
</div>
|
||||
<div class="tab-content result bg-gray-800 p-2 rounded h-full flex-1 border border-gray-600">
|
||||
<div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
|
||||
<pre class="h-full flex relative">
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<code id="curl-code" class="language-bash"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<code id="python-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
<code id="nodejs-code" class="language-javascript"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
|
||||
</pre>
|
||||
<pre class="hidden h-full flex relative">
|
||||
<code id="library-code" class="language-python"></code>
|
||||
<button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<h1 class="text-3xl font-bold mb-4">Installation 💻</h1>
|
||||
<p class="mb-4">There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local server.</p>
|
||||
|
||||
<p class="mb-4">You can also try Crawl4AI in a Google Colab <a href = "https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="display: inline-block; width: 100px; height: 20px;"/></a></p>
|
||||
|
||||
<h2 class="text-2xl font-bold mb-2">Using Crawl4AI as a Library 📚</h2>
|
||||
<p class="mb-4">To install Crawl4AI as a library, follow these steps:</p>
|
||||
|
||||
<ol class="list-decimal list-inside mb-4">
|
||||
<li class="mb-2">
|
||||
Install the package from GitHub:
|
||||
<pre class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"><code>pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
|
||||
</li>
|
||||
<li class="mb-2">
|
||||
Alternatively, you can clone the repository and install the package locally:
|
||||
<pre class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"><code class = "language-python bash">virtualenv venv
|
||||
source venv/bin/activate
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e .
|
||||
</code></pre>
|
||||
</li>
|
||||
<li>
|
||||
Import the necessary modules in your Python script:
|
||||
<pre class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"><code class = "language-python hljs">from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
import os
|
||||
|
||||
<section class="hero bg-gray-900 py-8">
|
||||
crawler = WebCrawler()
|
||||
|
||||
# Single page crawl
|
||||
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
|
||||
result = crawl4ai.fetch_page(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
|
||||
chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
|
||||
extraction_strategy= CosineStrategy(word_count_threshold=20, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
|
||||
# extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
|
||||
bypass_cache=False,
|
||||
extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
|
||||
css_selector = "", # Eg: "div.article-body"
|
||||
verbose=True,
|
||||
include_raw_html=True, # Whether to include the raw HTML content in the response
|
||||
)
|
||||
print(result.model_dump())
|
||||
</code></pre>
|
||||
</li>
|
||||
</ol>
|
||||
<p class="mb-4">For more information about how to run Crawl4AI as a local server, please refer to the <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.</p>
|
||||
<a href="
|
||||
</section>
|
||||
|
||||
<section class="bg-zinc-900 text-zinc-300 p-6 px-20">
|
||||
<h1 class="text-3xl font-bold mb-4">📖 Parameters</h1>
|
||||
<div class="overflow-x-auto">
|
||||
<table class="min-w-full bg-zinc-800 border border-zinc-700">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Parameter</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Description</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Required</th>
|
||||
<th class="py-2 px-4 border-b border-zinc-700">Default Value</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">urls</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
A list of URLs to crawl and extract data from.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">Yes</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">-</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">include_raw_html</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to include the raw HTML content in the response.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">false</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">bypass_cache</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to force a fresh crawl even if the URL has been previously crawled.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">false</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">extract_blocks</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
Whether to extract semantical blocks of text from the HTML.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">true</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">word_count_threshold</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The minimum number of words a block must contain to be considered meaningful (minimum
|
||||
value is 5).
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">5</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">extraction_strategy</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">CosineStrategy</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">chunking_strategy</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The strategy to use for chunking the text before processing (e.g., "RegexChunking").
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">RegexChunking</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">css_selector</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">
|
||||
The CSS selector to target specific parts of the HTML for extraction.
|
||||
</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">No</td>
|
||||
<td class="py-2 px-4 border-b border-zinc-700">None</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="py-2 px-4">verbose</td>
|
||||
<td class="py-2 px-4">Whether to enable verbose logging.</td>
|
||||
<td class="py-2 px-4">No</td>
|
||||
<td class="py-2 px-4">true</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="extraction" class="py-8 px-20">
|
||||
<div class="overflow-x-auto mx-auto px-6">
|
||||
<h2 class="text-2xl font-bold mb-4">Extraction Strategies</h2>
|
||||
<div id="extraction-strategies" class="space-y-4"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="chunking" class="py-8 px-20">
|
||||
<div class="overflow-x-auto mx-auto px-6">
|
||||
<h2 class="text-2xl font-bold mb-4">Chunking Strategies</h2>
|
||||
<div id="chunking-strategies" class="space-y-4"></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="hero bg-zinc-900 py-8 px-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
|
||||
<p class="text-lg mb-4">
|
||||
@@ -192,7 +504,7 @@
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="installation py-8">
|
||||
<section class="installation py-8 px-20">
|
||||
<div class="container mx-auto px-4">
|
||||
<h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
|
||||
<p class="mb-4">
|
||||
@@ -202,7 +514,7 @@
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="bg-gray-900 text-white py-4">
|
||||
<footer class="bg-zinc-900 text-white py-4">
|
||||
<div class="container mx-auto px-4">
|
||||
<div class="flex justify-between items-center">
|
||||
<p>© 2024 Crawl4AI. All rights reserved.</p>
|
||||
@@ -219,18 +531,27 @@
|
||||
target="_blank"
|
||||
>🐦 Twitter</a
|
||||
>
|
||||
<!-- <a
|
||||
href="https://discord.gg/your-invite-link"
|
||||
class="text-white hover:text-gray-300 mx-2"
|
||||
target="_blank"
|
||||
>💬 Discord</a
|
||||
> -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
// JavaScript to manage dynamic form changes and logic
|
||||
document.getElementById("extraction-strategy-select").addEventListener("change", function () {
|
||||
const strategy = this.value;
|
||||
const providerModelSelect = document.getElementById("provider-model-select");
|
||||
const tokenInput = document.getElementById("token-input");
|
||||
|
||||
if (strategy === "LLMExtractionStrategy") {
|
||||
providerModelSelect.disabled = false;
|
||||
tokenInput.disabled = false;
|
||||
} else {
|
||||
providerModelSelect.disabled = true;
|
||||
tokenInput.disabled = true;
|
||||
}
|
||||
});
|
||||
|
||||
// Get the selected provider model and token from local storage
|
||||
const storedProviderModel = localStorage.getItem("provider_model");
|
||||
const storedToken = localStorage.getItem(storedProviderModel);
|
||||
@@ -274,6 +595,7 @@
|
||||
const selectedProviderModel = document.getElementById("provider-model-select").value;
|
||||
const apiToken = document.getElementById("token-input").value;
|
||||
const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
|
||||
const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
|
||||
|
||||
// Save the selected provider model and token to local storage
|
||||
localStorage.setItem("provider_model", selectedProviderModel);
|
||||
@@ -286,17 +608,21 @@
|
||||
provider_model: selectedProviderModel,
|
||||
api_token: apiToken,
|
||||
include_raw_html: true,
|
||||
forced: false,
|
||||
bypass_cache: bypassCache,
|
||||
extract_blocks: extractBlocks,
|
||||
word_count_threshold: parseInt(document.getElementById("threshold").value),
|
||||
extraction_strategy: document.getElementById("extraction-strategy-select").value,
|
||||
chunking_strategy: document.getElementById("chunking-strategy-select").value,
|
||||
css_selector: document.getElementById("css-selector").value,
|
||||
verbose: true,
|
||||
};
|
||||
|
||||
// save api token to local storage
|
||||
localStorage.setItem("api_token", document.getElementById("token-input").value);
|
||||
|
||||
document.getElementById("loading").classList.remove("hidden");
|
||||
document.getElementById("result").classList.add("hidden");
|
||||
document.getElementById("code_help").classList.add("hidden");
|
||||
//document.getElementById("result").classList.add("hidden");
|
||||
//document.getElementById("code_help").classList.add("hidden");
|
||||
|
||||
axios
|
||||
.post("/crawl", data)
|
||||
@@ -308,29 +634,44 @@
|
||||
document.getElementById("markdown-result").textContent = result.markdown;
|
||||
|
||||
// Update code examples dynamically
|
||||
// Update code examples dynamically
|
||||
const extractionStrategy = data.extraction_strategy;
|
||||
const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
|
||||
|
||||
document.getElementById(
|
||||
"curl-code"
|
||||
).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
|
||||
...data,
|
||||
api_token: "your_api_token",
|
||||
})}' http://localhost:8000/crawl`;
|
||||
api_token: isLLMExtraction ? "your_api_token" : undefined,
|
||||
})}' http://crawl4ai.uccode.io/crawl`;
|
||||
|
||||
document.getElementById(
|
||||
"python-code"
|
||||
).textContent = `import requests\n\ndata = ${JSON.stringify(
|
||||
{ ...data, api_token: "your_api_token" },
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
|
||||
)}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
|
||||
|
||||
document.getElementById(
|
||||
"nodejs-code"
|
||||
).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
|
||||
{ ...data, api_token: "your_api_token" },
|
||||
{ ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
|
||||
null,
|
||||
2
|
||||
)};\n\naxios.post("http://localhost:8000/crawl", data)\n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
)};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n .then(response => console.log(response.data))\n .catch(error => console.error(error));`;
|
||||
|
||||
document.getElementById(
|
||||
"library-code"
|
||||
).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n url='${
|
||||
urls[0]
|
||||
}',\n word_count_threshold=${data.word_count_threshold},\n extraction_strategy=${
|
||||
isLLMExtraction
|
||||
? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
|
||||
: extractionStrategy + "()"
|
||||
},\n chunking_strategy=${data.chunking_strategy}(),\n bypass_cache=${
|
||||
data.bypass_cache
|
||||
},\n css_selector="${data.css_selector}"\n)\nprint(result)`;
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
|
||||
@@ -357,8 +698,8 @@
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||
btn.classList.add("bg-blue-600", "text-white");
|
||||
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
@@ -370,19 +711,58 @@
|
||||
const tab = btn.dataset.tab;
|
||||
document
|
||||
.querySelectorAll(".code-tab-btn")
|
||||
.forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
|
||||
btn.classList.add("bg-blue-600", "text-white");
|
||||
.forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
|
||||
btn.classList.add("bg-lime-700", "text-white");
|
||||
document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
|
||||
document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle copy to clipboard button clicks
|
||||
|
||||
async function copyToClipboard(text) {
|
||||
if (navigator.clipboard && navigator.clipboard.writeText) {
|
||||
return navigator.clipboard.writeText(text);
|
||||
} else {
|
||||
return fallbackCopyTextToClipboard(text);
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackCopyTextToClipboard(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const textArea = document.createElement("textarea");
|
||||
textArea.value = text;
|
||||
|
||||
// Avoid scrolling to bottom
|
||||
textArea.style.top = "0";
|
||||
textArea.style.left = "0";
|
||||
textArea.style.position = "fixed";
|
||||
|
||||
document.body.appendChild(textArea);
|
||||
textArea.focus();
|
||||
textArea.select();
|
||||
|
||||
try {
|
||||
const successful = document.execCommand("copy");
|
||||
if (successful) {
|
||||
resolve();
|
||||
} else {
|
||||
reject();
|
||||
}
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
|
||||
document.body.removeChild(textArea);
|
||||
});
|
||||
}
|
||||
|
||||
document.querySelectorAll(".copy-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
const target = btn.dataset.target;
|
||||
const code = document.getElementById(target).textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
//navigator.clipboard.writeText(code).then(() => {
|
||||
copyToClipboard(code).then(() => {
|
||||
btn.textContent = "Copied!";
|
||||
setTimeout(() => {
|
||||
btn.textContent = "Copy";
|
||||
@@ -390,6 +770,42 @@
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
document.addEventListener("DOMContentLoaded", async () => {
|
||||
try {
|
||||
const extractionResponse = await fetch("/strategies/extraction");
|
||||
const extractionStrategies = await extractionResponse.json();
|
||||
|
||||
const chunkingResponse = await fetch("/strategies/chunking");
|
||||
const chunkingStrategies = await chunkingResponse.json();
|
||||
|
||||
renderStrategies("extraction-strategies", extractionStrategies);
|
||||
renderStrategies("chunking-strategies", chunkingStrategies);
|
||||
} catch (error) {
|
||||
console.error("Error fetching strategies:", error);
|
||||
}
|
||||
});
|
||||
|
||||
function renderStrategies(containerId, strategies) {
|
||||
const container = document.getElementById(containerId);
|
||||
container.innerHTML = ""; // Clear any existing content
|
||||
strategies = JSON.parse(strategies);
|
||||
Object.entries(strategies).forEach(([strategy, description]) => {
|
||||
const strategyElement = document.createElement("div");
|
||||
strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
|
||||
|
||||
const strategyDescription = document.createElement("div");
|
||||
strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
|
||||
strategyDescription.innerHTML = marked.parse(description);
|
||||
|
||||
strategyElement.appendChild(strategyDescription);
|
||||
|
||||
container.appendChild(strategyElement);
|
||||
});
|
||||
}
|
||||
|
||||
// Highlight code syntax
|
||||
hljs.highlightAll();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
111
tests/test_web_crawler.py
Normal file
111
tests/test_web_crawler.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import unittest, os
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
|
||||
from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
|
||||
|
||||
class TestWebCrawler(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = WebCrawler()
|
||||
|
||||
def test_warmup(self):
|
||||
self.crawler.warmup()
|
||||
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
|
||||
|
||||
def test_run_default_strategies(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
|
||||
|
||||
def test_run_different_strategies(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# Test with FixedLengthWordChunking and LLMExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
|
||||
|
||||
# Test with SlidingWindowChunking and TopicExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
|
||||
|
||||
def test_invalid_url(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='invalid_url', bypass_cache=True)
|
||||
self.assertIn("Invalid URL", str(context.exception))
|
||||
|
||||
def test_unsupported_extraction_strategy(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
|
||||
self.assertIn("Unsupported extraction strategy", str(context.exception))
|
||||
|
||||
def test_invalid_css_selector(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
|
||||
self.assertIn("Invalid CSS selector", str(context.exception))
|
||||
|
||||
|
||||
def test_crawl_with_cache_and_bypass_cache(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# First crawl with cache enabled
|
||||
result = self.crawler.run(url=url, bypass_cache=False)
|
||||
self.assertTrue(result.success, "Failed to crawl and cache the result")
|
||||
|
||||
# Second crawl with bypass_cache=True
|
||||
result = self.crawler.run(url=url, bypass_cache=True)
|
||||
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
|
||||
|
||||
def test_fetch_multiple_pages(self):
|
||||
urls = [
|
||||
'https://www.nbcnews.com/business',
|
||||
'https://www.bbc.com/news'
|
||||
]
|
||||
results = []
|
||||
for url in urls:
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(),
|
||||
bypass_cache=True
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
|
||||
for result in results:
|
||||
self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
|
||||
|
||||
def test_run_fixed_length_word_chunking_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
|
||||
|
||||
def test_run_sliding_window_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user