Update:
- Fix Spacy model issue - Update Readme and requirements.txt
This commit is contained in:
@@ -117,7 +117,10 @@ To install Crawl4AI as a library, follow these steps:
|
|||||||
|
|
||||||
1. Install the package from GitHub:
|
1. Install the package from GitHub:
|
||||||
```bash
|
```bash
|
||||||
|
virtualenv venv
|
||||||
|
source venv/bin/activate
|
||||||
pip install git+https://github.com/unclecode/crawl4ai.git
|
pip install git+https://github.com/unclecode/crawl4ai.git
|
||||||
|
python docs/examples/quickstart.py
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Alternatively, you can clone the repository and install the package locally:
|
2. Alternatively, you can clone the repository and install the package locally:
|
||||||
@@ -192,6 +195,8 @@ For more information about the available parameters and their descriptions, refe
|
|||||||
|
|
||||||
## Python Library Usage 🚀
|
## Python Library Usage 🚀
|
||||||
|
|
||||||
|
🔥 A great way to try out Crawl4AI is to run `quickstart.py` in the `docs/examples` directory. This script demonstrates how to use Crawl4AI to crawl a website and extract content from it.
|
||||||
|
|
||||||
### Quickstart Guide
|
### Quickstart Guide
|
||||||
|
|
||||||
Create an instance of WebCrawler and call the `warmup()` function.
|
Create an instance of WebCrawler and call the `warmup()` function.
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import re
|
|||||||
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
|
# from nltk.tokenize import word_tokenize, TextTilingTokenizer
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import string
|
import string
|
||||||
|
from .model_loader import load_spacy_en_core_web_sm
|
||||||
|
|
||||||
# Define the abstract base class for chunking strategies
|
# Define the abstract base class for chunking strategies
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
@@ -37,13 +38,7 @@ class RegexChunking(ChunkingStrategy):
|
|||||||
|
|
||||||
class NlpSentenceChunking(ChunkingStrategy):
|
class NlpSentenceChunking(ChunkingStrategy):
|
||||||
def __init__(self, model='en_core_web_sm'):
|
def __init__(self, model='en_core_web_sm'):
|
||||||
import spacy
|
self.nlp = load_spacy_en_core_web_sm()
|
||||||
try:
|
|
||||||
self.nlp = spacy.load(model)
|
|
||||||
except IOError:
|
|
||||||
spacy.cli.download("en_core_web_sm")
|
|
||||||
self.nlp = spacy.load(model)
|
|
||||||
# raise ImportError(f"Spacy model '{model}' not found. Please download the model using 'python -m spacy download {model}'")
|
|
||||||
|
|
||||||
def chunk(self, text: str) -> list:
|
def chunk(self, text: str) -> list:
|
||||||
doc = self.nlp(text)
|
doc = self.nlp(text)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ load_dotenv() # Load environment variables from .env file
|
|||||||
|
|
||||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||||
|
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||||
PROVIDER_MODELS = {
|
PROVIDER_MODELS = {
|
||||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||||
|
|||||||
@@ -1,20 +1,86 @@
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
from .utils import get_home_folder
|
||||||
import spacy
|
from pathlib import Path
|
||||||
|
import subprocess, os
|
||||||
|
import shutil
|
||||||
|
from .config import MODEL_REPO_BRANCH
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_bert_base_uncased():
|
def load_bert_base_uncased():
|
||||||
|
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
|
||||||
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
|
||||||
return tokenizer, model
|
return tokenizer, model
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_bge_small_en_v1_5():
|
def load_bge_small_en_v1_5():
|
||||||
|
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
|
||||||
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||||
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
|
||||||
model.eval()
|
model.eval()
|
||||||
return tokenizer, model
|
return tokenizer, model
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def load_spacy_en_core_web_sm():
|
||||||
|
import spacy
|
||||||
|
try:
|
||||||
|
print("[LOG] Loading spaCy model")
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
except IOError:
|
||||||
|
print("[LOG] ⏬ Downloading spaCy model for the first time")
|
||||||
|
spacy.cli.download("en_core_web_sm")
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
print("[LOG] ✅ spaCy model loaded successfully")
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_spacy_model():
|
def load_spacy_model():
|
||||||
return spacy.load("models/reuters")
|
import spacy
|
||||||
|
name = "models/reuters"
|
||||||
|
home_folder = get_home_folder()
|
||||||
|
model_folder = os.path.join(home_folder, name)
|
||||||
|
|
||||||
|
# Check if the model directory already exists
|
||||||
|
if True or not (Path(model_folder).exists() and any(Path(model_folder).iterdir())):
|
||||||
|
repo_url = "https://github.com/unclecode/crawl4ai.git"
|
||||||
|
# branch = "main"
|
||||||
|
branch = MODEL_REPO_BRANCH
|
||||||
|
repo_folder = os.path.join(home_folder, "crawl4ai")
|
||||||
|
model_folder = os.path.join(home_folder, name)
|
||||||
|
|
||||||
|
print("[LOG] ⏬ Downloading model for the first time...")
|
||||||
|
|
||||||
|
# Remove existing repo folder if it exists
|
||||||
|
if Path(repo_folder).exists():
|
||||||
|
shutil.rmtree(repo_folder)
|
||||||
|
shutil.rmtree(model_folder)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Clone the repository
|
||||||
|
subprocess.run(
|
||||||
|
["git", "clone", "-b", branch, repo_url, repo_folder],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the models directory if it doesn't exist
|
||||||
|
models_folder = os.path.join(home_folder, "models")
|
||||||
|
os.makedirs(models_folder, exist_ok=True)
|
||||||
|
|
||||||
|
# Copy the reuters model folder to the models directory
|
||||||
|
source_folder = os.path.join(repo_folder, "models/reuters")
|
||||||
|
shutil.copytree(source_folder, model_folder)
|
||||||
|
|
||||||
|
# Remove the cloned repository
|
||||||
|
shutil.rmtree(repo_folder)
|
||||||
|
|
||||||
|
# Print completion message
|
||||||
|
print("[LOG] ✅ Model downloaded successfully")
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"An error occurred while cloning the repository: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
return spacy.load(model_folder)
|
||||||
@@ -9,10 +9,19 @@ import os
|
|||||||
from html2text import HTML2Text
|
from html2text import HTML2Text
|
||||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||||
from .config import *
|
from .config import *
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_home_folder():
|
||||||
|
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||||
|
os.makedirs(home_folder, exist_ok=True)
|
||||||
|
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||||
|
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||||
|
return home_folder
|
||||||
|
|
||||||
def beautify_html(escaped_html):
|
def beautify_html(escaped_html):
|
||||||
"""
|
"""
|
||||||
Beautifies an escaped HTML string.
|
Beautifies an escaped HTML string.
|
||||||
|
|||||||
@@ -34,13 +34,16 @@ class WebCrawler:
|
|||||||
# if not db_path:
|
# if not db_path:
|
||||||
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
# self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
|
||||||
|
|
||||||
flush_db()
|
# flush_db()
|
||||||
init_db()
|
init_db()
|
||||||
|
|
||||||
self.ready = False
|
self.ready = False
|
||||||
|
|
||||||
def warmup(self):
|
def warmup(self):
|
||||||
print("[LOG] 🌤️ Warming up the WebCrawler")
|
|
||||||
|
|
||||||
|
|
||||||
|
print("[LOG] 🌤️ Warming up the WebCrawler")
|
||||||
result = self.run(
|
result = self.run(
|
||||||
url='https://crawl4ai.uccode.io/',
|
url='https://crawl4ai.uccode.io/',
|
||||||
word_count_threshold=5,
|
word_count_threshold=5,
|
||||||
|
|||||||
@@ -1,18 +1,26 @@
|
|||||||
import os, time
|
import os
|
||||||
|
import time
|
||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.chunking_strategy import *
|
from crawl4ai.chunking_strategy import *
|
||||||
from crawl4ai.extraction_strategy import *
|
from crawl4ai.extraction_strategy import *
|
||||||
from crawl4ai.crawler_strategy import *
|
from crawl4ai.crawler_strategy import *
|
||||||
from rich import print
|
from rich import print
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def create_crawler():
|
||||||
|
crawler = WebCrawler()
|
||||||
|
crawler.warmup()
|
||||||
|
return crawler
|
||||||
|
|
||||||
def print_result(result):
|
def print_result(result):
|
||||||
# Print each key in one line and just the first 10 characters of each one's value and three dots
|
# Print each key in one line and just the first 10 characters of each one's value and three dots
|
||||||
console.print(f"\t[bold]Result:[/bold]")
|
console.print(f"\t[bold]Result:[/bold]")
|
||||||
for key, value in result.model_dump().items():
|
for key, value in result.model_dump().items():
|
||||||
if type(value) == str and value:
|
if isinstance(value, str) and value:
|
||||||
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
|
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
|
||||||
|
|
||||||
def cprint(message, press_any_key=False):
|
def cprint(message, press_any_key=False):
|
||||||
@@ -21,26 +29,17 @@ def cprint(message, press_any_key=False):
|
|||||||
console.print("Press any key to continue...", style="")
|
console.print("Press any key to continue...", style="")
|
||||||
input()
|
input()
|
||||||
|
|
||||||
def main():
|
def basic_usage(crawler):
|
||||||
# 🚀 Let's get started with the basics!
|
|
||||||
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
|
||||||
|
|
||||||
# Basic usage: Just provide the URL
|
|
||||||
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
|
||||||
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to lead required model files.", True)
|
|
||||||
|
|
||||||
crawler = WebCrawler()
|
|
||||||
crawler.warmup()
|
|
||||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
# Explanation of bypass_cache and include_raw_html
|
def understanding_parameters(crawler):
|
||||||
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
|
||||||
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action. Becuase we already crawled this URL, the result will be fetched from the cache. Let's try it out!")
|
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
|
||||||
|
|
||||||
# Reads from cache
|
# First crawl (reads from cache)
|
||||||
cprint("1️⃣ First crawl (caches the result):", True)
|
cprint("1️⃣ First crawl (caches the result):", True)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
@@ -57,16 +56,12 @@ def main():
|
|||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
# Retrieve raw HTML content
|
# Retrieve raw HTML content
|
||||||
cprint("\n🔄 [bold cyan]By default 'include_raw_html' is set to True, which includes the raw HTML content in the response.[/bold cyan]", True)
|
cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
|
||||||
cprint("[LOG] 📦 [bold yellow]Craw result (without raw HTML content):[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
cprint("\n📄 The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response. By default is set to True. Let's move on to exploring different chunking strategies now!")
|
def add_chunking_strategy(crawler):
|
||||||
|
|
||||||
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
|
|
||||||
crawler.always_by_pass_cache = True
|
|
||||||
|
|
||||||
# Adding a chunking strategy: RegexChunking
|
# Adding a chunking strategy: RegexChunking
|
||||||
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
|
||||||
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
|
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
|
||||||
@@ -86,9 +81,8 @@ def main():
|
|||||||
)
|
)
|
||||||
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
cprint("There are more chunking strategies to explore, make sure to check document, but let's move on to extraction strategies now!")
|
|
||||||
|
|
||||||
|
def add_extraction_strategy(crawler):
|
||||||
# Adding an extraction strategy: CosineStrategy
|
# Adding an extraction strategy: CosineStrategy
|
||||||
cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
|
cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
|
||||||
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
|
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
|
||||||
@@ -99,6 +93,7 @@ def main():
|
|||||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
# Using semantic_filter with CosineStrategy
|
||||||
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
|
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
@@ -106,10 +101,10 @@ def main():
|
|||||||
semantic_filter="inflation rent prices",
|
semantic_filter="inflation rent prices",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def add_llm_extraction_strategy(crawler):
|
||||||
# Adding an LLM extraction strategy without instructions
|
# Adding an LLM extraction strategy without instructions
|
||||||
cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
|
cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
|
||||||
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
|
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
|
||||||
@@ -120,8 +115,6 @@ def main():
|
|||||||
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
cprint("You can pass other providers like 'groq/llama3-70b-8192' or 'ollama/llama3' to the LLMExtractionStrategy.")
|
|
||||||
|
|
||||||
# Adding an LLM extraction strategy with instructions
|
# Adding an LLM extraction strategy with instructions
|
||||||
cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
|
cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
|
||||||
cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
|
cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
|
||||||
@@ -143,12 +136,11 @@ def main():
|
|||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
instruction="Extract only content related to technology"
|
instruction="Extract only content related to technology"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
|
||||||
cprint("You can pass other instructions like 'Extract only content related to technology' to the LLMExtractionStrategy.")
|
print_result(result)
|
||||||
|
|
||||||
cprint("There are more extraction strategies to explore, make sure to check the documentation!")
|
|
||||||
|
|
||||||
|
def targeted_extraction(crawler):
|
||||||
# Using a CSS selector to extract only H2 tags
|
# Using a CSS selector to extract only H2 tags
|
||||||
cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
|
cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
@@ -158,6 +150,7 @@ def main():
|
|||||||
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def interactive_extraction(crawler):
|
||||||
# Passing JavaScript code to interact with the page
|
# Passing JavaScript code to interact with the page
|
||||||
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
|
||||||
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
|
||||||
@@ -173,31 +166,26 @@ def main():
|
|||||||
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
|
||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
|
||||||
|
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
|
||||||
|
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
|
||||||
|
|
||||||
|
crawler = create_crawler()
|
||||||
|
|
||||||
|
cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
|
||||||
|
crawler.always_by_pass_cache = True
|
||||||
|
|
||||||
|
basic_usage(crawler)
|
||||||
|
understanding_parameters(crawler)
|
||||||
|
add_chunking_strategy(crawler)
|
||||||
|
add_extraction_strategy(crawler)
|
||||||
|
add_llm_extraction_strategy(crawler)
|
||||||
|
targeted_extraction(crawler)
|
||||||
|
interactive_extraction(crawler)
|
||||||
|
|
||||||
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
def old_main():
|
|
||||||
js_code = """const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"""
|
|
||||||
# js_code = None
|
|
||||||
crawler = WebCrawler( crawler_strategy=LocalSeleniumCrawlerStrategy(use_cached_html=False, js_code=js_code))
|
|
||||||
crawler.warmup()
|
|
||||||
# Single page crawl
|
|
||||||
result = crawler.run(
|
|
||||||
url="https://www.nbcnews.com/business",
|
|
||||||
word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
|
|
||||||
chunking_strategy=RegexChunking(patterns=["\n\n"]), # Default is RegexChunking
|
|
||||||
# extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3), # Default is CosineStrategy
|
|
||||||
extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), instruction = "I am intrested in only financial news"),
|
|
||||||
bypass_cache=True,
|
|
||||||
extract_blocks=True, # Whether to extract semantical blocks of text from the HTML
|
|
||||||
css_selector="", # Eg: "div.article-body" or all H2 tags liek "h2"
|
|
||||||
verbose=True,
|
|
||||||
include_raw_html=True, # Whether to include the raw HTML content in the response
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
print("[LOG] 📦 Crawl result:")
|
|
||||||
print(result.model_dump())
|
|
||||||
|
|
||||||
@@ -14,4 +14,5 @@ python-dotenv
|
|||||||
nltk
|
nltk
|
||||||
lazy_import
|
lazy_import
|
||||||
rich
|
rich
|
||||||
# spacy
|
spacy
|
||||||
|
scikit-learn
|
||||||
Reference in New Issue
Block a user