Improve library loading
This commit is contained in:
@@ -35,8 +35,6 @@ Simply, firtsy install the package:
|
|||||||
```bash
|
```bash
|
||||||
virtualenv venv
|
virtualenv venv
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
# Install the required packages
|
|
||||||
pip install transformers torch chromedriver_autoinstaller
|
|
||||||
# Install Crawl4AI
|
# Install Crawl4AI
|
||||||
pip install git+https://github.com/unclecode/crawl4ai.git
|
pip install git+https://github.com/unclecode/crawl4ai.git
|
||||||
```
|
```
|
||||||
@@ -137,7 +135,6 @@ To install Crawl4AI as a library, follow these steps:
|
|||||||
```bash
|
```bash
|
||||||
virtualenv venv
|
virtualenv venv
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
pip install transformers torch chromedriver_autoinstaller
|
|
||||||
pip install git+https://github.com/unclecode/crawl4ai.git
|
pip install git+https://github.com/unclecode/crawl4ai.git
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -151,7 +148,6 @@ virtualenv venv
|
|||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
git clone https://github.com/unclecode/crawl4ai.git
|
git clone https://github.com/unclecode/crawl4ai.git
|
||||||
cd crawl4ai
|
cd crawl4ai
|
||||||
pip install transformers torch chromedriver_autoinstaller
|
|
||||||
pip install -e .
|
pip install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import InvalidArgumentException
|
from selenium.common.exceptions import InvalidArgumentException
|
||||||
import chromedriver_autoinstaller
|
|
||||||
from typing import List
|
from typing import List
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
@@ -38,6 +38,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
|
|||||||
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||||
def __init__(self, use_cached_html=False, js_code=None):
|
def __init__(self, use_cached_html=False, js_code=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
|
||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.headless = True
|
self.options.headless = True
|
||||||
self.options.add_argument("--no-sandbox")
|
self.options.add_argument("--no-sandbox")
|
||||||
@@ -49,6 +50,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.js_code = js_code
|
self.js_code = js_code
|
||||||
|
|
||||||
# chromedriver_autoinstaller.install()
|
# chromedriver_autoinstaller.install()
|
||||||
|
import chromedriver_autoinstaller
|
||||||
self.service = Service(chromedriver_autoinstaller.install())
|
self.service = Service(chromedriver_autoinstaller.install())
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ from .config import *
|
|||||||
from .utils import *
|
from .utils import *
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
|
||||||
from transformers import pipeline
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
class ExtractionStrategy(ABC):
|
class ExtractionStrategy(ABC):
|
||||||
"""
|
"""
|
||||||
@@ -165,6 +165,7 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
:param top_k: Number of top categories to extract.
|
:param top_k: Number of top categories to extract.
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
from transformers import BertTokenizer, BertModel, pipeline
|
from transformers import BertTokenizer, BertModel, pipeline
|
||||||
from transformers import AutoTokenizer, AutoModel
|
from transformers import AutoTokenizer, AutoModel
|
||||||
import spacy
|
import spacy
|
||||||
@@ -196,6 +197,7 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
:param threshold: Cosine similarity threshold for filtering documents.
|
:param threshold: Cosine similarity threshold for filtering documents.
|
||||||
:return: Filtered list of documents.
|
:return: Filtered list of documents.
|
||||||
"""
|
"""
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
if not semantic_filter:
|
if not semantic_filter:
|
||||||
return documents
|
return documents
|
||||||
# Compute embedding for the keyword filter
|
# Compute embedding for the keyword filter
|
||||||
@@ -409,6 +411,7 @@ class ContentSummarizationStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
:param model_name: The model to use for summarization.
|
:param model_name: The model to use for summarization.
|
||||||
"""
|
"""
|
||||||
|
from transformers import pipeline
|
||||||
self.summarizer = pipeline("summarization", model=model_name)
|
self.summarizer = pipeline("summarization", model=model_name)
|
||||||
|
|
||||||
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from functools import lru_cache
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import subprocess, os
|
import subprocess, os
|
||||||
import shutil
|
import shutil
|
||||||
from .config import MODEL_REPO_BRANCH
|
from crawl4ai.config import MODEL_REPO_BRANCH
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
def get_home_folder():
|
def get_home_folder():
|
||||||
@@ -40,7 +40,6 @@ def load_spacy_en_core_web_sm():
|
|||||||
print("[LOG] ✅ spaCy model loaded successfully")
|
print("[LOG] ✅ spaCy model loaded successfully")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def load_spacy_model():
|
def load_spacy_model():
|
||||||
import spacy
|
import spacy
|
||||||
@@ -92,11 +91,8 @@ def load_spacy_model():
|
|||||||
|
|
||||||
return spacy.load(model_folder)
|
return spacy.load(model_folder)
|
||||||
|
|
||||||
|
|
||||||
def download_all_models(remove_existing=False):
|
def download_all_models(remove_existing=False):
|
||||||
"""Download all models required for Crawl4AI."""
|
"""Download all models required for Crawl4AI."""
|
||||||
print("[LOG] Welcome to the Crawl4AI Model Downloader!")
|
|
||||||
print("[LOG] This script will download all the models required for Crawl4AI.")
|
|
||||||
if remove_existing:
|
if remove_existing:
|
||||||
print("[LOG] Removing existing models...")
|
print("[LOG] Removing existing models...")
|
||||||
home_folder = get_home_folder()
|
home_folder = get_home_folder()
|
||||||
@@ -121,6 +117,8 @@ def download_all_models(remove_existing=False):
|
|||||||
print("[LOG] ✅ All models downloaded successfully.")
|
print("[LOG] ✅ All models downloaded successfully.")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
print("[LOG] Welcome to the Crawl4AI Model Downloader!")
|
||||||
|
print("[LOG] This script will download all the models required for Crawl4AI.")
|
||||||
parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
|
parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
|
||||||
parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
|
parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ class WebCrawler:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
# db_path: str = None,
|
# db_path: str = None,
|
||||||
crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy(),
|
crawler_strategy: CrawlerStrategy = None,
|
||||||
always_by_pass_cache: bool = False,
|
always_by_pass_cache: bool = False,
|
||||||
):
|
):
|
||||||
# self.db_path = db_path
|
# self.db_path = db_path
|
||||||
self.crawler_strategy = crawler_strategy
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy()
|
||||||
self.always_by_pass_cache = always_by_pass_cache
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
|
|||||||
@@ -30,7 +30,6 @@
|
|||||||
><code>virtualenv venv
|
><code>virtualenv venv
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
pip install git+https://github.com/unclecode/crawl4ai.git
|
pip install git+https://github.com/unclecode/crawl4ai.git
|
||||||
pip install transformers torch chromedriver_autoinstaller
|
|
||||||
</code></pre>
|
</code></pre>
|
||||||
</li>
|
</li>
|
||||||
<li class="mb-4">
|
<li class="mb-4">
|
||||||
@@ -47,7 +46,6 @@ pip install transformers torch chromedriver_autoinstaller
|
|||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
git clone https://github.com/unclecode/crawl4ai.git
|
git clone https://github.com/unclecode/crawl4ai.git
|
||||||
cd crawl4ai
|
cd crawl4ai
|
||||||
pip install transformers torch chromedriver_autoinstaller
|
|
||||||
pip install -e .
|
pip install -e .
|
||||||
</code></pre>
|
</code></pre>
|
||||||
</li>
|
</li>
|
||||||
|
|||||||
Reference in New Issue
Block a user