Improve library loading

This commit is contained in:
unclecode
2024-05-16 21:19:02 +08:00
parent d19488a821
commit ea16dec587
6 changed files with 13 additions and 16 deletions

View File

@@ -35,8 +35,6 @@ Simply, firtsy install the package:
```bash ```bash
virtualenv venv virtualenv venv
source venv/bin/activate source venv/bin/activate
# Install the required packages
pip install transformers torch chromedriver_autoinstaller
# Install Crawl4AI # Install Crawl4AI
pip install git+https://github.com/unclecode/crawl4ai.git pip install git+https://github.com/unclecode/crawl4ai.git
``` ```
@@ -137,7 +135,6 @@ To install Crawl4AI as a library, follow these steps:
```bash ```bash
virtualenv venv virtualenv venv
source venv/bin/activate source venv/bin/activate
pip install transformers torch chromedriver_autoinstaller
pip install git+https://github.com/unclecode/crawl4ai.git pip install git+https://github.com/unclecode/crawl4ai.git
``` ```
@@ -151,7 +148,6 @@ virtualenv venv
source venv/bin/activate source venv/bin/activate
git clone https://github.com/unclecode/crawl4ai.git git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai cd crawl4ai
pip install transformers torch chromedriver_autoinstaller
pip install -e . pip install -e .
``` ```

View File

@@ -6,7 +6,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException from selenium.common.exceptions import InvalidArgumentException
import chromedriver_autoinstaller
from typing import List from typing import List
import requests import requests
import os import os
@@ -38,6 +38,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
class LocalSeleniumCrawlerStrategy(CrawlerStrategy): class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html=False, js_code=None): def __init__(self, use_cached_html=False, js_code=None):
super().__init__() super().__init__()
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
self.options = Options() self.options = Options()
self.options.headless = True self.options.headless = True
self.options.add_argument("--no-sandbox") self.options.add_argument("--no-sandbox")
@@ -49,6 +50,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.js_code = js_code self.js_code = js_code
# chromedriver_autoinstaller.install() # chromedriver_autoinstaller.install()
import chromedriver_autoinstaller
self.service = Service(chromedriver_autoinstaller.install()) self.service = Service(chromedriver_autoinstaller.install())
self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = webdriver.Chrome(service=self.service, options=self.options)

View File

@@ -8,8 +8,8 @@ from .config import *
from .utils import * from .utils import *
from functools import partial from functools import partial
from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np import numpy as np
class ExtractionStrategy(ABC): class ExtractionStrategy(ABC):
""" """
@@ -165,6 +165,7 @@ class CosineStrategy(ExtractionStrategy):
:param top_k: Number of top categories to extract. :param top_k: Number of top categories to extract.
""" """
super().__init__() super().__init__()
from transformers import BertTokenizer, BertModel, pipeline from transformers import BertTokenizer, BertModel, pipeline
from transformers import AutoTokenizer, AutoModel from transformers import AutoTokenizer, AutoModel
import spacy import spacy
@@ -196,6 +197,7 @@ class CosineStrategy(ExtractionStrategy):
:param threshold: Cosine similarity threshold for filtering documents. :param threshold: Cosine similarity threshold for filtering documents.
:return: Filtered list of documents. :return: Filtered list of documents.
""" """
from sklearn.metrics.pairwise import cosine_similarity
if not semantic_filter: if not semantic_filter:
return documents return documents
# Compute embedding for the keyword filter # Compute embedding for the keyword filter
@@ -409,6 +411,7 @@ class ContentSummarizationStrategy(ExtractionStrategy):
:param model_name: The model to use for summarization. :param model_name: The model to use for summarization.
""" """
from transformers import pipeline
self.summarizer = pipeline("summarization", model=model_name) self.summarizer = pipeline("summarization", model=model_name)
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:

View File

@@ -2,7 +2,7 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
import subprocess, os import subprocess, os
import shutil import shutil
from .config import MODEL_REPO_BRANCH from crawl4ai.config import MODEL_REPO_BRANCH
import argparse import argparse
def get_home_folder(): def get_home_folder():
@@ -40,7 +40,6 @@ def load_spacy_en_core_web_sm():
print("[LOG] ✅ spaCy model loaded successfully") print("[LOG] ✅ spaCy model loaded successfully")
return nlp return nlp
@lru_cache() @lru_cache()
def load_spacy_model(): def load_spacy_model():
import spacy import spacy
@@ -92,11 +91,8 @@ def load_spacy_model():
return spacy.load(model_folder) return spacy.load(model_folder)
def download_all_models(remove_existing=False): def download_all_models(remove_existing=False):
"""Download all models required for Crawl4AI.""" """Download all models required for Crawl4AI."""
print("[LOG] Welcome to the Crawl4AI Model Downloader!")
print("[LOG] This script will download all the models required for Crawl4AI.")
if remove_existing: if remove_existing:
print("[LOG] Removing existing models...") print("[LOG] Removing existing models...")
home_folder = get_home_folder() home_folder = get_home_folder()
@@ -121,6 +117,8 @@ def download_all_models(remove_existing=False):
print("[LOG] ✅ All models downloaded successfully.") print("[LOG] ✅ All models downloaded successfully.")
def main(): def main():
print("[LOG] Welcome to the Crawl4AI Model Downloader!")
print("[LOG] This script will download all the models required for Crawl4AI.")
parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader") parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading") parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
args = parser.parse_args() args = parser.parse_args()

View File

@@ -18,11 +18,11 @@ class WebCrawler:
def __init__( def __init__(
self, self,
# db_path: str = None, # db_path: str = None,
crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy(), crawler_strategy: CrawlerStrategy = None,
always_by_pass_cache: bool = False, always_by_pass_cache: bool = False,
): ):
# self.db_path = db_path # self.db_path = db_path
self.crawler_strategy = crawler_strategy self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy()
self.always_by_pass_cache = always_by_pass_cache self.always_by_pass_cache = always_by_pass_cache
# Create the .crawl4ai folder in the user's home directory if it doesn't exist # Create the .crawl4ai folder in the user's home directory if it doesn't exist

View File

@@ -30,7 +30,6 @@
><code>virtualenv venv ><code>virtualenv venv
source venv/bin/activate source venv/bin/activate
pip install git+https://github.com/unclecode/crawl4ai.git pip install git+https://github.com/unclecode/crawl4ai.git
pip install transformers torch chromedriver_autoinstaller
</code></pre> </code></pre>
</li> </li>
<li class="mb-4"> <li class="mb-4">
@@ -47,7 +46,6 @@ pip install transformers torch chromedriver_autoinstaller
source venv/bin/activate source venv/bin/activate
git clone https://github.com/unclecode/crawl4ai.git git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai cd crawl4ai
pip install transformers torch chromedriver_autoinstaller
pip install -e . pip install -e .
</code></pre> </code></pre>
</li> </li>