From ea16dec587b751b67d9b84a44ee7b84765656add Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 16 May 2024 21:19:02 +0800 Subject: [PATCH] Improve library loading --- README.md | 4 ---- crawl4ai/crawler_strategy.py | 4 +++- crawl4ai/extraction_strategy.py | 7 +++++-- crawl4ai/model_loader.py | 8 +++----- crawl4ai/web_crawler.py | 4 ++-- pages/partial/installation.html | 2 -- 6 files changed, 13 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 942da285..f4e7f37a 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,6 @@ Simply, firtsy install the package: ```bash virtualenv venv source venv/bin/activate -# Install the required packages -pip install transformers torch chromedriver_autoinstaller # Install Crawl4AI pip install git+https://github.com/unclecode/crawl4ai.git ``` @@ -137,7 +135,6 @@ To install Crawl4AI as a library, follow these steps: ```bash virtualenv venv source venv/bin/activate -pip install transformers torch chromedriver_autoinstaller pip install git+https://github.com/unclecode/crawl4ai.git ``` @@ -151,7 +148,6 @@ virtualenv venv source venv/bin/activate git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install transformers torch chromedriver_autoinstaller pip install -e . ``` diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index c1a06072..24add103 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -6,7 +6,7 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import InvalidArgumentException -import chromedriver_autoinstaller + from typing import List import requests import os @@ -38,6 +38,7 @@ class CloudCrawlerStrategy(CrawlerStrategy): class LocalSeleniumCrawlerStrategy(CrawlerStrategy): def __init__(self, use_cached_html=False, js_code=None): super().__init__() + print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy") self.options = Options() self.options.headless = True self.options.add_argument("--no-sandbox") @@ -49,6 +50,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.js_code = js_code # chromedriver_autoinstaller.install() + import chromedriver_autoinstaller self.service = Service(chromedriver_autoinstaller.install()) self.driver = webdriver.Chrome(service=self.service, options=self.options) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index c9074eb2..59aa81a5 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -8,8 +8,8 @@ from .config import * from .utils import * from functools import partial from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model -from transformers import pipeline -from sklearn.metrics.pairwise import cosine_similarity + + import numpy as np class ExtractionStrategy(ABC): """ @@ -165,6 +165,7 @@ class CosineStrategy(ExtractionStrategy): :param top_k: Number of top categories to extract. """ super().__init__() + from transformers import BertTokenizer, BertModel, pipeline from transformers import AutoTokenizer, AutoModel import spacy @@ -196,6 +197,7 @@ class CosineStrategy(ExtractionStrategy): :param threshold: Cosine similarity threshold for filtering documents. :return: Filtered list of documents. """ + from sklearn.metrics.pairwise import cosine_similarity if not semantic_filter: return documents # Compute embedding for the keyword filter @@ -409,6 +411,7 @@ class ContentSummarizationStrategy(ExtractionStrategy): :param model_name: The model to use for summarization. """ + from transformers import pipeline self.summarizer = pipeline("summarization", model=model_name) def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 9c50c524..50288631 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -2,7 +2,7 @@ from functools import lru_cache from pathlib import Path import subprocess, os import shutil -from .config import MODEL_REPO_BRANCH +from crawl4ai.config import MODEL_REPO_BRANCH import argparse def get_home_folder(): @@ -40,7 +40,6 @@ def load_spacy_en_core_web_sm(): print("[LOG] ✅ spaCy model loaded successfully") return nlp - @lru_cache() def load_spacy_model(): import spacy @@ -92,11 +91,8 @@ def load_spacy_model(): return spacy.load(model_folder) - def download_all_models(remove_existing=False): """Download all models required for Crawl4AI.""" - print("[LOG] Welcome to the Crawl4AI Model Downloader!") - print("[LOG] This script will download all the models required for Crawl4AI.") if remove_existing: print("[LOG] Removing existing models...") home_folder = get_home_folder() @@ -121,6 +117,8 @@ def download_all_models(remove_existing=False): print("[LOG] ✅ All models downloaded successfully.") def main(): + print("[LOG] Welcome to the Crawl4AI Model Downloader!") + print("[LOG] This script will download all the models required for Crawl4AI.") parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader") parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading") args = parser.parse_args() diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 88996c44..7fa4304f 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -18,11 +18,11 @@ class WebCrawler: def __init__( self, # db_path: str = None, - crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy(), + crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, ): # self.db_path = db_path - self.crawler_strategy = crawler_strategy + self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy() self.always_by_pass_cache = always_by_pass_cache # Create the .crawl4ai folder in the user's home directory if it doesn't exist diff --git a/pages/partial/installation.html b/pages/partial/installation.html index 6a80987e..01ff715b 100644 --- a/pages/partial/installation.html +++ b/pages/partial/installation.html @@ -30,7 +30,6 @@ >virtualenv venv source venv/bin/activate pip install git+https://github.com/unclecode/crawl4ai.git -pip install transformers torch chromedriver_autoinstaller
  • @@ -47,7 +46,6 @@ pip install transformers torch chromedriver_autoinstaller source venv/bin/activate git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install transformers torch chromedriver_autoinstaller pip install -e .