From 9e43f7bedab944431a370f40c46624852092a40a Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 31 Jul 2024 13:29:23 +0800 Subject: [PATCH] refactor: Temporarily disable fetching image file size in get_content_of_website_optimized Set the `image_size` variable to 0 in the `get_content_of_website_optimized` function to temporarily disable fetching the image file size. This change addresses performance issues and will be improved in a future update. Update Dockerfile for linuz users --- Dockerfile | 12 ++++++------ crawl4ai/crawler_strategy.py | 24 ++++++++++++------------ requirements.txt | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 07c41ad7..e49e2f6f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,17 +24,17 @@ COPY . . # Install Crawl4AI using the local setup.py (which will use the default installation) RUN pip install --no-cache-dir . -# Install Google Chrome and ChromeDriver +# Install Google Chrome RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \ apt-get update && \ - apt-get install -y google-chrome-stable && \ - wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \ - unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ + apt-get install -y google-chrome-stable -# Set environment to use Chrome and ChromeDriver properly +# Update webdriver_manager to version 4.0.2 +RUN pip install --no-cache-dir webdriver_manager==4.0.2 + +# Set environment to use Chrome properly ENV CHROME_BIN=/usr/bin/google-chrome \ - CHROMEDRIVER=/usr/local/bin/chromedriver \ DISPLAY=:99 \ DBUS_SESSION_BUS_ADDRESS=/dev/null \ PYTHONUNBUFFERED=1 diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 78148f68..626b72d8 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -137,10 +137,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # self.service = Service(chromedriver_autoinstaller.install()) - chromedriver_path = ChromeDriverManager().install() - self.service = Service(chromedriver_path) - self.service.log_path = "NUL" - self.driver = webdriver.Chrome(service=self.service, options=self.options) + # chromedriver_path = ChromeDriverManager().install() + # self.service = Service(chromedriver_path) + # self.service.log_path = "NUL" + # self.driver = webdriver.Chrome(service=self.service, options=self.options) + + # Use selenium-manager (built into Selenium 4.10.0+) + self.service = Service() + self.driver = webdriver.Chrome(options=self.options) + self.driver = self.execute_hook('on_driver_created', self.driver) if kwargs.get("cookies"): @@ -292,7 +297,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # Open the screenshot with PIL image = Image.open(BytesIO(screenshot)) - # Convert image to RGB mode + # Convert image to RGB mode (this will handle both RGB and RGBA images) rgb_image = image.convert('RGB') # Convert to JPEG and compress @@ -304,11 +309,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): print(f"[LOG] 📸 Screenshot taken and converted to base64") return img_base64 - except Exception as e: - if self.verbose: - print(f"[ERROR] Failed to take screenshot: {str(e)}") - return "" - except Exception as e: error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}") print(error_message) @@ -321,7 +321,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): try: font = ImageFont.truetype("arial.ttf", 40) except IOError: - font = ImageFont.load_default(size=40) + font = ImageFont.load_default() # Define text color and wrap the text text_color = (255, 255, 255) @@ -340,6 +340,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') return img_base64 - + def quit(self): self.driver.quit() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ced41173..359f0b7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ python-dotenv==1.0.1 requests==2.32.3 rich==13.7.1 scikit-learn==1.5.0 -selenium==4.21.0 +selenium==4.23.1 uvicorn==0.30.1 transformers==4.41.2 chromedriver-autoinstaller==0.6.4