refactor: Temporarily disable fetching image file size in get_content_of_website_optimized

Set the `image_size` variable to 0 in the `get_content_of_website_optimized` function to temporarily disable fetching the image file size. This change addresses performance issues and will be improved in a future update.

Update Dockerfile for linuz users
This commit is contained in:
unclecode
2024-07-31 13:29:23 +08:00
parent aa9412e1b4
commit 9e43f7beda
3 changed files with 19 additions and 19 deletions

View File

@@ -24,17 +24,17 @@ COPY . .
# Install Crawl4AI using the local setup.py (which will use the default installation) # Install Crawl4AI using the local setup.py (which will use the default installation)
RUN pip install --no-cache-dir . RUN pip install --no-cache-dir .
# Install Google Chrome and ChromeDriver # Install Google Chrome
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \ sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
apt-get update && \ apt-get update && \
apt-get install -y google-chrome-stable && \ apt-get install -y google-chrome-stable
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
# Set environment to use Chrome and ChromeDriver properly # Update webdriver_manager to version 4.0.2
RUN pip install --no-cache-dir webdriver_manager==4.0.2
# Set environment to use Chrome properly
ENV CHROME_BIN=/usr/bin/google-chrome \ ENV CHROME_BIN=/usr/bin/google-chrome \
CHROMEDRIVER=/usr/local/bin/chromedriver \
DISPLAY=:99 \ DISPLAY=:99 \
DBUS_SESSION_BUS_ADDRESS=/dev/null \ DBUS_SESSION_BUS_ADDRESS=/dev/null \
PYTHONUNBUFFERED=1 PYTHONUNBUFFERED=1

View File

@@ -137,10 +137,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
# self.service = Service(chromedriver_autoinstaller.install()) # self.service = Service(chromedriver_autoinstaller.install())
chromedriver_path = ChromeDriverManager().install() # chromedriver_path = ChromeDriverManager().install()
self.service = Service(chromedriver_path) # self.service = Service(chromedriver_path)
self.service.log_path = "NUL" # self.service.log_path = "NUL"
self.driver = webdriver.Chrome(service=self.service, options=self.options) # self.driver = webdriver.Chrome(service=self.service, options=self.options)
# Use selenium-manager (built into Selenium 4.10.0+)
self.service = Service()
self.driver = webdriver.Chrome(options=self.options)
self.driver = self.execute_hook('on_driver_created', self.driver) self.driver = self.execute_hook('on_driver_created', self.driver)
if kwargs.get("cookies"): if kwargs.get("cookies"):
@@ -292,7 +297,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
# Open the screenshot with PIL # Open the screenshot with PIL
image = Image.open(BytesIO(screenshot)) image = Image.open(BytesIO(screenshot))
# Convert image to RGB mode # Convert image to RGB mode (this will handle both RGB and RGBA images)
rgb_image = image.convert('RGB') rgb_image = image.convert('RGB')
# Convert to JPEG and compress # Convert to JPEG and compress
@@ -304,11 +309,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
print(f"[LOG] 📸 Screenshot taken and converted to base64") print(f"[LOG] 📸 Screenshot taken and converted to base64")
return img_base64 return img_base64
except Exception as e:
if self.verbose:
print(f"[ERROR] Failed to take screenshot: {str(e)}")
return ""
except Exception as e: except Exception as e:
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}") error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
print(error_message) print(error_message)
@@ -321,7 +321,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
try: try:
font = ImageFont.truetype("arial.ttf", 40) font = ImageFont.truetype("arial.ttf", 40)
except IOError: except IOError:
font = ImageFont.load_default(size=40) font = ImageFont.load_default()
# Define text color and wrap the text # Define text color and wrap the text
text_color = (255, 255, 255) text_color = (255, 255, 255)
@@ -340,6 +340,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
return img_base64 return img_base64
def quit(self): def quit(self):
self.driver.quit() self.driver.quit()

View File

@@ -12,7 +12,7 @@ python-dotenv==1.0.1
requests==2.32.3 requests==2.32.3
rich==13.7.1 rich==13.7.1
scikit-learn==1.5.0 scikit-learn==1.5.0
selenium==4.21.0 selenium==4.23.1
uvicorn==0.30.1 uvicorn==0.30.1
transformers==4.41.2 transformers==4.41.2
chromedriver-autoinstaller==0.6.4 chromedriver-autoinstaller==0.6.4