refactor: Temporarily disable fetching image file size in get_content_of_website_optimized
Set the `image_size` variable to 0 in the `get_content_of_website_optimized` function to temporarily disable fetching the image file size. This change addresses performance issues and will be improved in a future update. Update Dockerfile for linuz users
This commit is contained in:
12
Dockerfile
12
Dockerfile
@@ -24,17 +24,17 @@ COPY . .
|
|||||||
# Install Crawl4AI using the local setup.py (which will use the default installation)
|
# Install Crawl4AI using the local setup.py (which will use the default installation)
|
||||||
RUN pip install --no-cache-dir .
|
RUN pip install --no-cache-dir .
|
||||||
|
|
||||||
# Install Google Chrome and ChromeDriver
|
# Install Google Chrome
|
||||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||||
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y google-chrome-stable && \
|
apt-get install -y google-chrome-stable
|
||||||
wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \
|
|
||||||
unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/
|
|
||||||
|
|
||||||
# Set environment to use Chrome and ChromeDriver properly
|
# Update webdriver_manager to version 4.0.2
|
||||||
|
RUN pip install --no-cache-dir webdriver_manager==4.0.2
|
||||||
|
|
||||||
|
# Set environment to use Chrome properly
|
||||||
ENV CHROME_BIN=/usr/bin/google-chrome \
|
ENV CHROME_BIN=/usr/bin/google-chrome \
|
||||||
CHROMEDRIVER=/usr/local/bin/chromedriver \
|
|
||||||
DISPLAY=:99 \
|
DISPLAY=:99 \
|
||||||
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
DBUS_SESSION_BUS_ADDRESS=/dev/null \
|
||||||
PYTHONUNBUFFERED=1
|
PYTHONUNBUFFERED=1
|
||||||
|
|||||||
@@ -137,10 +137,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
# self.service = Service(chromedriver_autoinstaller.install())
|
# self.service = Service(chromedriver_autoinstaller.install())
|
||||||
|
|
||||||
|
|
||||||
chromedriver_path = ChromeDriverManager().install()
|
# chromedriver_path = ChromeDriverManager().install()
|
||||||
self.service = Service(chromedriver_path)
|
# self.service = Service(chromedriver_path)
|
||||||
self.service.log_path = "NUL"
|
# self.service.log_path = "NUL"
|
||||||
self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
# self.driver = webdriver.Chrome(service=self.service, options=self.options)
|
||||||
|
|
||||||
|
# Use selenium-manager (built into Selenium 4.10.0+)
|
||||||
|
self.service = Service()
|
||||||
|
self.driver = webdriver.Chrome(options=self.options)
|
||||||
|
|
||||||
self.driver = self.execute_hook('on_driver_created', self.driver)
|
self.driver = self.execute_hook('on_driver_created', self.driver)
|
||||||
|
|
||||||
if kwargs.get("cookies"):
|
if kwargs.get("cookies"):
|
||||||
@@ -292,7 +297,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
# Open the screenshot with PIL
|
# Open the screenshot with PIL
|
||||||
image = Image.open(BytesIO(screenshot))
|
image = Image.open(BytesIO(screenshot))
|
||||||
|
|
||||||
# Convert image to RGB mode
|
# Convert image to RGB mode (this will handle both RGB and RGBA images)
|
||||||
rgb_image = image.convert('RGB')
|
rgb_image = image.convert('RGB')
|
||||||
|
|
||||||
# Convert to JPEG and compress
|
# Convert to JPEG and compress
|
||||||
@@ -304,11 +309,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||||
|
|
||||||
return img_base64
|
return img_base64
|
||||||
except Exception as e:
|
|
||||||
if self.verbose:
|
|
||||||
print(f"[ERROR] Failed to take screenshot: {str(e)}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||||
print(error_message)
|
print(error_message)
|
||||||
@@ -321,7 +321,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
try:
|
try:
|
||||||
font = ImageFont.truetype("arial.ttf", 40)
|
font = ImageFont.truetype("arial.ttf", 40)
|
||||||
except IOError:
|
except IOError:
|
||||||
font = ImageFont.load_default(size=40)
|
font = ImageFont.load_default()
|
||||||
|
|
||||||
# Define text color and wrap the text
|
# Define text color and wrap the text
|
||||||
text_color = (255, 255, 255)
|
text_color = (255, 255, 255)
|
||||||
@@ -340,6 +340,6 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
return img_base64
|
return img_base64
|
||||||
|
|
||||||
def quit(self):
|
def quit(self):
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
@@ -12,7 +12,7 @@ python-dotenv==1.0.1
|
|||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
rich==13.7.1
|
rich==13.7.1
|
||||||
scikit-learn==1.5.0
|
scikit-learn==1.5.0
|
||||||
selenium==4.21.0
|
selenium==4.23.1
|
||||||
uvicorn==0.30.1
|
uvicorn==0.30.1
|
||||||
transformers==4.41.2
|
transformers==4.41.2
|
||||||
chromedriver-autoinstaller==0.6.4
|
chromedriver-autoinstaller==0.6.4
|
||||||
|
|||||||
Reference in New Issue
Block a user