Add a temporary solution for when we can't crawl websites in headless mode.

This commit is contained in:
unclecode
2024-06-29 23:25:50 +08:00
parent f8a11779fe
commit 5b28eed2c0

View File

@@ -83,14 +83,20 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
if kwargs.get("user_agent"):
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
else:
# Set user agent
user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
self.options.add_argument(f"--user-agent={user_agent}")
self.options.add_argument("--no-sandbox")
self.options.add_argument(f"--user-agent={user_agent}")
self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
self.options.headless = kwargs.get("headless", True)
if self.options.headless:
self.options.add_argument("--headless")
self.options.add_argument("--disable-gpu")
self.options.add_argument("--window-size=1920,1080")
self.options.add_argument("--no-sandbox")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-blink-features=AutomationControlled")
# self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-gpu")
# self.options.add_argument("--disable-extensions")
@@ -187,10 +193,24 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.driver = self.execute_hook('before_get_url', self.driver)
if self.verbose:
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
self.driver.get(url)
self.driver.get(url) #<html><head></head><body></body></html>
html = self.driver.page_source
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
)
can_not_be_done_headless = False # Look at my creativity for naming variables
# TODO: Very ugly way for now but it works
if html == "<html><head></head><body></body></html>":
can_not_be_done_headless = True
options = Options()
options.headless = False
# set window size very small
options.add_argument("--window-size=10,10")
driver = webdriver.Chrome(service=self.service, options=options)
driver.get(url)
html = driver.page_source
driver.quit()
self.driver = self.execute_hook('after_get_url', self.driver)
# Execute JS code if provided
@@ -207,7 +227,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
html = self.driver.page_source
if not can_not_be_done_headless:
html = self.driver.page_source
self.driver = self.execute_hook('before_return_html', self.driver, html)
# Store in cache