Refactor crawler_strategy.py to handle exceptions and improve error messages
This commit is contained in:
@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
|
|||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import InvalidArgumentException
|
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
except InvalidArgumentException:
|
except InvalidArgumentException:
|
||||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
except WebDriverException as e:
|
||||||
|
# If e does nlt have msg attribute create it and set it to str(e)
|
||||||
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
|
||||||
def take_screenshot(self) -> str:
|
def take_screenshot(self) -> str:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -129,6 +129,7 @@ class WebCrawler:
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
|
try:
|
||||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
extraction_strategy.verbose = verbose
|
extraction_strategy.verbose = verbose
|
||||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
@@ -171,7 +172,14 @@ class WebCrawler:
|
|||||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
|
|
||||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||||
|
crawl_result.success = bool(html)
|
||||||
|
return crawl_result
|
||||||
|
except Exception as e:
|
||||||
|
if not hasattr(e, "msg"):
|
||||||
|
e.msg = str(e)
|
||||||
|
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
||||||
|
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
||||||
|
|
||||||
def process_html(
|
def process_html(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user