Refactor crawler_strategy.py to handle exceptions and improve error messages
This commit is contained in:
@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
|
|||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import InvalidArgumentException
|
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
except InvalidArgumentException:
|
except InvalidArgumentException:
|
||||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
except WebDriverException as e:
|
||||||
|
# If e does nlt have msg attribute create it and set it to str(e)
|
||||||
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
if not hasattr(e, 'msg'):
|
||||||
|
e.msg = str(e)
|
||||||
|
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||||
|
|
||||||
def take_screenshot(self) -> str:
|
def take_screenshot(self) -> str:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -129,49 +129,57 @@ class WebCrawler:
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
try:
|
||||||
extraction_strategy.verbose = verbose
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
extraction_strategy.verbose = verbose
|
||||||
raise ValueError("Unsupported extraction strategy")
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
raise ValueError("Unsupported extraction strategy")
|
||||||
raise ValueError("Unsupported chunking strategy")
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||||
|
raise ValueError("Unsupported chunking strategy")
|
||||||
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
|
||||||
# word_count_threshold = MIN_WORD_THRESHOLD
|
|
||||||
|
|
||||||
word_count_threshold = max(word_count_threshold, 0)
|
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||||
|
# word_count_threshold = MIN_WORD_THRESHOLD
|
||||||
|
|
||||||
|
word_count_threshold = max(word_count_threshold, 0)
|
||||||
|
|
||||||
# Check cache first
|
# Check cache first
|
||||||
cached = None
|
cached = None
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
if not bypass_cache and not self.always_by_pass_cache:
|
if not bypass_cache and not self.always_by_pass_cache:
|
||||||
cached = get_cached_url(url)
|
cached = get_cached_url(url)
|
||||||
|
|
||||||
if kwargs.get("warmup", True) and not self.ready:
|
if kwargs.get("warmup", True) and not self.ready:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if cached:
|
if cached:
|
||||||
html = cached[1]
|
html = cached[1]
|
||||||
extracted_content = cached[4]
|
extracted_content = cached[4]
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = cached[9]
|
screenshot_data = cached[9]
|
||||||
if not screenshot_data:
|
if not screenshot_data:
|
||||||
cached = None
|
cached = None
|
||||||
|
|
||||||
if not cached or not html:
|
if not cached or not html:
|
||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
html = self.crawler_strategy.crawl(url)
|
html = self.crawler_strategy.crawl(url)
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
|
|
||||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||||
|
crawl_result.success = bool(html)
|
||||||
|
return crawl_result
|
||||||
|
except Exception as e:
|
||||||
|
if not hasattr(e, "msg"):
|
||||||
|
e.msg = str(e)
|
||||||
|
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
||||||
|
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
||||||
|
|
||||||
def process_html(
|
def process_html(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user