Refactor crawler_strategy.py to handle exceptions and improve error messages

This commit is contained in:
unclecode
2024-06-26 15:04:33 +08:00
parent 7ba2142363
commit 4756d0a532
2 changed files with 60 additions and 43 deletions

View File

@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException from selenium.common.exceptions import InvalidArgumentException, WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
return html return html
except InvalidArgumentException: except InvalidArgumentException:
raise InvalidArgumentException(f"Invalid URL {url}") if not hasattr(e, 'msg'):
e.msg = str(e)
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
except WebDriverException as e:
# If e does nlt have msg attribute create it and set it to str(e)
if not hasattr(e, 'msg'):
e.msg = str(e)
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
except Exception as e: except Exception as e:
raise Exception(f"Failed to crawl {url}: {str(e)}") if not hasattr(e, 'msg'):
e.msg = str(e)
raise Exception(f"Failed to crawl {url}: {e.msg}")
def take_screenshot(self) -> str: def take_screenshot(self) -> str:
try: try:

View File

@@ -129,49 +129,57 @@ class WebCrawler:
verbose=True, verbose=True,
**kwargs, **kwargs,
) -> CrawlResult: ) -> CrawlResult:
extraction_strategy = extraction_strategy or NoExtractionStrategy() try:
extraction_strategy.verbose = verbose extraction_strategy = extraction_strategy or NoExtractionStrategy()
if not isinstance(extraction_strategy, ExtractionStrategy): extraction_strategy.verbose = verbose
raise ValueError("Unsupported extraction strategy") if not isinstance(extraction_strategy, ExtractionStrategy):
if not isinstance(chunking_strategy, ChunkingStrategy): raise ValueError("Unsupported extraction strategy")
raise ValueError("Unsupported chunking strategy") if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
# if word_count_threshold < MIN_WORD_THRESHOLD:
# word_count_threshold = MIN_WORD_THRESHOLD
word_count_threshold = max(word_count_threshold, 0) # if word_count_threshold < MIN_WORD_THRESHOLD:
# word_count_threshold = MIN_WORD_THRESHOLD
word_count_threshold = max(word_count_threshold, 0)
# Check cache first # Check cache first
cached = None cached = None
screenshot_data = None screenshot_data = None
extracted_content = None extracted_content = None
if not bypass_cache and not self.always_by_pass_cache: if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url) cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready: if kwargs.get("warmup", True) and not self.ready:
return None return None
if cached: if cached:
html = cached[1] html = cached[1]
extracted_content = cached[4] extracted_content = cached[4]
if screenshot: if screenshot:
screenshot_data = cached[9] screenshot_data = cached[9]
if not screenshot_data: if not screenshot_data:
cached = None cached = None
if not cached or not html: if not cached or not html:
if user_agent: if user_agent:
self.crawler_strategy.update_user_agent(user_agent) self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time() t1 = time.time()
html = self.crawler_strategy.crawl(url) html = self.crawler_strategy.crawl(url)
t2 = time.time() t2 = time.time()
if verbose: if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
if screenshot: if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot() screenshot_data = self.crawler_strategy.take_screenshot()
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
crawl_result.success = bool(html)
return crawl_result
except Exception as e:
if not hasattr(e, "msg"):
e.msg = str(e)
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
def process_html( def process_html(
self, self,