Refactor crawler_strategy.py to handle exceptions and improve error messages
This commit is contained in:
@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import InvalidArgumentException
|
||||
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
|
||||
return html
|
||||
except InvalidArgumentException:
|
||||
raise InvalidArgumentException(f"Invalid URL {url}")
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
|
||||
except WebDriverException as e:
|
||||
# If e does nlt have msg attribute create it and set it to str(e)
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to crawl {url}: {str(e)}")
|
||||
if not hasattr(e, 'msg'):
|
||||
e.msg = str(e)
|
||||
raise Exception(f"Failed to crawl {url}: {e.msg}")
|
||||
|
||||
def take_screenshot(self) -> str:
|
||||
try:
|
||||
|
||||
@@ -129,49 +129,57 @@ class WebCrawler:
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
# word_count_threshold = MIN_WORD_THRESHOLD
|
||||
try:
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
raise ValueError("Unsupported extraction strategy")
|
||||
if not isinstance(chunking_strategy, ChunkingStrategy):
|
||||
raise ValueError("Unsupported chunking strategy")
|
||||
|
||||
word_count_threshold = max(word_count_threshold, 0)
|
||||
# if word_count_threshold < MIN_WORD_THRESHOLD:
|
||||
# word_count_threshold = MIN_WORD_THRESHOLD
|
||||
|
||||
word_count_threshold = max(word_count_threshold, 0)
|
||||
|
||||
# Check cache first
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[4]
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
if not screenshot_data:
|
||||
cached = None
|
||||
|
||||
if not cached or not html:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
t1 = time.time()
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||
if screenshot:
|
||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||
# Check cache first
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
if not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = get_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = cached[1]
|
||||
extracted_content = cached[4]
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
if not screenshot_data:
|
||||
cached = None
|
||||
|
||||
if not cached or not html:
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
t1 = time.time()
|
||||
html = self.crawler_strategy.crawl(url)
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||
if screenshot:
|
||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||
|
||||
|
||||
return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||
|
||||
crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
|
||||
crawl_result.success = bool(html)
|
||||
return crawl_result
|
||||
except Exception as e:
|
||||
if not hasattr(e, "msg"):
|
||||
e.msg = str(e)
|
||||
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
||||
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
||||
|
||||
def process_html(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user