Refactor crawler_strategy.py to handle exceptions and improve error messages

2024-06-26 15:04:33 +08:00
parent 7ba2142363
commit 4756d0a532
2 changed files with 60 additions and 43 deletions
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import InvalidArgumentException
+from selenium.common.exceptions import InvalidArgumentException, WebDriverException
 from selenium.webdriver.chrome.service import Service as ChromeService
 from webdriver_manager.chrome import ChromeDriverManager
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            return html
        except InvalidArgumentException:
-            raise InvalidArgumentException(f"Invalid URL {url}")
+            if not hasattr(e, 'msg'):
                e.msg = str(e)
            raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
        except WebDriverException as e:
            # If e does nlt have msg attribute create it and set it to str(e)
            if not hasattr(e, 'msg'):
                e.msg = str(e)
            raise WebDriverException(f"Failed to crawl {url}: {e.msg}")  
        except Exception as e:
-            raise Exception(f"Failed to crawl {url}: {str(e)}")
+            if not hasattr(e, 'msg'):
                e.msg = str(e)
            raise Exception(f"Failed to crawl {url}: {e.msg}")
    def take_screenshot(self) -> str:
        try:
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -129,6 +129,7 @@ class WebCrawler:
            verbose=True,
            **kwargs,
        ) -> CrawlResult:
            try:
                extraction_strategy = extraction_strategy or NoExtractionStrategy()
                extraction_strategy.verbose = verbose
                if not isinstance(extraction_strategy, ExtractionStrategy):
@@ -171,7 +172,14 @@ class WebCrawler:
                        screenshot_data = self.crawler_strategy.take_screenshot()
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
+                crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
                crawl_result.success = bool(html)
                return crawl_result
            except Exception as e:
                if not hasattr(e, "msg"):
                    e.msg = str(e)
                print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")    
                return CrawlResult(url=url, html="", success=False, error_message=e.msg)
    def process_html(
            self,