Refactor crawler_strategy.py to handle exceptions and improve error messages

2024-06-26 15:04:33 +08:00
parent 7ba2142363
commit 4756d0a532
2 changed files with 60 additions and 43 deletions
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import InvalidArgumentException
+from selenium.common.exceptions import InvalidArgumentException, WebDriverException
 from selenium.webdriver.chrome.service import Service as ChromeService
 from webdriver_manager.chrome import ChromeDriverManager
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            return html
        except InvalidArgumentException:
-            raise InvalidArgumentException(f"Invalid URL {url}")
+            if not hasattr(e, 'msg'):
                e.msg = str(e)
            raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
        except WebDriverException as e:
            # If e does nlt have msg attribute create it and set it to str(e)
            if not hasattr(e, 'msg'):
                e.msg = str(e)
            raise WebDriverException(f"Failed to crawl {url}: {e.msg}")  
        except Exception as e:
-            raise Exception(f"Failed to crawl {url}: {str(e)}")
+            if not hasattr(e, 'msg'):
                e.msg = str(e)
            raise Exception(f"Failed to crawl {url}: {e.msg}")
    def take_screenshot(self) -> str:
        try:
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -129,49 +129,57 @@ class WebCrawler:
            verbose=True,
            **kwargs,
        ) -> CrawlResult:
-            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            try:
-            extraction_strategy.verbose = verbose
+                extraction_strategy = extraction_strategy or NoExtractionStrategy()
-            if not isinstance(extraction_strategy, ExtractionStrategy):
+                extraction_strategy.verbose = verbose
-                raise ValueError("Unsupported extraction strategy")
+                if not isinstance(extraction_strategy, ExtractionStrategy):
-            if not isinstance(chunking_strategy, ChunkingStrategy):
+                    raise ValueError("Unsupported extraction strategy")
-                raise ValueError("Unsupported chunking strategy")
+                if not isinstance(chunking_strategy, ChunkingStrategy):
-            
+                    raise ValueError("Unsupported chunking strategy")
            # if word_count_threshold < MIN_WORD_THRESHOLD:
            #     word_count_threshold = MIN_WORD_THRESHOLD
-            word_count_threshold = max(word_count_threshold, 0)
+                # if word_count_threshold < MIN_WORD_THRESHOLD:
                #     word_count_threshold = MIN_WORD_THRESHOLD
                word_count_threshold = max(word_count_threshold, 0)
-            # Check cache first
+                # Check cache first
-            cached = None
+                cached = None
-            screenshot_data = None
+                screenshot_data = None
-            extracted_content = None
+                extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
+                if not bypass_cache and not self.always_by_pass_cache:
-                cached = get_cached_url(url)
+                    cached = get_cached_url(url)
-            
+                
-            if kwargs.get("warmup", True) and not self.ready:
+                if kwargs.get("warmup", True) and not self.ready:
-                return None
+                    return None
-            
+                
-            if cached:
+                if cached:
-                html = cached[1]
+                    html = cached[1]
-                extracted_content = cached[4]
+                    extracted_content = cached[4]
-                if screenshot:
+                    if screenshot:
-                    screenshot_data = cached[9]
+                        screenshot_data = cached[9]
-                    if not screenshot_data:
+                        if not screenshot_data:
-                        cached = None
+                            cached = None
-            
+                
-            if not cached or not html:
+                if not cached or not html:
-                if user_agent:
+                    if user_agent:
-                    self.crawler_strategy.update_user_agent(user_agent)
+                        self.crawler_strategy.update_user_agent(user_agent)
-                t1 = time.time()
+                    t1 = time.time()
-                html = self.crawler_strategy.crawl(url)
+                    html = self.crawler_strategy.crawl(url)
-                t2 = time.time()
+                    t2 = time.time()
-                if verbose:
+                    if verbose:
-                    print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
+                        print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
-                if screenshot:
+                    if screenshot:
-                    screenshot_data = self.crawler_strategy.take_screenshot()
+                        screenshot_data = self.crawler_strategy.take_screenshot()
-            
+                
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
+                crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
                crawl_result.success = bool(html)
                return crawl_result
            except Exception as e:
                if not hasattr(e, "msg"):
                    e.msg = str(e)
                print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")    
                return CrawlResult(url=url, html="", success=False, error_message=e.msg)
    def process_html(
            self,