From ad078c3f18347be5e4d6c703413d585f6b74ce25 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Fri, 23 May 2025 16:05:44 +0800
Subject: [PATCH] fix(pdf): add timeout to PDF downloads to prevent hanging
 (#1141)

- Added timeout=(20, 600) to requests.get() to prevent indefinite hanging
- Added download progress logging for better visibility
- Improved error handling with specific timeout exceptions
- Better temp file cleanup tracking

Fixes #1141
---
 crawl4ai/processors/pdf/__init__.py | 38 ++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py
index 947641cb..a6627f13 100644
--- a/crawl4ai/processors/pdf/__init__.py
+++ b/crawl4ai/processors/pdf/__init__.py
@@ -14,7 +14,7 @@ class PDFCrawlerStrategy(AsyncCrawlerStrategy):
     async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
         # Just pass through with empty HTML - scraper will handle actual processing
         return AsyncCrawlResponse(
-            html="",  # Scraper will handle the real work
+            html="Scraper will handle the real work",  # Scraper will handle the real work
             response_headers={"Content-Type": "application/pdf"},
             status_code=200
         )
@@ -66,6 +66,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
             image_save_dir=image_save_dir,
             batch_size=batch_size
         )
+        self._temp_files = []  # Track temp files for cleanup
 
     def scrap(self, url: str, html: str, **params) -> ScrapingResult:
         """
@@ -124,7 +125,13 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
         finally:
             # Cleanup temp file if downloaded
             if url.startswith(("http://", "https://")):
-                Path(pdf_path).unlink(missing_ok=True)
+                try:
+                    Path(pdf_path).unlink(missing_ok=True)
+                    if pdf_path in self._temp_files:
+                        self._temp_files.remove(pdf_path)
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Failed to cleanup temp file {pdf_path}: {e}")
 
     async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
         # For simple cases, you can use the sync version
@@ -138,22 +145,45 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
             
             # Create temp file with .pdf extension
             temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+            self._temp_files.append(temp_file.name)
             
             try:
-                # Download PDF with streaming
-                response = requests.get(url, stream=True)
+                if self.logger:
+                    self.logger.info(f"Downloading PDF from {url}...")
+                
+                # Download PDF with streaming and timeout
+                # Connection timeout: 10s, Read timeout: 300s (5 minutes for large PDFs)
+                response = requests.get(url, stream=True, timeout=(20, 60 * 10))
                 response.raise_for_status()
                 
+                # Get file size if available
+                total_size = int(response.headers.get('content-length', 0))
+                downloaded = 0
+                
                 # Write to temp file
                 with open(temp_file.name, 'wb') as f:
                     for chunk in response.iter_content(chunk_size=8192):
                         f.write(chunk)
+                        downloaded += len(chunk)
+                        if self.logger and total_size > 0:
+                            progress = (downloaded / total_size) * 100
+                            if progress % 10 < 0.1:  # Log every 10%
+                                self.logger.debug(f"PDF download progress: {progress:.0f}%")
+                
+                if self.logger:
+                    self.logger.info(f"PDF downloaded successfully: {temp_file.name}")
                         
                 return temp_file.name
                 
+            except requests.exceptions.Timeout as e:
+                # Clean up temp file if download fails
+                Path(temp_file.name).unlink(missing_ok=True)
+                self._temp_files.remove(temp_file.name)
+                raise RuntimeError(f"Timeout downloading PDF from {url}: {str(e)}")
             except Exception as e:
                 # Clean up temp file if download fails
                 Path(temp_file.name).unlink(missing_ok=True)
+                self._temp_files.remove(temp_file.name)
                 raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
                 
         elif url.startswith("file://"):