fix(pdf): add timeout to PDF downloads to prevent hanging (#1141)
- Added timeout=(20, 600) to requests.get() to prevent indefinite hanging - Added download progress logging for better visibility - Improved error handling with specific timeout exceptions - Better temp file cleanup tracking Fixes #1141
This commit is contained in:
@@ -14,7 +14,7 @@ class PDFCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
# Just pass through with empty HTML - scraper will handle actual processing
|
# Just pass through with empty HTML - scraper will handle actual processing
|
||||||
return AsyncCrawlResponse(
|
return AsyncCrawlResponse(
|
||||||
html="", # Scraper will handle the real work
|
html="Scraper will handle the real work", # Scraper will handle the real work
|
||||||
response_headers={"Content-Type": "application/pdf"},
|
response_headers={"Content-Type": "application/pdf"},
|
||||||
status_code=200
|
status_code=200
|
||||||
)
|
)
|
||||||
@@ -66,6 +66,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
|||||||
image_save_dir=image_save_dir,
|
image_save_dir=image_save_dir,
|
||||||
batch_size=batch_size
|
batch_size=batch_size
|
||||||
)
|
)
|
||||||
|
self._temp_files = [] # Track temp files for cleanup
|
||||||
|
|
||||||
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
|
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
|
||||||
"""
|
"""
|
||||||
@@ -124,7 +125,13 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
|||||||
finally:
|
finally:
|
||||||
# Cleanup temp file if downloaded
|
# Cleanup temp file if downloaded
|
||||||
if url.startswith(("http://", "https://")):
|
if url.startswith(("http://", "https://")):
|
||||||
Path(pdf_path).unlink(missing_ok=True)
|
try:
|
||||||
|
Path(pdf_path).unlink(missing_ok=True)
|
||||||
|
if pdf_path in self._temp_files:
|
||||||
|
self._temp_files.remove(pdf_path)
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.warning(f"Failed to cleanup temp file {pdf_path}: {e}")
|
||||||
|
|
||||||
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
# For simple cases, you can use the sync version
|
# For simple cases, you can use the sync version
|
||||||
@@ -138,22 +145,45 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
# Create temp file with .pdf extension
|
# Create temp file with .pdf extension
|
||||||
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
||||||
|
self._temp_files.append(temp_file.name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Download PDF with streaming
|
if self.logger:
|
||||||
response = requests.get(url, stream=True)
|
self.logger.info(f"Downloading PDF from {url}...")
|
||||||
|
|
||||||
|
# Download PDF with streaming and timeout
|
||||||
|
# Connection timeout: 10s, Read timeout: 300s (5 minutes for large PDFs)
|
||||||
|
response = requests.get(url, stream=True, timeout=(20, 60 * 10))
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Get file size if available
|
||||||
|
total_size = int(response.headers.get('content-length', 0))
|
||||||
|
downloaded = 0
|
||||||
|
|
||||||
# Write to temp file
|
# Write to temp file
|
||||||
with open(temp_file.name, 'wb') as f:
|
with open(temp_file.name, 'wb') as f:
|
||||||
for chunk in response.iter_content(chunk_size=8192):
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
downloaded += len(chunk)
|
||||||
|
if self.logger and total_size > 0:
|
||||||
|
progress = (downloaded / total_size) * 100
|
||||||
|
if progress % 10 < 0.1: # Log every 10%
|
||||||
|
self.logger.debug(f"PDF download progress: {progress:.0f}%")
|
||||||
|
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info(f"PDF downloaded successfully: {temp_file.name}")
|
||||||
|
|
||||||
return temp_file.name
|
return temp_file.name
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout as e:
|
||||||
|
# Clean up temp file if download fails
|
||||||
|
Path(temp_file.name).unlink(missing_ok=True)
|
||||||
|
self._temp_files.remove(temp_file.name)
|
||||||
|
raise RuntimeError(f"Timeout downloading PDF from {url}: {str(e)}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Clean up temp file if download fails
|
# Clean up temp file if download fails
|
||||||
Path(temp_file.name).unlink(missing_ok=True)
|
Path(temp_file.name).unlink(missing_ok=True)
|
||||||
|
self._temp_files.remove(temp_file.name)
|
||||||
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
|
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
|
||||||
|
|
||||||
elif url.startswith("file://"):
|
elif url.startswith("file://"):
|
||||||
|
|||||||
Reference in New Issue
Block a user