refactor(pdf): improve PDF processor dependency handling

Make PyPDF2 an optional dependency and improve import handling in PDF processor. Move imports inside methods to allow for lazy loading and better error handling. Add new 'pdf' optional dependency group in pyproject.toml. Clean up unused imports and remove deprecated files. BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features.
2025-02-25 22:27:55 +08:00
parent 71ce01c9e1
commit 4bcd4cbda1
8 changed files with 67 additions and 388 deletions
--- a/crawl4ai/processors/pdf/init.py
+++ b/crawl4ai/processors/pdf/init.py
@@ -1,8 +1,6 @@
-from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, List, Optional
+import asyncio
 from dataclasses import asdict
-
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
 from crawl4ai.models import AsyncCrawlResponse, ScrapingResult