Enhance crawler capabilities and documentation

- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -23,11 +23,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_logger import AsyncLogger
 from playwright_stealth import StealthConfig, stealth_async
-
-
-from io import BytesIO
-import base64
-from PIL import Image, ImageDraw, ImageFont
+from .utilities.ssl_utils import get_ssl_certificate

 stealth_config = StealthConfig(
    webdriver=True,
@@ -566,18 +562,6 @@ class AsyncCrawlerStrategy(ABC):
    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
        pass

-    @abstractmethod
-    async def take_screenshot(self, **kwargs) -> str:
-        pass
-
-    @abstractmethod
-    def update_user_agent(self, user_agent: str):
-        pass
-
-    @abstractmethod
-    def set_hook(self, hook_type: str, hook: Callable):
-        pass
-

 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
    def __init__(
@@ -928,6 +912,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            page.on("pageerror", lambda e: log_consol(e, "error"))

        try:
+            # Get SSL certificate information if requested and URL is HTTPS
+            ssl_certificate = None
+            if config.fetch_ssl_certificate and url.startswith('https://'):
+                ssl_certificate = get_ssl_certificate(url)
+
            # Set up download handling
            if self.browser_config.accept_downloads:
                page.on(
@@ -1155,6 +1144,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                screenshot=screenshot_data,
                pdf_data=pdf_data,
                get_delayed_content=get_delayed_content,
+                ssl_certificate=ssl_certificate,
                downloaded_files=(
                    self._downloaded_files if self._downloaded_files else None
                ),