feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts

- Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py
2024-10-12 13:42:42 +08:00
parent b99d20b725
commit ff3524d9b1
8 changed files with 127 additions and 22 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult

-__version__ = "0.3.5"
+__version__ = "0.3.6"

 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64, time
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional
+from typing import Callable, Dict, Any, List, Optional, Awaitable
 import os
 from playwright.async_api import async_playwright, Page, Browser, Error
 from io import BytesIO
@@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel):
    response_headers: Dict[str, str]
    status_code: int
    screenshot: Optional[str] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True

 class AsyncCrawlerStrategy(ABC):
    @abstractmethod
@@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            if not kwargs.get("js_only", False):
                await self.execute_hook('before_goto', page)
-                response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
+                response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
                await self.execute_hook('after_goto', page)
                
                # Get status code and headers
@@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                except Exception as e:
                    raise RuntimeError(f"Wait condition failed: {str(e)}")

+            # Check if kwargs has screenshot=True then take screenshot
+            screenshot_data = None
+            if kwargs.get("screenshot"):
+                screenshot_data = await self.take_screenshot(url)
+            
            html = await page.content()
            page = await self.execute_hook('before_return_html', page, html)

@@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "status_code": status_code
                    }, f)

-            response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+            
+            async def get_delayed_content(delay: float = 5.0) -> str:
+                if self.verbose:
+                    print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
+                await asyncio.sleep(delay)
+                return await page.content()
+                
+            response = AsyncCrawlResponse(
+                html=html, 
+                response_headers=response_headers, 
+                status_code=status_code,
+                screenshot=screenshot_data,
+                get_delayed_content=get_delayed_content
+            )
            return response
        except Error as e:
            raise Error(f"Failed to crawl {url}: {str(e)}")
@@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result if not isinstance(result, Exception) else str(result) for result in results]

-    async def take_screenshot(self, url: str) -> str:
+    async def take_screenshot(self, url: str, wait_time = 1000) -> str:
        async with await self.browser.new_context(user_agent=self.user_agent) as context:
            page = await context.new_page()
            try:
-                await page.goto(url, wait_until="domcontentloaded")
+                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                # Wait for a specified time (default is 1 second)
+                await page.wait_for_timeout(wait_time)
                screenshot = await page.screenshot(full_page=True)
                return base64.b64encode(screenshot).decode('utf-8')
            except Exception as e:
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -29,14 +29,31 @@ class AsyncDatabaseManager:
                )
            ''')
            await db.commit()
+        await self.update_db_schema()

-    async def aalter_db_add_screenshot(self, new_column: str = "media"):
+    async def update_db_schema(self):
+        async with aiosqlite.connect(self.db_path) as db:
+            # Check if the 'media' column exists
+            cursor = await db.execute("PRAGMA table_info(crawled_data)")
+            columns = await cursor.fetchall()
+            column_names = [column[1] for column in columns]
+            
+            if 'media' not in column_names:
+                await self.aalter_db_add_column('media')
+            
+            # Check for other missing columns and add them if necessary
+            for column in ['links', 'metadata', 'screenshot']:
+                if column not in column_names:
+                    await self.aalter_db_add_column(column)
+
+    async def aalter_db_add_column(self, new_column: str):
        try:
            async with aiosqlite.connect(self.db_path) as db:
                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
                await db.commit()
+            print(f"Added column '{new_column}' to the database.")
        except Exception as e:
-            print(f"Error altering database to add screenshot column: {e}")
+            print(f"Error altering database to add {new_column} column: {e}")

    async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
        try:
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -202,11 +202,11 @@ class AsyncWebCrawler:
                )

            if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
+                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
        except InvalidCSSSelectorError as e:
            raise ValueError(str(e))
        except Exception as e:
-            raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}")
+            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")

        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
        markdown = sanitize_input_encode(result.get("markdown", ""))
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                    if isinstance(element, Comment):
                        element.extract()
                    return False
+                
+                # if element.name == 'img':
+                #     process_image(element, url, 0, 1)
+                #     return True

                if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
-                    if element.name == 'img':
-                        process_image(element, url, 0, 1)
                    element.decompose()
                    return False