feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts

- Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py
2024-10-12 13:42:42 +08:00
parent b99d20b725
commit ff3524d9b1
8 changed files with 127 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -201,4 +201,6 @@ test_env/
 todo.md
 git_changes.py
 git_changes.md
-pypi_build.sh
+pypi_build.sh
 .tests/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,38 @@
 # Changelog
 ## [0.3.6] - 2024-10-12
 ### Added
 - New `.tests/` directory added to `.gitignore`
 - Screenshot functionality:
  - Added `screenshot` column to the database schema
  - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy`
  - Added option to capture screenshots when crawling
 - Delayed content retrieval:
  - New `get_delayed_content` method in `AsyncCrawlResponse`
 - Database schema updates:
  - Auto-update mechanism for database schema
  - New columns: 'media', 'links', 'metadata', 'screenshot'
 - LLM extraction examples in `quickstart_async.py`:
  - Support for OpenAI, Hugging Face, and Ollama models
 ### Changed
 - Updated version number to 0.3.6 in `__init__.py`
 - Improved error handling and logging in various components
 - Enhanced `WebScrappingStrategy` to handle image processing more efficiently
 - Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values
 ### Fixed
 - Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags
 ### Removed
 - Removed `pypi_build.sh` from version control (added to `.gitignore`)
 ### Developer Notes
 - Added examples for using different LLM providers in `quickstart_async.py`
 - Improved error messages for better debugging
 - Enhanced type hinting throughout the codebase
 ## [v0.3.5] - 2024-09-02
 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult
-__version__ = "0.3.5"
+__version__ = "0.3.6"
 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64, time
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional
+from typing import Callable, Dict, Any, List, Optional, Awaitable
 import os
 from playwright.async_api import async_playwright, Page, Browser, Error
 from io import BytesIO
@@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel):
    response_headers: Dict[str, str]
    status_code: int
    screenshot: Optional[str] = None
    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
    class Config:
        arbitrary_types_allowed = True
 class AsyncCrawlerStrategy(ABC):
    @abstractmethod
@@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if not kwargs.get("js_only", False):
                await self.execute_hook('before_goto', page)
-                response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
+                response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
                await self.execute_hook('after_goto', page)
                # Get status code and headers
@@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                except Exception as e:
                    raise RuntimeError(f"Wait condition failed: {str(e)}")
            # Check if kwargs has screenshot=True then take screenshot
            screenshot_data = None
            if kwargs.get("screenshot"):
                screenshot_data = await self.take_screenshot(url)
            html = await page.content()
            page = await self.execute_hook('before_return_html', page, html)
@@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "status_code": status_code
                    }, f)
-            response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+            
            async def get_delayed_content(delay: float = 5.0) -> str:
                if self.verbose:
                    print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
                await asyncio.sleep(delay)
                return await page.content()
            response = AsyncCrawlResponse(
                html=html, 
                response_headers=response_headers, 
                status_code=status_code,
                screenshot=screenshot_data,
                get_delayed_content=get_delayed_content
            )
            return response
        except Error as e:
            raise Error(f"Failed to crawl {url}: {str(e)}")
@@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result if not isinstance(result, Exception) else str(result) for result in results]
-    async def take_screenshot(self, url: str) -> str:
+    async def take_screenshot(self, url: str, wait_time = 1000) -> str:
        async with await self.browser.new_context(user_agent=self.user_agent) as context:
            page = await context.new_page()
            try:
-                await page.goto(url, wait_until="domcontentloaded")
+                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                # Wait for a specified time (default is 1 second)
                await page.wait_for_timeout(wait_time)
                screenshot = await page.screenshot(full_page=True)
                return base64.b64encode(screenshot).decode('utf-8')
            except Exception as e:
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -29,14 +29,31 @@ class AsyncDatabaseManager:
                )
            ''')
            await db.commit()
        await self.update_db_schema()
-    async def aalter_db_add_screenshot(self, new_column: str = "media"):
+    async def update_db_schema(self):
        async with aiosqlite.connect(self.db_path) as db:
            # Check if the 'media' column exists
            cursor = await db.execute("PRAGMA table_info(crawled_data)")
            columns = await cursor.fetchall()
            column_names = [column[1] for column in columns]
            if 'media' not in column_names:
                await self.aalter_db_add_column('media')
            # Check for other missing columns and add them if necessary
            for column in ['links', 'metadata', 'screenshot']:
                if column not in column_names:
                    await self.aalter_db_add_column(column)
    async def aalter_db_add_column(self, new_column: str):
        try:
            async with aiosqlite.connect(self.db_path) as db:
                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
                await db.commit()
            print(f"Added column '{new_column}' to the database.")
        except Exception as e:
-            print(f"Error altering database to add screenshot column: {e}")
+            print(f"Error altering database to add {new_column} column: {e}")
    async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
        try:
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -202,11 +202,11 @@ class AsyncWebCrawler:
                )
            if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
+                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
        except InvalidCSSSelectorError as e:
            raise ValueError(str(e))
        except Exception as e:
-            raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}")
+            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
        markdown = sanitize_input_encode(result.get("markdown", ""))
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                    if isinstance(element, Comment):
                        element.extract()
                    return False
                # if element.name == 'img':
                #     process_image(element, url, 0, 1)
                #     return True
                if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
                    if element.name == 'img':
                        process_image(element, url, 0, 1)
                    element.decompose()
                    return False
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -66,6 +66,29 @@ async def use_proxy():
    #     )
    #     print(result.markdown[:500])  # Print first 500 characters
 async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            screenshot=True,
            bypass_cache=True
        )
        if result.success and result.screenshot:
            import base64
            # Decode the base64 screenshot data
            screenshot_data = base64.b64decode(result.screenshot)
            # Save the screenshot as a JPEG file
            with open(output_path, 'wb') as f:
                f.write(screenshot_data)
            print(f"Screenshot saved successfully to {output_path}")
        else:
            print("Failed to capture screenshot")
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel):
        ..., description="Fee for output token for the OpenAI model."
    )
-async def extract_structured_data_using_llm():
+async def extract_structured_data_using_llm(provider: str, api_token: str = None):
-    print("\n--- Extracting Structured Data with OpenAI ---")
+    print(f"\n--- Extracting Structured Data with {provider} ---")
-    print(
+    
-        "Note: Set your OpenAI API key as an environment variable to run this example."
+    if api_token is None and provider != "ollama":
-    )
+        print(f"API token is required for {provider}. Skipping this example.")
    if not os.getenv("OPENAI_API_KEY"):
        print("OpenAI API key not found. Skipping this example.")
        return
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -87,8 +108,8 @@ async def extract_structured_data_using_llm():
            url="https://openai.com/api/pricing/",
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-4o",
+                provider=provider,
-                api_token=os.getenv("OPENAI_API_KEY"),
+                api_token=api_token,
                schema=OpenAIModelFee.schema(),
                extraction_type="schema",
                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -390,7 +411,13 @@ async def main():
    await js_and_css()
    await use_proxy()
    await extract_structured_data_using_css_extractor()
    # LLM extraction examples
    await extract_structured_data_using_llm()
    await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
    await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
    await extract_structured_data_using_llm("ollama/llama3.2")    
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()