From ff3524d9b1f76bb06a43a7721eb958db9bd01463 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Sat, 12 Oct 2024 13:42:42 +0800
Subject: [PATCH] feat(v0.3.6): Add screenshot capture, delayed content, and
 custom timeouts

- Implement screenshot capture functionality
- Add delayed content retrieval method
- Introduce custom page timeout parameter
- Enhance LLM support with multiple providers
- Improve database schema auto-updates
- Optimize image processing in WebScrappingStrategy
- Update error handling and logging
- Expand examples in quickstart_async.py
---
 .gitignore                             |  4 ++-
 CHANGELOG.md                           | 33 +++++++++++++++++++
 crawl4ai/__init__.py                   |  2 +-
 crawl4ai/async_crawler_strategy.py     | 34 ++++++++++++++++---
 crawl4ai/async_database.py             | 21 ++++++++++--
 crawl4ai/async_webcrawler.py           |  4 +--
 crawl4ai/content_scrapping_strategy.py |  6 ++--
 docs/examples/quickstart_async.py      | 45 ++++++++++++++++++++------
 8 files changed, 127 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 85882f69..8b8f014c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -201,4 +201,6 @@ test_env/
 todo.md
 git_changes.py
 git_changes.md
-pypi_build.sh
\ No newline at end of file
+pypi_build.sh
+
+.tests/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37b564ed..701d6903 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,38 @@
 # Changelog
 
+## [0.3.6] - 2024-10-12
+
+### Added
+- New `.tests/` directory added to `.gitignore`
+- Screenshot functionality:
+  - Added `screenshot` column to the database schema
+  - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy`
+  - Added option to capture screenshots when crawling
+- Delayed content retrieval:
+  - New `get_delayed_content` method in `AsyncCrawlResponse`
+- Database schema updates:
+  - Auto-update mechanism for database schema
+  - New columns: 'media', 'links', 'metadata', 'screenshot'
+- LLM extraction examples in `quickstart_async.py`:
+  - Support for OpenAI, Hugging Face, and Ollama models
+
+### Changed
+- Updated version number to 0.3.6 in `__init__.py`
+- Improved error handling and logging in various components
+- Enhanced `WebScrappingStrategy` to handle image processing more efficiently
+- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values
+
+### Fixed
+- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags
+
+### Removed
+- Removed `pypi_build.sh` from version control (added to `.gitignore`)
+
+### Developer Notes
+- Added examples for using different LLM providers in `quickstart_async.py`
+- Improved error messages for better debugging
+- Enhanced type hinting throughout the codebase
+
 ## [v0.3.5] - 2024-09-02
 
 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 186730e8..04da30f8 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult
 
-__version__ = "0.3.5"
+__version__ = "0.3.6"
 
 __all__ = [
     "AsyncWebCrawler",
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 987925f8..28795a3e 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64, time
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional
+from typing import Callable, Dict, Any, List, Optional, Awaitable
 import os
 from playwright.async_api import async_playwright, Page, Browser, Error
 from io import BytesIO
@@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel):
     response_headers: Dict[str, str]
     status_code: int
     screenshot: Optional[str] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
 
 class AsyncCrawlerStrategy(ABC):
     @abstractmethod
@@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             if not kwargs.get("js_only", False):
                 await self.execute_hook('before_goto', page)
-                response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
+                response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
                 await self.execute_hook('after_goto', page)
                 
                 # Get status code and headers
@@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 except Exception as e:
                     raise RuntimeError(f"Wait condition failed: {str(e)}")
 
+            # Check if kwargs has screenshot=True then take screenshot
+            screenshot_data = None
+            if kwargs.get("screenshot"):
+                screenshot_data = await self.take_screenshot(url)
+            
             html = await page.content()
             page = await self.execute_hook('before_return_html', page, html)
 
@@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         "status_code": status_code
                     }, f)
 
-            response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+            
+            async def get_delayed_content(delay: float = 5.0) -> str:
+                if self.verbose:
+                    print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
+                await asyncio.sleep(delay)
+                return await page.content()
+                
+            response = AsyncCrawlResponse(
+                html=html, 
+                response_headers=response_headers, 
+                status_code=status_code,
+                screenshot=screenshot_data,
+                get_delayed_content=get_delayed_content
+            )
             return response
         except Error as e:
             raise Error(f"Failed to crawl {url}: {str(e)}")
@@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         results = await asyncio.gather(*tasks, return_exceptions=True)
         return [result if not isinstance(result, Exception) else str(result) for result in results]
 
-    async def take_screenshot(self, url: str) -> str:
+    async def take_screenshot(self, url: str, wait_time = 1000) -> str:
         async with await self.browser.new_context(user_agent=self.user_agent) as context:
             page = await context.new_page()
             try:
-                await page.goto(url, wait_until="domcontentloaded")
+                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                # Wait for a specified time (default is 1 second)
+                await page.wait_for_timeout(wait_time)
                 screenshot = await page.screenshot(full_page=True)
                 return base64.b64encode(screenshot).decode('utf-8')
             except Exception as e:
diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index baa53255..61d98e9c 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -29,14 +29,31 @@ class AsyncDatabaseManager:
                 )
             ''')
             await db.commit()
+        await self.update_db_schema()
 
-    async def aalter_db_add_screenshot(self, new_column: str = "media"):
+    async def update_db_schema(self):
+        async with aiosqlite.connect(self.db_path) as db:
+            # Check if the 'media' column exists
+            cursor = await db.execute("PRAGMA table_info(crawled_data)")
+            columns = await cursor.fetchall()
+            column_names = [column[1] for column in columns]
+            
+            if 'media' not in column_names:
+                await self.aalter_db_add_column('media')
+            
+            # Check for other missing columns and add them if necessary
+            for column in ['links', 'metadata', 'screenshot']:
+                if column not in column_names:
+                    await self.aalter_db_add_column(column)
+
+    async def aalter_db_add_column(self, new_column: str):
         try:
             async with aiosqlite.connect(self.db_path) as db:
                 await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
                 await db.commit()
+            print(f"Added column '{new_column}' to the database.")
         except Exception as e:
-            print(f"Error altering database to add screenshot column: {e}")
+            print(f"Error altering database to add {new_column} column: {e}")
 
     async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
         try:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 88c05f03..d308e930 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -202,11 +202,11 @@ class AsyncWebCrawler:
                 )
 
             if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
+                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
         except InvalidCSSSelectorError as e:
             raise ValueError(str(e))
         except Exception as e:
-            raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}")
+            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
 
         cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
         markdown = sanitize_input_encode(result.get("markdown", ""))
diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py
index e3d2c57f..afd75892 100644
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                     if isinstance(element, Comment):
                         element.extract()
                     return False
+                
+                # if element.name == 'img':
+                #     process_image(element, url, 0, 1)
+                #     return True
 
                 if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
-                    if element.name == 'img':
-                        process_image(element, url, 0, 1)
                     element.decompose()
                     return False
 
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
index 1a2d9570..836bdb1d 100644
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -66,6 +66,29 @@ async def use_proxy():
     #     )
     #     print(result.markdown[:500])  # Print first 500 characters
 
+
+async def capture_and_save_screenshot(url: str, output_path: str):
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=url,
+            screenshot=True,
+            bypass_cache=True
+        )
+        
+        if result.success and result.screenshot:
+            import base64
+            
+            # Decode the base64 screenshot data
+            screenshot_data = base64.b64decode(result.screenshot)
+            
+            # Save the screenshot as a JPEG file
+            with open(output_path, 'wb') as f:
+                f.write(screenshot_data)
+            
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
 class OpenAIModelFee(BaseModel):
     model_name: str = Field(..., description="Name of the OpenAI model.")
     input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel):
         ..., description="Fee for output token for the OpenAI model."
     )
 
-async def extract_structured_data_using_llm():
-    print("\n--- Extracting Structured Data with OpenAI ---")
-    print(
-        "Note: Set your OpenAI API key as an environment variable to run this example."
-    )
-    if not os.getenv("OPENAI_API_KEY"):
-        print("OpenAI API key not found. Skipping this example.")
+async def extract_structured_data_using_llm(provider: str, api_token: str = None):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+    
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
         return
 
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -87,8 +108,8 @@ async def extract_structured_data_using_llm():
             url="https://openai.com/api/pricing/",
             word_count_threshold=1,
             extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-4o",
-                api_token=os.getenv("OPENAI_API_KEY"),
+                provider=provider,
+                api_token=api_token,
                 schema=OpenAIModelFee.schema(),
                 extraction_type="schema",
                 instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -390,7 +411,13 @@ async def main():
     await js_and_css()
     await use_proxy()
     await extract_structured_data_using_css_extractor()
+
+    # LLM extraction examples
     await extract_structured_data_using_llm()
+    await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
+    await extract_structured_data_using_llm("ollama/llama3.2")    
+    
     # await crawl_dynamic_content_pages_method_1()
     # await crawl_dynamic_content_pages_method_2()
     await crawl_dynamic_content_pages_method_3()