feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts

- Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py
2024-10-12 13:42:42 +08:00
parent b99d20b725
commit ff3524d9b1
8 changed files with 127 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -202,3 +202,5 @@ todo.md
 git_changes.py
 git_changes.md
 pypi_build.sh
+
+.tests/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,38 @@
 # Changelog

+## [0.3.6] - 2024-10-12
+
+### Added
+- New `.tests/` directory added to `.gitignore`
+- Screenshot functionality:
+  - Added `screenshot` column to the database schema
+  - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy`
+  - Added option to capture screenshots when crawling
+- Delayed content retrieval:
+  - New `get_delayed_content` method in `AsyncCrawlResponse`
+- Database schema updates:
+  - Auto-update mechanism for database schema
+  - New columns: 'media', 'links', 'metadata', 'screenshot'
+- LLM extraction examples in `quickstart_async.py`:
+  - Support for OpenAI, Hugging Face, and Ollama models
+
+### Changed
+- Updated version number to 0.3.6 in `__init__.py`
+- Improved error handling and logging in various components
+- Enhanced `WebScrappingStrategy` to handle image processing more efficiently
+- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values
+
+### Fixed
+- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags
+
+### Removed
+- Removed `pypi_build.sh` from version control (added to `.gitignore`)
+
+### Developer Notes
+- Added examples for using different LLM providers in `quickstart_async.py`
+- Improved error messages for better debugging
+- Enhanced type hinting throughout the codebase
+
 ## [v0.3.5] - 2024-09-02

 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult

-__version__ = "0.3.5"
+__version__ = "0.3.6"

 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64, time
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional
+from typing import Callable, Dict, Any, List, Optional, Awaitable
 import os
 from playwright.async_api import async_playwright, Page, Browser, Error
 from io import BytesIO
@@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel):
    response_headers: Dict[str, str]
    status_code: int
    screenshot: Optional[str] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True

 class AsyncCrawlerStrategy(ABC):
    @abstractmethod
@@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            if not kwargs.get("js_only", False):
                await self.execute_hook('before_goto', page)
-                response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
+                response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
                await self.execute_hook('after_goto', page)
                
                # Get status code and headers
@@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                except Exception as e:
                    raise RuntimeError(f"Wait condition failed: {str(e)}")

+            # Check if kwargs has screenshot=True then take screenshot
+            screenshot_data = None
+            if kwargs.get("screenshot"):
+                screenshot_data = await self.take_screenshot(url)
+            
            html = await page.content()
            page = await self.execute_hook('before_return_html', page, html)

@@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "status_code": status_code
                    }, f)

-            response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+            
+            async def get_delayed_content(delay: float = 5.0) -> str:
+                if self.verbose:
+                    print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
+                await asyncio.sleep(delay)
+                return await page.content()
+                
+            response = AsyncCrawlResponse(
+                html=html, 
+                response_headers=response_headers, 
+                status_code=status_code,
+                screenshot=screenshot_data,
+                get_delayed_content=get_delayed_content
+            )
            return response
        except Error as e:
            raise Error(f"Failed to crawl {url}: {str(e)}")
@@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result if not isinstance(result, Exception) else str(result) for result in results]

-    async def take_screenshot(self, url: str) -> str:
+    async def take_screenshot(self, url: str, wait_time = 1000) -> str:
        async with await self.browser.new_context(user_agent=self.user_agent) as context:
            page = await context.new_page()
            try:
-                await page.goto(url, wait_until="domcontentloaded")
+                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                # Wait for a specified time (default is 1 second)
+                await page.wait_for_timeout(wait_time)
                screenshot = await page.screenshot(full_page=True)
                return base64.b64encode(screenshot).decode('utf-8')
            except Exception as e:
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -29,14 +29,31 @@ class AsyncDatabaseManager:
                )
            ''')
            await db.commit()
+        await self.update_db_schema()

-    async def aalter_db_add_screenshot(self, new_column: str = "media"):
+    async def update_db_schema(self):
+        async with aiosqlite.connect(self.db_path) as db:
+            # Check if the 'media' column exists
+            cursor = await db.execute("PRAGMA table_info(crawled_data)")
+            columns = await cursor.fetchall()
+            column_names = [column[1] for column in columns]
+            
+            if 'media' not in column_names:
+                await self.aalter_db_add_column('media')
+            
+            # Check for other missing columns and add them if necessary
+            for column in ['links', 'metadata', 'screenshot']:
+                if column not in column_names:
+                    await self.aalter_db_add_column(column)
+
+    async def aalter_db_add_column(self, new_column: str):
        try:
            async with aiosqlite.connect(self.db_path) as db:
                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
                await db.commit()
+            print(f"Added column '{new_column}' to the database.")
        except Exception as e:
-            print(f"Error altering database to add screenshot column: {e}")
+            print(f"Error altering database to add {new_column} column: {e}")

    async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
        try:
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -202,11 +202,11 @@ class AsyncWebCrawler:
                )

            if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
+                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
        except InvalidCSSSelectorError as e:
            raise ValueError(str(e))
        except Exception as e:
-            raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}")
+            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")

        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
        markdown = sanitize_input_encode(result.get("markdown", ""))
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -171,9 +171,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                        element.extract()
                    return False
                
+                # if element.name == 'img':
+                #     process_image(element, url, 0, 1)
+                #     return True
+
                if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
-                    if element.name == 'img':
-                        process_image(element, url, 0, 1)
                    element.decompose()
                    return False

--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -66,6 +66,29 @@ async def use_proxy():
    #     )
    #     print(result.markdown[:500])  # Print first 500 characters

+
+async def capture_and_save_screenshot(url: str, output_path: str):
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=url,
+            screenshot=True,
+            bypass_cache=True
+        )
+        
+        if result.success and result.screenshot:
+            import base64
+            
+            # Decode the base64 screenshot data
+            screenshot_data = base64.b64decode(result.screenshot)
+            
+            # Save the screenshot as a JPEG file
+            with open(output_path, 'wb') as f:
+                f.write(screenshot_data)
+            
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel):
        ..., description="Fee for output token for the OpenAI model."
    )

-async def extract_structured_data_using_llm():
-    print("\n--- Extracting Structured Data with OpenAI ---")
-    print(
-        "Note: Set your OpenAI API key as an environment variable to run this example."
-    )
-    if not os.getenv("OPENAI_API_KEY"):
-        print("OpenAI API key not found. Skipping this example.")
+async def extract_structured_data_using_llm(provider: str, api_token: str = None):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+    
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
        return

    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -87,8 +108,8 @@ async def extract_structured_data_using_llm():
            url="https://openai.com/api/pricing/",
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-4o",
-                api_token=os.getenv("OPENAI_API_KEY"),
+                provider=provider,
+                api_token=api_token,
                schema=OpenAIModelFee.schema(),
                extraction_type="schema",
                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -390,7 +411,13 @@ async def main():
    await js_and_css()
    await use_proxy()
    await extract_structured_data_using_css_extractor()
+
+    # LLM extraction examples
    await extract_structured_data_using_llm()
+    await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
+    await extract_structured_data_using_llm("ollama/llama3.2")    
+    
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()