From ff3524d9b1f76bb06a43a7721eb958db9bd01463 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 13:42:42 +0800 Subject: [PATCH] feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts - Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py --- .gitignore | 4 ++- CHANGELOG.md | 33 +++++++++++++++++++ crawl4ai/__init__.py | 2 +- crawl4ai/async_crawler_strategy.py | 34 ++++++++++++++++--- crawl4ai/async_database.py | 21 ++++++++++-- crawl4ai/async_webcrawler.py | 4 +-- crawl4ai/content_scrapping_strategy.py | 6 ++-- docs/examples/quickstart_async.py | 45 ++++++++++++++++++++------ 8 files changed, 127 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 85882f69..8b8f014c 100644 --- a/.gitignore +++ b/.gitignore @@ -201,4 +201,6 @@ test_env/ todo.md git_changes.py git_changes.md -pypi_build.sh \ No newline at end of file +pypi_build.sh + +.tests/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b564ed..701d6903 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## [0.3.6] - 2024-10-12 + +### Added +- New `.tests/` directory added to `.gitignore` +- Screenshot functionality: + - Added `screenshot` column to the database schema + - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy` + - Added option to capture screenshots when crawling +- Delayed content retrieval: + - New `get_delayed_content` method in `AsyncCrawlResponse` +- Database schema updates: + - Auto-update mechanism for database schema + - New columns: 'media', 'links', 'metadata', 'screenshot' +- LLM extraction examples in `quickstart_async.py`: + - Support for OpenAI, Hugging Face, and Ollama models + +### Changed +- Updated version number to 0.3.6 in `__init__.py` +- Improved error handling and logging in various components +- Enhanced `WebScrappingStrategy` to handle image processing more efficiently +- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values + +### Fixed +- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags + +### Removed +- Removed `pypi_build.sh` from version control (added to `.gitignore`) + +### Developer Notes +- Added examples for using different LLM providers in `quickstart_async.py` +- Improved error messages for better debugging +- Enhanced type hinting throughout the codebase + ## [v0.3.5] - 2024-09-02 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 186730e8..04da30f8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -__version__ = "0.3.5" +__version__ = "0.3.6" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 987925f8..28795a3e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1,7 +1,7 @@ import asyncio import base64, time from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional +from typing import Callable, Dict, Any, List, Optional, Awaitable import os from playwright.async_api import async_playwright, Page, Browser, Error from io import BytesIO @@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel): response_headers: Dict[str, str] status_code: int screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + + class Config: + arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) - response = await page.goto(url, wait_until="domcontentloaded", timeout=60000) + response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)) await self.execute_hook('after_goto', page) # Get status code and headers @@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + screenshot_data = await self.take_screenshot(url) + html = await page.content() page = await self.execute_hook('before_return_html', page, html) @@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "status_code": status_code }, f) - response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content + ) return response except Error as e: raise Error(f"Failed to crawl {url}: {str(e)}") @@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] - async def take_screenshot(self, url: str) -> str: + async def take_screenshot(self, url: str, wait_time = 1000) -> str: async with await self.browser.new_context(user_agent=self.user_agent) as context: page = await context.new_page() try: - await page.goto(url, wait_until="domcontentloaded") + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + # Wait for a specified time (default is 1 second) + await page.wait_for_timeout(wait_time) screenshot = await page.screenshot(full_page=True) return base64.b64encode(screenshot).decode('utf-8') except Exception as e: diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index baa53255..61d98e9c 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -29,14 +29,31 @@ class AsyncDatabaseManager: ) ''') await db.commit() + await self.update_db_schema() - async def aalter_db_add_screenshot(self, new_column: str = "media"): + async def update_db_schema(self): + async with aiosqlite.connect(self.db_path) as db: + # Check if the 'media' column exists + cursor = await db.execute("PRAGMA table_info(crawled_data)") + columns = await cursor.fetchall() + column_names = [column[1] for column in columns] + + if 'media' not in column_names: + await self.aalter_db_add_column('media') + + # Check for other missing columns and add them if necessary + for column in ['links', 'metadata', 'screenshot']: + if column not in column_names: + await self.aalter_db_add_column(column) + + async def aalter_db_add_column(self, new_column: str): try: async with aiosqlite.connect(self.db_path) as db: await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') await db.commit() + print(f"Added column '{new_column}' to the database.") except Exception as e: - print(f"Error altering database to add screenshot column: {e}") + print(f"Error altering database to add {new_column} column: {e}") async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: try: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 88c05f03..d308e930 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -202,11 +202,11 @@ class AsyncWebCrawler: ) if result is None: - raise ValueError(f"Failed to extract content from the website: {url}") + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") except InvalidCSSSelectorError as e: raise ValueError(str(e)) except Exception as e: - raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}") + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index e3d2c57f..afd75892 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy): if isinstance(element, Comment): element.extract() return False + + # if element.name == 'img': + # process_image(element, url, 0, 1) + # return True if element.name in ['script', 'style', 'link', 'meta', 'noscript']: - if element.name == 'img': - process_image(element, url, 0, 1) element.decompose() return False diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 1a2d9570..836bdb1d 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -66,6 +66,29 @@ async def use_proxy(): # ) # print(result.markdown[:500]) # Print first 500 characters + +async def capture_and_save_screenshot(url: str, output_path: str): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=url, + screenshot=True, + bypass_cache=True + ) + + if result.success and result.screenshot: + import base64 + + # Decode the base64 screenshot data + screenshot_data = base64.b64decode(result.screenshot) + + # Save the screenshot as a JPEG file + with open(output_path, 'wb') as f: + f.write(screenshot_data) + + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") @@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) -async def extract_structured_data_using_llm(): - print("\n--- Extracting Structured Data with OpenAI ---") - print( - "Note: Set your OpenAI API key as an environment variable to run this example." - ) - if not os.getenv("OPENAI_API_KEY"): - print("OpenAI API key not found. Skipping this example.") +async def extract_structured_data_using_llm(provider: str, api_token: str = None): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") return async with AsyncWebCrawler(verbose=True) as crawler: @@ -87,8 +108,8 @@ async def extract_structured_data_using_llm(): url="https://openai.com/api/pricing/", word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY"), + provider=provider, + api_token=api_token, schema=OpenAIModelFee.schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -390,7 +411,13 @@ async def main(): await js_and_css() await use_proxy() await extract_structured_data_using_css_extractor() + + # LLM extraction examples await extract_structured_data_using_llm() + await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("ollama/llama3.2") + # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3()