feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts

- Implement screenshot capture functionality
- Add delayed content retrieval method
- Introduce custom page timeout parameter
- Enhance LLM support with multiple providers
- Improve database schema auto-updates
- Optimize image processing in WebScrappingStrategy
- Update error handling and logging
- Expand examples in quickstart_async.py
This commit is contained in:
unclecode
2024-10-12 13:42:42 +08:00
parent b99d20b725
commit ff3524d9b1
8 changed files with 127 additions and 22 deletions

2
.gitignore vendored
View File

@@ -202,3 +202,5 @@ todo.md
git_changes.py
git_changes.md
pypi_build.sh
.tests/

View File

@@ -1,5 +1,38 @@
# Changelog
## [0.3.6] - 2024-10-12
### Added
- New `.tests/` directory added to `.gitignore`
- Screenshot functionality:
- Added `screenshot` column to the database schema
- Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy`
- Added option to capture screenshots when crawling
- Delayed content retrieval:
- New `get_delayed_content` method in `AsyncCrawlResponse`
- Database schema updates:
- Auto-update mechanism for database schema
- New columns: 'media', 'links', 'metadata', 'screenshot'
- LLM extraction examples in `quickstart_async.py`:
- Support for OpenAI, Hugging Face, and Ollama models
### Changed
- Updated version number to 0.3.6 in `__init__.py`
- Improved error handling and logging in various components
- Enhanced `WebScrappingStrategy` to handle image processing more efficiently
- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values
### Fixed
- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags
### Removed
- Removed `pypi_build.sh` from version control (added to `.gitignore`)
### Developer Notes
- Added examples for using different LLM providers in `quickstart_async.py`
- Improved error messages for better debugging
- Enhanced type hinting throughout the codebase
## [v0.3.5] - 2024-09-02
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities

View File

@@ -3,7 +3,7 @@
from .async_webcrawler import AsyncWebCrawler
from .models import CrawlResult
__version__ = "0.3.5"
__version__ = "0.3.6"
__all__ = [
"AsyncWebCrawler",

View File

@@ -1,7 +1,7 @@
import asyncio
import base64, time
from abc import ABC, abstractmethod
from typing import Callable, Dict, Any, List, Optional
from typing import Callable, Dict, Any, List, Optional, Awaitable
import os
from playwright.async_api import async_playwright, Page, Browser, Error
from io import BytesIO
@@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel):
response_headers: Dict[str, str]
status_code: int
screenshot: Optional[str] = None
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
class Config:
arbitrary_types_allowed = True
class AsyncCrawlerStrategy(ABC):
@abstractmethod
@@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if not kwargs.get("js_only", False):
await self.execute_hook('before_goto', page)
response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
await self.execute_hook('after_goto', page)
# Get status code and headers
@@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Exception as e:
raise RuntimeError(f"Wait condition failed: {str(e)}")
# Check if kwargs has screenshot=True then take screenshot
screenshot_data = None
if kwargs.get("screenshot"):
screenshot_data = await self.take_screenshot(url)
html = await page.content()
page = await self.execute_hook('before_return_html', page, html)
@@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"status_code": status_code
}, f)
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
async def get_delayed_content(delay: float = 5.0) -> str:
if self.verbose:
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
await asyncio.sleep(delay)
return await page.content()
response = AsyncCrawlResponse(
html=html,
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
get_delayed_content=get_delayed_content
)
return response
except Error as e:
raise Error(f"Failed to crawl {url}: {str(e)}")
@@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
results = await asyncio.gather(*tasks, return_exceptions=True)
return [result if not isinstance(result, Exception) else str(result) for result in results]
async def take_screenshot(self, url: str) -> str:
async def take_screenshot(self, url: str, wait_time = 1000) -> str:
async with await self.browser.new_context(user_agent=self.user_agent) as context:
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded")
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
# Wait for a specified time (default is 1 second)
await page.wait_for_timeout(wait_time)
screenshot = await page.screenshot(full_page=True)
return base64.b64encode(screenshot).decode('utf-8')
except Exception as e:

View File

@@ -29,14 +29,31 @@ class AsyncDatabaseManager:
)
''')
await db.commit()
await self.update_db_schema()
async def aalter_db_add_screenshot(self, new_column: str = "media"):
async def update_db_schema(self):
async with aiosqlite.connect(self.db_path) as db:
# Check if the 'media' column exists
cursor = await db.execute("PRAGMA table_info(crawled_data)")
columns = await cursor.fetchall()
column_names = [column[1] for column in columns]
if 'media' not in column_names:
await self.aalter_db_add_column('media')
# Check for other missing columns and add them if necessary
for column in ['links', 'metadata', 'screenshot']:
if column not in column_names:
await self.aalter_db_add_column(column)
async def aalter_db_add_column(self, new_column: str):
try:
async with aiosqlite.connect(self.db_path) as db:
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
await db.commit()
print(f"Added column '{new_column}' to the database.")
except Exception as e:
print(f"Error altering database to add screenshot column: {e}")
print(f"Error altering database to add {new_column} column: {e}")
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
try:

View File

@@ -202,11 +202,11 @@ class AsyncWebCrawler:
)
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
except Exception as e:
raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}")
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))

View File

@@ -171,9 +171,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
element.extract()
return False
# if element.name == 'img':
# process_image(element, url, 0, 1)
# return True
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
if element.name == 'img':
process_image(element, url, 0, 1)
element.decompose()
return False

View File

@@ -66,6 +66,29 @@ async def use_proxy():
# )
# print(result.markdown[:500]) # Print first 500 characters
async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=url,
screenshot=True,
bypass_cache=True
)
if result.success and result.screenshot:
import base64
# Decode the base64 screenshot data
screenshot_data = base64.b64decode(result.screenshot)
# Save the screenshot as a JPEG file
with open(output_path, 'wb') as f:
f.write(screenshot_data)
print(f"Screenshot saved successfully to {output_path}")
else:
print("Failed to capture screenshot")
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel):
..., description="Fee for output token for the OpenAI model."
)
async def extract_structured_data_using_llm():
print("\n--- Extracting Structured Data with OpenAI ---")
print(
"Note: Set your OpenAI API key as an environment variable to run this example."
)
if not os.getenv("OPENAI_API_KEY"):
print("OpenAI API key not found. Skipping this example.")
async def extract_structured_data_using_llm(provider: str, api_token: str = None):
print(f"\n--- Extracting Structured Data with {provider} ---")
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
return
async with AsyncWebCrawler(verbose=True) as crawler:
@@ -87,8 +108,8 @@ async def extract_structured_data_using_llm():
url="https://openai.com/api/pricing/",
word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token=os.getenv("OPENAI_API_KEY"),
provider=provider,
api_token=api_token,
schema=OpenAIModelFee.schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
@@ -390,7 +411,13 @@ async def main():
await js_and_css()
await use_proxy()
await extract_structured_data_using_css_extractor()
# LLM extraction examples
await extract_structured_data_using_llm()
await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
await extract_structured_data_using_llm("ollama/llama3.2")
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
await crawl_dynamic_content_pages_method_3()