feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts
- Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -201,4 +201,6 @@ test_env/
|
|||||||
todo.md
|
todo.md
|
||||||
git_changes.py
|
git_changes.py
|
||||||
git_changes.md
|
git_changes.md
|
||||||
pypi_build.sh
|
pypi_build.sh
|
||||||
|
|
||||||
|
.tests/
|
||||||
33
CHANGELOG.md
33
CHANGELOG.md
@@ -1,5 +1,38 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [0.3.6] - 2024-10-12
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- New `.tests/` directory added to `.gitignore`
|
||||||
|
- Screenshot functionality:
|
||||||
|
- Added `screenshot` column to the database schema
|
||||||
|
- Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy`
|
||||||
|
- Added option to capture screenshots when crawling
|
||||||
|
- Delayed content retrieval:
|
||||||
|
- New `get_delayed_content` method in `AsyncCrawlResponse`
|
||||||
|
- Database schema updates:
|
||||||
|
- Auto-update mechanism for database schema
|
||||||
|
- New columns: 'media', 'links', 'metadata', 'screenshot'
|
||||||
|
- LLM extraction examples in `quickstart_async.py`:
|
||||||
|
- Support for OpenAI, Hugging Face, and Ollama models
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Updated version number to 0.3.6 in `__init__.py`
|
||||||
|
- Improved error handling and logging in various components
|
||||||
|
- Enhanced `WebScrappingStrategy` to handle image processing more efficiently
|
||||||
|
- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags
|
||||||
|
|
||||||
|
### Removed
|
||||||
|
- Removed `pypi_build.sh` from version control (added to `.gitignore`)
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- Added examples for using different LLM providers in `quickstart_async.py`
|
||||||
|
- Improved error messages for better debugging
|
||||||
|
- Enhanced type hinting throughout the codebase
|
||||||
|
|
||||||
## [v0.3.5] - 2024-09-02
|
## [v0.3.5] - 2024-09-02
|
||||||
|
|
||||||
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
|
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
|
|
||||||
__version__ = "0.3.5"
|
__version__ = "0.3.6"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64, time
|
import base64, time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Dict, Any, List, Optional
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||||
import os
|
import os
|
||||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
response_headers: Dict[str, str]
|
response_headers: Dict[str, str]
|
||||||
status_code: int
|
status_code: int
|
||||||
screenshot: Optional[str] = None
|
screenshot: Optional[str] = None
|
||||||
|
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
class AsyncCrawlerStrategy(ABC):
|
class AsyncCrawlerStrategy(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
if not kwargs.get("js_only", False):
|
if not kwargs.get("js_only", False):
|
||||||
await self.execute_hook('before_goto', page)
|
await self.execute_hook('before_goto', page)
|
||||||
response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
|
||||||
await self.execute_hook('after_goto', page)
|
await self.execute_hook('after_goto', page)
|
||||||
|
|
||||||
# Get status code and headers
|
# Get status code and headers
|
||||||
@@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|
||||||
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
|
screenshot_data = None
|
||||||
|
if kwargs.get("screenshot"):
|
||||||
|
screenshot_data = await self.take_screenshot(url)
|
||||||
|
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
page = await self.execute_hook('before_return_html', page, html)
|
page = await self.execute_hook('before_return_html', page, html)
|
||||||
|
|
||||||
@@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"status_code": status_code
|
"status_code": status_code
|
||||||
}, f)
|
}, f)
|
||||||
|
|
||||||
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
|
||||||
|
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await page.content()
|
||||||
|
|
||||||
|
response = AsyncCrawlResponse(
|
||||||
|
html=html,
|
||||||
|
response_headers=response_headers,
|
||||||
|
status_code=status_code,
|
||||||
|
screenshot=screenshot_data,
|
||||||
|
get_delayed_content=get_delayed_content
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise Error(f"Failed to crawl {url}: {str(e)}")
|
raise Error(f"Failed to crawl {url}: {str(e)}")
|
||||||
@@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||||
|
|
||||||
async def take_screenshot(self, url: str) -> str:
|
async def take_screenshot(self, url: str, wait_time = 1000) -> str:
|
||||||
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
try:
|
try:
|
||||||
await page.goto(url, wait_until="domcontentloaded")
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
# Wait for a specified time (default is 1 second)
|
||||||
|
await page.wait_for_timeout(wait_time)
|
||||||
screenshot = await page.screenshot(full_page=True)
|
screenshot = await page.screenshot(full_page=True)
|
||||||
return base64.b64encode(screenshot).decode('utf-8')
|
return base64.b64encode(screenshot).decode('utf-8')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -29,14 +29,31 @@ class AsyncDatabaseManager:
|
|||||||
)
|
)
|
||||||
''')
|
''')
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
await self.update_db_schema()
|
||||||
|
|
||||||
async def aalter_db_add_screenshot(self, new_column: str = "media"):
|
async def update_db_schema(self):
|
||||||
|
async with aiosqlite.connect(self.db_path) as db:
|
||||||
|
# Check if the 'media' column exists
|
||||||
|
cursor = await db.execute("PRAGMA table_info(crawled_data)")
|
||||||
|
columns = await cursor.fetchall()
|
||||||
|
column_names = [column[1] for column in columns]
|
||||||
|
|
||||||
|
if 'media' not in column_names:
|
||||||
|
await self.aalter_db_add_column('media')
|
||||||
|
|
||||||
|
# Check for other missing columns and add them if necessary
|
||||||
|
for column in ['links', 'metadata', 'screenshot']:
|
||||||
|
if column not in column_names:
|
||||||
|
await self.aalter_db_add_column(column)
|
||||||
|
|
||||||
|
async def aalter_db_add_column(self, new_column: str):
|
||||||
try:
|
try:
|
||||||
async with aiosqlite.connect(self.db_path) as db:
|
async with aiosqlite.connect(self.db_path) as db:
|
||||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
print(f"Added column '{new_column}' to the database.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error altering database to add screenshot column: {e}")
|
print(f"Error altering database to add {new_column} column: {e}")
|
||||||
|
|
||||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -202,11 +202,11 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
||||||
except InvalidCSSSelectorError as e:
|
except InvalidCSSSelectorError as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}")
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||||
|
|
||||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
markdown = sanitize_input_encode(result.get("markdown", ""))
|
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||||
|
|||||||
@@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
if isinstance(element, Comment):
|
if isinstance(element, Comment):
|
||||||
element.extract()
|
element.extract()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# if element.name == 'img':
|
||||||
|
# process_image(element, url, 0, 1)
|
||||||
|
# return True
|
||||||
|
|
||||||
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
||||||
if element.name == 'img':
|
|
||||||
process_image(element, url, 0, 1)
|
|
||||||
element.decompose()
|
element.decompose()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@@ -66,6 +66,29 @@ async def use_proxy():
|
|||||||
# )
|
# )
|
||||||
# print(result.markdown[:500]) # Print first 500 characters
|
# print(result.markdown[:500]) # Print first 500 characters
|
||||||
|
|
||||||
|
|
||||||
|
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
screenshot=True,
|
||||||
|
bypass_cache=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success and result.screenshot:
|
||||||
|
import base64
|
||||||
|
|
||||||
|
# Decode the base64 screenshot data
|
||||||
|
screenshot_data = base64.b64decode(result.screenshot)
|
||||||
|
|
||||||
|
# Save the screenshot as a JPEG file
|
||||||
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(screenshot_data)
|
||||||
|
|
||||||
|
print(f"Screenshot saved successfully to {output_path}")
|
||||||
|
else:
|
||||||
|
print("Failed to capture screenshot")
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||||
@@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel):
|
|||||||
..., description="Fee for output token for the OpenAI model."
|
..., description="Fee for output token for the OpenAI model."
|
||||||
)
|
)
|
||||||
|
|
||||||
async def extract_structured_data_using_llm():
|
async def extract_structured_data_using_llm(provider: str, api_token: str = None):
|
||||||
print("\n--- Extracting Structured Data with OpenAI ---")
|
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||||
print(
|
|
||||||
"Note: Set your OpenAI API key as an environment variable to run this example."
|
if api_token is None and provider != "ollama":
|
||||||
)
|
print(f"API token is required for {provider}. Skipping this example.")
|
||||||
if not os.getenv("OPENAI_API_KEY"):
|
|
||||||
print("OpenAI API key not found. Skipping this example.")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
@@ -87,8 +108,8 @@ async def extract_structured_data_using_llm():
|
|||||||
url="https://openai.com/api/pricing/",
|
url="https://openai.com/api/pricing/",
|
||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o",
|
provider=provider,
|
||||||
api_token=os.getenv("OPENAI_API_KEY"),
|
api_token=api_token,
|
||||||
schema=OpenAIModelFee.schema(),
|
schema=OpenAIModelFee.schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||||
@@ -390,7 +411,13 @@ async def main():
|
|||||||
await js_and_css()
|
await js_and_css()
|
||||||
await use_proxy()
|
await use_proxy()
|
||||||
await extract_structured_data_using_css_extractor()
|
await extract_structured_data_using_css_extractor()
|
||||||
|
|
||||||
|
# LLM extraction examples
|
||||||
await extract_structured_data_using_llm()
|
await extract_structured_data_using_llm()
|
||||||
|
await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
|
||||||
|
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||||
|
await extract_structured_data_using_llm("ollama/llama3.2")
|
||||||
|
|
||||||
# await crawl_dynamic_content_pages_method_1()
|
# await crawl_dynamic_content_pages_method_1()
|
||||||
# await crawl_dynamic_content_pages_method_2()
|
# await crawl_dynamic_content_pages_method_2()
|
||||||
await crawl_dynamic_content_pages_method_3()
|
await crawl_dynamic_content_pages_method_3()
|
||||||
|
|||||||
Reference in New Issue
Block a user