feat: Add screenshot functionality to crawl_urls

The code changes in this commit add the `screenshot` parameter to the `crawl_urls` function in `main.py`. This allows users to specify whether they want to take a screenshot of the page during the crawling process. The default value is `False`.

This commit message follows the established convention of starting with a type (feat for feature) and providing a concise and descriptive summary of the changes made.
This commit is contained in:
unclecode
2024-06-07 15:23:32 +08:00
parent 0533aeb814
commit 8e73a482a2
11 changed files with 147 additions and 27 deletions

View File

@@ -59,6 +59,8 @@ class WebCrawler:
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
css_selector: str = None,
screenshot: bool = False,
use_cached_html: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
@@ -70,6 +72,8 @@ class WebCrawler:
extraction_strategy or NoExtractionStrategy(),
chunking_strategy,
bypass_cache=url_model.forced,
css_selector=css_selector,
screenshot=screenshot,
**kwargs,
)
pass
@@ -83,6 +87,7 @@ class WebCrawler:
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
verbose=True,
**kwargs,
) -> CrawlResult:
@@ -110,7 +115,8 @@ class WebCrawler:
"markdown": cached[3],
"extracted_content": cached[4],
"success": cached[5],
"media": json.loads(cached[6]),
"media": json.loads(cached[6] or "{}"),
"screenshot": cached[7],
"error_message": "",
}
)
@@ -118,6 +124,9 @@ class WebCrawler:
# Initialize WebDriver for crawling
t = time.time()
html = self.crawler_strategy.crawl(url)
base64_image = None
if screenshot:
base64_image = self.crawler_strategy.take_screenshot()
success = True
error_message = ""
# Extract content from HTML
@@ -166,6 +175,7 @@ class WebCrawler:
extracted_content,
success,
json.dumps(media),
screenshot=base64_image,
)
return CrawlResult(
@@ -174,6 +184,7 @@ class WebCrawler:
cleaned_html=cleaned_html,
markdown=markdown,
media=media,
screenshot=base64_image,
extracted_content=extracted_content,
success=success,
error_message=error_message,
@@ -187,6 +198,8 @@ class WebCrawler:
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
use_cached_html: bool = False,
css_selector: str = None,
screenshot: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
@@ -204,6 +217,8 @@ class WebCrawler:
[api_token] * len(url_models),
[extract_blocks_flag] * len(url_models),
[word_count_threshold] * len(url_models),
[css_selector] * len(url_models),
[screenshot] * len(url_models),
[use_cached_html] * len(url_models),
[extraction_strategy] * len(url_models),
[chunking_strategy] * len(url_models),