diff --git a/README.md b/README.md index 03762b97..9ac2e624 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Recent Changes ### v0.2.3 -- 🎨 Extract and return all media tags (Images, Audio, and Video). -- 🖼️ Take screenshots of the page. +- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media` +- 🖼️ Take [screenshots](#taking-screenshots-) of the page. ### v0.2.2 - Support multiple JS scripts @@ -266,6 +266,14 @@ Crawl result without raw HTML content: result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False) ``` +### Taking Screenshots + +```python +result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) +with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) +``` + ### Adding a chunking strategy: RegexChunking Using RegexChunking: @@ -372,6 +380,7 @@ result = crawler.run(url="https://www.nbcnews.com/business") | `urls` | A list of URLs to crawl and extract data from. | Yes | - | | `include_raw_html` | Whether to include the raw HTML content in the response. | No | `false` | | `bypass_cache` | Whether to force a fresh crawl even if the URL has been previously crawled. | No | `false` | +| `screenshots` | Whether to take screenshots of the page. | No | `false` | | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5). | No | `5` | | `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy"). | No | `NoExtractionStrategy` | | `chunking_strategy` | The strategy to use for chunking the text before processing (e.g., "RegexChunking"). | No | `RegexChunking` | diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 2b6f9872..0fdd3772 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -35,10 +35,20 @@ def cprint(message, press_any_key=False): def basic_usage(crawler): cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") - result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) + result = crawler.run(url="https://www.nbcnews.com/business") cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]") print_result(result) +def screenshot_usage(crawler): + cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) + cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]") + # Save the screenshot to a file + with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + cprint("Screenshot saved to 'screenshot.png'!") + print_result(result) + def understanding_parameters(crawler): cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]") cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.") @@ -187,11 +197,11 @@ def main(): crawler = create_crawler() - crawler.always_by_pass_cache = True basic_usage(crawler) understanding_parameters(crawler) crawler.always_by_pass_cache = True + screenshot_usage(crawler) add_chunking_strategy(crawler) add_extraction_strategy(crawler) add_llm_extraction_strategy(crawler) diff --git a/pages/partial/how_to_guide.html b/pages/partial/how_to_guide.html index 3ad53ce4..270fcacb 100644 --- a/pages/partial/how_to_guide.html +++ b/pages/partial/how_to_guide.html @@ -50,6 +50,20 @@ crawler.warmup()
crawler.always_by_pass_cache = True
+ +
+ 📸 + Let's take a screenshot of the page! +
+
+
result = crawler.run(
+    url="https://www.nbcnews.com/business",
+    screenshot=True
+)
+with open("screenshot.png", "wb") as f:
+    f.write(base64.b64decode(result.screenshot))
+
+