From 4750810a67aba2b257a8c8a6d234d2cf397bd025 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 2 Oct 2024 17:34:56 +0800 Subject: [PATCH] Enhance AsyncWebCrawler with smart waiting and screenshot capabilities - Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py --- .gitignore | 6 +- CHANGELOG.md | 9 ++ crawl4ai/__init__.py | 2 +- crawl4ai/async_crawler_strategy.py | 55 ++++++-- crawl4ai/async_webcrawler.py | 9 +- crawl4ai/models.py | 2 +- .../async_webcrawler_multiple_urls_example.py | 48 +++++++ docs/examples/language_support_example.py | 45 +++++++ tests/async/test_basic_crawling.py | 2 +- tests/async/test_screenshot.py | 124 ++++++++++++++++++ 10 files changed, 281 insertions(+), 21 deletions(-) create mode 100644 docs/examples/async_webcrawler_multiple_urls_example.py create mode 100644 docs/examples/language_support_example.py create mode 100644 tests/async/test_screenshot.py diff --git a/.gitignore b/.gitignore index b48005ba..a593ae1d 100644 --- a/.gitignore +++ b/.gitignore @@ -196,4 +196,8 @@ docs/.DS_Store tmp/ test_env/ **/.DS_Store -**/.DS_Store \ No newline at end of file +**/.DS_Store + +todo.md +git_changes.py +git_changes.md \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index d054dd52..37b564ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## [v0.3.5] - 2024-09-02 + +Enhance AsyncWebCrawler with smart waiting and screenshot capabilities + +- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy +- Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler +- Improve error handling and timeout management in crawling process +- Fix typo in CrawlResult model (responser_headers -> response_headers) + ## [v0.2.77] - 2024-08-04 Significant improvements in text processing and performance: diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index dcb55ab9..186730e8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -__version__ = "0.3.4" +__version__ = "0.3.5" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 35e3c59b..987925f8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -12,10 +12,12 @@ import hashlib from pathlib import Path from playwright.async_api import ProxySettings from pydantic import BaseModel + class AsyncCrawlResponse(BaseModel): html: str response_headers: Dict[str, str] status_code: int + screenshot: Optional[str] = None class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -139,6 +141,45 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): asyncio.create_task(self.kill_session(sid)) + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + wait_for = wait_for.strip() + + if wait_for.startswith('js:'): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith('css:'): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith('()') or wait_for.startswith('function'): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) + except Error: + raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'.") + async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): wrapper_js = f""" async () => {{ @@ -250,19 +291,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): wait_for = kwargs.get("wait_for") if wait_for: try: - await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000)) + await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000)) except Exception as e: - raise RuntimeError(f"Custom wait condition failed: {str(e)}") - # try: - # await page.wait_for_function(wait_for) - # # if callable(wait_for): - # # await page.wait_for_function(wait_for) - # # elif isinstance(wait_for, str): - # # await page.wait_for_selector(wait_for) - # # else: - # # raise ValueError("wait_for must be either a callable or a CSS selector string") - # except Error as e: - # raise Error(f"Custom wait condition failed: {str(e)}") + raise RuntimeError(f"Wait condition failed: {str(e)}") html = await page.content() page = await self.execute_hook('before_return_html', page, html) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 34b192e5..88c05f03 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -80,7 +80,7 @@ class AsyncWebCrawler: word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) - async_response : AsyncCrawlResponse = None + async_response: AsyncCrawlResponse = None cached = None screenshot_data = None extracted_content = None @@ -102,15 +102,14 @@ class AsyncWebCrawler: t1 = time.time() if user_agent: self.crawler_strategy.update_user_agent(user_agent) - async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot t2 = time.time() if verbose: print( f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" ) - if screenshot: - screenshot_data = await self.crawler_strategy.take_screenshot(url) crawl_result = await self.aprocess_html( url, @@ -127,7 +126,7 @@ class AsyncWebCrawler: **kwargs, ) crawl_result.status_code = async_response.status_code if async_response else 200 - crawl_result.responser_headers = async_response.response_headers if async_response else {} + crawl_result.response_headers = async_response.response_headers if async_response else {} crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) return crawl_result diff --git a/crawl4ai/models.py b/crawl4ai/models.py index eefb0cb9..151ccb4f 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -18,5 +18,5 @@ class CrawlResult(BaseModel): metadata: Optional[dict] = None error_message: Optional[str] = None session_id: Optional[str] = None - responser_headers: Optional[dict] = None + response_headers: Optional[dict] = None status_code: Optional[int] = None \ No newline at end of file diff --git a/docs/examples/async_webcrawler_multiple_urls_example.py b/docs/examples/async_webcrawler_multiple_urls_example.py new file mode 100644 index 00000000..1d63ac80 --- /dev/null +++ b/docs/examples/async_webcrawler_multiple_urls_example.py @@ -0,0 +1,48 @@ +# File: async_webcrawler_multiple_urls_example.py +import os, sys +# append 2 parent directories to sys.path to import crawl4ai +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(parent_dir) + +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + # Initialize the AsyncWebCrawler + async with AsyncWebCrawler(verbose=True) as crawler: + # List of URLs to crawl + urls = [ + "https://example.com", + "https://python.org", + "https://github.com", + "https://stackoverflow.com", + "https://news.ycombinator.com" + ] + + # Set up crawling parameters + word_count_threshold = 100 + + # Run the crawling process for multiple URLs + results = await crawler.arun_many( + urls=urls, + word_count_threshold=word_count_threshold, + bypass_cache=True, + verbose=True + ) + + # Process the results + for result in results: + if result.success: + print(f"Successfully crawled: {result.url}") + print(f"Title: {result.metadata.get('title', 'N/A')}") + print(f"Word count: {len(result.markdown.split())}") + print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") + print(f"Number of images: {len(result.media.get('images', []))}") + print("---") + else: + print(f"Failed to crawl: {result.url}") + print(f"Error: {result.error_message}") + print("---") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/language_support_example.py b/docs/examples/language_support_example.py new file mode 100644 index 00000000..b74a8402 --- /dev/null +++ b/docs/examples/language_support_example.py @@ -0,0 +1,45 @@ +import asyncio +from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy + +async def main(): + # Example 1: Setting language when creating the crawler + crawler1 = AsyncWebCrawler( + crawler_strategy=AsyncPlaywrightCrawlerStrategy( + headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"} + ) + ) + result1 = await crawler1.arun("https://www.example.com") + print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters + + # Example 2: Setting language before crawling + crawler2 = AsyncWebCrawler() + crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7" + result2 = await crawler2.arun("https://www.example.com") + print("Example 2 result:", result2.extracted_content[:100]) + + # Example 3: Setting language when calling arun method + crawler3 = AsyncWebCrawler() + result3 = await crawler3.arun( + "https://www.example.com", + headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"} + ) + print("Example 3 result:", result3.extracted_content[:100]) + + # Example 4: Crawling multiple pages with different languages + urls = [ + ("https://www.example.com", "fr-FR,fr;q=0.9"), + ("https://www.example.org", "es-ES,es;q=0.9"), + ("https://www.example.net", "de-DE,de;q=0.9"), + ] + + crawler4 = AsyncWebCrawler() + results = await asyncio.gather(*[ + crawler4.arun(url, headers={"Accept-Language": lang}) + for url, lang in urls + ]) + + for url, result in zip([u for u, _ in urls], results): + print(f"Result for {url}:", result.extracted_content[:100]) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/async/test_basic_crawling.py b/tests/async/test_basic_crawling.py index 7184f464..ce38ac2f 100644 --- a/tests/async/test_basic_crawling.py +++ b/tests/async/test_basic_crawling.py @@ -5,7 +5,7 @@ import asyncio import time # Add the parent directory to the Python path -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(parent_dir) from crawl4ai.async_webcrawler import AsyncWebCrawler diff --git a/tests/async/test_screenshot.py b/tests/async/test_screenshot.py new file mode 100644 index 00000000..0c4439f6 --- /dev/null +++ b/tests/async/test_screenshot.py @@ -0,0 +1,124 @@ +import os +import sys +import pytest +import asyncio +import base64 +from PIL import Image +import io + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.async_webcrawler import AsyncWebCrawler + +@pytest.mark.asyncio +async def test_basic_screenshot(): + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://example.com" # A static website + result = await crawler.arun(url=url, bypass_cache=True, screenshot=True) + + assert result.success + assert result.screenshot is not None + + # Verify the screenshot is a valid image + image_data = base64.b64decode(result.screenshot) + image = Image.open(io.BytesIO(image_data)) + assert image.format == "PNG" + +@pytest.mark.asyncio +async def test_screenshot_with_wait_for(): + async with AsyncWebCrawler(verbose=True) as crawler: + # Using a website with dynamic content + url = "https://www.youtube.com" + wait_for = "css:#content" # Wait for the main content to load + + result = await crawler.arun( + url=url, + bypass_cache=True, + screenshot=True, + wait_for=wait_for + ) + + assert result.success + assert result.screenshot is not None + + # Verify the screenshot is a valid image + image_data = base64.b64decode(result.screenshot) + image = Image.open(io.BytesIO(image_data)) + assert image.format == "PNG" + + # You might want to add more specific checks here, like image dimensions + # or even use image recognition to verify certain elements are present + +@pytest.mark.asyncio +async def test_screenshot_with_js_wait_for(): + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://www.amazon.com" + wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null" + + result = await crawler.arun( + url=url, + bypass_cache=True, + screenshot=True, + wait_for=wait_for + ) + + assert result.success + assert result.screenshot is not None + + image_data = base64.b64decode(result.screenshot) + image = Image.open(io.BytesIO(image_data)) + assert image.format == "PNG" + +@pytest.mark.asyncio +async def test_screenshot_without_wait_for(): + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://www.nytimes.com" # A website with lots of dynamic content + + result = await crawler.arun(url=url, bypass_cache=True, screenshot=True) + + assert result.success + assert result.screenshot is not None + + image_data = base64.b64decode(result.screenshot) + image = Image.open(io.BytesIO(image_data)) + assert image.format == "PNG" + +@pytest.mark.asyncio +async def test_screenshot_comparison(): + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://www.reddit.com" + wait_for = "css:#SHORTCUT_FOCUSABLE_DIV" + + # Take screenshot without wait_for + result_without_wait = await crawler.arun( + url=url, + bypass_cache=True, + screenshot=True + ) + + # Take screenshot with wait_for + result_with_wait = await crawler.arun( + url=url, + bypass_cache=True, + screenshot=True, + wait_for=wait_for + ) + + assert result_without_wait.success and result_with_wait.success + assert result_without_wait.screenshot is not None + assert result_with_wait.screenshot is not None + + # Compare the two screenshots + image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot))) + image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot))) + + # This is a simple size comparison. In a real-world scenario, you might want to use + # more sophisticated image comparison techniques. + assert image_with_wait.size[0] >= image_without_wait.size[0] + assert image_with_wait.size[1] >= image_without_wait.size[1] + +# Entry point for debugging +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file