Merge branch '0.3.5' of https://github.com/unclecode/crawl4ai into 0.3.5

Update gitignore
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
2024-10-14 22:53:09 +08:00 · 2024-10-14 22:52:00 +08:00 · 2024-10-02 17:34:56 +08:00
10 changed files with 281 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -196,4 +196,8 @@ docs/.DS_Store
 tmp/
 test_env/
 **/.DS_Store
-**/.DS_Store
+**/.DS_Store
 todo.md
 git_changes.py
 git_changes.md
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 ## [v0.3.5] - 2024-09-02
 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
 - Implement smart_wait function in AsyncPlaywrightCrawlerStrategy
 - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler
 - Improve error handling and timeout management in crawling process
 - Fix typo in CrawlResult model (responser_headers -> response_headers)
 ## [v0.2.77] - 2024-08-04
 Significant improvements in text processing and performance:
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult
-__version__ = "0.3.4"
+__version__ = "0.3.5"
 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -12,10 +12,12 @@ import hashlib
 from pathlib import Path
 from playwright.async_api import ProxySettings
 from pydantic import BaseModel
 class AsyncCrawlResponse(BaseModel):
    html: str
    response_headers: Dict[str, str]
    status_code: int
    screenshot: Optional[str] = None
 class AsyncCrawlerStrategy(ABC):
    @abstractmethod
@@ -139,6 +141,45 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            asyncio.create_task(self.kill_session(sid))
    async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
        wait_for = wait_for.strip()
        if wait_for.startswith('js:'):
            # Explicitly specified JavaScript
            js_code = wait_for[3:].strip()
            return await self.csp_compliant_wait(page, js_code, timeout)
        elif wait_for.startswith('css:'):
            # Explicitly specified CSS selector
            css_selector = wait_for[4:].strip()
            try:
                await page.wait_for_selector(css_selector, timeout=timeout)
            except Error as e:
                if 'Timeout' in str(e):
                    raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
                else:
                    raise ValueError(f"Invalid CSS selector: '{css_selector}'")
        else:
            # Auto-detect based on content
            if wait_for.startswith('()') or wait_for.startswith('function'):
                # It's likely a JavaScript function
                return await self.csp_compliant_wait(page, wait_for, timeout)
            else:
                # Assume it's a CSS selector first
                try:
                    await page.wait_for_selector(wait_for, timeout=timeout)
                except Error as e:
                    if 'Timeout' in str(e):
                        raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
                    else:
                        # If it's not a timeout error, it might be an invalid selector
                        # Let's try to evaluate it as a JavaScript function as a fallback
                        try:
                            return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
                        except Error:
                            raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
                                            "It should be either a valid CSS selector, a JavaScript function, "
                                            "or explicitly prefixed with 'js:' or 'css:'.")
    async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
        wrapper_js = f"""
        async () => {{
@@ -250,19 +291,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            wait_for = kwargs.get("wait_for")
            if wait_for:
                try:
-                    await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
+                    await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
                except Exception as e:
-                    raise RuntimeError(f"Custom wait condition failed: {str(e)}")                
+                    raise RuntimeError(f"Wait condition failed: {str(e)}")
                # try:
                #     await page.wait_for_function(wait_for)
                #     # if callable(wait_for):
                #     #     await page.wait_for_function(wait_for)
                #     # elif isinstance(wait_for, str):
                #     #     await page.wait_for_selector(wait_for)
                #     # else:
                #     #     raise ValueError("wait_for must be either a callable or a CSS selector string")
                # except Error as e:
                #     raise Error(f"Custom wait condition failed: {str(e)}")
            html = await page.content()
            page = await self.execute_hook('before_return_html', page, html)
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -80,7 +80,7 @@ class AsyncWebCrawler:
            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
-            async_response : AsyncCrawlResponse = None
+            async_response: AsyncCrawlResponse = None
            cached = None
            screenshot_data = None
            extracted_content = None
@@ -102,15 +102,14 @@ class AsyncWebCrawler:
                t1 = time.time()
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
-                async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
+                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
                html = sanitize_input_encode(async_response.html)
                screenshot_data = async_response.screenshot
                t2 = time.time()
                if verbose:
                    print(
                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
                    )
                if screenshot:
                    screenshot_data = await self.crawler_strategy.take_screenshot(url)
            crawl_result = await self.aprocess_html(
                url,
@@ -127,7 +126,7 @@ class AsyncWebCrawler:
                **kwargs,
            )
            crawl_result.status_code = async_response.status_code if async_response else 200
-            crawl_result.responser_headers = async_response.response_headers if async_response else {}
+            crawl_result.response_headers = async_response.response_headers if async_response else {}
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
            return crawl_result
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -18,5 +18,5 @@ class CrawlResult(BaseModel):
    metadata: Optional[dict] = None
    error_message: Optional[str] = None
    session_id: Optional[str] = None
-    responser_headers: Optional[dict] = None
+    response_headers: Optional[dict] = None
    status_code: Optional[int] = None
--- a/docs/examples/async_webcrawler_multiple_urls_example.py
+++ b/docs/examples/async_webcrawler_multiple_urls_example.py
@@ -0,0 +1,48 @@
 # File: async_webcrawler_multiple_urls_example.py
 import os, sys
 # append 2 parent directories to sys.path to import crawl4ai
 parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(parent_dir)
 import asyncio
 from crawl4ai import AsyncWebCrawler
 async def main():
    # Initialize the AsyncWebCrawler
    async with AsyncWebCrawler(verbose=True) as crawler:
        # List of URLs to crawl
        urls = [
            "https://example.com",
            "https://python.org",
            "https://github.com",
            "https://stackoverflow.com",
            "https://news.ycombinator.com"
        ]
        # Set up crawling parameters
        word_count_threshold = 100
        # Run the crawling process for multiple URLs
        results = await crawler.arun_many(
            urls=urls,
            word_count_threshold=word_count_threshold,
            bypass_cache=True,
            verbose=True
        )
        # Process the results
        for result in results:
            if result.success:
                print(f"Successfully crawled: {result.url}")
                print(f"Title: {result.metadata.get('title', 'N/A')}")
                print(f"Word count: {len(result.markdown.split())}")
                print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
                print(f"Number of images: {len(result.media.get('images', []))}")
                print("---")
            else:
                print(f"Failed to crawl: {result.url}")
                print(f"Error: {result.error_message}")
                print("---")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/language_support_example.py
+++ b/docs/examples/language_support_example.py
@@ -0,0 +1,45 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
 async def main():
    # Example 1: Setting language when creating the crawler
    crawler1 = AsyncWebCrawler(
        crawler_strategy=AsyncPlaywrightCrawlerStrategy(
            headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
        )
    )
    result1 = await crawler1.arun("https://www.example.com")
    print("Example 1 result:", result1.extracted_content[:100])  # Print first 100 characters
    # Example 2: Setting language before crawling
    crawler2 = AsyncWebCrawler()
    crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
    result2 = await crawler2.arun("https://www.example.com")
    print("Example 2 result:", result2.extracted_content[:100])
    # Example 3: Setting language when calling arun method
    crawler3 = AsyncWebCrawler()
    result3 = await crawler3.arun(
        "https://www.example.com",
        headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
    )
    print("Example 3 result:", result3.extracted_content[:100])
    # Example 4: Crawling multiple pages with different languages
    urls = [
        ("https://www.example.com", "fr-FR,fr;q=0.9"),
        ("https://www.example.org", "es-ES,es;q=0.9"),
        ("https://www.example.net", "de-DE,de;q=0.9"),
    ]
    crawler4 = AsyncWebCrawler()
    results = await asyncio.gather(*[
        crawler4.arun(url, headers={"Accept-Language": lang})
        for url, lang in urls
    ])
    for url, result in zip([u for u, _ in urls], results):
        print(f"Result for {url}:", result.extracted_content[:100])
 if __name__ == "__main__":
    asyncio.run(main())
--- a/tests/async/test_basic_crawling.py
+++ b/tests/async/test_basic_crawling.py
@@ -5,7 +5,7 @@ import asyncio
 import time
 # Add the parent directory to the Python path
-parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(parent_dir)
 from crawl4ai.async_webcrawler import AsyncWebCrawler
--- a/tests/async/test_screenshot.py
+++ b/tests/async/test_screenshot.py
@@ -0,0 +1,124 @@
 import os
 import sys
 import pytest
 import asyncio
 import base64
 from PIL import Image
 import io
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 from crawl4ai.async_webcrawler import AsyncWebCrawler
@pytest.mark.asyncio
 async def test_basic_screenshot():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://example.com"  # A static website
        result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
        assert result.success
        assert result.screenshot is not None
        # Verify the screenshot is a valid image
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"
@pytest.mark.asyncio
 async def test_screenshot_with_wait_for():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Using a website with dynamic content
        url = "https://www.youtube.com"
        wait_for = "css:#content"  # Wait for the main content to load
        result = await crawler.arun(
            url=url, 
            bypass_cache=True, 
            screenshot=True, 
            wait_for=wait_for
        )
        assert result.success
        assert result.screenshot is not None
        # Verify the screenshot is a valid image
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"
        # You might want to add more specific checks here, like image dimensions
        # or even use image recognition to verify certain elements are present
@pytest.mark.asyncio
 async def test_screenshot_with_js_wait_for():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.amazon.com"
        wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null"
        result = await crawler.arun(
            url=url, 
            bypass_cache=True, 
            screenshot=True, 
            wait_for=wait_for
        )
        assert result.success
        assert result.screenshot is not None
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"
@pytest.mark.asyncio
 async def test_screenshot_without_wait_for():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nytimes.com"  # A website with lots of dynamic content
        result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
        assert result.success
        assert result.screenshot is not None
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"
@pytest.mark.asyncio
 async def test_screenshot_comparison():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.reddit.com"
        wait_for = "css:#SHORTCUT_FOCUSABLE_DIV"
        # Take screenshot without wait_for
        result_without_wait = await crawler.arun(
            url=url, 
            bypass_cache=True, 
            screenshot=True
        )
        # Take screenshot with wait_for
        result_with_wait = await crawler.arun(
            url=url, 
            bypass_cache=True, 
            screenshot=True, 
            wait_for=wait_for
        )
        assert result_without_wait.success and result_with_wait.success
        assert result_without_wait.screenshot is not None
        assert result_with_wait.screenshot is not None
        # Compare the two screenshots
        image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot)))
        image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot)))
        # This is a simple size comparison. In a real-world scenario, you might want to use
        # more sophisticated image comparison techniques.
        assert image_with_wait.size[0] >= image_without_wait.size[0]
        assert image_with_wait.size[1] >= image_without_wait.size[1]
 # Entry point for debugging
 if __name__ == "__main__":
    pytest.main([__file__, "-v"])