Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5b84ac9186 | ||
|
|
7ea5603576 | ||
|
|
4750810a67 |
6
.gitignore
vendored
6
.gitignore
vendored
@@ -196,4 +196,8 @@ docs/.DS_Store
|
|||||||
tmp/
|
tmp/
|
||||||
test_env/
|
test_env/
|
||||||
**/.DS_Store
|
**/.DS_Store
|
||||||
**/.DS_Store
|
**/.DS_Store
|
||||||
|
|
||||||
|
todo.md
|
||||||
|
git_changes.py
|
||||||
|
git_changes.md
|
||||||
|
|||||||
@@ -1,5 +1,14 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.3.5] - 2024-09-02
|
||||||
|
|
||||||
|
Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
|
||||||
|
|
||||||
|
- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy
|
||||||
|
- Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler
|
||||||
|
- Improve error handling and timeout management in crawling process
|
||||||
|
- Fix typo in CrawlResult model (responser_headers -> response_headers)
|
||||||
|
|
||||||
## [v0.2.77] - 2024-08-04
|
## [v0.2.77] - 2024-08-04
|
||||||
|
|
||||||
Significant improvements in text processing and performance:
|
Significant improvements in text processing and performance:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
|
|
||||||
__version__ = "0.3.4"
|
__version__ = "0.3.5"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
|||||||
@@ -12,10 +12,12 @@ import hashlib
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
class AsyncCrawlResponse(BaseModel):
|
class AsyncCrawlResponse(BaseModel):
|
||||||
html: str
|
html: str
|
||||||
response_headers: Dict[str, str]
|
response_headers: Dict[str, str]
|
||||||
status_code: int
|
status_code: int
|
||||||
|
screenshot: Optional[str] = None
|
||||||
|
|
||||||
class AsyncCrawlerStrategy(ABC):
|
class AsyncCrawlerStrategy(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -139,6 +141,45 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
asyncio.create_task(self.kill_session(sid))
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
|
|
||||||
|
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
||||||
|
wait_for = wait_for.strip()
|
||||||
|
|
||||||
|
if wait_for.startswith('js:'):
|
||||||
|
# Explicitly specified JavaScript
|
||||||
|
js_code = wait_for[3:].strip()
|
||||||
|
return await self.csp_compliant_wait(page, js_code, timeout)
|
||||||
|
elif wait_for.startswith('css:'):
|
||||||
|
# Explicitly specified CSS selector
|
||||||
|
css_selector = wait_for[4:].strip()
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(css_selector, timeout=timeout)
|
||||||
|
except Error as e:
|
||||||
|
if 'Timeout' in str(e):
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
||||||
|
else:
|
||||||
|
# Auto-detect based on content
|
||||||
|
if wait_for.startswith('()') or wait_for.startswith('function'):
|
||||||
|
# It's likely a JavaScript function
|
||||||
|
return await self.csp_compliant_wait(page, wait_for, timeout)
|
||||||
|
else:
|
||||||
|
# Assume it's a CSS selector first
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for, timeout=timeout)
|
||||||
|
except Error as e:
|
||||||
|
if 'Timeout' in str(e):
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
|
||||||
|
else:
|
||||||
|
# If it's not a timeout error, it might be an invalid selector
|
||||||
|
# Let's try to evaluate it as a JavaScript function as a fallback
|
||||||
|
try:
|
||||||
|
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||||
|
except Error:
|
||||||
|
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||||
|
"It should be either a valid CSS selector, a JavaScript function, "
|
||||||
|
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||||
|
|
||||||
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||||
wrapper_js = f"""
|
wrapper_js = f"""
|
||||||
async () => {{
|
async () => {{
|
||||||
@@ -250,19 +291,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
wait_for = kwargs.get("wait_for")
|
wait_for = kwargs.get("wait_for")
|
||||||
if wait_for:
|
if wait_for:
|
||||||
try:
|
try:
|
||||||
await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
|
await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Custom wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
# try:
|
|
||||||
# await page.wait_for_function(wait_for)
|
|
||||||
# # if callable(wait_for):
|
|
||||||
# # await page.wait_for_function(wait_for)
|
|
||||||
# # elif isinstance(wait_for, str):
|
|
||||||
# # await page.wait_for_selector(wait_for)
|
|
||||||
# # else:
|
|
||||||
# # raise ValueError("wait_for must be either a callable or a CSS selector string")
|
|
||||||
# except Error as e:
|
|
||||||
# raise Error(f"Custom wait condition failed: {str(e)}")
|
|
||||||
|
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
page = await self.execute_hook('before_return_html', page, html)
|
page = await self.execute_hook('before_return_html', page, html)
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
||||||
|
|
||||||
async_response : AsyncCrawlResponse = None
|
async_response: AsyncCrawlResponse = None
|
||||||
cached = None
|
cached = None
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
@@ -102,15 +102,14 @@ class AsyncWebCrawler:
|
|||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
|
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
|
||||||
html = sanitize_input_encode(async_response.html)
|
html = sanitize_input_encode(async_response.html)
|
||||||
|
screenshot_data = async_response.screenshot
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||||
)
|
)
|
||||||
if screenshot:
|
|
||||||
screenshot_data = await self.crawler_strategy.take_screenshot(url)
|
|
||||||
|
|
||||||
crawl_result = await self.aprocess_html(
|
crawl_result = await self.aprocess_html(
|
||||||
url,
|
url,
|
||||||
@@ -127,7 +126,7 @@ class AsyncWebCrawler:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
crawl_result.status_code = async_response.status_code if async_response else 200
|
crawl_result.status_code = async_response.status_code if async_response else 200
|
||||||
crawl_result.responser_headers = async_response.response_headers if async_response else {}
|
crawl_result.response_headers = async_response.response_headers if async_response else {}
|
||||||
crawl_result.success = bool(html)
|
crawl_result.success = bool(html)
|
||||||
crawl_result.session_id = kwargs.get("session_id", None)
|
crawl_result.session_id = kwargs.get("session_id", None)
|
||||||
return crawl_result
|
return crawl_result
|
||||||
|
|||||||
@@ -18,5 +18,5 @@ class CrawlResult(BaseModel):
|
|||||||
metadata: Optional[dict] = None
|
metadata: Optional[dict] = None
|
||||||
error_message: Optional[str] = None
|
error_message: Optional[str] = None
|
||||||
session_id: Optional[str] = None
|
session_id: Optional[str] = None
|
||||||
responser_headers: Optional[dict] = None
|
response_headers: Optional[dict] = None
|
||||||
status_code: Optional[int] = None
|
status_code: Optional[int] = None
|
||||||
48
docs/examples/async_webcrawler_multiple_urls_example.py
Normal file
48
docs/examples/async_webcrawler_multiple_urls_example.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# File: async_webcrawler_multiple_urls_example.py
|
||||||
|
import os, sys
|
||||||
|
# append 2 parent directories to sys.path to import crawl4ai
|
||||||
|
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Initialize the AsyncWebCrawler
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
# List of URLs to crawl
|
||||||
|
urls = [
|
||||||
|
"https://example.com",
|
||||||
|
"https://python.org",
|
||||||
|
"https://github.com",
|
||||||
|
"https://stackoverflow.com",
|
||||||
|
"https://news.ycombinator.com"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Set up crawling parameters
|
||||||
|
word_count_threshold = 100
|
||||||
|
|
||||||
|
# Run the crawling process for multiple URLs
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
word_count_threshold=word_count_threshold,
|
||||||
|
bypass_cache=True,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process the results
|
||||||
|
for result in results:
|
||||||
|
if result.success:
|
||||||
|
print(f"Successfully crawled: {result.url}")
|
||||||
|
print(f"Title: {result.metadata.get('title', 'N/A')}")
|
||||||
|
print(f"Word count: {len(result.markdown.split())}")
|
||||||
|
print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
|
||||||
|
print(f"Number of images: {len(result.media.get('images', []))}")
|
||||||
|
print("---")
|
||||||
|
else:
|
||||||
|
print(f"Failed to crawl: {result.url}")
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
print("---")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
45
docs/examples/language_support_example.py
Normal file
45
docs/examples/language_support_example.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Example 1: Setting language when creating the crawler
|
||||||
|
crawler1 = AsyncWebCrawler(
|
||||||
|
crawler_strategy=AsyncPlaywrightCrawlerStrategy(
|
||||||
|
headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result1 = await crawler1.arun("https://www.example.com")
|
||||||
|
print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters
|
||||||
|
|
||||||
|
# Example 2: Setting language before crawling
|
||||||
|
crawler2 = AsyncWebCrawler()
|
||||||
|
crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
|
||||||
|
result2 = await crawler2.arun("https://www.example.com")
|
||||||
|
print("Example 2 result:", result2.extracted_content[:100])
|
||||||
|
|
||||||
|
# Example 3: Setting language when calling arun method
|
||||||
|
crawler3 = AsyncWebCrawler()
|
||||||
|
result3 = await crawler3.arun(
|
||||||
|
"https://www.example.com",
|
||||||
|
headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
|
||||||
|
)
|
||||||
|
print("Example 3 result:", result3.extracted_content[:100])
|
||||||
|
|
||||||
|
# Example 4: Crawling multiple pages with different languages
|
||||||
|
urls = [
|
||||||
|
("https://www.example.com", "fr-FR,fr;q=0.9"),
|
||||||
|
("https://www.example.org", "es-ES,es;q=0.9"),
|
||||||
|
("https://www.example.net", "de-DE,de;q=0.9"),
|
||||||
|
]
|
||||||
|
|
||||||
|
crawler4 = AsyncWebCrawler()
|
||||||
|
results = await asyncio.gather(*[
|
||||||
|
crawler4.arun(url, headers={"Accept-Language": lang})
|
||||||
|
for url, lang in urls
|
||||||
|
])
|
||||||
|
|
||||||
|
for url, result in zip([u for u, _ in urls], results):
|
||||||
|
print(f"Result for {url}:", result.extracted_content[:100])
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -5,7 +5,7 @@ import asyncio
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
# Add the parent directory to the Python path
|
# Add the parent directory to the Python path
|
||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
sys.path.append(parent_dir)
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
|||||||
124
tests/async/test_screenshot.py
Normal file
124
tests/async/test_screenshot.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Add the parent directory to the Python path
|
||||||
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_basic_screenshot():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
url = "https://example.com" # A static website
|
||||||
|
result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert result.screenshot is not None
|
||||||
|
|
||||||
|
# Verify the screenshot is a valid image
|
||||||
|
image_data = base64.b64decode(result.screenshot)
|
||||||
|
image = Image.open(io.BytesIO(image_data))
|
||||||
|
assert image.format == "PNG"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_screenshot_with_wait_for():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
# Using a website with dynamic content
|
||||||
|
url = "https://www.youtube.com"
|
||||||
|
wait_for = "css:#content" # Wait for the main content to load
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
bypass_cache=True,
|
||||||
|
screenshot=True,
|
||||||
|
wait_for=wait_for
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert result.screenshot is not None
|
||||||
|
|
||||||
|
# Verify the screenshot is a valid image
|
||||||
|
image_data = base64.b64decode(result.screenshot)
|
||||||
|
image = Image.open(io.BytesIO(image_data))
|
||||||
|
assert image.format == "PNG"
|
||||||
|
|
||||||
|
# You might want to add more specific checks here, like image dimensions
|
||||||
|
# or even use image recognition to verify certain elements are present
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_screenshot_with_js_wait_for():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
url = "https://www.amazon.com"
|
||||||
|
wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null"
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
bypass_cache=True,
|
||||||
|
screenshot=True,
|
||||||
|
wait_for=wait_for
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert result.screenshot is not None
|
||||||
|
|
||||||
|
image_data = base64.b64decode(result.screenshot)
|
||||||
|
image = Image.open(io.BytesIO(image_data))
|
||||||
|
assert image.format == "PNG"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_screenshot_without_wait_for():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
url = "https://www.nytimes.com" # A website with lots of dynamic content
|
||||||
|
|
||||||
|
result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert result.screenshot is not None
|
||||||
|
|
||||||
|
image_data = base64.b64decode(result.screenshot)
|
||||||
|
image = Image.open(io.BytesIO(image_data))
|
||||||
|
assert image.format == "PNG"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_screenshot_comparison():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
url = "https://www.reddit.com"
|
||||||
|
wait_for = "css:#SHORTCUT_FOCUSABLE_DIV"
|
||||||
|
|
||||||
|
# Take screenshot without wait_for
|
||||||
|
result_without_wait = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
bypass_cache=True,
|
||||||
|
screenshot=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Take screenshot with wait_for
|
||||||
|
result_with_wait = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
bypass_cache=True,
|
||||||
|
screenshot=True,
|
||||||
|
wait_for=wait_for
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result_without_wait.success and result_with_wait.success
|
||||||
|
assert result_without_wait.screenshot is not None
|
||||||
|
assert result_with_wait.screenshot is not None
|
||||||
|
|
||||||
|
# Compare the two screenshots
|
||||||
|
image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot)))
|
||||||
|
image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot)))
|
||||||
|
|
||||||
|
# This is a simple size comparison. In a real-world scenario, you might want to use
|
||||||
|
# more sophisticated image comparison techniques.
|
||||||
|
assert image_with_wait.size[0] >= image_without_wait.size[0]
|
||||||
|
assert image_with_wait.size[1] >= image_without_wait.size[1]
|
||||||
|
|
||||||
|
# Entry point for debugging
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Reference in New Issue
Block a user