Compare commits
5 Commits
unclecode/
...
0.3.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b309bc34e1 | ||
|
|
b8147b64e0 | ||
|
|
aab6ea022e | ||
|
|
dd17ed0e63 | ||
|
|
768aa06ceb |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -202,5 +202,7 @@ todo.md
|
|||||||
git_changes.py
|
git_changes.py
|
||||||
git_changes.md
|
git_changes.md
|
||||||
pypi_build.sh
|
pypi_build.sh
|
||||||
|
git_issues.py
|
||||||
|
git_issues.md
|
||||||
|
|
||||||
.tests/
|
.tests/
|
||||||
74
CHANGELOG.md
74
CHANGELOG.md
@@ -1,5 +1,79 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.3.71] - 2024-10-18
|
||||||
|
|
||||||
|
### Changes
|
||||||
|
1. **Version Update**:
|
||||||
|
- Updated version number from 0.3.7 to 0.3.71.
|
||||||
|
|
||||||
|
2. **Crawler Enhancements**:
|
||||||
|
- Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure.
|
||||||
|
- Improved context creation with additional options:
|
||||||
|
- Enabled `accept_downloads` and `java_script_enabled`.
|
||||||
|
- Added a cookie to enable cookies by default.
|
||||||
|
|
||||||
|
3. **Error Handling Improvements**:
|
||||||
|
- Enhanced error messages in AsyncWebCrawler's `arun` method.
|
||||||
|
- Updated error reporting format for better visibility and consistency.
|
||||||
|
|
||||||
|
4. **Performance Optimization**:
|
||||||
|
- Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios.
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
- Updated quickstart notebook:
|
||||||
|
- Changed installation command to use the released package instead of GitHub repository.
|
||||||
|
- Updated kernel display name.
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- Minor code refactoring and cleanup.
|
||||||
|
|
||||||
|
## [v0.3.7] - 2024-10-17
|
||||||
|
|
||||||
|
### New Features
|
||||||
|
1. **Enhanced Browser Stealth**:
|
||||||
|
- Implemented `playwright_stealth` for improved bot detection avoidance.
|
||||||
|
- Added `StealthConfig` for fine-tuned control over stealth parameters.
|
||||||
|
|
||||||
|
2. **User Simulation**:
|
||||||
|
- New `simulate_user` option to mimic human-like interactions (mouse movements, clicks, keyboard presses).
|
||||||
|
|
||||||
|
3. **Navigator Override**:
|
||||||
|
- Added `override_navigator` option to modify navigator properties, further improving bot detection evasion.
|
||||||
|
|
||||||
|
4. **Improved iframe Handling**:
|
||||||
|
- New `process_iframes` parameter to extract and integrate iframe content into the main page.
|
||||||
|
|
||||||
|
5. **Flexible Browser Selection**:
|
||||||
|
- Support for choosing between Chromium, Firefox, and WebKit browsers.
|
||||||
|
|
||||||
|
6. **Include Links in Markdown**:
|
||||||
|
- Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method.
|
||||||
|
|
||||||
|
### Improvements
|
||||||
|
1. **Better Error Handling**:
|
||||||
|
- Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions.
|
||||||
|
- Added console message and error logging for better debugging.
|
||||||
|
|
||||||
|
2. **Image Processing Enhancements**:
|
||||||
|
- Improved image dimension updating and filtering logic.
|
||||||
|
|
||||||
|
3. **Crawling Flexibility**:
|
||||||
|
- Added support for custom viewport sizes.
|
||||||
|
- Implemented delayed content retrieval with `delay_before_return_html` parameter.
|
||||||
|
|
||||||
|
4. **Performance Optimization**:
|
||||||
|
- Adjusted default semaphore count for parallel crawling.
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
- Fixed an issue where the HTML content could be empty after processing.
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
- Added new example `crawl_with_user_simulation()` demonstrating the use of user simulation and navigator override features.
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- Refactored code for better maintainability and readability.
|
||||||
|
- Updated browser launch arguments for improved compatibility and performance.
|
||||||
|
|
||||||
## [v0.3.6] - 2024-10-12
|
## [v0.3.6] - 2024-10-12
|
||||||
|
|
||||||
### 1. Improved Crawling Control
|
### 1. Improved Crawling Control
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
|
|
||||||
__version__ = "0.3.6"
|
__version__ = "0.3.71"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
|||||||
558
crawl4ai/async_crawler_strategy copy.py
Normal file
558
crawl4ai/async_crawler_strategy copy.py
Normal file
@@ -0,0 +1,558 @@
|
|||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||||
|
import os
|
||||||
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||||
|
from io import BytesIO
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from pathlib import Path
|
||||||
|
from playwright.async_api import ProxySettings
|
||||||
|
from pydantic import BaseModel
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from playwright_stealth import stealth_async
|
||||||
|
|
||||||
|
class AsyncCrawlResponse(BaseModel):
|
||||||
|
html: str
|
||||||
|
response_headers: Dict[str, str]
|
||||||
|
status_code: int
|
||||||
|
screenshot: Optional[str] = None
|
||||||
|
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
class AsyncCrawlerStrategy(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def take_screenshot(self, url: str) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||||
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
|
self.use_cached_html = use_cached_html
|
||||||
|
self.user_agent = kwargs.get(
|
||||||
|
"user_agent",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
)
|
||||||
|
self.proxy = kwargs.get("proxy")
|
||||||
|
self.headless = kwargs.get("headless", True)
|
||||||
|
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||||
|
self.headers = kwargs.get("headers", {})
|
||||||
|
self.sessions = {}
|
||||||
|
self.session_ttl = 1800
|
||||||
|
self.js_code = js_code
|
||||||
|
self.verbose = kwargs.get("verbose", False)
|
||||||
|
self.playwright = None
|
||||||
|
self.browser = None
|
||||||
|
self.hooks = {
|
||||||
|
'on_browser_created': None,
|
||||||
|
'on_user_agent_updated': None,
|
||||||
|
'on_execution_started': None,
|
||||||
|
'before_goto': None,
|
||||||
|
'after_goto': None,
|
||||||
|
'before_return_html': None,
|
||||||
|
'before_retrieve_html': None
|
||||||
|
}
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
await self.start()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
if self.playwright is None:
|
||||||
|
self.playwright = await async_playwright().start()
|
||||||
|
if self.browser is None:
|
||||||
|
browser_args = {
|
||||||
|
"headless": self.headless,
|
||||||
|
"args": [
|
||||||
|
"--disable-gpu",
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--window-position=0,0",
|
||||||
|
"--ignore-certificate-errors",
|
||||||
|
"--ignore-certificate-errors-spki-list",
|
||||||
|
# "--headless=new", # Use the new headless mode
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add proxy settings if a proxy is specified
|
||||||
|
if self.proxy:
|
||||||
|
proxy_settings = ProxySettings(server=self.proxy)
|
||||||
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
# Select the appropriate browser based on the browser_type
|
||||||
|
if self.browser_type == "firefox":
|
||||||
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||||
|
elif self.browser_type == "webkit":
|
||||||
|
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||||
|
else:
|
||||||
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||||
|
|
||||||
|
await self.execute_hook('on_browser_created', self.browser)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
if self.browser:
|
||||||
|
await self.browser.close()
|
||||||
|
self.browser = None
|
||||||
|
if self.playwright:
|
||||||
|
await self.playwright.stop()
|
||||||
|
self.playwright = None
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if self.browser or self.playwright:
|
||||||
|
asyncio.get_event_loop().run_until_complete(self.close())
|
||||||
|
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
if hook_type in self.hooks:
|
||||||
|
self.hooks[hook_type] = hook
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||||
|
|
||||||
|
async def execute_hook(self, hook_type: str, *args):
|
||||||
|
hook = self.hooks.get(hook_type)
|
||||||
|
if hook:
|
||||||
|
if asyncio.iscoroutinefunction(hook):
|
||||||
|
return await hook(*args)
|
||||||
|
else:
|
||||||
|
return hook(*args)
|
||||||
|
return args[0] if args else None
|
||||||
|
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
|
def set_custom_headers(self, headers: Dict[str, str]):
|
||||||
|
self.headers = headers
|
||||||
|
|
||||||
|
async def kill_session(self, session_id: str):
|
||||||
|
if session_id in self.sessions:
|
||||||
|
context, page, _ = self.sessions[session_id]
|
||||||
|
await page.close()
|
||||||
|
await context.close()
|
||||||
|
del self.sessions[session_id]
|
||||||
|
|
||||||
|
def _cleanup_expired_sessions(self):
|
||||||
|
current_time = time.time()
|
||||||
|
expired_sessions = [
|
||||||
|
sid for sid, (_, _, last_used) in self.sessions.items()
|
||||||
|
if current_time - last_used > self.session_ttl
|
||||||
|
]
|
||||||
|
for sid in expired_sessions:
|
||||||
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
|
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
||||||
|
wait_for = wait_for.strip()
|
||||||
|
|
||||||
|
if wait_for.startswith('js:'):
|
||||||
|
# Explicitly specified JavaScript
|
||||||
|
js_code = wait_for[3:].strip()
|
||||||
|
return await self.csp_compliant_wait(page, js_code, timeout)
|
||||||
|
elif wait_for.startswith('css:'):
|
||||||
|
# Explicitly specified CSS selector
|
||||||
|
css_selector = wait_for[4:].strip()
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(css_selector, timeout=timeout)
|
||||||
|
except Error as e:
|
||||||
|
if 'Timeout' in str(e):
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
||||||
|
else:
|
||||||
|
# Auto-detect based on content
|
||||||
|
if wait_for.startswith('()') or wait_for.startswith('function'):
|
||||||
|
# It's likely a JavaScript function
|
||||||
|
return await self.csp_compliant_wait(page, wait_for, timeout)
|
||||||
|
else:
|
||||||
|
# Assume it's a CSS selector first
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for, timeout=timeout)
|
||||||
|
except Error as e:
|
||||||
|
if 'Timeout' in str(e):
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
|
||||||
|
else:
|
||||||
|
# If it's not a timeout error, it might be an invalid selector
|
||||||
|
# Let's try to evaluate it as a JavaScript function as a fallback
|
||||||
|
try:
|
||||||
|
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||||
|
except Error:
|
||||||
|
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||||
|
"It should be either a valid CSS selector, a JavaScript function, "
|
||||||
|
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||||
|
|
||||||
|
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||||
|
wrapper_js = f"""
|
||||||
|
async () => {{
|
||||||
|
const userFunction = {user_wait_function};
|
||||||
|
const startTime = Date.now();
|
||||||
|
while (true) {{
|
||||||
|
if (await userFunction()) {{
|
||||||
|
return true;
|
||||||
|
}}
|
||||||
|
if (Date.now() - startTime > {timeout}) {{
|
||||||
|
throw new Error('Timeout waiting for condition');
|
||||||
|
}}
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.evaluate(wrapper_js)
|
||||||
|
except TimeoutError:
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for condition")
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Error in wait condition: {str(e)}")
|
||||||
|
|
||||||
|
async def process_iframes(self, page):
|
||||||
|
# Find all iframes
|
||||||
|
iframes = await page.query_selector_all('iframe')
|
||||||
|
|
||||||
|
for i, iframe in enumerate(iframes):
|
||||||
|
try:
|
||||||
|
# Add a unique identifier to the iframe
|
||||||
|
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
|
||||||
|
|
||||||
|
# Get the frame associated with this iframe
|
||||||
|
frame = await iframe.content_frame()
|
||||||
|
|
||||||
|
if frame:
|
||||||
|
# Wait for the frame to load
|
||||||
|
await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout
|
||||||
|
|
||||||
|
# Extract the content of the iframe's body
|
||||||
|
iframe_content = await frame.evaluate('() => document.body.innerHTML')
|
||||||
|
|
||||||
|
# Generate a unique class name for this iframe
|
||||||
|
class_name = f'extracted-iframe-content-{i}'
|
||||||
|
|
||||||
|
# Replace the iframe with a div containing the extracted content
|
||||||
|
_iframe = iframe_content.replace('`', '\\`')
|
||||||
|
await page.evaluate(f"""
|
||||||
|
() => {{
|
||||||
|
const iframe = document.getElementById('iframe-{i}');
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.innerHTML = `{_iframe}`;
|
||||||
|
div.className = '{class_name}';
|
||||||
|
iframe.replaceWith(div);
|
||||||
|
}}
|
||||||
|
""")
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not access content frame for iframe {i}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing iframe {i}: {str(e)}")
|
||||||
|
|
||||||
|
# Return the page object
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
|
response_headers = {}
|
||||||
|
status_code = None
|
||||||
|
|
||||||
|
self._cleanup_expired_sessions()
|
||||||
|
session_id = kwargs.get("session_id")
|
||||||
|
if session_id:
|
||||||
|
context, page, _ = self.sessions.get(session_id, (None, None, None))
|
||||||
|
if not context:
|
||||||
|
context = await self.browser.new_context(
|
||||||
|
user_agent=self.user_agent,
|
||||||
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None
|
||||||
|
)
|
||||||
|
await context.set_extra_http_headers(self.headers)
|
||||||
|
page = await context.new_page()
|
||||||
|
self.sessions[session_id] = (context, page, time.time())
|
||||||
|
else:
|
||||||
|
context = await self.browser.new_context(
|
||||||
|
user_agent=self.user_agent,
|
||||||
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None
|
||||||
|
)
|
||||||
|
await context.set_extra_http_headers(self.headers)
|
||||||
|
|
||||||
|
if kwargs.get("override_navigator", False):
|
||||||
|
# Inject scripts to override navigator properties
|
||||||
|
await context.add_init_script("""
|
||||||
|
// Pass the Permissions Test.
|
||||||
|
const originalQuery = window.navigator.permissions.query;
|
||||||
|
window.navigator.permissions.query = (parameters) => (
|
||||||
|
parameters.name === 'notifications' ?
|
||||||
|
Promise.resolve({ state: Notification.permission }) :
|
||||||
|
originalQuery(parameters)
|
||||||
|
);
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
});
|
||||||
|
window.navigator.chrome = {
|
||||||
|
runtime: {},
|
||||||
|
// Add other properties if necessary
|
||||||
|
};
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
get: () => [1, 2, 3, 4, 5],
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => ['en-US', 'en'],
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'hidden', {
|
||||||
|
get: () => false
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'visibilityState', {
|
||||||
|
get: () => 'visible'
|
||||||
|
});
|
||||||
|
""")
|
||||||
|
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||||
|
|
||||||
|
if self.use_cached_html:
|
||||||
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
|
if os.path.exists(cache_file_path):
|
||||||
|
html = ""
|
||||||
|
with open(cache_file_path, "r") as f:
|
||||||
|
html = f.read()
|
||||||
|
# retrieve response headers and status code from cache
|
||||||
|
with open(cache_file_path + ".meta", "r") as f:
|
||||||
|
meta = json.load(f)
|
||||||
|
response_headers = meta.get("response_headers", {})
|
||||||
|
status_code = meta.get("status_code")
|
||||||
|
response = AsyncCrawlResponse(
|
||||||
|
html=html, response_headers=response_headers, status_code=status_code
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
if not kwargs.get("js_only", False):
|
||||||
|
await self.execute_hook('before_goto', page)
|
||||||
|
|
||||||
|
response = await page.goto("about:blank")
|
||||||
|
await stealth_async(page)
|
||||||
|
response = await page.goto(
|
||||||
|
url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
|
||||||
|
)
|
||||||
|
|
||||||
|
# await stealth_async(page)
|
||||||
|
# response = await page.goto("about:blank")
|
||||||
|
# await stealth_async(page)
|
||||||
|
# await page.evaluate(f"window.location.href = '{url}'")
|
||||||
|
|
||||||
|
await self.execute_hook('after_goto', page)
|
||||||
|
|
||||||
|
# Get status code and headers
|
||||||
|
status_code = response.status
|
||||||
|
response_headers = response.headers
|
||||||
|
else:
|
||||||
|
status_code = 200
|
||||||
|
response_headers = {}
|
||||||
|
|
||||||
|
await page.wait_for_selector('body')
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
|
||||||
|
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||||
|
if js_code:
|
||||||
|
if isinstance(js_code, str):
|
||||||
|
await page.evaluate(js_code)
|
||||||
|
elif isinstance(js_code, list):
|
||||||
|
for js in js_code:
|
||||||
|
await page.evaluate(js)
|
||||||
|
|
||||||
|
await page.wait_for_load_state('networkidle')
|
||||||
|
# Check for on execution event
|
||||||
|
await self.execute_hook('on_execution_started', page)
|
||||||
|
|
||||||
|
if kwargs.get("simulate_user", False):
|
||||||
|
# Simulate user interactions
|
||||||
|
await page.mouse.move(100, 100)
|
||||||
|
await page.mouse.down()
|
||||||
|
await page.mouse.up()
|
||||||
|
await page.keyboard.press('ArrowDown')
|
||||||
|
|
||||||
|
# Handle the wait_for parameter
|
||||||
|
wait_for = kwargs.get("wait_for")
|
||||||
|
if wait_for:
|
||||||
|
try:
|
||||||
|
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Update image dimensions
|
||||||
|
update_image_dimensions_js = """
|
||||||
|
() => {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const filterImage = (img) => {
|
||||||
|
// Filter out images that are too small
|
||||||
|
if (img.width < 100 && img.height < 100) return false;
|
||||||
|
|
||||||
|
// Filter out images that are not visible
|
||||||
|
const rect = img.getBoundingClientRect();
|
||||||
|
if (rect.width === 0 || rect.height === 0) return false;
|
||||||
|
|
||||||
|
// Filter out images with certain class names (e.g., icons, thumbnails)
|
||||||
|
if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
|
||||||
|
|
||||||
|
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
||||||
|
if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
|
||||||
|
let imagesLeft = images.length;
|
||||||
|
|
||||||
|
if (imagesLeft === 0) {
|
||||||
|
resolve();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const checkImage = (img) => {
|
||||||
|
if (img.complete && img.naturalWidth !== 0) {
|
||||||
|
img.setAttribute('width', img.naturalWidth);
|
||||||
|
img.setAttribute('height', img.naturalHeight);
|
||||||
|
imagesLeft--;
|
||||||
|
if (imagesLeft === 0) resolve();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
images.forEach(img => {
|
||||||
|
checkImage(img);
|
||||||
|
if (!img.complete) {
|
||||||
|
img.onload = () => {
|
||||||
|
checkImage(img);
|
||||||
|
};
|
||||||
|
img.onerror = () => {
|
||||||
|
imagesLeft--;
|
||||||
|
if (imagesLeft === 0) resolve();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fallback timeout of 5 seconds
|
||||||
|
setTimeout(() => resolve(), 5000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
await page.evaluate(update_image_dimensions_js)
|
||||||
|
|
||||||
|
# Wait a bit for any onload events to complete
|
||||||
|
await page.wait_for_timeout(100)
|
||||||
|
|
||||||
|
# Process iframes
|
||||||
|
if kwargs.get("process_iframes", False):
|
||||||
|
page = await self.process_iframes(page)
|
||||||
|
|
||||||
|
await self.execute_hook('before_retrieve_html', page)
|
||||||
|
# Check if delay_before_return_html is set then wait for that time
|
||||||
|
delay_before_return_html = kwargs.get("delay_before_return_html")
|
||||||
|
if delay_before_return_html:
|
||||||
|
await asyncio.sleep(delay_before_return_html)
|
||||||
|
|
||||||
|
html = await page.content()
|
||||||
|
await self.execute_hook('before_return_html', page, html)
|
||||||
|
|
||||||
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
|
screenshot_data = None
|
||||||
|
if kwargs.get("screenshot"):
|
||||||
|
screenshot_data = await self.take_screenshot(url)
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||||
|
|
||||||
|
if self.use_cached_html:
|
||||||
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
|
# store response headers and status code in cache
|
||||||
|
with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
|
||||||
|
json.dump({
|
||||||
|
"response_headers": response_headers,
|
||||||
|
"status_code": status_code
|
||||||
|
}, f)
|
||||||
|
|
||||||
|
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await page.content()
|
||||||
|
|
||||||
|
response = AsyncCrawlResponse(
|
||||||
|
html=html,
|
||||||
|
response_headers=response_headers,
|
||||||
|
status_code=status_code,
|
||||||
|
screenshot=screenshot_data,
|
||||||
|
get_delayed_content=get_delayed_content
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Error as e:
|
||||||
|
raise Error(f"Failed to crawl {url}: {str(e)}")
|
||||||
|
finally:
|
||||||
|
if not session_id:
|
||||||
|
await page.close()
|
||||||
|
await context.close()
|
||||||
|
|
||||||
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
|
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||||
|
semaphore = asyncio.Semaphore(semaphore_count)
|
||||||
|
|
||||||
|
async def crawl_with_semaphore(url):
|
||||||
|
async with semaphore:
|
||||||
|
return await self.crawl(url, **kwargs)
|
||||||
|
|
||||||
|
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||||
|
|
||||||
|
async def take_screenshot(self, url: str, wait_time=1000) -> str:
|
||||||
|
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
||||||
|
page = await context.new_page()
|
||||||
|
try:
|
||||||
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
# Wait for a specified time (default is 1 second)
|
||||||
|
await page.wait_for_timeout(wait_time)
|
||||||
|
screenshot = await page.screenshot(full_page=True)
|
||||||
|
return base64.b64encode(screenshot).decode('utf-8')
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Failed to take screenshot: {str(e)}"
|
||||||
|
print(error_message)
|
||||||
|
|
||||||
|
# Generate an error image
|
||||||
|
img = Image.new('RGB', (800, 600), color='black')
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||||
|
|
||||||
|
buffered = BytesIO()
|
||||||
|
img.save(buffered, format="JPEG")
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
@@ -1,17 +1,35 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64, time
|
import base64
|
||||||
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||||
import os
|
import os
|
||||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from .utils import sanitize_input_encode, calculate_semaphore_count
|
|
||||||
import json, uuid
|
|
||||||
import hashlib
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from playwright_stealth import StealthConfig, stealth_async
|
||||||
|
|
||||||
|
stealth_config = StealthConfig(
|
||||||
|
webdriver=True,
|
||||||
|
chrome_app=True,
|
||||||
|
chrome_csi=True,
|
||||||
|
chrome_load_times=True,
|
||||||
|
chrome_runtime=True,
|
||||||
|
navigator_languages=True,
|
||||||
|
navigator_plugins=True,
|
||||||
|
navigator_permissions=True,
|
||||||
|
webgl_vendor=True,
|
||||||
|
outerdimensions=True,
|
||||||
|
navigator_hardware_concurrency=True,
|
||||||
|
media_codecs=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AsyncCrawlResponse(BaseModel):
|
class AsyncCrawlResponse(BaseModel):
|
||||||
html: str
|
html: str
|
||||||
@@ -47,10 +65,14 @@ class AsyncCrawlerStrategy(ABC):
|
|||||||
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
self.use_cached_html = use_cached_html
|
self.use_cached_html = use_cached_html
|
||||||
self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
self.user_agent = kwargs.get(
|
||||||
|
"user_agent",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
)
|
||||||
self.proxy = kwargs.get("proxy")
|
self.proxy = kwargs.get("proxy")
|
||||||
self.headless = kwargs.get("headless", True)
|
self.headless = kwargs.get("headless", True)
|
||||||
self.browser_type = kwargs.get("browser_type", "chromium") # New parameter
|
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||||
self.headers = kwargs.get("headers", {})
|
self.headers = kwargs.get("headers", {})
|
||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
self.session_ttl = 1800
|
self.session_ttl = 1800
|
||||||
@@ -58,6 +80,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
self.playwright = None
|
self.playwright = None
|
||||||
self.browser = None
|
self.browser = None
|
||||||
|
self.sleep_on_close = kwargs.get("sleep_on_close", False)
|
||||||
self.hooks = {
|
self.hooks = {
|
||||||
'on_browser_created': None,
|
'on_browser_created': None,
|
||||||
'on_user_agent_updated': None,
|
'on_user_agent_updated': None,
|
||||||
@@ -83,9 +106,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"headless": self.headless,
|
"headless": self.headless,
|
||||||
"args": [
|
"args": [
|
||||||
"--disable-gpu",
|
"--disable-gpu",
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-setuid-sandbox",
|
|
||||||
"--no-sandbox",
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--window-position=0,0",
|
||||||
|
"--ignore-certificate-errors",
|
||||||
|
"--ignore-certificate-errors-spki-list",
|
||||||
|
# "--headless=new", # Use the new headless mode
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,7 +122,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
proxy_settings = ProxySettings(server=self.proxy)
|
proxy_settings = ProxySettings(server=self.proxy)
|
||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|
||||||
# Select the appropriate browser based on the browser_type
|
# Select the appropriate browser based on the browser_type
|
||||||
if self.browser_type == "firefox":
|
if self.browser_type == "firefox":
|
||||||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||||
@@ -106,6 +133,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.execute_hook('on_browser_created', self.browser)
|
await self.execute_hook('on_browser_created', self.browser)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
|
if self.sleep_on_close:
|
||||||
|
await asyncio.sleep(500)
|
||||||
if self.browser:
|
if self.browser:
|
||||||
await self.browser.close()
|
await self.browser.close()
|
||||||
self.browser = None
|
self.browser = None
|
||||||
@@ -147,8 +176,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
def _cleanup_expired_sessions(self):
|
def _cleanup_expired_sessions(self):
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
expired_sessions = [sid for sid, (_, _, last_used) in self.sessions.items()
|
expired_sessions = [
|
||||||
if current_time - last_used > self.session_ttl]
|
sid for sid, (_, _, last_used) in self.sessions.items()
|
||||||
|
if current_time - last_used > self.session_ttl
|
||||||
|
]
|
||||||
for sid in expired_sessions:
|
for sid in expired_sessions:
|
||||||
asyncio.create_task(self.kill_session(sid))
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
@@ -256,7 +287,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# Return the page object
|
# Return the page object
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
status_code = None
|
status_code = None
|
||||||
@@ -268,25 +298,70 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if not context:
|
if not context:
|
||||||
context = await self.browser.new_context(
|
context = await self.browser.new_context(
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
proxy={"server": self.proxy} if self.proxy else None
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
|
accept_downloads=True,
|
||||||
|
java_script_enabled=True
|
||||||
)
|
)
|
||||||
|
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
||||||
await context.set_extra_http_headers(self.headers)
|
await context.set_extra_http_headers(self.headers)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
self.sessions[session_id] = (context, page, time.time())
|
self.sessions[session_id] = (context, page, time.time())
|
||||||
else:
|
else:
|
||||||
context = await self.browser.new_context(
|
context = await self.browser.new_context(
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
|
viewport={"width": 1920, "height": 1080},
|
||||||
proxy={"server": self.proxy} if self.proxy else None
|
proxy={"server": self.proxy} if self.proxy else None
|
||||||
)
|
)
|
||||||
await context.set_extra_http_headers(self.headers)
|
await context.set_extra_http_headers(self.headers)
|
||||||
|
|
||||||
|
if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
||||||
|
# Inject scripts to override navigator properties
|
||||||
|
await context.add_init_script("""
|
||||||
|
// Pass the Permissions Test.
|
||||||
|
const originalQuery = window.navigator.permissions.query;
|
||||||
|
window.navigator.permissions.query = (parameters) => (
|
||||||
|
parameters.name === 'notifications' ?
|
||||||
|
Promise.resolve({ state: Notification.permission }) :
|
||||||
|
originalQuery(parameters)
|
||||||
|
);
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
});
|
||||||
|
window.navigator.chrome = {
|
||||||
|
runtime: {},
|
||||||
|
// Add other properties if necessary
|
||||||
|
};
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
get: () => [1, 2, 3, 4, 5],
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => ['en-US', 'en'],
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'hidden', {
|
||||||
|
get: () => false
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'visibilityState', {
|
||||||
|
get: () => 'visible'
|
||||||
|
});
|
||||||
|
""")
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
# await stealth_async(page) #, stealth_config)
|
||||||
|
|
||||||
|
# Add console message and error logging
|
||||||
|
if kwargs.get("log_console", False):
|
||||||
|
page.on("console", lambda msg: print(f"Console: {msg.text}"))
|
||||||
|
page.on("pageerror", lambda exc: print(f"Page Error: {exc}"))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
html = ""
|
html = ""
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
@@ -296,12 +371,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
meta = json.load(f)
|
meta = json.load(f)
|
||||||
response_headers = meta.get("response_headers", {})
|
response_headers = meta.get("response_headers", {})
|
||||||
status_code = meta.get("status_code")
|
status_code = meta.get("status_code")
|
||||||
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
response = AsyncCrawlResponse(
|
||||||
|
html=html, response_headers=response_headers, status_code=status_code
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
if not kwargs.get("js_only", False):
|
if not kwargs.get("js_only", False):
|
||||||
await self.execute_hook('before_goto', page)
|
await self.execute_hook('before_goto', page)
|
||||||
response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
|
|
||||||
|
response = await page.goto(
|
||||||
|
url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
|
||||||
|
)
|
||||||
|
|
||||||
|
# response = await page.goto("about:blank")
|
||||||
|
# await page.evaluate(f"window.location.href = '{url}'")
|
||||||
|
|
||||||
await self.execute_hook('after_goto', page)
|
await self.execute_hook('after_goto', page)
|
||||||
|
|
||||||
# Get status code and headers
|
# Get status code and headers
|
||||||
@@ -311,37 +395,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
status_code = 200
|
status_code = 200
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
|
|
||||||
|
|
||||||
await page.wait_for_selector('body')
|
await page.wait_for_selector('body')
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
|
||||||
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||||
if js_code:
|
if js_code:
|
||||||
if isinstance(js_code, str):
|
if isinstance(js_code, str):
|
||||||
r = await page.evaluate(js_code)
|
await page.evaluate(js_code)
|
||||||
elif isinstance(js_code, list):
|
elif isinstance(js_code, list):
|
||||||
for js in js_code:
|
for js in js_code:
|
||||||
await page.evaluate(js)
|
await page.evaluate(js)
|
||||||
|
|
||||||
# await page.wait_for_timeout(100)
|
|
||||||
await page.wait_for_load_state('networkidle')
|
await page.wait_for_load_state('networkidle')
|
||||||
# Check for on execution even
|
# Check for on execution event
|
||||||
await self.execute_hook('on_execution_started', page)
|
await self.execute_hook('on_execution_started', page)
|
||||||
|
|
||||||
# New code to handle the wait_for parameter
|
if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
||||||
# Example usage:
|
# Simulate user interactions
|
||||||
# await crawler.crawl(
|
await page.mouse.move(100, 100)
|
||||||
# url,
|
await page.mouse.down()
|
||||||
# js_code="// some JavaScript code",
|
await page.mouse.up()
|
||||||
# wait_for="""() => {
|
await page.keyboard.press('ArrowDown')
|
||||||
# return document.querySelector('#my-element') !== null;
|
|
||||||
# }"""
|
# Handle the wait_for parameter
|
||||||
# )
|
|
||||||
# Example of using a CSS selector:
|
|
||||||
# await crawler.crawl(
|
|
||||||
# url,
|
|
||||||
# wait_for="#my-element"
|
|
||||||
# )
|
|
||||||
wait_for = kwargs.get("wait_for")
|
wait_for = kwargs.get("wait_for")
|
||||||
if wait_for:
|
if wait_for:
|
||||||
try:
|
try:
|
||||||
@@ -349,13 +425,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|
||||||
# Check if kwargs has screenshot=True then take screenshot
|
# Update image dimensions
|
||||||
screenshot_data = None
|
|
||||||
if kwargs.get("screenshot"):
|
|
||||||
screenshot_data = await self.take_screenshot(url)
|
|
||||||
|
|
||||||
|
|
||||||
# New code to update image dimensions
|
|
||||||
update_image_dimensions_js = """
|
update_image_dimensions_js = """
|
||||||
() => {
|
() => {
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
@@ -429,11 +499,18 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
html = await page.content()
|
html = await page.content()
|
||||||
await self.execute_hook('before_return_html', page, html)
|
await self.execute_hook('before_return_html', page, html)
|
||||||
|
|
||||||
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
|
screenshot_data = None
|
||||||
|
if kwargs.get("screenshot"):
|
||||||
|
screenshot_data = await self.take_screenshot(url)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
# store response headers and status code in cache
|
# store response headers and status code in cache
|
||||||
@@ -443,7 +520,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"status_code": status_code
|
"status_code": status_code
|
||||||
}, f)
|
}, f)
|
||||||
|
|
||||||
|
|
||||||
async def get_delayed_content(delay: float = 5.0) -> str:
|
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||||
@@ -459,63 +535,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise Error(f"Failed to crawl {url}: {str(e)}")
|
raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}")
|
||||||
finally:
|
# finally:
|
||||||
if not session_id:
|
# if not session_id:
|
||||||
await page.close()
|
# await page.close()
|
||||||
|
# await context.close()
|
||||||
# try:
|
|
||||||
# html = await _crawl()
|
|
||||||
# return sanitize_input_encode(html)
|
|
||||||
# except Error as e:
|
|
||||||
# raise Error(f"Failed to crawl {url}: {str(e)}")
|
|
||||||
# except Exception as e:
|
|
||||||
# raise Exception(f"Failed to crawl {url}: {str(e)}")
|
|
||||||
|
|
||||||
async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = None, wait_for_css: str = None) -> AsyncCrawlResponse:
|
|
||||||
"""
|
|
||||||
Execute JavaScript code in a specific session and optionally wait for a condition.
|
|
||||||
|
|
||||||
:param session_id: The ID of the session to execute the JS code in.
|
|
||||||
:param js_code: The JavaScript code to execute.
|
|
||||||
:param wait_for_js: JavaScript condition to wait for after execution.
|
|
||||||
:param wait_for_css: CSS selector to wait for after execution.
|
|
||||||
:return: AsyncCrawlResponse containing the page's HTML and other information.
|
|
||||||
:raises ValueError: If the session does not exist.
|
|
||||||
"""
|
|
||||||
if not session_id:
|
|
||||||
raise ValueError("Session ID must be provided")
|
|
||||||
|
|
||||||
if session_id not in self.sessions:
|
|
||||||
raise ValueError(f"No active session found for session ID: {session_id}")
|
|
||||||
|
|
||||||
context, page, last_used = self.sessions[session_id]
|
|
||||||
|
|
||||||
try:
|
|
||||||
await page.evaluate(js_code)
|
|
||||||
|
|
||||||
if wait_for_js:
|
|
||||||
await page.wait_for_function(wait_for_js)
|
|
||||||
|
|
||||||
if wait_for_css:
|
|
||||||
await page.wait_for_selector(wait_for_css)
|
|
||||||
|
|
||||||
# Get the updated HTML content
|
|
||||||
html = await page.content()
|
|
||||||
|
|
||||||
# Get response headers and status code (assuming these are available)
|
|
||||||
response_headers = await page.evaluate("() => JSON.stringify(performance.getEntriesByType('resource')[0].responseHeaders)")
|
|
||||||
status_code = await page.evaluate("() => performance.getEntriesByType('resource')[0].responseStatus")
|
|
||||||
|
|
||||||
# Update the last used time for this session
|
|
||||||
self.sessions[session_id] = (context, page, time.time())
|
|
||||||
|
|
||||||
return AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
|
||||||
except Error as e:
|
|
||||||
raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
|
|
||||||
|
|
||||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
|
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||||
semaphore = asyncio.Semaphore(semaphore_count)
|
semaphore = asyncio.Semaphore(semaphore_count)
|
||||||
|
|
||||||
async def crawl_with_semaphore(url):
|
async def crawl_with_semaphore(url):
|
||||||
@@ -550,3 +577,4 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
|
|||||||
@@ -133,8 +133,8 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, "msg"):
|
if not hasattr(e, "msg"):
|
||||||
e.msg = str(e)
|
e.msg = str(e)
|
||||||
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}")
|
||||||
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||||
|
|
||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
@@ -195,6 +195,7 @@ class AsyncWebCrawler:
|
|||||||
image_description_min_word_threshold=kwargs.get(
|
image_description_min_word_threshold=kwargs.get(
|
||||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||||
),
|
),
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
||||||
|
|
||||||
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||||
|
success = True
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -273,10 +274,41 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
if base64_pattern.match(src):
|
if base64_pattern.match(src):
|
||||||
# Replace base64 data with empty string
|
# Replace base64 data with empty string
|
||||||
img['src'] = base64_pattern.sub('', src)
|
img['src'] = base64_pattern.sub('', src)
|
||||||
|
|
||||||
|
try:
|
||||||
|
str(body)
|
||||||
|
except Exception as e:
|
||||||
|
# Reset body to the original HTML
|
||||||
|
success = False
|
||||||
|
body = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Create a new div with a special ID
|
||||||
|
error_div = body.new_tag('div', id='crawl4ai_error_message')
|
||||||
|
error_div.string = '''
|
||||||
|
Crawl4AI Error: This page is not fully supported.
|
||||||
|
|
||||||
|
Possible reasons:
|
||||||
|
1. The page may have restrictions that prevent crawling.
|
||||||
|
2. The page might not be fully loaded.
|
||||||
|
|
||||||
|
Suggestions:
|
||||||
|
- Try calling the crawl function with these parameters:
|
||||||
|
magic=True,
|
||||||
|
- Set headless=False to visualize what's happening on the page.
|
||||||
|
|
||||||
|
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Append the error div to the body
|
||||||
|
body.body.append(error_div)
|
||||||
|
|
||||||
|
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
|
||||||
|
|
||||||
|
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
|
|
||||||
h = CustomHTML2Text()
|
h = CustomHTML2Text()
|
||||||
h.ignore_links = True
|
h.ignore_links = not kwargs.get('include_links_on_markdown', False)
|
||||||
h.body_width = 0
|
h.body_width = 0
|
||||||
try:
|
try:
|
||||||
markdown = h.handle(cleaned_html)
|
markdown = h.handle(cleaned_html)
|
||||||
@@ -294,7 +326,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
return {
|
return {
|
||||||
'markdown': markdown,
|
'markdown': markdown,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': True,
|
'success': success,
|
||||||
'media': media,
|
'media': media,
|
||||||
'links': links,
|
'links': links,
|
||||||
'metadata': meta
|
'metadata': meta
|
||||||
|
|||||||
@@ -692,8 +692,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
for img in imgs:
|
for img in imgs:
|
||||||
src = img.get('src', '')
|
src = img.get('src', '')
|
||||||
if base64_pattern.match(src):
|
if base64_pattern.match(src):
|
||||||
# Replace base64 data with empty string
|
|
||||||
img['src'] = base64_pattern.sub('', src)
|
img['src'] = base64_pattern.sub('', src)
|
||||||
|
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
cleaned_html = sanitize_html(cleaned_html)
|
cleaned_html = sanitize_html(cleaned_html)
|
||||||
|
|
||||||
|
|||||||
@@ -47,8 +47,7 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# !pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git\"\n",
|
"!pip install crawl4ai\n",
|
||||||
"!pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git@staging\"\n",
|
|
||||||
"!pip install nest-asyncio\n",
|
"!pip install nest-asyncio\n",
|
||||||
"!playwright install"
|
"!playwright install"
|
||||||
]
|
]
|
||||||
@@ -714,7 +713,7 @@
|
|||||||
"provenance": []
|
"provenance": []
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "venv",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -379,6 +379,18 @@ async def crawl_custom_browser_type():
|
|||||||
print(result.markdown[:500])
|
print(result.markdown[:500])
|
||||||
print("Time taken: ", time.time() - start)
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
|
async def crawl_with_user_simultion():
|
||||||
|
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||||
|
url = "YOUR-URL-HERE"
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
bypass_cache=True,
|
||||||
|
simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
|
||||||
|
override_navigator = True # Overrides the navigator object to make it look like a real user
|
||||||
|
)
|
||||||
|
|
||||||
|
print(result.markdown)
|
||||||
|
|
||||||
async def speed_comparison():
|
async def speed_comparison():
|
||||||
# print("\n--- Speed Comparison ---")
|
# print("\n--- Speed Comparison ---")
|
||||||
# print("Firecrawl (simulated):")
|
# print("Firecrawl (simulated):")
|
||||||
@@ -455,7 +467,7 @@ async def main():
|
|||||||
# LLM extraction examples
|
# LLM extraction examples
|
||||||
await extract_structured_data_using_llm()
|
await extract_structured_data_using_llm()
|
||||||
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||||
await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
|
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
await extract_structured_data_using_llm("ollama/llama3.2")
|
await extract_structured_data_using_llm("ollama/llama3.2")
|
||||||
|
|
||||||
# You always can pass custom headers to the extraction strategy
|
# You always can pass custom headers to the extraction strategy
|
||||||
|
|||||||
@@ -8,3 +8,4 @@ playwright==1.47.0
|
|||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
requests>=2.26.0,<2.32.3
|
requests>=2.26.0,<2.32.3
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
|
playwright_stealth==1.0.6
|
||||||
Reference in New Issue
Block a user