Improved database management and error handling, updated README instructions, refined .gitignore, enhanced async web crawling capabilities, and updated dependencies.
This commit is contained in:
@@ -1,558 +0,0 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||
import os
|
||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||
from io import BytesIO
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from pathlib import Path
|
||||
from playwright.async_api import ProxySettings
|
||||
from pydantic import BaseModel
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
from playwright_stealth import stealth_async
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
status_code: int
|
||||
screenshot: Optional[str] = None
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
class AsyncCrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def take_screenshot(self, url: str) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_user_agent(self, user_agent: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
pass
|
||||
|
||||
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||
self.use_cached_html = use_cached_html
|
||||
self.user_agent = kwargs.get(
|
||||
"user_agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
self.proxy = kwargs.get("proxy")
|
||||
self.headless = kwargs.get("headless", True)
|
||||
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||
self.headers = kwargs.get("headers", {})
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
self.playwright = None
|
||||
self.browser = None
|
||||
self.hooks = {
|
||||
'on_browser_created': None,
|
||||
'on_user_agent_updated': None,
|
||||
'on_execution_started': None,
|
||||
'before_goto': None,
|
||||
'after_goto': None,
|
||||
'before_return_html': None,
|
||||
'before_retrieve_html': None
|
||||
}
|
||||
|
||||
async def __aenter__(self):
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
async def start(self):
|
||||
if self.playwright is None:
|
||||
self.playwright = await async_playwright().start()
|
||||
if self.browser is None:
|
||||
browser_args = {
|
||||
"headless": self.headless,
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-infobars",
|
||||
"--window-position=0,0",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
# "--headless=new", # Use the new headless mode
|
||||
]
|
||||
}
|
||||
|
||||
# Add proxy settings if a proxy is specified
|
||||
if self.proxy:
|
||||
proxy_settings = ProxySettings(server=self.proxy)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
# Select the appropriate browser based on the browser_type
|
||||
if self.browser_type == "firefox":
|
||||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||
elif self.browser_type == "webkit":
|
||||
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||
else:
|
||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||
|
||||
await self.execute_hook('on_browser_created', self.browser)
|
||||
|
||||
async def close(self):
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
self.browser = None
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
def __del__(self):
|
||||
if self.browser or self.playwright:
|
||||
asyncio.get_event_loop().run_until_complete(self.close())
|
||||
|
||||
def set_hook(self, hook_type: str, hook: Callable):
|
||||
if hook_type in self.hooks:
|
||||
self.hooks[hook_type] = hook
|
||||
else:
|
||||
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||
|
||||
async def execute_hook(self, hook_type: str, *args):
|
||||
hook = self.hooks.get(hook_type)
|
||||
if hook:
|
||||
if asyncio.iscoroutinefunction(hook):
|
||||
return await hook(*args)
|
||||
else:
|
||||
return hook(*args)
|
||||
return args[0] if args else None
|
||||
|
||||
def update_user_agent(self, user_agent: str):
|
||||
self.user_agent = user_agent
|
||||
|
||||
def set_custom_headers(self, headers: Dict[str, str]):
|
||||
self.headers = headers
|
||||
|
||||
async def kill_session(self, session_id: str):
|
||||
if session_id in self.sessions:
|
||||
context, page, _ = self.sessions[session_id]
|
||||
await page.close()
|
||||
await context.close()
|
||||
del self.sessions[session_id]
|
||||
|
||||
def _cleanup_expired_sessions(self):
|
||||
current_time = time.time()
|
||||
expired_sessions = [
|
||||
sid for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl
|
||||
]
|
||||
for sid in expired_sessions:
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
||||
wait_for = wait_for.strip()
|
||||
|
||||
if wait_for.startswith('js:'):
|
||||
# Explicitly specified JavaScript
|
||||
js_code = wait_for[3:].strip()
|
||||
return await self.csp_compliant_wait(page, js_code, timeout)
|
||||
elif wait_for.startswith('css:'):
|
||||
# Explicitly specified CSS selector
|
||||
css_selector = wait_for[4:].strip()
|
||||
try:
|
||||
await page.wait_for_selector(css_selector, timeout=timeout)
|
||||
except Error as e:
|
||||
if 'Timeout' in str(e):
|
||||
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
|
||||
else:
|
||||
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
||||
else:
|
||||
# Auto-detect based on content
|
||||
if wait_for.startswith('()') or wait_for.startswith('function'):
|
||||
# It's likely a JavaScript function
|
||||
return await self.csp_compliant_wait(page, wait_for, timeout)
|
||||
else:
|
||||
# Assume it's a CSS selector first
|
||||
try:
|
||||
await page.wait_for_selector(wait_for, timeout=timeout)
|
||||
except Error as e:
|
||||
if 'Timeout' in str(e):
|
||||
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
|
||||
else:
|
||||
# If it's not a timeout error, it might be an invalid selector
|
||||
# Let's try to evaluate it as a JavaScript function as a fallback
|
||||
try:
|
||||
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||
except Error:
|
||||
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||
"It should be either a valid CSS selector, a JavaScript function, "
|
||||
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||
|
||||
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||
wrapper_js = f"""
|
||||
async () => {{
|
||||
const userFunction = {user_wait_function};
|
||||
const startTime = Date.now();
|
||||
while (true) {{
|
||||
if (await userFunction()) {{
|
||||
return true;
|
||||
}}
|
||||
if (Date.now() - startTime > {timeout}) {{
|
||||
throw new Error('Timeout waiting for condition');
|
||||
}}
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}}
|
||||
}}
|
||||
"""
|
||||
|
||||
try:
|
||||
await page.evaluate(wrapper_js)
|
||||
except TimeoutError:
|
||||
raise TimeoutError(f"Timeout after {timeout}ms waiting for condition")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error in wait condition: {str(e)}")
|
||||
|
||||
async def process_iframes(self, page):
|
||||
# Find all iframes
|
||||
iframes = await page.query_selector_all('iframe')
|
||||
|
||||
for i, iframe in enumerate(iframes):
|
||||
try:
|
||||
# Add a unique identifier to the iframe
|
||||
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
|
||||
|
||||
# Get the frame associated with this iframe
|
||||
frame = await iframe.content_frame()
|
||||
|
||||
if frame:
|
||||
# Wait for the frame to load
|
||||
await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout
|
||||
|
||||
# Extract the content of the iframe's body
|
||||
iframe_content = await frame.evaluate('() => document.body.innerHTML')
|
||||
|
||||
# Generate a unique class name for this iframe
|
||||
class_name = f'extracted-iframe-content-{i}'
|
||||
|
||||
# Replace the iframe with a div containing the extracted content
|
||||
_iframe = iframe_content.replace('`', '\\`')
|
||||
await page.evaluate(f"""
|
||||
() => {{
|
||||
const iframe = document.getElementById('iframe-{i}');
|
||||
const div = document.createElement('div');
|
||||
div.innerHTML = `{_iframe}`;
|
||||
div.className = '{class_name}';
|
||||
iframe.replaceWith(div);
|
||||
}}
|
||||
""")
|
||||
else:
|
||||
print(f"Warning: Could not access content frame for iframe {i}")
|
||||
except Exception as e:
|
||||
print(f"Error processing iframe {i}: {str(e)}")
|
||||
|
||||
# Return the page object
|
||||
return page
|
||||
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
|
||||
self._cleanup_expired_sessions()
|
||||
session_id = kwargs.get("session_id")
|
||||
if session_id:
|
||||
context, page, _ = self.sessions.get(session_id, (None, None, None))
|
||||
if not context:
|
||||
context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
proxy={"server": self.proxy} if self.proxy else None
|
||||
)
|
||||
await context.set_extra_http_headers(self.headers)
|
||||
page = await context.new_page()
|
||||
self.sessions[session_id] = (context, page, time.time())
|
||||
else:
|
||||
context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
proxy={"server": self.proxy} if self.proxy else None
|
||||
)
|
||||
await context.set_extra_http_headers(self.headers)
|
||||
|
||||
if kwargs.get("override_navigator", False):
|
||||
# Inject scripts to override navigator properties
|
||||
await context.add_init_script("""
|
||||
// Pass the Permissions Test.
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) => (
|
||||
parameters.name === 'notifications' ?
|
||||
Promise.resolve({ state: Notification.permission }) :
|
||||
originalQuery(parameters)
|
||||
);
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
window.navigator.chrome = {
|
||||
runtime: {},
|
||||
// Add other properties if necessary
|
||||
};
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en'],
|
||||
});
|
||||
Object.defineProperty(document, 'hidden', {
|
||||
get: () => false
|
||||
});
|
||||
Object.defineProperty(document, 'visibilityState', {
|
||||
get: () => 'visible'
|
||||
});
|
||||
""")
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
)
|
||||
if os.path.exists(cache_file_path):
|
||||
html = ""
|
||||
with open(cache_file_path, "r") as f:
|
||||
html = f.read()
|
||||
# retrieve response headers and status code from cache
|
||||
with open(cache_file_path + ".meta", "r") as f:
|
||||
meta = json.load(f)
|
||||
response_headers = meta.get("response_headers", {})
|
||||
status_code = meta.get("status_code")
|
||||
response = AsyncCrawlResponse(
|
||||
html=html, response_headers=response_headers, status_code=status_code
|
||||
)
|
||||
return response
|
||||
|
||||
if not kwargs.get("js_only", False):
|
||||
await self.execute_hook('before_goto', page)
|
||||
|
||||
response = await page.goto("about:blank")
|
||||
await stealth_async(page)
|
||||
response = await page.goto(
|
||||
url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
|
||||
)
|
||||
|
||||
# await stealth_async(page)
|
||||
# response = await page.goto("about:blank")
|
||||
# await stealth_async(page)
|
||||
# await page.evaluate(f"window.location.href = '{url}'")
|
||||
|
||||
await self.execute_hook('after_goto', page)
|
||||
|
||||
# Get status code and headers
|
||||
status_code = response.status
|
||||
response_headers = response.headers
|
||||
else:
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
|
||||
await page.wait_for_selector('body')
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
|
||||
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||
if js_code:
|
||||
if isinstance(js_code, str):
|
||||
await page.evaluate(js_code)
|
||||
elif isinstance(js_code, list):
|
||||
for js in js_code:
|
||||
await page.evaluate(js)
|
||||
|
||||
await page.wait_for_load_state('networkidle')
|
||||
# Check for on execution event
|
||||
await self.execute_hook('on_execution_started', page)
|
||||
|
||||
if kwargs.get("simulate_user", False):
|
||||
# Simulate user interactions
|
||||
await page.mouse.move(100, 100)
|
||||
await page.mouse.down()
|
||||
await page.mouse.up()
|
||||
await page.keyboard.press('ArrowDown')
|
||||
|
||||
# Handle the wait_for parameter
|
||||
wait_for = kwargs.get("wait_for")
|
||||
if wait_for:
|
||||
try:
|
||||
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
|
||||
|
||||
# Update image dimensions
|
||||
update_image_dimensions_js = """
|
||||
() => {
|
||||
return new Promise((resolve) => {
|
||||
const filterImage = (img) => {
|
||||
// Filter out images that are too small
|
||||
if (img.width < 100 && img.height < 100) return false;
|
||||
|
||||
// Filter out images that are not visible
|
||||
const rect = img.getBoundingClientRect();
|
||||
if (rect.width === 0 || rect.height === 0) return false;
|
||||
|
||||
// Filter out images with certain class names (e.g., icons, thumbnails)
|
||||
if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
|
||||
|
||||
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
||||
if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
|
||||
let imagesLeft = images.length;
|
||||
|
||||
if (imagesLeft === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
|
||||
const checkImage = (img) => {
|
||||
if (img.complete && img.naturalWidth !== 0) {
|
||||
img.setAttribute('width', img.naturalWidth);
|
||||
img.setAttribute('height', img.naturalHeight);
|
||||
imagesLeft--;
|
||||
if (imagesLeft === 0) resolve();
|
||||
}
|
||||
};
|
||||
|
||||
images.forEach(img => {
|
||||
checkImage(img);
|
||||
if (!img.complete) {
|
||||
img.onload = () => {
|
||||
checkImage(img);
|
||||
};
|
||||
img.onerror = () => {
|
||||
imagesLeft--;
|
||||
if (imagesLeft === 0) resolve();
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
// Fallback timeout of 5 seconds
|
||||
setTimeout(() => resolve(), 5000);
|
||||
});
|
||||
}
|
||||
"""
|
||||
await page.evaluate(update_image_dimensions_js)
|
||||
|
||||
# Wait a bit for any onload events to complete
|
||||
await page.wait_for_timeout(100)
|
||||
|
||||
# Process iframes
|
||||
if kwargs.get("process_iframes", False):
|
||||
page = await self.process_iframes(page)
|
||||
|
||||
await self.execute_hook('before_retrieve_html', page)
|
||||
# Check if delay_before_return_html is set then wait for that time
|
||||
delay_before_return_html = kwargs.get("delay_before_return_html")
|
||||
if delay_before_return_html:
|
||||
await asyncio.sleep(delay_before_return_html)
|
||||
|
||||
html = await page.content()
|
||||
await self.execute_hook('before_return_html', page, html)
|
||||
|
||||
# Check if kwargs has screenshot=True then take screenshot
|
||||
screenshot_data = None
|
||||
if kwargs.get("screenshot"):
|
||||
screenshot_data = await self.take_screenshot(url)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
)
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
# store response headers and status code in cache
|
||||
with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"response_headers": response_headers,
|
||||
"status_code": status_code
|
||||
}, f)
|
||||
|
||||
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||
if self.verbose:
|
||||
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||
await asyncio.sleep(delay)
|
||||
return await page.content()
|
||||
|
||||
response = AsyncCrawlResponse(
|
||||
html=html,
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
get_delayed_content=get_delayed_content
|
||||
)
|
||||
return response
|
||||
except Error as e:
|
||||
raise Error(f"Failed to crawl {url}: {str(e)}")
|
||||
finally:
|
||||
if not session_id:
|
||||
await page.close()
|
||||
await context.close()
|
||||
|
||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||
semaphore = asyncio.Semaphore(semaphore_count)
|
||||
|
||||
async def crawl_with_semaphore(url):
|
||||
async with semaphore:
|
||||
return await self.crawl(url, **kwargs)
|
||||
|
||||
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||
|
||||
async def take_screenshot(self, url: str, wait_time=1000) -> str:
|
||||
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
||||
page = await context.new_page()
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
# Wait for a specified time (default is 1 second)
|
||||
await page.wait_for_timeout(wait_time)
|
||||
screenshot = await page.screenshot(full_page=True)
|
||||
return base64.b64encode(screenshot).decode('utf-8')
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take screenshot: {str(e)}"
|
||||
print(error_message)
|
||||
|
||||
# Generate an error image
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
font = ImageFont.load_default()
|
||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
@@ -3,7 +3,8 @@ import base64
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||
import os
|
||||
import os, sys, shutil
|
||||
import tempfile, subprocess
|
||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||
from io import BytesIO
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
@@ -13,6 +14,7 @@ from pydantic import BaseModel
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from playwright_stealth import StealthConfig, stealth_async
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
@@ -31,6 +33,106 @@ stealth_config = StealthConfig(
|
||||
)
|
||||
|
||||
|
||||
class ManagedBrowser:
|
||||
def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False):
|
||||
self.browser_type = browser_type
|
||||
self.user_data_dir = user_data_dir
|
||||
self.headless = headless
|
||||
self.browser_process = None
|
||||
self.temp_dir = None
|
||||
self.debugging_port = 9222
|
||||
|
||||
async def start(self) -> str:
|
||||
"""
|
||||
Starts the browser process and returns the CDP endpoint URL.
|
||||
If user_data_dir is not provided, creates a temporary directory.
|
||||
"""
|
||||
|
||||
# Create temp dir if needed
|
||||
if not self.user_data_dir:
|
||||
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
|
||||
self.user_data_dir = self.temp_dir
|
||||
|
||||
# Get browser path and args based on OS and browser type
|
||||
browser_path = self._get_browser_path()
|
||||
args = self._get_browser_args()
|
||||
|
||||
# Start browser process
|
||||
try:
|
||||
self.browser_process = subprocess.Popen(
|
||||
args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
await asyncio.sleep(2) # Give browser time to start
|
||||
return f"http://localhost:{self.debugging_port}"
|
||||
except Exception as e:
|
||||
await self.cleanup()
|
||||
raise Exception(f"Failed to start browser: {e}")
|
||||
|
||||
def _get_browser_path(self) -> str:
|
||||
"""Returns the browser executable path based on OS and browser type"""
|
||||
if sys.platform == "darwin": # macOS
|
||||
paths = {
|
||||
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
|
||||
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari"
|
||||
}
|
||||
elif sys.platform == "win32": # Windows
|
||||
paths = {
|
||||
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
||||
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
|
||||
"webkit": None # WebKit not supported on Windows
|
||||
}
|
||||
else: # Linux
|
||||
paths = {
|
||||
"chromium": "google-chrome",
|
||||
"firefox": "firefox",
|
||||
"webkit": None # WebKit not supported on Linux
|
||||
}
|
||||
|
||||
return paths.get(self.browser_type)
|
||||
|
||||
def _get_browser_args(self) -> List[str]:
|
||||
"""Returns browser-specific command line arguments"""
|
||||
base_args = [self._get_browser_path()]
|
||||
|
||||
if self.browser_type == "chromium":
|
||||
args = [
|
||||
f"--remote-debugging-port={self.debugging_port}",
|
||||
f"--user-data-dir={self.user_data_dir}",
|
||||
]
|
||||
if self.headless:
|
||||
args.append("--headless=new")
|
||||
elif self.browser_type == "firefox":
|
||||
args = [
|
||||
"--remote-debugging-port", str(self.debugging_port),
|
||||
"--profile", self.user_data_dir,
|
||||
]
|
||||
if self.headless:
|
||||
args.append("--headless")
|
||||
else:
|
||||
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
||||
|
||||
return base_args + args
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup browser process and temporary directory"""
|
||||
if self.browser_process:
|
||||
try:
|
||||
self.browser_process.terminate()
|
||||
await asyncio.sleep(1)
|
||||
if self.browser_process.poll() is None:
|
||||
self.browser_process.kill()
|
||||
except Exception as e:
|
||||
print(f"Error terminating browser: {e}")
|
||||
|
||||
if self.temp_dir and os.path.exists(self.temp_dir):
|
||||
try:
|
||||
shutil.rmtree(self.temp_dir)
|
||||
except Exception as e:
|
||||
print(f"Error removing temporary directory: {e}")
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
@@ -82,6 +184,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
self.playwright = None
|
||||
self.browser = None
|
||||
self.sleep_on_close = kwargs.get("sleep_on_close", False)
|
||||
self.use_managed_browser = kwargs.get("use_managed_browser", False)
|
||||
self.user_data_dir = kwargs.get("user_data_dir", None)
|
||||
self.managed_browser = None
|
||||
self.hooks = {
|
||||
'on_browser_created': None,
|
||||
'on_user_agent_updated': None,
|
||||
@@ -103,36 +208,46 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if self.playwright is None:
|
||||
self.playwright = await async_playwright().start()
|
||||
if self.browser is None:
|
||||
browser_args = {
|
||||
"headless": self.headless,
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-infobars",
|
||||
"--window-position=0,0",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
# "--headless=new", # Use the new headless mode
|
||||
]
|
||||
}
|
||||
|
||||
# Add proxy settings if a proxy is specified
|
||||
if self.proxy:
|
||||
proxy_settings = ProxySettings(server=self.proxy)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
elif self.proxy_config:
|
||||
proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password"))
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
# Select the appropriate browser based on the browser_type
|
||||
if self.browser_type == "firefox":
|
||||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||
elif self.browser_type == "webkit":
|
||||
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||
if self.use_managed_browser:
|
||||
# Use managed browser approach
|
||||
self.managed_browser = ManagedBrowser(
|
||||
browser_type=self.browser_type,
|
||||
user_data_dir=self.user_data_dir,
|
||||
headless=self.headless
|
||||
)
|
||||
cdp_url = await self.managed_browser.start()
|
||||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||||
else:
|
||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||
browser_args = {
|
||||
"headless": self.headless,
|
||||
"args": [
|
||||
"--disable-gpu",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-infobars",
|
||||
"--window-position=0,0",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
# "--headless=new", # Use the new headless mode
|
||||
]
|
||||
}
|
||||
|
||||
# Add proxy settings if a proxy is specified
|
||||
if self.proxy:
|
||||
proxy_settings = ProxySettings(server=self.proxy)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
elif self.proxy_config:
|
||||
proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password"))
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
# Select the appropriate browser based on the browser_type
|
||||
if self.browser_type == "firefox":
|
||||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||
elif self.browser_type == "webkit":
|
||||
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||
else:
|
||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||
|
||||
await self.execute_hook('on_browser_created', self.browser)
|
||||
|
||||
@@ -142,6 +257,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
self.browser = None
|
||||
if self.managed_browser:
|
||||
await self.managed_browser.cleanup()
|
||||
self.managed_browser = None
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
@@ -399,7 +517,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
|
||||
await page.wait_for_selector('body')
|
||||
# Replace the current wait_for_selector line with this more robust check:
|
||||
try:
|
||||
# First wait for body to exist, regardless of visibility
|
||||
await page.wait_for_selector('body', state='attached', timeout=30000)
|
||||
|
||||
# Then wait for it to become visible by checking CSS
|
||||
await page.wait_for_function("""
|
||||
() => {
|
||||
const body = document.body;
|
||||
const style = window.getComputedStyle(body);
|
||||
return style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
style.opacity !== '0';
|
||||
}
|
||||
""", timeout=30000)
|
||||
|
||||
except Error as e:
|
||||
# If waiting fails, let's try to diagnose the issue
|
||||
visibility_info = await page.evaluate("""
|
||||
() => {
|
||||
const body = document.body;
|
||||
const style = window.getComputedStyle(body);
|
||||
return {
|
||||
display: style.display,
|
||||
visibility: style.visibility,
|
||||
opacity: style.opacity,
|
||||
hasContent: body.innerHTML.length,
|
||||
classList: Array.from(body.classList)
|
||||
}
|
||||
}
|
||||
""")
|
||||
|
||||
if self.verbose:
|
||||
print(f"Body visibility debug info: {visibility_info}")
|
||||
|
||||
# Even if body is hidden, we might still want to proceed
|
||||
if kwargs.get('ignore_body_visibility', True):
|
||||
if self.verbose:
|
||||
print("Proceeding despite hidden body...")
|
||||
pass
|
||||
else:
|
||||
raise Error(f"Body element is hidden: {visibility_info}")
|
||||
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
|
||||
|
||||
@@ -2,18 +2,82 @@ import os
|
||||
from pathlib import Path
|
||||
import aiosqlite
|
||||
import asyncio
|
||||
from typing import Optional, Tuple
|
||||
from typing import Optional, Tuple, Dict
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
class AsyncDatabaseManager:
|
||||
def __init__(self):
|
||||
def __init__(self, pool_size: int = 10, max_retries: int = 3):
|
||||
self.db_path = DB_PATH
|
||||
self.pool_size = pool_size
|
||||
self.max_retries = max_retries
|
||||
self.connection_pool: Dict[int, aiosqlite.Connection] = {}
|
||||
self.pool_lock = asyncio.Lock()
|
||||
self.connection_semaphore = asyncio.Semaphore(pool_size)
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the database and connection pool"""
|
||||
await self.ainit_db()
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup connections when shutting down"""
|
||||
async with self.pool_lock:
|
||||
for conn in self.connection_pool.values():
|
||||
await conn.close()
|
||||
self.connection_pool.clear()
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_connection(self):
|
||||
"""Connection pool manager"""
|
||||
async with self.connection_semaphore:
|
||||
task_id = id(asyncio.current_task())
|
||||
try:
|
||||
async with self.pool_lock:
|
||||
if task_id not in self.connection_pool:
|
||||
conn = await aiosqlite.connect(
|
||||
self.db_path,
|
||||
timeout=30.0
|
||||
)
|
||||
await conn.execute('PRAGMA journal_mode = WAL')
|
||||
await conn.execute('PRAGMA busy_timeout = 5000')
|
||||
self.connection_pool[task_id] = conn
|
||||
|
||||
yield self.connection_pool[task_id]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Connection error: {e}")
|
||||
raise
|
||||
finally:
|
||||
async with self.pool_lock:
|
||||
if task_id in self.connection_pool:
|
||||
await self.connection_pool[task_id].close()
|
||||
del self.connection_pool[task_id]
|
||||
|
||||
async def execute_with_retry(self, operation, *args):
|
||||
"""Execute database operations with retry logic"""
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
async with self.get_connection() as db:
|
||||
result = await operation(db, *args)
|
||||
await db.commit()
|
||||
return result
|
||||
except Exception as e:
|
||||
if attempt == self.max_retries - 1:
|
||||
logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
|
||||
raise
|
||||
await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
|
||||
|
||||
async def ainit_db(self):
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
"""Initialize database schema"""
|
||||
async def _init(db):
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS crawled_data (
|
||||
url TEXT PRIMARY KEY,
|
||||
@@ -28,87 +92,101 @@ class AsyncDatabaseManager:
|
||||
screenshot TEXT DEFAULT ""
|
||||
)
|
||||
''')
|
||||
await db.commit()
|
||||
|
||||
await self.execute_with_retry(_init)
|
||||
await self.update_db_schema()
|
||||
|
||||
async def update_db_schema(self):
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
# Check if the 'media' column exists
|
||||
"""Update database schema if needed"""
|
||||
async def _check_columns(db):
|
||||
cursor = await db.execute("PRAGMA table_info(crawled_data)")
|
||||
columns = await cursor.fetchall()
|
||||
column_names = [column[1] for column in columns]
|
||||
|
||||
if 'media' not in column_names:
|
||||
await self.aalter_db_add_column('media')
|
||||
|
||||
# Check for other missing columns and add them if necessary
|
||||
for column in ['links', 'metadata', 'screenshot']:
|
||||
if column not in column_names:
|
||||
await self.aalter_db_add_column(column)
|
||||
return [column[1] for column in columns]
|
||||
|
||||
column_names = await self.execute_with_retry(_check_columns)
|
||||
|
||||
for column in ['media', 'links', 'metadata', 'screenshot']:
|
||||
if column not in column_names:
|
||||
await self.aalter_db_add_column(column)
|
||||
|
||||
async def aalter_db_add_column(self, new_column: str):
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
await db.commit()
|
||||
print(f"Added column '{new_column}' to the database.")
|
||||
except Exception as e:
|
||||
print(f"Error altering database to add {new_column} column: {e}")
|
||||
"""Add new column to the database"""
|
||||
async def _alter(db):
|
||||
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
|
||||
logger.info(f"Added column '{new_column}' to the database.")
|
||||
|
||||
await self.execute_with_retry(_alter)
|
||||
|
||||
async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
|
||||
"""Retrieve cached URL data"""
|
||||
async def _get(db):
|
||||
async with db.execute(
|
||||
'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?',
|
||||
(url,)
|
||||
) as cursor:
|
||||
return await cursor.fetchone()
|
||||
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
async with db.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,)) as cursor:
|
||||
return await cursor.fetchone()
|
||||
return await self.execute_with_retry(_get)
|
||||
except Exception as e:
|
||||
print(f"Error retrieving cached URL: {e}")
|
||||
logger.error(f"Error retrieving cached URL: {e}")
|
||||
return None
|
||||
|
||||
async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""):
|
||||
"""Cache URL data with retry logic"""
|
||||
async def _cache(db):
|
||||
await db.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
extracted_content = excluded.extracted_content,
|
||||
success = excluded.success,
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute('''
|
||||
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
html = excluded.html,
|
||||
cleaned_html = excluded.cleaned_html,
|
||||
markdown = excluded.markdown,
|
||||
extracted_content = excluded.extracted_content,
|
||||
success = excluded.success,
|
||||
media = excluded.media,
|
||||
links = excluded.links,
|
||||
metadata = excluded.metadata,
|
||||
screenshot = excluded.screenshot
|
||||
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
|
||||
await db.commit()
|
||||
await self.execute_with_retry(_cache)
|
||||
except Exception as e:
|
||||
print(f"Error caching URL: {e}")
|
||||
logger.error(f"Error caching URL: {e}")
|
||||
|
||||
async def aget_total_count(self) -> int:
|
||||
"""Get total number of cached URLs"""
|
||||
async def _count(db):
|
||||
async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
|
||||
result = await cursor.fetchone()
|
||||
return result[0] if result else 0
|
||||
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
|
||||
result = await cursor.fetchone()
|
||||
return result[0] if result else 0
|
||||
return await self.execute_with_retry(_count)
|
||||
except Exception as e:
|
||||
print(f"Error getting total count: {e}")
|
||||
logger.error(f"Error getting total count: {e}")
|
||||
return 0
|
||||
|
||||
async def aclear_db(self):
|
||||
"""Clear all data from the database"""
|
||||
async def _clear(db):
|
||||
await db.execute('DELETE FROM crawled_data')
|
||||
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute('DELETE FROM crawled_data')
|
||||
await db.commit()
|
||||
await self.execute_with_retry(_clear)
|
||||
except Exception as e:
|
||||
print(f"Error clearing database: {e}")
|
||||
logger.error(f"Error clearing database: {e}")
|
||||
|
||||
async def aflush_db(self):
|
||||
try:
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute('DROP TABLE IF EXISTS crawled_data')
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
print(f"Error flushing database: {e}")
|
||||
"""Drop the entire table"""
|
||||
async def _flush(db):
|
||||
await db.execute('DROP TABLE IF EXISTS crawled_data')
|
||||
|
||||
try:
|
||||
await self.execute_with_retry(_flush)
|
||||
except Exception as e:
|
||||
logger.error(f"Error flushing database: {e}")
|
||||
|
||||
# Create a singleton instance
|
||||
async_db_manager = AsyncDatabaseManager()
|
||||
@@ -48,7 +48,8 @@ class AsyncWebCrawler:
|
||||
async def awarmup(self):
|
||||
if self.verbose:
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
await async_db_manager.ainit_db()
|
||||
# await async_db_manager.ainit_db()
|
||||
await async_db_manager.initialize()
|
||||
await self.arun(
|
||||
url="https://google.com/",
|
||||
word_count_threshold=5,
|
||||
@@ -125,6 +126,7 @@ class AsyncWebCrawler:
|
||||
verbose,
|
||||
bool(cached),
|
||||
async_response=async_response,
|
||||
bypass_cache=bypass_cache,
|
||||
**kwargs,
|
||||
)
|
||||
crawl_result.status_code = async_response.status_code if async_response else 200
|
||||
@@ -243,7 +245,7 @@ class AsyncWebCrawler:
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
if not is_cached:
|
||||
if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(
|
||||
url,
|
||||
html,
|
||||
@@ -274,7 +276,8 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
async def aclear_cache(self):
|
||||
await async_db_manager.aclear_db()
|
||||
# await async_db_manager.aclear_db()
|
||||
await async_db_manager.cleanup()
|
||||
|
||||
async def aflush_cache(self):
|
||||
await async_db_manager.aflush_db()
|
||||
|
||||
@@ -14,12 +14,97 @@ from .utils import (
|
||||
sanitize_html,
|
||||
extract_metadata,
|
||||
InvalidCSSSelectorError,
|
||||
CustomHTML2Text,
|
||||
# CustomHTML2Text,
|
||||
normalize_url,
|
||||
is_external_url
|
||||
|
||||
)
|
||||
|
||||
from .html2text import HTML2Text
|
||||
class CustomHTML2Text(HTML2Text):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.inside_pre = False
|
||||
self.inside_code = False
|
||||
self.preserve_tags = set() # Set of tags to preserve
|
||||
self.current_preserved_tag = None
|
||||
self.preserved_content = []
|
||||
self.preserve_depth = 0
|
||||
|
||||
# Configuration options
|
||||
self.skip_internal_links = False
|
||||
self.single_line_break = False
|
||||
self.mark_code = False
|
||||
self.include_sup_sub = False
|
||||
self.body_width = 0
|
||||
self.ignore_mailto_links = True
|
||||
self.ignore_links = False
|
||||
self.escape_backslash = False
|
||||
self.escape_dot = False
|
||||
self.escape_plus = False
|
||||
self.escape_dash = False
|
||||
self.escape_snob = False
|
||||
|
||||
def update_params(self, **kwargs):
|
||||
"""Update parameters and set preserved tags."""
|
||||
for key, value in kwargs.items():
|
||||
if key == 'preserve_tags':
|
||||
self.preserve_tags = set(value)
|
||||
else:
|
||||
setattr(self, key, value)
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
# Handle preserved tags
|
||||
if tag in self.preserve_tags:
|
||||
if start:
|
||||
if self.preserve_depth == 0:
|
||||
self.current_preserved_tag = tag
|
||||
self.preserved_content = []
|
||||
# Format opening tag with attributes
|
||||
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
self.preserved_content.append(f'<{tag}{attr_str}>')
|
||||
self.preserve_depth += 1
|
||||
return
|
||||
else:
|
||||
self.preserve_depth -= 1
|
||||
if self.preserve_depth == 0:
|
||||
self.preserved_content.append(f'</{tag}>')
|
||||
# Output the preserved HTML block with proper spacing
|
||||
preserved_html = ''.join(self.preserved_content)
|
||||
self.o('\n' + preserved_html + '\n')
|
||||
self.current_preserved_tag = None
|
||||
return
|
||||
|
||||
# If we're inside a preserved tag, collect all content
|
||||
if self.preserve_depth > 0:
|
||||
if start:
|
||||
# Format nested tags with attributes
|
||||
attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
|
||||
self.preserved_content.append(f'<{tag}{attr_str}>')
|
||||
else:
|
||||
self.preserved_content.append(f'</{tag}>')
|
||||
return
|
||||
|
||||
# Handle pre tags
|
||||
if tag == 'pre':
|
||||
if start:
|
||||
self.o('```\n')
|
||||
self.inside_pre = True
|
||||
else:
|
||||
self.o('\n```')
|
||||
self.inside_pre = False
|
||||
elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
pass
|
||||
else:
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def handle_data(self, data, entity_char=False):
|
||||
"""Override handle_data to capture content within preserved tags."""
|
||||
if self.preserve_depth > 0:
|
||||
self.preserved_content.append(data)
|
||||
return
|
||||
super().handle_data(data, entity_char)
|
||||
|
||||
class ContentScrappingStrategy(ABC):
|
||||
@abstractmethod
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
|
||||
@@ -178,7 +178,7 @@ def escape_json_string(s):
|
||||
|
||||
return s
|
||||
|
||||
class CustomHTML2Text(HTML2Text):
|
||||
class CustomHTML2Text_v0(HTML2Text):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.inside_pre = False
|
||||
@@ -981,6 +981,19 @@ def format_html(html_string):
|
||||
return soup.prettify()
|
||||
|
||||
def normalize_url(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
# Parse base URL to get components
|
||||
parsed_base = urlparse(base_url)
|
||||
if not parsed_base.scheme or not parsed_base.netloc:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
# Use urljoin to handle all cases
|
||||
normalized = urljoin(base_url, href.strip())
|
||||
return normalized
|
||||
|
||||
def normalize_url_tmp(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
# Extract protocol and domain from base URL
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user