Compare commits
5 Commits
scraper-uc
...
0.3.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b309bc34e1 | ||
|
|
b8147b64e0 | ||
|
|
aab6ea022e | ||
|
|
dd17ed0e63 | ||
|
|
768aa06ceb |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -205,7 +205,4 @@ pypi_build.sh
|
|||||||
git_issues.py
|
git_issues.py
|
||||||
git_issues.md
|
git_issues.md
|
||||||
|
|
||||||
.tests/
|
.tests/
|
||||||
.issues/
|
|
||||||
.docs/
|
|
||||||
.issues/
|
|
||||||
74
CHANGELOG.md
74
CHANGELOG.md
@@ -1,5 +1,79 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.3.71] - 2024-10-18
|
||||||
|
|
||||||
|
### Changes
|
||||||
|
1. **Version Update**:
|
||||||
|
- Updated version number from 0.3.7 to 0.3.71.
|
||||||
|
|
||||||
|
2. **Crawler Enhancements**:
|
||||||
|
- Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure.
|
||||||
|
- Improved context creation with additional options:
|
||||||
|
- Enabled `accept_downloads` and `java_script_enabled`.
|
||||||
|
- Added a cookie to enable cookies by default.
|
||||||
|
|
||||||
|
3. **Error Handling Improvements**:
|
||||||
|
- Enhanced error messages in AsyncWebCrawler's `arun` method.
|
||||||
|
- Updated error reporting format for better visibility and consistency.
|
||||||
|
|
||||||
|
4. **Performance Optimization**:
|
||||||
|
- Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios.
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
- Updated quickstart notebook:
|
||||||
|
- Changed installation command to use the released package instead of GitHub repository.
|
||||||
|
- Updated kernel display name.
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- Minor code refactoring and cleanup.
|
||||||
|
|
||||||
|
## [v0.3.7] - 2024-10-17
|
||||||
|
|
||||||
|
### New Features
|
||||||
|
1. **Enhanced Browser Stealth**:
|
||||||
|
- Implemented `playwright_stealth` for improved bot detection avoidance.
|
||||||
|
- Added `StealthConfig` for fine-tuned control over stealth parameters.
|
||||||
|
|
||||||
|
2. **User Simulation**:
|
||||||
|
- New `simulate_user` option to mimic human-like interactions (mouse movements, clicks, keyboard presses).
|
||||||
|
|
||||||
|
3. **Navigator Override**:
|
||||||
|
- Added `override_navigator` option to modify navigator properties, further improving bot detection evasion.
|
||||||
|
|
||||||
|
4. **Improved iframe Handling**:
|
||||||
|
- New `process_iframes` parameter to extract and integrate iframe content into the main page.
|
||||||
|
|
||||||
|
5. **Flexible Browser Selection**:
|
||||||
|
- Support for choosing between Chromium, Firefox, and WebKit browsers.
|
||||||
|
|
||||||
|
6. **Include Links in Markdown**:
|
||||||
|
- Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method.
|
||||||
|
|
||||||
|
### Improvements
|
||||||
|
1. **Better Error Handling**:
|
||||||
|
- Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions.
|
||||||
|
- Added console message and error logging for better debugging.
|
||||||
|
|
||||||
|
2. **Image Processing Enhancements**:
|
||||||
|
- Improved image dimension updating and filtering logic.
|
||||||
|
|
||||||
|
3. **Crawling Flexibility**:
|
||||||
|
- Added support for custom viewport sizes.
|
||||||
|
- Implemented delayed content retrieval with `delay_before_return_html` parameter.
|
||||||
|
|
||||||
|
4. **Performance Optimization**:
|
||||||
|
- Adjusted default semaphore count for parallel crawling.
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
- Fixed an issue where the HTML content could be empty after processing.
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
- Added new example `crawl_with_user_simulation()` demonstrating the use of user simulation and navigator override features.
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- Refactored code for better maintainability and readability.
|
||||||
|
- Updated browser launch arguments for improved compatibility and performance.
|
||||||
|
|
||||||
## [v0.3.6] - 2024-10-12
|
## [v0.3.6] - 2024-10-12
|
||||||
|
|
||||||
### 1. Improved Crawling Control
|
### 1. Improved Crawling Control
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
|
|
||||||
__version__ = "0.3.6"
|
__version__ = "0.3.71"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
|||||||
558
crawl4ai/async_crawler_strategy copy.py
Normal file
558
crawl4ai/async_crawler_strategy copy.py
Normal file
@@ -0,0 +1,558 @@
|
|||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||||
|
import os
|
||||||
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||||
|
from io import BytesIO
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from pathlib import Path
|
||||||
|
from playwright.async_api import ProxySettings
|
||||||
|
from pydantic import BaseModel
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from playwright_stealth import stealth_async
|
||||||
|
|
||||||
|
class AsyncCrawlResponse(BaseModel):
|
||||||
|
html: str
|
||||||
|
response_headers: Dict[str, str]
|
||||||
|
status_code: int
|
||||||
|
screenshot: Optional[str] = None
|
||||||
|
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
class AsyncCrawlerStrategy(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def take_screenshot(self, url: str) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||||
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
|
self.use_cached_html = use_cached_html
|
||||||
|
self.user_agent = kwargs.get(
|
||||||
|
"user_agent",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
)
|
||||||
|
self.proxy = kwargs.get("proxy")
|
||||||
|
self.headless = kwargs.get("headless", True)
|
||||||
|
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||||
|
self.headers = kwargs.get("headers", {})
|
||||||
|
self.sessions = {}
|
||||||
|
self.session_ttl = 1800
|
||||||
|
self.js_code = js_code
|
||||||
|
self.verbose = kwargs.get("verbose", False)
|
||||||
|
self.playwright = None
|
||||||
|
self.browser = None
|
||||||
|
self.hooks = {
|
||||||
|
'on_browser_created': None,
|
||||||
|
'on_user_agent_updated': None,
|
||||||
|
'on_execution_started': None,
|
||||||
|
'before_goto': None,
|
||||||
|
'after_goto': None,
|
||||||
|
'before_return_html': None,
|
||||||
|
'before_retrieve_html': None
|
||||||
|
}
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
await self.start()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
if self.playwright is None:
|
||||||
|
self.playwright = await async_playwright().start()
|
||||||
|
if self.browser is None:
|
||||||
|
browser_args = {
|
||||||
|
"headless": self.headless,
|
||||||
|
"args": [
|
||||||
|
"--disable-gpu",
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--window-position=0,0",
|
||||||
|
"--ignore-certificate-errors",
|
||||||
|
"--ignore-certificate-errors-spki-list",
|
||||||
|
# "--headless=new", # Use the new headless mode
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add proxy settings if a proxy is specified
|
||||||
|
if self.proxy:
|
||||||
|
proxy_settings = ProxySettings(server=self.proxy)
|
||||||
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
# Select the appropriate browser based on the browser_type
|
||||||
|
if self.browser_type == "firefox":
|
||||||
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||||
|
elif self.browser_type == "webkit":
|
||||||
|
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||||
|
else:
|
||||||
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||||
|
|
||||||
|
await self.execute_hook('on_browser_created', self.browser)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
if self.browser:
|
||||||
|
await self.browser.close()
|
||||||
|
self.browser = None
|
||||||
|
if self.playwright:
|
||||||
|
await self.playwright.stop()
|
||||||
|
self.playwright = None
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if self.browser or self.playwright:
|
||||||
|
asyncio.get_event_loop().run_until_complete(self.close())
|
||||||
|
|
||||||
|
def set_hook(self, hook_type: str, hook: Callable):
|
||||||
|
if hook_type in self.hooks:
|
||||||
|
self.hooks[hook_type] = hook
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||||
|
|
||||||
|
async def execute_hook(self, hook_type: str, *args):
|
||||||
|
hook = self.hooks.get(hook_type)
|
||||||
|
if hook:
|
||||||
|
if asyncio.iscoroutinefunction(hook):
|
||||||
|
return await hook(*args)
|
||||||
|
else:
|
||||||
|
return hook(*args)
|
||||||
|
return args[0] if args else None
|
||||||
|
|
||||||
|
def update_user_agent(self, user_agent: str):
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
|
def set_custom_headers(self, headers: Dict[str, str]):
|
||||||
|
self.headers = headers
|
||||||
|
|
||||||
|
async def kill_session(self, session_id: str):
|
||||||
|
if session_id in self.sessions:
|
||||||
|
context, page, _ = self.sessions[session_id]
|
||||||
|
await page.close()
|
||||||
|
await context.close()
|
||||||
|
del self.sessions[session_id]
|
||||||
|
|
||||||
|
def _cleanup_expired_sessions(self):
|
||||||
|
current_time = time.time()
|
||||||
|
expired_sessions = [
|
||||||
|
sid for sid, (_, _, last_used) in self.sessions.items()
|
||||||
|
if current_time - last_used > self.session_ttl
|
||||||
|
]
|
||||||
|
for sid in expired_sessions:
|
||||||
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
|
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
||||||
|
wait_for = wait_for.strip()
|
||||||
|
|
||||||
|
if wait_for.startswith('js:'):
|
||||||
|
# Explicitly specified JavaScript
|
||||||
|
js_code = wait_for[3:].strip()
|
||||||
|
return await self.csp_compliant_wait(page, js_code, timeout)
|
||||||
|
elif wait_for.startswith('css:'):
|
||||||
|
# Explicitly specified CSS selector
|
||||||
|
css_selector = wait_for[4:].strip()
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(css_selector, timeout=timeout)
|
||||||
|
except Error as e:
|
||||||
|
if 'Timeout' in str(e):
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
||||||
|
else:
|
||||||
|
# Auto-detect based on content
|
||||||
|
if wait_for.startswith('()') or wait_for.startswith('function'):
|
||||||
|
# It's likely a JavaScript function
|
||||||
|
return await self.csp_compliant_wait(page, wait_for, timeout)
|
||||||
|
else:
|
||||||
|
# Assume it's a CSS selector first
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector(wait_for, timeout=timeout)
|
||||||
|
except Error as e:
|
||||||
|
if 'Timeout' in str(e):
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
|
||||||
|
else:
|
||||||
|
# If it's not a timeout error, it might be an invalid selector
|
||||||
|
# Let's try to evaluate it as a JavaScript function as a fallback
|
||||||
|
try:
|
||||||
|
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||||
|
except Error:
|
||||||
|
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||||
|
"It should be either a valid CSS selector, a JavaScript function, "
|
||||||
|
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||||
|
|
||||||
|
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||||
|
wrapper_js = f"""
|
||||||
|
async () => {{
|
||||||
|
const userFunction = {user_wait_function};
|
||||||
|
const startTime = Date.now();
|
||||||
|
while (true) {{
|
||||||
|
if (await userFunction()) {{
|
||||||
|
return true;
|
||||||
|
}}
|
||||||
|
if (Date.now() - startTime > {timeout}) {{
|
||||||
|
throw new Error('Timeout waiting for condition');
|
||||||
|
}}
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.evaluate(wrapper_js)
|
||||||
|
except TimeoutError:
|
||||||
|
raise TimeoutError(f"Timeout after {timeout}ms waiting for condition")
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Error in wait condition: {str(e)}")
|
||||||
|
|
||||||
|
async def process_iframes(self, page):
|
||||||
|
# Find all iframes
|
||||||
|
iframes = await page.query_selector_all('iframe')
|
||||||
|
|
||||||
|
for i, iframe in enumerate(iframes):
|
||||||
|
try:
|
||||||
|
# Add a unique identifier to the iframe
|
||||||
|
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
|
||||||
|
|
||||||
|
# Get the frame associated with this iframe
|
||||||
|
frame = await iframe.content_frame()
|
||||||
|
|
||||||
|
if frame:
|
||||||
|
# Wait for the frame to load
|
||||||
|
await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout
|
||||||
|
|
||||||
|
# Extract the content of the iframe's body
|
||||||
|
iframe_content = await frame.evaluate('() => document.body.innerHTML')
|
||||||
|
|
||||||
|
# Generate a unique class name for this iframe
|
||||||
|
class_name = f'extracted-iframe-content-{i}'
|
||||||
|
|
||||||
|
# Replace the iframe with a div containing the extracted content
|
||||||
|
_iframe = iframe_content.replace('`', '\\`')
|
||||||
|
await page.evaluate(f"""
|
||||||
|
() => {{
|
||||||
|
const iframe = document.getElementById('iframe-{i}');
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.innerHTML = `{_iframe}`;
|
||||||
|
div.className = '{class_name}';
|
||||||
|
iframe.replaceWith(div);
|
||||||
|
}}
|
||||||
|
""")
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not access content frame for iframe {i}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing iframe {i}: {str(e)}")
|
||||||
|
|
||||||
|
# Return the page object
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
|
response_headers = {}
|
||||||
|
status_code = None
|
||||||
|
|
||||||
|
self._cleanup_expired_sessions()
|
||||||
|
session_id = kwargs.get("session_id")
|
||||||
|
if session_id:
|
||||||
|
context, page, _ = self.sessions.get(session_id, (None, None, None))
|
||||||
|
if not context:
|
||||||
|
context = await self.browser.new_context(
|
||||||
|
user_agent=self.user_agent,
|
||||||
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None
|
||||||
|
)
|
||||||
|
await context.set_extra_http_headers(self.headers)
|
||||||
|
page = await context.new_page()
|
||||||
|
self.sessions[session_id] = (context, page, time.time())
|
||||||
|
else:
|
||||||
|
context = await self.browser.new_context(
|
||||||
|
user_agent=self.user_agent,
|
||||||
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None
|
||||||
|
)
|
||||||
|
await context.set_extra_http_headers(self.headers)
|
||||||
|
|
||||||
|
if kwargs.get("override_navigator", False):
|
||||||
|
# Inject scripts to override navigator properties
|
||||||
|
await context.add_init_script("""
|
||||||
|
// Pass the Permissions Test.
|
||||||
|
const originalQuery = window.navigator.permissions.query;
|
||||||
|
window.navigator.permissions.query = (parameters) => (
|
||||||
|
parameters.name === 'notifications' ?
|
||||||
|
Promise.resolve({ state: Notification.permission }) :
|
||||||
|
originalQuery(parameters)
|
||||||
|
);
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
});
|
||||||
|
window.navigator.chrome = {
|
||||||
|
runtime: {},
|
||||||
|
// Add other properties if necessary
|
||||||
|
};
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
get: () => [1, 2, 3, 4, 5],
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => ['en-US', 'en'],
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'hidden', {
|
||||||
|
get: () => false
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'visibilityState', {
|
||||||
|
get: () => 'visible'
|
||||||
|
});
|
||||||
|
""")
|
||||||
|
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||||
|
|
||||||
|
if self.use_cached_html:
|
||||||
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
|
if os.path.exists(cache_file_path):
|
||||||
|
html = ""
|
||||||
|
with open(cache_file_path, "r") as f:
|
||||||
|
html = f.read()
|
||||||
|
# retrieve response headers and status code from cache
|
||||||
|
with open(cache_file_path + ".meta", "r") as f:
|
||||||
|
meta = json.load(f)
|
||||||
|
response_headers = meta.get("response_headers", {})
|
||||||
|
status_code = meta.get("status_code")
|
||||||
|
response = AsyncCrawlResponse(
|
||||||
|
html=html, response_headers=response_headers, status_code=status_code
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
if not kwargs.get("js_only", False):
|
||||||
|
await self.execute_hook('before_goto', page)
|
||||||
|
|
||||||
|
response = await page.goto("about:blank")
|
||||||
|
await stealth_async(page)
|
||||||
|
response = await page.goto(
|
||||||
|
url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
|
||||||
|
)
|
||||||
|
|
||||||
|
# await stealth_async(page)
|
||||||
|
# response = await page.goto("about:blank")
|
||||||
|
# await stealth_async(page)
|
||||||
|
# await page.evaluate(f"window.location.href = '{url}'")
|
||||||
|
|
||||||
|
await self.execute_hook('after_goto', page)
|
||||||
|
|
||||||
|
# Get status code and headers
|
||||||
|
status_code = response.status
|
||||||
|
response_headers = response.headers
|
||||||
|
else:
|
||||||
|
status_code = 200
|
||||||
|
response_headers = {}
|
||||||
|
|
||||||
|
await page.wait_for_selector('body')
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
|
||||||
|
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||||
|
if js_code:
|
||||||
|
if isinstance(js_code, str):
|
||||||
|
await page.evaluate(js_code)
|
||||||
|
elif isinstance(js_code, list):
|
||||||
|
for js in js_code:
|
||||||
|
await page.evaluate(js)
|
||||||
|
|
||||||
|
await page.wait_for_load_state('networkidle')
|
||||||
|
# Check for on execution event
|
||||||
|
await self.execute_hook('on_execution_started', page)
|
||||||
|
|
||||||
|
if kwargs.get("simulate_user", False):
|
||||||
|
# Simulate user interactions
|
||||||
|
await page.mouse.move(100, 100)
|
||||||
|
await page.mouse.down()
|
||||||
|
await page.mouse.up()
|
||||||
|
await page.keyboard.press('ArrowDown')
|
||||||
|
|
||||||
|
# Handle the wait_for parameter
|
||||||
|
wait_for = kwargs.get("wait_for")
|
||||||
|
if wait_for:
|
||||||
|
try:
|
||||||
|
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Update image dimensions
|
||||||
|
update_image_dimensions_js = """
|
||||||
|
() => {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const filterImage = (img) => {
|
||||||
|
// Filter out images that are too small
|
||||||
|
if (img.width < 100 && img.height < 100) return false;
|
||||||
|
|
||||||
|
// Filter out images that are not visible
|
||||||
|
const rect = img.getBoundingClientRect();
|
||||||
|
if (rect.width === 0 || rect.height === 0) return false;
|
||||||
|
|
||||||
|
// Filter out images with certain class names (e.g., icons, thumbnails)
|
||||||
|
if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
|
||||||
|
|
||||||
|
// Filter out images with certain patterns in their src (e.g., placeholder images)
|
||||||
|
if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
|
||||||
|
let imagesLeft = images.length;
|
||||||
|
|
||||||
|
if (imagesLeft === 0) {
|
||||||
|
resolve();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const checkImage = (img) => {
|
||||||
|
if (img.complete && img.naturalWidth !== 0) {
|
||||||
|
img.setAttribute('width', img.naturalWidth);
|
||||||
|
img.setAttribute('height', img.naturalHeight);
|
||||||
|
imagesLeft--;
|
||||||
|
if (imagesLeft === 0) resolve();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
images.forEach(img => {
|
||||||
|
checkImage(img);
|
||||||
|
if (!img.complete) {
|
||||||
|
img.onload = () => {
|
||||||
|
checkImage(img);
|
||||||
|
};
|
||||||
|
img.onerror = () => {
|
||||||
|
imagesLeft--;
|
||||||
|
if (imagesLeft === 0) resolve();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fallback timeout of 5 seconds
|
||||||
|
setTimeout(() => resolve(), 5000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
await page.evaluate(update_image_dimensions_js)
|
||||||
|
|
||||||
|
# Wait a bit for any onload events to complete
|
||||||
|
await page.wait_for_timeout(100)
|
||||||
|
|
||||||
|
# Process iframes
|
||||||
|
if kwargs.get("process_iframes", False):
|
||||||
|
page = await self.process_iframes(page)
|
||||||
|
|
||||||
|
await self.execute_hook('before_retrieve_html', page)
|
||||||
|
# Check if delay_before_return_html is set then wait for that time
|
||||||
|
delay_before_return_html = kwargs.get("delay_before_return_html")
|
||||||
|
if delay_before_return_html:
|
||||||
|
await asyncio.sleep(delay_before_return_html)
|
||||||
|
|
||||||
|
html = await page.content()
|
||||||
|
await self.execute_hook('before_return_html', page, html)
|
||||||
|
|
||||||
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
|
screenshot_data = None
|
||||||
|
if kwargs.get("screenshot"):
|
||||||
|
screenshot_data = await self.take_screenshot(url)
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||||
|
|
||||||
|
if self.use_cached_html:
|
||||||
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html)
|
||||||
|
# store response headers and status code in cache
|
||||||
|
with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
|
||||||
|
json.dump({
|
||||||
|
"response_headers": response_headers,
|
||||||
|
"status_code": status_code
|
||||||
|
}, f)
|
||||||
|
|
||||||
|
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await page.content()
|
||||||
|
|
||||||
|
response = AsyncCrawlResponse(
|
||||||
|
html=html,
|
||||||
|
response_headers=response_headers,
|
||||||
|
status_code=status_code,
|
||||||
|
screenshot=screenshot_data,
|
||||||
|
get_delayed_content=get_delayed_content
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
except Error as e:
|
||||||
|
raise Error(f"Failed to crawl {url}: {str(e)}")
|
||||||
|
finally:
|
||||||
|
if not session_id:
|
||||||
|
await page.close()
|
||||||
|
await context.close()
|
||||||
|
|
||||||
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
|
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||||
|
semaphore = asyncio.Semaphore(semaphore_count)
|
||||||
|
|
||||||
|
async def crawl_with_semaphore(url):
|
||||||
|
async with semaphore:
|
||||||
|
return await self.crawl(url, **kwargs)
|
||||||
|
|
||||||
|
tasks = [crawl_with_semaphore(url) for url in urls]
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||||
|
|
||||||
|
async def take_screenshot(self, url: str, wait_time=1000) -> str:
|
||||||
|
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
||||||
|
page = await context.new_page()
|
||||||
|
try:
|
||||||
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
# Wait for a specified time (default is 1 second)
|
||||||
|
await page.wait_for_timeout(wait_time)
|
||||||
|
screenshot = await page.screenshot(full_page=True)
|
||||||
|
return base64.b64encode(screenshot).decode('utf-8')
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Failed to take screenshot: {str(e)}"
|
||||||
|
print(error_message)
|
||||||
|
|
||||||
|
# Generate an error image
|
||||||
|
img = Image.new('RGB', (800, 600), color='black')
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||||
|
|
||||||
|
buffered = BytesIO()
|
||||||
|
img.save(buffered, format="JPEG")
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
@@ -1,17 +1,35 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64, time
|
import base64
|
||||||
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
||||||
import os
|
import os
|
||||||
from playwright.async_api import async_playwright, Page, Browser, Error
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from .utils import sanitize_input_encode, calculate_semaphore_count
|
|
||||||
import json, uuid
|
|
||||||
import hashlib
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from playwright_stealth import StealthConfig, stealth_async
|
||||||
|
|
||||||
|
stealth_config = StealthConfig(
|
||||||
|
webdriver=True,
|
||||||
|
chrome_app=True,
|
||||||
|
chrome_csi=True,
|
||||||
|
chrome_load_times=True,
|
||||||
|
chrome_runtime=True,
|
||||||
|
navigator_languages=True,
|
||||||
|
navigator_plugins=True,
|
||||||
|
navigator_permissions=True,
|
||||||
|
webgl_vendor=True,
|
||||||
|
outerdimensions=True,
|
||||||
|
navigator_hardware_concurrency=True,
|
||||||
|
media_codecs=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AsyncCrawlResponse(BaseModel):
|
class AsyncCrawlResponse(BaseModel):
|
||||||
html: str
|
html: str
|
||||||
@@ -47,10 +65,14 @@ class AsyncCrawlerStrategy(ABC):
|
|||||||
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||||
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
|
||||||
self.use_cached_html = use_cached_html
|
self.use_cached_html = use_cached_html
|
||||||
self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
self.user_agent = kwargs.get(
|
||||||
|
"user_agent",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
)
|
||||||
self.proxy = kwargs.get("proxy")
|
self.proxy = kwargs.get("proxy")
|
||||||
self.headless = kwargs.get("headless", True)
|
self.headless = kwargs.get("headless", True)
|
||||||
self.browser_type = kwargs.get("browser_type", "chromium") # New parameter
|
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||||
self.headers = kwargs.get("headers", {})
|
self.headers = kwargs.get("headers", {})
|
||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
self.session_ttl = 1800
|
self.session_ttl = 1800
|
||||||
@@ -58,6 +80,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
self.playwright = None
|
self.playwright = None
|
||||||
self.browser = None
|
self.browser = None
|
||||||
|
self.sleep_on_close = kwargs.get("sleep_on_close", False)
|
||||||
self.hooks = {
|
self.hooks = {
|
||||||
'on_browser_created': None,
|
'on_browser_created': None,
|
||||||
'on_user_agent_updated': None,
|
'on_user_agent_updated': None,
|
||||||
@@ -83,9 +106,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"headless": self.headless,
|
"headless": self.headless,
|
||||||
"args": [
|
"args": [
|
||||||
"--disable-gpu",
|
"--disable-gpu",
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-setuid-sandbox",
|
|
||||||
"--no-sandbox",
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--window-position=0,0",
|
||||||
|
"--ignore-certificate-errors",
|
||||||
|
"--ignore-certificate-errors-spki-list",
|
||||||
|
# "--headless=new", # Use the new headless mode
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,7 +122,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
proxy_settings = ProxySettings(server=self.proxy)
|
proxy_settings = ProxySettings(server=self.proxy)
|
||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|
||||||
# Select the appropriate browser based on the browser_type
|
# Select the appropriate browser based on the browser_type
|
||||||
if self.browser_type == "firefox":
|
if self.browser_type == "firefox":
|
||||||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||||||
@@ -106,6 +133,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.execute_hook('on_browser_created', self.browser)
|
await self.execute_hook('on_browser_created', self.browser)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
|
if self.sleep_on_close:
|
||||||
|
await asyncio.sleep(500)
|
||||||
if self.browser:
|
if self.browser:
|
||||||
await self.browser.close()
|
await self.browser.close()
|
||||||
self.browser = None
|
self.browser = None
|
||||||
@@ -147,8 +176,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
def _cleanup_expired_sessions(self):
|
def _cleanup_expired_sessions(self):
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
expired_sessions = [sid for sid, (_, _, last_used) in self.sessions.items()
|
expired_sessions = [
|
||||||
if current_time - last_used > self.session_ttl]
|
sid for sid, (_, _, last_used) in self.sessions.items()
|
||||||
|
if current_time - last_used > self.session_ttl
|
||||||
|
]
|
||||||
for sid in expired_sessions:
|
for sid in expired_sessions:
|
||||||
asyncio.create_task(self.kill_session(sid))
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
@@ -188,8 +219,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||||
except Error:
|
except Error:
|
||||||
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||||
"It should be either a valid CSS selector, a JavaScript function, "
|
"It should be either a valid CSS selector, a JavaScript function, "
|
||||||
"or explicitly prefixed with 'js:' or 'css:'.")
|
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||||
|
|
||||||
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||||
wrapper_js = f"""
|
wrapper_js = f"""
|
||||||
@@ -254,8 +285,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
print(f"Error processing iframe {i}: {str(e)}")
|
print(f"Error processing iframe {i}: {str(e)}")
|
||||||
|
|
||||||
# Return the page object
|
# Return the page object
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
@@ -268,25 +298,70 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if not context:
|
if not context:
|
||||||
context = await self.browser.new_context(
|
context = await self.browser.new_context(
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
proxy={"server": self.proxy} if self.proxy else None
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None,
|
||||||
|
accept_downloads=True,
|
||||||
|
java_script_enabled=True
|
||||||
)
|
)
|
||||||
|
await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}])
|
||||||
await context.set_extra_http_headers(self.headers)
|
await context.set_extra_http_headers(self.headers)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
self.sessions[session_id] = (context, page, time.time())
|
self.sessions[session_id] = (context, page, time.time())
|
||||||
else:
|
else:
|
||||||
context = await self.browser.new_context(
|
context = await self.browser.new_context(
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
proxy={"server": self.proxy} if self.proxy else None
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
proxy={"server": self.proxy} if self.proxy else None
|
||||||
)
|
)
|
||||||
await context.set_extra_http_headers(self.headers)
|
await context.set_extra_http_headers(self.headers)
|
||||||
|
|
||||||
|
if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
||||||
|
# Inject scripts to override navigator properties
|
||||||
|
await context.add_init_script("""
|
||||||
|
// Pass the Permissions Test.
|
||||||
|
const originalQuery = window.navigator.permissions.query;
|
||||||
|
window.navigator.permissions.query = (parameters) => (
|
||||||
|
parameters.name === 'notifications' ?
|
||||||
|
Promise.resolve({ state: Notification.permission }) :
|
||||||
|
originalQuery(parameters)
|
||||||
|
);
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
});
|
||||||
|
window.navigator.chrome = {
|
||||||
|
runtime: {},
|
||||||
|
// Add other properties if necessary
|
||||||
|
};
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
get: () => [1, 2, 3, 4, 5],
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => ['en-US', 'en'],
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'hidden', {
|
||||||
|
get: () => false
|
||||||
|
});
|
||||||
|
Object.defineProperty(document, 'visibilityState', {
|
||||||
|
get: () => 'visible'
|
||||||
|
});
|
||||||
|
""")
|
||||||
|
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
# await stealth_async(page) #, stealth_config)
|
||||||
|
|
||||||
|
# Add console message and error logging
|
||||||
|
if kwargs.get("log_console", False):
|
||||||
|
page.on("console", lambda msg: print(f"Console: {msg.text}"))
|
||||||
|
page.on("pageerror", lambda exc: print(f"Page Error: {exc}"))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
html = ""
|
html = ""
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
@@ -296,12 +371,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
meta = json.load(f)
|
meta = json.load(f)
|
||||||
response_headers = meta.get("response_headers", {})
|
response_headers = meta.get("response_headers", {})
|
||||||
status_code = meta.get("status_code")
|
status_code = meta.get("status_code")
|
||||||
response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
response = AsyncCrawlResponse(
|
||||||
|
html=html, response_headers=response_headers, status_code=status_code
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
if not kwargs.get("js_only", False):
|
if not kwargs.get("js_only", False):
|
||||||
await self.execute_hook('before_goto', page)
|
await self.execute_hook('before_goto', page)
|
||||||
response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
|
|
||||||
|
response = await page.goto(
|
||||||
|
url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
|
||||||
|
)
|
||||||
|
|
||||||
|
# response = await page.goto("about:blank")
|
||||||
|
# await page.evaluate(f"window.location.href = '{url}'")
|
||||||
|
|
||||||
await self.execute_hook('after_goto', page)
|
await self.execute_hook('after_goto', page)
|
||||||
|
|
||||||
# Get status code and headers
|
# Get status code and headers
|
||||||
@@ -311,37 +395,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
status_code = 200
|
status_code = 200
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
|
|
||||||
|
|
||||||
await page.wait_for_selector('body')
|
await page.wait_for_selector('body')
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
|
||||||
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||||
if js_code:
|
if js_code:
|
||||||
if isinstance(js_code, str):
|
if isinstance(js_code, str):
|
||||||
r = await page.evaluate(js_code)
|
await page.evaluate(js_code)
|
||||||
elif isinstance(js_code, list):
|
elif isinstance(js_code, list):
|
||||||
for js in js_code:
|
for js in js_code:
|
||||||
await page.evaluate(js)
|
await page.evaluate(js)
|
||||||
|
|
||||||
# await page.wait_for_timeout(100)
|
|
||||||
await page.wait_for_load_state('networkidle')
|
await page.wait_for_load_state('networkidle')
|
||||||
# Check for on execution even
|
# Check for on execution event
|
||||||
await self.execute_hook('on_execution_started', page)
|
await self.execute_hook('on_execution_started', page)
|
||||||
|
|
||||||
# New code to handle the wait_for parameter
|
if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
|
||||||
# Example usage:
|
# Simulate user interactions
|
||||||
# await crawler.crawl(
|
await page.mouse.move(100, 100)
|
||||||
# url,
|
await page.mouse.down()
|
||||||
# js_code="// some JavaScript code",
|
await page.mouse.up()
|
||||||
# wait_for="""() => {
|
await page.keyboard.press('ArrowDown')
|
||||||
# return document.querySelector('#my-element') !== null;
|
|
||||||
# }"""
|
# Handle the wait_for parameter
|
||||||
# )
|
|
||||||
# Example of using a CSS selector:
|
|
||||||
# await crawler.crawl(
|
|
||||||
# url,
|
|
||||||
# wait_for="#my-element"
|
|
||||||
# )
|
|
||||||
wait_for = kwargs.get("wait_for")
|
wait_for = kwargs.get("wait_for")
|
||||||
if wait_for:
|
if wait_for:
|
||||||
try:
|
try:
|
||||||
@@ -349,13 +425,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|
||||||
# Check if kwargs has screenshot=True then take screenshot
|
# Update image dimensions
|
||||||
screenshot_data = None
|
|
||||||
if kwargs.get("screenshot"):
|
|
||||||
screenshot_data = await self.take_screenshot(url)
|
|
||||||
|
|
||||||
|
|
||||||
# New code to update image dimensions
|
|
||||||
update_image_dimensions_js = """
|
update_image_dimensions_js = """
|
||||||
() => {
|
() => {
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
@@ -428,12 +498,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
await self.execute_hook('before_return_html', page, html)
|
await self.execute_hook('before_return_html', page, html)
|
||||||
|
|
||||||
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
|
screenshot_data = None
|
||||||
|
if kwargs.get("screenshot"):
|
||||||
|
screenshot_data = await self.take_screenshot(url)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
|
cache_file_path = os.path.join(
|
||||||
|
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
|
)
|
||||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
# store response headers and status code in cache
|
# store response headers and status code in cache
|
||||||
@@ -443,7 +520,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"status_code": status_code
|
"status_code": status_code
|
||||||
}, f)
|
}, f)
|
||||||
|
|
||||||
|
|
||||||
async def get_delayed_content(delay: float = 5.0) -> str:
|
async def get_delayed_content(delay: float = 5.0) -> str:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
|
||||||
@@ -459,63 +535,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise Error(f"Failed to crawl {url}: {str(e)}")
|
raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}")
|
||||||
finally:
|
# finally:
|
||||||
if not session_id:
|
# if not session_id:
|
||||||
await page.close()
|
# await page.close()
|
||||||
|
# await context.close()
|
||||||
|
|
||||||
# try:
|
|
||||||
# html = await _crawl()
|
|
||||||
# return sanitize_input_encode(html)
|
|
||||||
# except Error as e:
|
|
||||||
# raise Error(f"Failed to crawl {url}: {str(e)}")
|
|
||||||
# except Exception as e:
|
|
||||||
# raise Exception(f"Failed to crawl {url}: {str(e)}")
|
|
||||||
|
|
||||||
async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = None, wait_for_css: str = None) -> AsyncCrawlResponse:
|
|
||||||
"""
|
|
||||||
Execute JavaScript code in a specific session and optionally wait for a condition.
|
|
||||||
|
|
||||||
:param session_id: The ID of the session to execute the JS code in.
|
|
||||||
:param js_code: The JavaScript code to execute.
|
|
||||||
:param wait_for_js: JavaScript condition to wait for after execution.
|
|
||||||
:param wait_for_css: CSS selector to wait for after execution.
|
|
||||||
:return: AsyncCrawlResponse containing the page's HTML and other information.
|
|
||||||
:raises ValueError: If the session does not exist.
|
|
||||||
"""
|
|
||||||
if not session_id:
|
|
||||||
raise ValueError("Session ID must be provided")
|
|
||||||
|
|
||||||
if session_id not in self.sessions:
|
|
||||||
raise ValueError(f"No active session found for session ID: {session_id}")
|
|
||||||
|
|
||||||
context, page, last_used = self.sessions[session_id]
|
|
||||||
|
|
||||||
try:
|
|
||||||
await page.evaluate(js_code)
|
|
||||||
|
|
||||||
if wait_for_js:
|
|
||||||
await page.wait_for_function(wait_for_js)
|
|
||||||
|
|
||||||
if wait_for_css:
|
|
||||||
await page.wait_for_selector(wait_for_css)
|
|
||||||
|
|
||||||
# Get the updated HTML content
|
|
||||||
html = await page.content()
|
|
||||||
|
|
||||||
# Get response headers and status code (assuming these are available)
|
|
||||||
response_headers = await page.evaluate("() => JSON.stringify(performance.getEntriesByType('resource')[0].responseHeaders)")
|
|
||||||
status_code = await page.evaluate("() => performance.getEntriesByType('resource')[0].responseStatus")
|
|
||||||
|
|
||||||
# Update the last used time for this session
|
|
||||||
self.sessions[session_id] = (context, page, time.time())
|
|
||||||
|
|
||||||
return AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
|
|
||||||
except Error as e:
|
|
||||||
raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
|
|
||||||
|
|
||||||
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
|
||||||
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
|
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
|
||||||
semaphore = asyncio.Semaphore(semaphore_count)
|
semaphore = asyncio.Semaphore(semaphore_count)
|
||||||
|
|
||||||
async def crawl_with_semaphore(url):
|
async def crawl_with_semaphore(url):
|
||||||
@@ -526,7 +553,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
||||||
|
|
||||||
async def take_screenshot(self, url: str, wait_time = 1000) -> str:
|
async def take_screenshot(self, url: str, wait_time=1000) -> str:
|
||||||
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
async with await self.browser.new_context(user_agent=self.user_agent) as context:
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
try:
|
try:
|
||||||
@@ -549,4 +576,5 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
img.save(buffered, format="JPEG")
|
img.save(buffered, format="JPEG")
|
||||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
|
|||||||
@@ -133,8 +133,8 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, "msg"):
|
if not hasattr(e, "msg"):
|
||||||
e.msg = str(e)
|
e.msg = str(e)
|
||||||
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}")
|
||||||
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||||
|
|
||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
@@ -195,6 +195,7 @@ class AsyncWebCrawler:
|
|||||||
image_description_min_word_threshold=kwargs.get(
|
image_description_min_word_threshold=kwargs.get(
|
||||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||||
),
|
),
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
||||||
|
|
||||||
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||||
|
success = True
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -273,10 +274,41 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
if base64_pattern.match(src):
|
if base64_pattern.match(src):
|
||||||
# Replace base64 data with empty string
|
# Replace base64 data with empty string
|
||||||
img['src'] = base64_pattern.sub('', src)
|
img['src'] = base64_pattern.sub('', src)
|
||||||
|
|
||||||
|
try:
|
||||||
|
str(body)
|
||||||
|
except Exception as e:
|
||||||
|
# Reset body to the original HTML
|
||||||
|
success = False
|
||||||
|
body = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Create a new div with a special ID
|
||||||
|
error_div = body.new_tag('div', id='crawl4ai_error_message')
|
||||||
|
error_div.string = '''
|
||||||
|
Crawl4AI Error: This page is not fully supported.
|
||||||
|
|
||||||
|
Possible reasons:
|
||||||
|
1. The page may have restrictions that prevent crawling.
|
||||||
|
2. The page might not be fully loaded.
|
||||||
|
|
||||||
|
Suggestions:
|
||||||
|
- Try calling the crawl function with these parameters:
|
||||||
|
magic=True,
|
||||||
|
- Set headless=False to visualize what's happening on the page.
|
||||||
|
|
||||||
|
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Append the error div to the body
|
||||||
|
body.body.append(error_div)
|
||||||
|
|
||||||
|
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
|
||||||
|
|
||||||
|
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
|
|
||||||
h = CustomHTML2Text()
|
h = CustomHTML2Text()
|
||||||
h.ignore_links = True
|
h.ignore_links = not kwargs.get('include_links_on_markdown', False)
|
||||||
h.body_width = 0
|
h.body_width = 0
|
||||||
try:
|
try:
|
||||||
markdown = h.handle(cleaned_html)
|
markdown = h.handle(cleaned_html)
|
||||||
@@ -294,7 +326,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
return {
|
return {
|
||||||
'markdown': markdown,
|
'markdown': markdown,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': True,
|
'success': success,
|
||||||
'media': media,
|
'media': media,
|
||||||
'links': links,
|
'links': links,
|
||||||
'metadata': meta
|
'metadata': meta
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
from .async_web_scraper import AsyncWebScraper
|
|
||||||
from .bfs_scraper_strategy import BFSScraperStrategy
|
|
||||||
from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter
|
|
||||||
@@ -1,123 +0,0 @@
|
|||||||
from typing import Union, AsyncGenerator, Optional
|
|
||||||
from .scraper_strategy import ScraperStrategy
|
|
||||||
from .models import ScraperResult, CrawlResult
|
|
||||||
from ..async_webcrawler import AsyncWebCrawler
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ScrapingProgress:
|
|
||||||
"""Tracks the progress of a scraping operation."""
|
|
||||||
processed_urls: int = 0
|
|
||||||
failed_urls: int = 0
|
|
||||||
current_url: Optional[str] = None
|
|
||||||
|
|
||||||
class AsyncWebScraper:
|
|
||||||
"""
|
|
||||||
A high-level web scraper that combines an async crawler with a scraping strategy.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
crawler (AsyncWebCrawler): The async web crawler implementation
|
|
||||||
strategy (ScraperStrategy): The scraping strategy to use
|
|
||||||
logger (Optional[logging.Logger]): Custom logger for the scraper
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
crawler: AsyncWebCrawler,
|
|
||||||
strategy: ScraperStrategy,
|
|
||||||
logger: Optional[logging.Logger] = None
|
|
||||||
):
|
|
||||||
if not isinstance(crawler, AsyncWebCrawler):
|
|
||||||
raise TypeError("crawler must be an instance of AsyncWebCrawler")
|
|
||||||
if not isinstance(strategy, ScraperStrategy):
|
|
||||||
raise TypeError("strategy must be an instance of ScraperStrategy")
|
|
||||||
|
|
||||||
self.crawler = crawler
|
|
||||||
self.strategy = strategy
|
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
|
||||||
self._progress = ScrapingProgress()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def progress(self) -> ScrapingProgress:
|
|
||||||
"""Get current scraping progress."""
|
|
||||||
return self._progress
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def _error_handling_context(self, url: str):
|
|
||||||
"""Context manager for handling errors during scraping."""
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error scraping {url}: {str(e)}")
|
|
||||||
self._progress.failed_urls += 1
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def ascrape(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
parallel_processing: bool = True,
|
|
||||||
stream: bool = False
|
|
||||||
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
|
||||||
"""
|
|
||||||
Scrape a website starting from the given URL.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: Starting URL for scraping
|
|
||||||
parallel_processing: Whether to process URLs in parallel
|
|
||||||
stream: If True, yield results as they come; if False, collect all results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Either an async generator yielding CrawlResults or a final ScraperResult
|
|
||||||
"""
|
|
||||||
self._progress = ScrapingProgress() # Reset progress
|
|
||||||
|
|
||||||
async with self._error_handling_context(url):
|
|
||||||
if stream:
|
|
||||||
return self._ascrape_yielding(url, parallel_processing)
|
|
||||||
return await self._ascrape_collecting(url, parallel_processing)
|
|
||||||
|
|
||||||
async def _ascrape_yielding(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
parallel_processing: bool
|
|
||||||
) -> AsyncGenerator[CrawlResult, None]:
|
|
||||||
"""Stream scraping results as they become available."""
|
|
||||||
try:
|
|
||||||
result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
|
|
||||||
async for res in result_generator:
|
|
||||||
self._progress.processed_urls += 1
|
|
||||||
self._progress.current_url = res.url
|
|
||||||
yield res
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error in streaming scrape: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def _ascrape_collecting(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
parallel_processing: bool
|
|
||||||
) -> ScraperResult:
|
|
||||||
"""Collect all scraping results before returning."""
|
|
||||||
extracted_data = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
|
|
||||||
async for res in result_generator:
|
|
||||||
self._progress.processed_urls += 1
|
|
||||||
self._progress.current_url = res.url
|
|
||||||
extracted_data[res.url] = res
|
|
||||||
|
|
||||||
return ScraperResult(
|
|
||||||
url=url,
|
|
||||||
crawled_urls=list(extracted_data.keys()),
|
|
||||||
extracted_data=extracted_data,
|
|
||||||
stats={
|
|
||||||
'processed_urls': self._progress.processed_urls,
|
|
||||||
'failed_urls': self._progress.failed_urls
|
|
||||||
}
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error in collecting scrape: {str(e)}")
|
|
||||||
raise
|
|
||||||
@@ -1,327 +0,0 @@
|
|||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Union, AsyncGenerator, Optional, Dict, Set
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
from urllib.parse import urljoin, urlparse, urlunparse
|
|
||||||
from urllib.robotparser import RobotFileParser
|
|
||||||
import validators
|
|
||||||
import time
|
|
||||||
from aiolimiter import AsyncLimiter
|
|
||||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from .models import ScraperResult, CrawlResult
|
|
||||||
from .filters import FilterChain
|
|
||||||
from .scorers import URLScorer
|
|
||||||
from ..async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class CrawlStats:
|
|
||||||
"""Statistics for the crawling process"""
|
|
||||||
start_time: datetime
|
|
||||||
urls_processed: int = 0
|
|
||||||
urls_failed: int = 0
|
|
||||||
urls_skipped: int = 0
|
|
||||||
total_depth_reached: int = 0
|
|
||||||
current_depth: int = 0
|
|
||||||
robots_blocked: int = 0
|
|
||||||
|
|
||||||
class ScraperStrategy(ABC):
|
|
||||||
"""Base class for scraping strategies"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def ascrape(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
crawler: AsyncWebCrawler,
|
|
||||||
parallel_processing: bool = True,
|
|
||||||
stream: bool = False
|
|
||||||
) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
|
||||||
"""Abstract method for scraping implementation"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def can_process_url(self, url: str) -> bool:
|
|
||||||
"""Check if URL can be processed based on strategy rules"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def shutdown(self):
|
|
||||||
"""Clean up resources used by the strategy"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class BFSScraperStrategy(ScraperStrategy):
|
|
||||||
"""Breadth-First Search scraping strategy with politeness controls"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
max_depth: int,
|
|
||||||
filter_chain: FilterChain,
|
|
||||||
url_scorer: URLScorer,
|
|
||||||
max_concurrent: int = 5,
|
|
||||||
min_crawl_delay: int = 1,
|
|
||||||
timeout: int = 30,
|
|
||||||
logger: Optional[logging.Logger] = None
|
|
||||||
):
|
|
||||||
self.max_depth = max_depth
|
|
||||||
self.filter_chain = filter_chain
|
|
||||||
self.url_scorer = url_scorer
|
|
||||||
self.max_concurrent = max_concurrent
|
|
||||||
self.min_crawl_delay = min_crawl_delay
|
|
||||||
self.timeout = timeout
|
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Crawl control
|
|
||||||
self.stats = CrawlStats(start_time=datetime.now())
|
|
||||||
self._cancel_event = asyncio.Event()
|
|
||||||
self.process_external_links = False
|
|
||||||
|
|
||||||
# Rate limiting and politeness
|
|
||||||
self.rate_limiter = AsyncLimiter(1, 1)
|
|
||||||
self.last_crawl_time = defaultdict(float)
|
|
||||||
self.robot_parsers: Dict[str, RobotFileParser] = {}
|
|
||||||
self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue)
|
|
||||||
|
|
||||||
async def can_process_url(self, url: str) -> bool:
|
|
||||||
"""Check if URL can be processed based on robots.txt and filters
|
|
||||||
This is our gatekeeper method that determines if a URL should be processed. It:
|
|
||||||
- Validates URL format using the validators library
|
|
||||||
- Checks robots.txt permissions for the domain
|
|
||||||
- Applies custom filters from the filter chain
|
|
||||||
- Updates statistics for blocked URLs
|
|
||||||
- Returns False early if any check fails
|
|
||||||
"""
|
|
||||||
if not validators.url(url):
|
|
||||||
self.logger.warning(f"Invalid URL: {url}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
robot_parser = await self._get_robot_parser(url)
|
|
||||||
if robot_parser and not robot_parser.can_fetch("*", url):
|
|
||||||
self.stats.robots_blocked += 1
|
|
||||||
self.logger.info(f"Blocked by robots.txt: {url}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return self.filter_chain.apply(url)
|
|
||||||
|
|
||||||
async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
|
|
||||||
"""Get or create robots.txt parser for domain.
|
|
||||||
This is our robots.txt manager that:
|
|
||||||
- Uses domain-level caching of robot parsers
|
|
||||||
- Creates and caches new parsers as needed
|
|
||||||
- Handles failed robots.txt fetches gracefully
|
|
||||||
- Returns None if robots.txt can't be fetched, allowing crawling to proceed
|
|
||||||
"""
|
|
||||||
domain = urlparse(url).netloc
|
|
||||||
if domain not in self.robot_parsers:
|
|
||||||
parser = RobotFileParser()
|
|
||||||
try:
|
|
||||||
robots_url = f"{urlparse(url).scheme}://{domain}/robots.txt"
|
|
||||||
parser.set_url(robots_url)
|
|
||||||
parser.read()
|
|
||||||
self.robot_parsers[domain] = parser
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Error fetching robots.txt for {domain}: {e}")
|
|
||||||
return None
|
|
||||||
return self.robot_parsers[domain]
|
|
||||||
|
|
||||||
@retry(stop=stop_after_attempt(3),
|
|
||||||
wait=wait_exponential(multiplier=1, min=4, max=10))
|
|
||||||
async def _crawl_with_retry(
|
|
||||||
self,
|
|
||||||
crawler: AsyncWebCrawler,
|
|
||||||
url: str
|
|
||||||
) -> CrawlResult:
|
|
||||||
"""Crawl URL with retry logic"""
|
|
||||||
try:
|
|
||||||
async with asyncio.timeout(self.timeout):
|
|
||||||
return await crawler.arun(url)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
self.logger.error(f"Timeout crawling {url}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def process_url(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
depth: int,
|
|
||||||
crawler: AsyncWebCrawler,
|
|
||||||
queue: asyncio.PriorityQueue,
|
|
||||||
visited: Set[str],
|
|
||||||
depths: Dict[str, int]
|
|
||||||
) -> Optional[CrawlResult]:
|
|
||||||
"""Process a single URL and extract links.
|
|
||||||
This is our main URL processing workhorse that:
|
|
||||||
- Checks for cancellation
|
|
||||||
- Validates URLs through can_process_url
|
|
||||||
- Implements politeness delays per domain
|
|
||||||
- Applies rate limiting
|
|
||||||
- Handles crawling with retries
|
|
||||||
- Updates various statistics
|
|
||||||
- Processes extracted links
|
|
||||||
- Returns the crawl result or None on failure
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self._cancel_event.is_set():
|
|
||||||
return None
|
|
||||||
|
|
||||||
if not await self.can_process_url(url):
|
|
||||||
self.stats.urls_skipped += 1
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Politeness delay
|
|
||||||
domain = urlparse(url).netloc
|
|
||||||
time_since_last = time.time() - self.last_crawl_time[domain]
|
|
||||||
if time_since_last < self.min_crawl_delay:
|
|
||||||
await asyncio.sleep(self.min_crawl_delay - time_since_last)
|
|
||||||
self.last_crawl_time[domain] = time.time()
|
|
||||||
|
|
||||||
# Crawl with rate limiting
|
|
||||||
try:
|
|
||||||
async with self.rate_limiter:
|
|
||||||
result = await self._crawl_with_retry(crawler, url)
|
|
||||||
self.stats.urls_processed += 1
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error crawling {url}: {e}")
|
|
||||||
self.stats.urls_failed += 1
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Process links
|
|
||||||
await self._process_links(result, url, depth, queue, visited, depths)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
async def _process_links(
|
|
||||||
self,
|
|
||||||
result: CrawlResult,
|
|
||||||
source_url: str,
|
|
||||||
depth: int,
|
|
||||||
queue: asyncio.PriorityQueue,
|
|
||||||
visited: Set[str],
|
|
||||||
depths: Dict[str, int]
|
|
||||||
):
|
|
||||||
"""Process extracted links from crawl result.
|
|
||||||
This is our link processor that:
|
|
||||||
Handles both internal and external links
|
|
||||||
Normalizes URLs (removes fragments)
|
|
||||||
Checks depth limits
|
|
||||||
Scores URLs for priority
|
|
||||||
Updates depth tracking
|
|
||||||
Adds valid URLs to the queue
|
|
||||||
Updates maximum depth statistics
|
|
||||||
"""
|
|
||||||
links_ro_process = result.links["internal"]
|
|
||||||
if self.process_external_links:
|
|
||||||
links_ro_process += result.links["external"]
|
|
||||||
for link_type in links_ro_process:
|
|
||||||
for link in result.links[link_type]:
|
|
||||||
url = link['href']
|
|
||||||
# url = urljoin(source_url, link['href'])
|
|
||||||
# url = urlunparse(urlparse(url)._replace(fragment=""))
|
|
||||||
|
|
||||||
if url not in visited and await self.can_process_url(url):
|
|
||||||
new_depth = depths[source_url] + 1
|
|
||||||
if new_depth <= self.max_depth:
|
|
||||||
score = self.url_scorer.score(url)
|
|
||||||
await queue.put((score, new_depth, url))
|
|
||||||
depths[url] = new_depth
|
|
||||||
self.stats.total_depth_reached = max(
|
|
||||||
self.stats.total_depth_reached,
|
|
||||||
new_depth
|
|
||||||
)
|
|
||||||
|
|
||||||
async def ascrape(
|
|
||||||
self,
|
|
||||||
start_url: str,
|
|
||||||
crawler: AsyncWebCrawler,
|
|
||||||
parallel_processing: bool = True
|
|
||||||
) -> AsyncGenerator[CrawlResult, None]:
|
|
||||||
"""Implement BFS crawling strategy"""
|
|
||||||
|
|
||||||
# Initialize crawl state
|
|
||||||
"""
|
|
||||||
queue: A priority queue where items are tuples of (score, depth, url)
|
|
||||||
Score: Determines crawling priority (lower = higher priority)
|
|
||||||
Depth: Current distance from start_url
|
|
||||||
URL: The actual URL to crawl
|
|
||||||
visited: Keeps track of URLs we've already seen to avoid cycles
|
|
||||||
depths: Maps URLs to their depths from the start URL
|
|
||||||
pending_tasks: Tracks currently running crawl tasks
|
|
||||||
"""
|
|
||||||
queue = asyncio.PriorityQueue()
|
|
||||||
await queue.put((0, 0, start_url))
|
|
||||||
visited: Set[str] = set()
|
|
||||||
depths = {start_url: 0}
|
|
||||||
pending_tasks = set()
|
|
||||||
|
|
||||||
try:
|
|
||||||
while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set():
|
|
||||||
"""
|
|
||||||
This sets up our main control loop which:
|
|
||||||
- Continues while there are URLs to process (not queue.empty())
|
|
||||||
- Or while there are tasks still running (pending_tasks)
|
|
||||||
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
|
||||||
"""
|
|
||||||
# Start new tasks up to max_concurrent
|
|
||||||
while not queue.empty() and len(pending_tasks) < self.max_concurrent:
|
|
||||||
"""
|
|
||||||
This section manages task creation:
|
|
||||||
Checks if we can start more tasks (under max_concurrent limit)
|
|
||||||
Gets the next URL from the priority queue
|
|
||||||
Marks URLs as visited immediately to prevent duplicates
|
|
||||||
Updates current depth in stats
|
|
||||||
Either:
|
|
||||||
Creates a new async task (parallel mode)
|
|
||||||
Processes URL directly (sequential mode)
|
|
||||||
"""
|
|
||||||
_, depth, url = await queue.get()
|
|
||||||
if url not in visited:
|
|
||||||
visited.add(url)
|
|
||||||
self.stats.current_depth = depth
|
|
||||||
|
|
||||||
if parallel_processing:
|
|
||||||
task = asyncio.create_task(
|
|
||||||
self.process_url(url, depth, crawler, queue, visited, depths)
|
|
||||||
)
|
|
||||||
pending_tasks.add(task)
|
|
||||||
else:
|
|
||||||
result = await self.process_url(
|
|
||||||
url, depth, crawler, queue, visited, depths
|
|
||||||
)
|
|
||||||
if result:
|
|
||||||
yield result
|
|
||||||
|
|
||||||
# Process completed tasks
|
|
||||||
"""
|
|
||||||
This section manages completed tasks:
|
|
||||||
Waits for any task to complete using asyncio.wait
|
|
||||||
Uses FIRST_COMPLETED to handle results as soon as they're ready
|
|
||||||
Yields successful results to the caller
|
|
||||||
Updates pending_tasks to remove completed ones
|
|
||||||
"""
|
|
||||||
if pending_tasks:
|
|
||||||
done, pending_tasks = await asyncio.wait(
|
|
||||||
pending_tasks,
|
|
||||||
return_when=asyncio.FIRST_COMPLETED
|
|
||||||
)
|
|
||||||
for task in done:
|
|
||||||
result = await task
|
|
||||||
if result:
|
|
||||||
yield result
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error in crawl process: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Clean up any remaining tasks
|
|
||||||
for task in pending_tasks:
|
|
||||||
task.cancel()
|
|
||||||
self.stats.end_time = datetime.now()
|
|
||||||
|
|
||||||
async def shutdown(self):
|
|
||||||
"""Clean up resources and stop crawling"""
|
|
||||||
self._cancel_event.set()
|
|
||||||
# Clear caches and close connections
|
|
||||||
self.robot_parsers.clear()
|
|
||||||
self.domain_queues.clear()
|
|
||||||
@@ -1,205 +0,0 @@
|
|||||||
# from .url_filter import URLFilter, FilterChain
|
|
||||||
# from .content_type_filter import ContentTypeFilter
|
|
||||||
# from .url_pattern_filter import URLPatternFilter
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import List, Pattern, Set, Union
|
|
||||||
import re
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
import mimetypes
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass
|
|
||||||
import fnmatch
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class FilterStats:
|
|
||||||
"""Statistics for filter applications"""
|
|
||||||
total_urls: int = 0
|
|
||||||
rejected_urls: int = 0
|
|
||||||
passed_urls: int = 0
|
|
||||||
|
|
||||||
class URLFilter(ABC):
|
|
||||||
"""Base class for URL filters"""
|
|
||||||
|
|
||||||
def __init__(self, name: str = None):
|
|
||||||
self.name = name or self.__class__.__name__
|
|
||||||
self.stats = FilterStats()
|
|
||||||
self.logger = logging.getLogger(f"urlfilter.{self.name}")
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
"""Apply the filter to a URL"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _update_stats(self, passed: bool):
|
|
||||||
"""Update filter statistics"""
|
|
||||||
self.stats.total_urls += 1
|
|
||||||
if passed:
|
|
||||||
self.stats.passed_urls += 1
|
|
||||||
else:
|
|
||||||
self.stats.rejected_urls += 1
|
|
||||||
|
|
||||||
class FilterChain:
|
|
||||||
"""Chain of URL filters."""
|
|
||||||
|
|
||||||
def __init__(self, filters: List[URLFilter] = None):
|
|
||||||
self.filters = filters or []
|
|
||||||
self.stats = FilterStats()
|
|
||||||
self.logger = logging.getLogger("urlfilter.chain")
|
|
||||||
|
|
||||||
def add_filter(self, filter_: URLFilter) -> 'FilterChain':
|
|
||||||
"""Add a filter to the chain"""
|
|
||||||
self.filters.append(filter_)
|
|
||||||
return self # Enable method chaining
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
"""Apply all filters in the chain"""
|
|
||||||
self.stats.total_urls += 1
|
|
||||||
|
|
||||||
for filter_ in self.filters:
|
|
||||||
if not filter_.apply(url):
|
|
||||||
self.stats.rejected_urls += 1
|
|
||||||
self.logger.debug(f"URL {url} rejected by {filter_.name}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.stats.passed_urls += 1
|
|
||||||
return True
|
|
||||||
|
|
||||||
class URLPatternFilter(URLFilter):
|
|
||||||
"""Filter URLs based on glob patterns or regex.
|
|
||||||
|
|
||||||
pattern_filter = URLPatternFilter([
|
|
||||||
"*.example.com/*", # Glob pattern
|
|
||||||
"*/article/*", # Path pattern
|
|
||||||
re.compile(r"blog-\d+") # Regex pattern
|
|
||||||
])
|
|
||||||
|
|
||||||
- Supports glob patterns and regex
|
|
||||||
- Multiple patterns per filter
|
|
||||||
- Pattern pre-compilation for performance
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
|
||||||
use_glob: bool = True):
|
|
||||||
super().__init__()
|
|
||||||
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
|
||||||
self.use_glob = use_glob
|
|
||||||
self._compiled_patterns = []
|
|
||||||
|
|
||||||
for pattern in self.patterns:
|
|
||||||
if isinstance(pattern, str) and use_glob:
|
|
||||||
self._compiled_patterns.append(self._glob_to_regex(pattern))
|
|
||||||
else:
|
|
||||||
self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
|
|
||||||
|
|
||||||
def _glob_to_regex(self, pattern: str) -> Pattern:
|
|
||||||
"""Convert glob pattern to regex"""
|
|
||||||
return re.compile(fnmatch.translate(pattern))
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
"""Check if URL matches any of the patterns"""
|
|
||||||
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
|
|
||||||
self._update_stats(matches)
|
|
||||||
return matches
|
|
||||||
|
|
||||||
class ContentTypeFilter(URLFilter):
|
|
||||||
"""Filter URLs based on expected content type.
|
|
||||||
|
|
||||||
content_filter = ContentTypeFilter([
|
|
||||||
"text/html",
|
|
||||||
"application/pdf"
|
|
||||||
], check_extension=True)
|
|
||||||
|
|
||||||
- Filter by MIME types
|
|
||||||
- Extension checking
|
|
||||||
- Support for multiple content types
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, allowed_types: Union[str, List[str]],
|
|
||||||
check_extension: bool = True):
|
|
||||||
super().__init__()
|
|
||||||
self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
|
|
||||||
self.check_extension = check_extension
|
|
||||||
self._normalize_types()
|
|
||||||
|
|
||||||
def _normalize_types(self):
|
|
||||||
"""Normalize content type strings"""
|
|
||||||
self.allowed_types = [t.lower() for t in self.allowed_types]
|
|
||||||
|
|
||||||
def _check_extension(self, url: str) -> bool:
|
|
||||||
"""Check URL's file extension"""
|
|
||||||
ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
|
|
||||||
if not ext:
|
|
||||||
return True # No extension, might be dynamic content
|
|
||||||
|
|
||||||
guessed_type = mimetypes.guess_type(url)[0]
|
|
||||||
return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
"""Check if URL's content type is allowed"""
|
|
||||||
result = True
|
|
||||||
if self.check_extension:
|
|
||||||
result = self._check_extension(url)
|
|
||||||
self._update_stats(result)
|
|
||||||
return result
|
|
||||||
|
|
||||||
class DomainFilter(URLFilter):
|
|
||||||
"""Filter URLs based on allowed/blocked domains.
|
|
||||||
|
|
||||||
domain_filter = DomainFilter(
|
|
||||||
allowed_domains=["example.com", "blog.example.com"],
|
|
||||||
blocked_domains=["ads.example.com"]
|
|
||||||
)
|
|
||||||
|
|
||||||
- Allow/block specific domains
|
|
||||||
- Subdomain support
|
|
||||||
- Efficient domain matching
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, allowed_domains: Union[str, List[str]] = None,
|
|
||||||
blocked_domains: Union[str, List[str]] = None):
|
|
||||||
super().__init__()
|
|
||||||
self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None
|
|
||||||
self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
|
|
||||||
|
|
||||||
def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
|
|
||||||
"""Normalize domain strings"""
|
|
||||||
if isinstance(domains, str):
|
|
||||||
domains = [domains]
|
|
||||||
return [d.lower().strip() for d in domains]
|
|
||||||
|
|
||||||
def _extract_domain(self, url: str) -> str:
|
|
||||||
"""Extract domain from URL"""
|
|
||||||
return urlparse(url).netloc.lower()
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
"""Check if URL's domain is allowed"""
|
|
||||||
domain = self._extract_domain(url)
|
|
||||||
|
|
||||||
if domain in self.blocked_domains:
|
|
||||||
self._update_stats(False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if self.allowed_domains is not None and domain not in self.allowed_domains:
|
|
||||||
self._update_stats(False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
self._update_stats(True)
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Example usage:
|
|
||||||
def create_common_filter_chain() -> FilterChain:
|
|
||||||
"""Create a commonly used filter chain"""
|
|
||||||
return FilterChain([
|
|
||||||
URLPatternFilter([
|
|
||||||
"*.html", "*.htm", # HTML files
|
|
||||||
"*/article/*", "*/blog/*" # Common content paths
|
|
||||||
]),
|
|
||||||
ContentTypeFilter([
|
|
||||||
"text/html",
|
|
||||||
"application/xhtml+xml"
|
|
||||||
]),
|
|
||||||
DomainFilter(
|
|
||||||
blocked_domains=["ads.*", "analytics.*"]
|
|
||||||
)
|
|
||||||
])
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from pydantic import BaseModel
|
|
||||||
from typing import List, Dict
|
|
||||||
from ..models import CrawlResult
|
|
||||||
|
|
||||||
class ScraperResult(BaseModel):
|
|
||||||
url: str
|
|
||||||
crawled_urls: List[str]
|
|
||||||
extracted_data: Dict[str,CrawlResult]
|
|
||||||
@@ -1,268 +0,0 @@
|
|||||||
# from .url_scorer import URLScorer
|
|
||||||
# from .keyword_relevance_scorer import KeywordRelevanceScorer
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import List, Dict, Optional, Union
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from urllib.parse import urlparse, unquote
|
|
||||||
import re
|
|
||||||
from collections import defaultdict
|
|
||||||
import math
|
|
||||||
import logging
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ScoringStats:
|
|
||||||
"""Statistics for URL scoring"""
|
|
||||||
urls_scored: int = 0
|
|
||||||
total_score: float = 0.0
|
|
||||||
min_score: float = float('inf')
|
|
||||||
max_score: float = float('-inf')
|
|
||||||
|
|
||||||
def update(self, score: float):
|
|
||||||
"""Update scoring statistics"""
|
|
||||||
self.urls_scored += 1
|
|
||||||
self.total_score += score
|
|
||||||
self.min_score = min(self.min_score, score)
|
|
||||||
self.max_score = max(self.max_score, score)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def average_score(self) -> float:
|
|
||||||
"""Calculate average score"""
|
|
||||||
return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0
|
|
||||||
|
|
||||||
class URLScorer(ABC):
|
|
||||||
"""Base class for URL scoring strategies"""
|
|
||||||
|
|
||||||
def __init__(self, weight: float = 1.0, name: str = None):
|
|
||||||
self.weight = weight
|
|
||||||
self.name = name or self.__class__.__name__
|
|
||||||
self.stats = ScoringStats()
|
|
||||||
self.logger = logging.getLogger(f"urlscorer.{self.name}")
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate the raw score for a URL"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def score(self, url: str) -> float:
|
|
||||||
"""Calculate the weighted score for a URL"""
|
|
||||||
raw_score = self._calculate_score(url)
|
|
||||||
weighted_score = raw_score * self.weight
|
|
||||||
self.stats.update(weighted_score)
|
|
||||||
return weighted_score
|
|
||||||
|
|
||||||
class CompositeScorer(URLScorer):
|
|
||||||
"""Combines multiple scorers with weights"""
|
|
||||||
|
|
||||||
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
|
||||||
super().__init__(name="CompositeScorer")
|
|
||||||
self.scorers = scorers
|
|
||||||
self.normalize = normalize
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
scores = [scorer.score(url) for scorer in self.scorers]
|
|
||||||
total_score = sum(scores)
|
|
||||||
|
|
||||||
if self.normalize and scores:
|
|
||||||
total_score /= len(scores)
|
|
||||||
|
|
||||||
return total_score
|
|
||||||
|
|
||||||
class KeywordRelevanceScorer(URLScorer):
|
|
||||||
"""Score URLs based on keyword relevance.
|
|
||||||
|
|
||||||
keyword_scorer = KeywordRelevanceScorer(
|
|
||||||
keywords=["python", "programming"],
|
|
||||||
weight=1.0,
|
|
||||||
case_sensitive=False
|
|
||||||
)
|
|
||||||
|
|
||||||
- Score based on keyword matches
|
|
||||||
- Case sensitivity options
|
|
||||||
- Weighted scoring
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, keywords: List[str], weight: float = 1.0,
|
|
||||||
case_sensitive: bool = False):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.keywords = keywords
|
|
||||||
self.case_sensitive = case_sensitive
|
|
||||||
self._compile_keywords()
|
|
||||||
|
|
||||||
def _compile_keywords(self):
|
|
||||||
"""Prepare keywords for matching"""
|
|
||||||
flags = 0 if self.case_sensitive else re.IGNORECASE
|
|
||||||
self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords]
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on keyword matches"""
|
|
||||||
decoded_url = unquote(url)
|
|
||||||
total_matches = sum(
|
|
||||||
1 for pattern in self.patterns
|
|
||||||
if pattern.search(decoded_url)
|
|
||||||
)
|
|
||||||
# Normalize score between 0 and 1
|
|
||||||
return total_matches / len(self.patterns) if self.patterns else 0.0
|
|
||||||
|
|
||||||
class PathDepthScorer(URLScorer):
|
|
||||||
"""Score URLs based on their path depth.
|
|
||||||
|
|
||||||
path_scorer = PathDepthScorer(
|
|
||||||
optimal_depth=3, # Preferred URL depth
|
|
||||||
weight=0.7
|
|
||||||
)
|
|
||||||
|
|
||||||
- Score based on URL path depth
|
|
||||||
- Configurable optimal depth
|
|
||||||
- Diminishing returns for deeper paths
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.optimal_depth = optimal_depth
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on path depth"""
|
|
||||||
path = urlparse(url).path
|
|
||||||
depth = len([x for x in path.split('/') if x])
|
|
||||||
|
|
||||||
# Score decreases as we move away from optimal depth
|
|
||||||
distance_from_optimal = abs(depth - self.optimal_depth)
|
|
||||||
return 1.0 / (1.0 + distance_from_optimal)
|
|
||||||
|
|
||||||
class ContentTypeScorer(URLScorer):
|
|
||||||
"""Score URLs based on content type preferences.
|
|
||||||
|
|
||||||
content_scorer = ContentTypeScorer({
|
|
||||||
r'\.html$': 1.0,
|
|
||||||
r'\.pdf$': 0.8,
|
|
||||||
r'\.xml$': 0.6
|
|
||||||
})
|
|
||||||
|
|
||||||
- Score based on file types
|
|
||||||
- Configurable type weights
|
|
||||||
- Pattern matching support
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.type_weights = type_weights
|
|
||||||
self._compile_patterns()
|
|
||||||
|
|
||||||
def _compile_patterns(self):
|
|
||||||
"""Prepare content type patterns"""
|
|
||||||
self.patterns = {
|
|
||||||
re.compile(pattern): weight
|
|
||||||
for pattern, weight in self.type_weights.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on content type matching"""
|
|
||||||
for pattern, weight in self.patterns.items():
|
|
||||||
if pattern.search(url):
|
|
||||||
return weight
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
class FreshnessScorer(URLScorer):
|
|
||||||
"""Score URLs based on freshness indicators.
|
|
||||||
|
|
||||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
|
||||||
|
|
||||||
Score based on date indicators in URLs
|
|
||||||
Multiple date format support
|
|
||||||
Recency weighting"""
|
|
||||||
|
|
||||||
def __init__(self, weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.date_patterns = [
|
|
||||||
r'/(\d{4})/(\d{2})/(\d{2})/', # yyyy/mm/dd
|
|
||||||
r'(\d{4})[-_](\d{2})[-_](\d{2})', # yyyy-mm-dd
|
|
||||||
r'/(\d{4})/', # year only
|
|
||||||
]
|
|
||||||
self._compile_patterns()
|
|
||||||
|
|
||||||
def _compile_patterns(self):
|
|
||||||
"""Prepare date patterns"""
|
|
||||||
self.compiled_patterns = [re.compile(p) for p in self.date_patterns]
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on date indicators"""
|
|
||||||
for pattern in self.compiled_patterns:
|
|
||||||
if match := pattern.search(url):
|
|
||||||
year = int(match.group(1))
|
|
||||||
# Score higher for more recent years
|
|
||||||
return 1.0 - (2024 - year) * 0.1
|
|
||||||
return 0.5 # Default score for URLs without dates
|
|
||||||
|
|
||||||
class DomainAuthorityScorer(URLScorer):
|
|
||||||
"""Score URLs based on domain authority.
|
|
||||||
|
|
||||||
authority_scorer = DomainAuthorityScorer({
|
|
||||||
"python.org": 1.0,
|
|
||||||
"github.com": 0.9,
|
|
||||||
"medium.com": 0.7
|
|
||||||
})
|
|
||||||
|
|
||||||
Score based on domain importance
|
|
||||||
Configurable domain weights
|
|
||||||
Default weight for unknown domains"""
|
|
||||||
|
|
||||||
def __init__(self, domain_weights: Dict[str, float],
|
|
||||||
default_weight: float = 0.5, weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.domain_weights = domain_weights
|
|
||||||
self.default_weight = default_weight
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on domain authority"""
|
|
||||||
domain = urlparse(url).netloc.lower()
|
|
||||||
return self.domain_weights.get(domain, self.default_weight)
|
|
||||||
|
|
||||||
def create_balanced_scorer() -> CompositeScorer:
|
|
||||||
"""Create a balanced composite scorer"""
|
|
||||||
return CompositeScorer([
|
|
||||||
KeywordRelevanceScorer(
|
|
||||||
keywords=["article", "blog", "news", "research"],
|
|
||||||
weight=1.0
|
|
||||||
),
|
|
||||||
PathDepthScorer(
|
|
||||||
optimal_depth=3,
|
|
||||||
weight=0.7
|
|
||||||
),
|
|
||||||
ContentTypeScorer(
|
|
||||||
type_weights={
|
|
||||||
r'\.html?$': 1.0,
|
|
||||||
r'\.pdf$': 0.8,
|
|
||||||
r'\.xml$': 0.6
|
|
||||||
},
|
|
||||||
weight=0.8
|
|
||||||
),
|
|
||||||
FreshnessScorer(
|
|
||||||
weight=0.9
|
|
||||||
)
|
|
||||||
])
|
|
||||||
|
|
||||||
# Example Usage:
|
|
||||||
"""
|
|
||||||
# Create a composite scorer
|
|
||||||
scorer = CompositeScorer([
|
|
||||||
KeywordRelevanceScorer(["python", "programming"], weight=1.0),
|
|
||||||
PathDepthScorer(optimal_depth=2, weight=0.7),
|
|
||||||
FreshnessScorer(weight=0.8),
|
|
||||||
DomainAuthorityScorer(
|
|
||||||
domain_weights={
|
|
||||||
"python.org": 1.0,
|
|
||||||
"github.com": 0.9,
|
|
||||||
"medium.com": 0.7
|
|
||||||
},
|
|
||||||
weight=0.9
|
|
||||||
)
|
|
||||||
])
|
|
||||||
|
|
||||||
# Score a URL
|
|
||||||
score = scorer.score("https://python.org/article/2024/01/new-features")
|
|
||||||
|
|
||||||
# Access statistics
|
|
||||||
print(f"Average score: {scorer.stats.average_score}")
|
|
||||||
print(f"URLs scored: {scorer.stats.urls_scored}")
|
|
||||||
"""
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
from abc import ABC, abstractmethod
|
|
||||||
from .models import ScraperResult, CrawlResult
|
|
||||||
from ..models import CrawlResult
|
|
||||||
from ..async_webcrawler import AsyncWebCrawler
|
|
||||||
from typing import Union, AsyncGenerator
|
|
||||||
|
|
||||||
class ScraperStrategy(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
|
|
||||||
"""Scrape the given URL using the specified crawler.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The starting URL for the scrape.
|
|
||||||
crawler (AsyncWebCrawler): The web crawler instance.
|
|
||||||
parallel_processing (bool): Whether to use parallel processing. Defaults to True.
|
|
||||||
stream (bool): If True, yields individual crawl results as they are ready;
|
|
||||||
if False, accumulates results and returns a final ScraperResult.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
CrawlResult: Individual crawl results if stream is True.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ScraperResult: A summary of the scrape results containing the final extracted data
|
|
||||||
and the list of crawled URLs if stream is False.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
@@ -692,8 +692,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
for img in imgs:
|
for img in imgs:
|
||||||
src = img.get('src', '')
|
src = img.get('src', '')
|
||||||
if base64_pattern.match(src):
|
if base64_pattern.match(src):
|
||||||
# Replace base64 data with empty string
|
|
||||||
img['src'] = base64_pattern.sub('', src)
|
img['src'] = base64_pattern.sub('', src)
|
||||||
|
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
cleaned_html = sanitize_html(cleaned_html)
|
cleaned_html = sanitize_html(cleaned_html)
|
||||||
|
|
||||||
|
|||||||
@@ -47,8 +47,7 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# !pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git\"\n",
|
"!pip install crawl4ai\n",
|
||||||
"!pip install \"crawl4ai @ git+https://github.com/unclecode/crawl4ai.git@staging\"\n",
|
|
||||||
"!pip install nest-asyncio\n",
|
"!pip install nest-asyncio\n",
|
||||||
"!playwright install"
|
"!playwright install"
|
||||||
]
|
]
|
||||||
@@ -714,7 +713,7 @@
|
|||||||
"provenance": []
|
"provenance": []
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "venv",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -379,6 +379,18 @@ async def crawl_custom_browser_type():
|
|||||||
print(result.markdown[:500])
|
print(result.markdown[:500])
|
||||||
print("Time taken: ", time.time() - start)
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
|
async def crawl_with_user_simultion():
|
||||||
|
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||||
|
url = "YOUR-URL-HERE"
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
bypass_cache=True,
|
||||||
|
simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
|
||||||
|
override_navigator = True # Overrides the navigator object to make it look like a real user
|
||||||
|
)
|
||||||
|
|
||||||
|
print(result.markdown)
|
||||||
|
|
||||||
async def speed_comparison():
|
async def speed_comparison():
|
||||||
# print("\n--- Speed Comparison ---")
|
# print("\n--- Speed Comparison ---")
|
||||||
# print("Firecrawl (simulated):")
|
# print("Firecrawl (simulated):")
|
||||||
@@ -455,7 +467,7 @@ async def main():
|
|||||||
# LLM extraction examples
|
# LLM extraction examples
|
||||||
await extract_structured_data_using_llm()
|
await extract_structured_data_using_llm()
|
||||||
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||||
await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
|
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
await extract_structured_data_using_llm("ollama/llama3.2")
|
await extract_structured_data_using_llm("ollama/llama3.2")
|
||||||
|
|
||||||
# You always can pass custom headers to the extraction strategy
|
# You always can pass custom headers to the extraction strategy
|
||||||
|
|||||||
@@ -1,166 +0,0 @@
|
|||||||
# AsyncWebScraper: Smart Web Crawling Made Easy
|
|
||||||
|
|
||||||
AsyncWebScraper is a powerful and flexible web scraping tool that makes it easy to collect data from websites efficiently. Whether you need to scrape a few pages or an entire website, AsyncWebScraper handles the complexity of web crawling while giving you fine-grained control over the process.
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
flowchart TB
|
|
||||||
Start([Start]) --> Init[Initialize AsyncWebScraper\nwith Crawler and Strategy]
|
|
||||||
Init --> InputURL[Receive URL to scrape]
|
|
||||||
InputURL --> Decision{Stream or\nCollect?}
|
|
||||||
|
|
||||||
%% Streaming Path
|
|
||||||
Decision -->|Stream| StreamInit[Initialize Streaming Mode]
|
|
||||||
StreamInit --> StreamStrategy[Call Strategy.ascrape]
|
|
||||||
StreamStrategy --> AsyncGen[Create Async Generator]
|
|
||||||
AsyncGen --> ProcessURL[Process Next URL]
|
|
||||||
ProcessURL --> FetchContent[Fetch Page Content]
|
|
||||||
FetchContent --> Extract[Extract Data]
|
|
||||||
Extract --> YieldResult[Yield CrawlResult]
|
|
||||||
YieldResult --> CheckMore{More URLs?}
|
|
||||||
CheckMore -->|Yes| ProcessURL
|
|
||||||
CheckMore -->|No| StreamEnd([End Stream])
|
|
||||||
|
|
||||||
%% Collecting Path
|
|
||||||
Decision -->|Collect| CollectInit[Initialize Collection Mode]
|
|
||||||
CollectInit --> CollectStrategy[Call Strategy.ascrape]
|
|
||||||
CollectStrategy --> CollectGen[Create Async Generator]
|
|
||||||
CollectGen --> ProcessURLColl[Process Next URL]
|
|
||||||
ProcessURLColl --> FetchContentColl[Fetch Page Content]
|
|
||||||
FetchContentColl --> ExtractColl[Extract Data]
|
|
||||||
ExtractColl --> StoreColl[Store in Dictionary]
|
|
||||||
StoreColl --> CheckMoreColl{More URLs?}
|
|
||||||
CheckMoreColl -->|Yes| ProcessURLColl
|
|
||||||
CheckMoreColl -->|No| CreateResult[Create ScraperResult]
|
|
||||||
CreateResult --> ReturnResult([Return Result])
|
|
||||||
|
|
||||||
%% Parallel Processing
|
|
||||||
subgraph Parallel
|
|
||||||
ProcessURL
|
|
||||||
FetchContent
|
|
||||||
Extract
|
|
||||||
ProcessURLColl
|
|
||||||
FetchContentColl
|
|
||||||
ExtractColl
|
|
||||||
end
|
|
||||||
|
|
||||||
%% Error Handling
|
|
||||||
FetchContent --> ErrorCheck{Error?}
|
|
||||||
ErrorCheck -->|Yes| LogError[Log Error]
|
|
||||||
LogError --> UpdateStats[Update Error Stats]
|
|
||||||
UpdateStats --> CheckMore
|
|
||||||
ErrorCheck -->|No| Extract
|
|
||||||
|
|
||||||
FetchContentColl --> ErrorCheckColl{Error?}
|
|
||||||
ErrorCheckColl -->|Yes| LogErrorColl[Log Error]
|
|
||||||
LogErrorColl --> UpdateStatsColl[Update Error Stats]
|
|
||||||
UpdateStatsColl --> CheckMoreColl
|
|
||||||
ErrorCheckColl -->|No| ExtractColl
|
|
||||||
|
|
||||||
%% Style definitions
|
|
||||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
|
||||||
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
|
||||||
classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
|
||||||
classDef start fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
|
||||||
|
|
||||||
class Start,StreamEnd,ReturnResult start;
|
|
||||||
class Decision,CheckMore,CheckMoreColl,ErrorCheck,ErrorCheckColl decision;
|
|
||||||
class LogError,LogErrorColl,UpdateStats,UpdateStatsColl error;
|
|
||||||
class ProcessURL,FetchContent,Extract,ProcessURLColl,FetchContentColl,ExtractColl process;
|
|
||||||
```
|
|
||||||
|
|
||||||
AsyncWebScraper uses an intelligent crawling system that can navigate through websites following your specified strategy. It supports two main modes of operation:
|
|
||||||
|
|
||||||
### 1. Streaming Mode
|
|
||||||
```python
|
|
||||||
async for result in scraper.ascrape(url, stream=True):
|
|
||||||
print(f"Found data on {result.url}")
|
|
||||||
process_data(result.data)
|
|
||||||
```
|
|
||||||
- Perfect for processing large websites
|
|
||||||
- Memory efficient - handles one page at a time
|
|
||||||
- Ideal for real-time data processing
|
|
||||||
- Great for monitoring or continuous scraping tasks
|
|
||||||
|
|
||||||
### 2. Collection Mode
|
|
||||||
```python
|
|
||||||
result = await scraper.ascrape(url)
|
|
||||||
print(f"Scraped {len(result.crawled_urls)} pages")
|
|
||||||
process_all_data(result.extracted_data)
|
|
||||||
```
|
|
||||||
- Collects all data before returning
|
|
||||||
- Best for when you need the complete dataset
|
|
||||||
- Easier to work with for batch processing
|
|
||||||
- Includes comprehensive statistics
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
- **Smart Crawling**: Automatically follows relevant links while avoiding duplicates
|
|
||||||
- **Parallel Processing**: Scrapes multiple pages simultaneously for better performance
|
|
||||||
- **Memory Efficient**: Choose between streaming and collecting based on your needs
|
|
||||||
- **Error Resilient**: Continues working even if some pages fail to load
|
|
||||||
- **Progress Tracking**: Monitor the scraping progress in real-time
|
|
||||||
- **Customizable**: Configure crawling strategy, filters, and scoring to match your needs
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.scraper import AsyncWebScraper, BFSStrategy
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
# Initialize the scraper
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
strategy = BFSStrategy(
|
|
||||||
max_depth=2, # How deep to crawl
|
|
||||||
url_pattern="*.example.com/*" # What URLs to follow
|
|
||||||
)
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
|
|
||||||
# Start scraping
|
|
||||||
async def main():
|
|
||||||
# Collect all results
|
|
||||||
result = await scraper.ascrape("https://example.com")
|
|
||||||
print(f"Found {len(result.extracted_data)} pages")
|
|
||||||
|
|
||||||
# Or stream results
|
|
||||||
async for page in scraper.ascrape("https://example.com", stream=True):
|
|
||||||
print(f"Processing {page.url}")
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
1. **Choose the Right Mode**
|
|
||||||
- Use streaming for large websites or real-time processing
|
|
||||||
- Use collecting for smaller sites or when you need the complete dataset
|
|
||||||
|
|
||||||
2. **Configure Depth**
|
|
||||||
- Start with a small depth (2-3) and increase if needed
|
|
||||||
- Higher depths mean exponentially more pages to crawl
|
|
||||||
|
|
||||||
3. **Set Appropriate Filters**
|
|
||||||
- Use URL patterns to stay within relevant sections
|
|
||||||
- Set content type filters to only process useful pages
|
|
||||||
|
|
||||||
4. **Handle Resources Responsibly**
|
|
||||||
- Enable parallel processing for faster results
|
|
||||||
- Consider the target website's capacity
|
|
||||||
- Implement appropriate delays between requests
|
|
||||||
|
|
||||||
## Common Use Cases
|
|
||||||
|
|
||||||
- **Content Aggregation**: Collect articles, blog posts, or news from multiple pages
|
|
||||||
- **Data Extraction**: Gather product information, prices, or specifications
|
|
||||||
- **Site Mapping**: Create a complete map of a website's structure
|
|
||||||
- **Content Monitoring**: Track changes or updates across multiple pages
|
|
||||||
- **Data Mining**: Extract and analyze patterns across web pages
|
|
||||||
|
|
||||||
## Advanced Features
|
|
||||||
|
|
||||||
- Custom scoring algorithms for prioritizing important pages
|
|
||||||
- URL filters for focusing on specific site sections
|
|
||||||
- Content type filtering for processing only relevant pages
|
|
||||||
- Progress tracking for monitoring long-running scrapes
|
|
||||||
|
|
||||||
Need more help? Check out our [examples repository](https://github.com/example/crawl4ai/examples) or join our [community Discord](https://discord.gg/example).
|
|
||||||
@@ -1,244 +0,0 @@
|
|||||||
# BFS Scraper Strategy: Smart Web Traversal
|
|
||||||
|
|
||||||
The BFS (Breadth-First Search) Scraper Strategy provides an intelligent way to traverse websites systematically. It crawls websites level by level, ensuring thorough coverage while respecting web crawling etiquette.
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
flowchart TB
|
|
||||||
Start([Start]) --> Init[Initialize BFS Strategy]
|
|
||||||
Init --> InitStats[Initialize CrawlStats]
|
|
||||||
InitStats --> InitQueue[Initialize Priority Queue]
|
|
||||||
InitQueue --> AddStart[Add Start URL to Queue]
|
|
||||||
|
|
||||||
AddStart --> CheckState{Queue Empty or\nTasks Pending?}
|
|
||||||
CheckState -->|No| Cleanup[Cleanup & Stats]
|
|
||||||
Cleanup --> End([End])
|
|
||||||
|
|
||||||
CheckState -->|Yes| CheckCancel{Cancel\nRequested?}
|
|
||||||
CheckCancel -->|Yes| Cleanup
|
|
||||||
|
|
||||||
CheckCancel -->|No| CheckConcurrent{Under Max\nConcurrent?}
|
|
||||||
|
|
||||||
CheckConcurrent -->|No| WaitComplete[Wait for Task Completion]
|
|
||||||
WaitComplete --> YieldResult[Yield Result]
|
|
||||||
YieldResult --> CheckState
|
|
||||||
|
|
||||||
CheckConcurrent -->|Yes| GetNextURL[Get Next URL from Queue]
|
|
||||||
|
|
||||||
GetNextURL --> ValidateURL{Already\nVisited?}
|
|
||||||
ValidateURL -->|Yes| CheckState
|
|
||||||
|
|
||||||
ValidateURL -->|No| ProcessURL[Process URL]
|
|
||||||
|
|
||||||
subgraph URL_Processing [URL Processing]
|
|
||||||
ProcessURL --> CheckValid{URL Valid?}
|
|
||||||
CheckValid -->|No| UpdateStats[Update Skip Stats]
|
|
||||||
|
|
||||||
CheckValid -->|Yes| CheckRobots{Allowed by\nrobots.txt?}
|
|
||||||
CheckRobots -->|No| UpdateRobotStats[Update Robot Stats]
|
|
||||||
|
|
||||||
CheckRobots -->|Yes| ApplyDelay[Apply Politeness Delay]
|
|
||||||
ApplyDelay --> FetchContent[Fetch Content with Rate Limit]
|
|
||||||
|
|
||||||
FetchContent --> CheckError{Error?}
|
|
||||||
CheckError -->|Yes| Retry{Retry\nNeeded?}
|
|
||||||
Retry -->|Yes| FetchContent
|
|
||||||
Retry -->|No| UpdateFailStats[Update Fail Stats]
|
|
||||||
|
|
||||||
CheckError -->|No| ExtractLinks[Extract & Process Links]
|
|
||||||
ExtractLinks --> ScoreURLs[Score New URLs]
|
|
||||||
ScoreURLs --> AddToQueue[Add to Priority Queue]
|
|
||||||
end
|
|
||||||
|
|
||||||
ProcessURL --> CreateTask{Parallel\nProcessing?}
|
|
||||||
CreateTask -->|Yes| AddTask[Add to Pending Tasks]
|
|
||||||
CreateTask -->|No| DirectProcess[Process Directly]
|
|
||||||
|
|
||||||
AddTask --> CheckState
|
|
||||||
DirectProcess --> YieldResult
|
|
||||||
|
|
||||||
UpdateStats --> CheckState
|
|
||||||
UpdateRobotStats --> CheckState
|
|
||||||
UpdateFailStats --> CheckState
|
|
||||||
|
|
||||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
|
||||||
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
|
||||||
classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
|
||||||
classDef stats fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
|
||||||
|
|
||||||
class Start,End stats;
|
|
||||||
class CheckState,CheckCancel,CheckConcurrent,ValidateURL,CheckValid,CheckRobots,CheckError,Retry,CreateTask decision;
|
|
||||||
class UpdateStats,UpdateRobotStats,UpdateFailStats,InitStats,Cleanup stats;
|
|
||||||
class ProcessURL,FetchContent,ExtractLinks,ScoreURLs process;
|
|
||||||
```
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
The BFS strategy crawls a website by:
|
|
||||||
1. Starting from a root URL
|
|
||||||
2. Processing all URLs at the current depth
|
|
||||||
3. Moving to URLs at the next depth level
|
|
||||||
4. Continuing until maximum depth is reached
|
|
||||||
|
|
||||||
This ensures systematic coverage of the website while maintaining control over the crawling process.
|
|
||||||
|
|
||||||
## Key Features
|
|
||||||
|
|
||||||
### 1. Smart URL Processing
|
|
||||||
```python
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2,
|
|
||||||
filter_chain=my_filters,
|
|
||||||
url_scorer=my_scorer,
|
|
||||||
max_concurrent=5
|
|
||||||
)
|
|
||||||
```
|
|
||||||
- Controls crawl depth
|
|
||||||
- Filters unwanted URLs
|
|
||||||
- Scores URLs for priority
|
|
||||||
- Manages concurrent requests
|
|
||||||
|
|
||||||
### 2. Polite Crawling
|
|
||||||
The strategy automatically implements web crawling best practices:
|
|
||||||
- Respects robots.txt
|
|
||||||
- Implements rate limiting
|
|
||||||
- Adds politeness delays
|
|
||||||
- Manages concurrent requests
|
|
||||||
|
|
||||||
### 3. Link Processing Control
|
|
||||||
```python
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
...,
|
|
||||||
process_external_links=False # Only process internal links
|
|
||||||
)
|
|
||||||
```
|
|
||||||
- Control whether to follow external links
|
|
||||||
- Default: internal links only
|
|
||||||
- Enable external links when needed
|
|
||||||
|
|
||||||
## Configuration Options
|
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
|
||||||
|-----------|-------------|---------|
|
|
||||||
| max_depth | Maximum crawl depth | Required |
|
|
||||||
| filter_chain | URL filtering rules | Required |
|
|
||||||
| url_scorer | URL priority scoring | Required |
|
|
||||||
| max_concurrent | Max parallel requests | 5 |
|
|
||||||
| min_crawl_delay | Seconds between requests | 1 |
|
|
||||||
| process_external_links | Follow external links | False |
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
1. **Set Appropriate Depth**
|
|
||||||
- Start with smaller depths (2-3)
|
|
||||||
- Increase based on needs
|
|
||||||
- Consider site structure
|
|
||||||
|
|
||||||
2. **Configure Filters**
|
|
||||||
- Use URL patterns
|
|
||||||
- Filter by content type
|
|
||||||
- Avoid unwanted sections
|
|
||||||
|
|
||||||
3. **Tune Performance**
|
|
||||||
- Adjust max_concurrent
|
|
||||||
- Set appropriate delays
|
|
||||||
- Monitor resource usage
|
|
||||||
|
|
||||||
4. **Handle External Links**
|
|
||||||
- Keep external_links=False for focused crawls
|
|
||||||
- Enable only when needed
|
|
||||||
- Consider additional filtering
|
|
||||||
|
|
||||||
## Example Usage
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.scraper import BFSScraperStrategy
|
|
||||||
from crawl4ai.scraper.filters import FilterChain
|
|
||||||
from crawl4ai.scraper.scorers import BasicURLScorer
|
|
||||||
|
|
||||||
# Configure strategy
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=3,
|
|
||||||
filter_chain=FilterChain([
|
|
||||||
URLPatternFilter("*.example.com/*"),
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
]),
|
|
||||||
url_scorer=BasicURLScorer(),
|
|
||||||
max_concurrent=5,
|
|
||||||
min_crawl_delay=1,
|
|
||||||
process_external_links=False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use with AsyncWebScraper
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
results = await scraper.ascrape("https://example.com")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Common Use Cases
|
|
||||||
|
|
||||||
### 1. Site Mapping
|
|
||||||
```python
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=5,
|
|
||||||
filter_chain=site_filter,
|
|
||||||
url_scorer=depth_scorer,
|
|
||||||
process_external_links=False
|
|
||||||
)
|
|
||||||
```
|
|
||||||
Perfect for creating complete site maps or understanding site structure.
|
|
||||||
|
|
||||||
### 2. Content Aggregation
|
|
||||||
```python
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2,
|
|
||||||
filter_chain=content_filter,
|
|
||||||
url_scorer=relevance_scorer,
|
|
||||||
max_concurrent=3
|
|
||||||
)
|
|
||||||
```
|
|
||||||
Ideal for collecting specific types of content (articles, products, etc.).
|
|
||||||
|
|
||||||
### 3. Link Analysis
|
|
||||||
```python
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=1,
|
|
||||||
filter_chain=link_filter,
|
|
||||||
url_scorer=link_scorer,
|
|
||||||
process_external_links=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
Useful for analyzing both internal and external link structures.
|
|
||||||
|
|
||||||
## Advanced Features
|
|
||||||
|
|
||||||
### Progress Monitoring
|
|
||||||
```python
|
|
||||||
async for result in scraper.ascrape(url):
|
|
||||||
print(f"Current depth: {strategy.stats.current_depth}")
|
|
||||||
print(f"Processed URLs: {strategy.stats.urls_processed}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom URL Scoring
|
|
||||||
```python
|
|
||||||
class CustomScorer(URLScorer):
|
|
||||||
def score(self, url: str) -> float:
|
|
||||||
# Lower scores = higher priority
|
|
||||||
return score_based_on_criteria(url)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
1. **Slow Crawling**
|
|
||||||
- Increase max_concurrent
|
|
||||||
- Adjust min_crawl_delay
|
|
||||||
- Check network conditions
|
|
||||||
|
|
||||||
2. **Missing Content**
|
|
||||||
- Verify max_depth
|
|
||||||
- Check filter settings
|
|
||||||
- Review URL patterns
|
|
||||||
|
|
||||||
3. **High Resource Usage**
|
|
||||||
- Reduce max_concurrent
|
|
||||||
- Increase crawl delay
|
|
||||||
- Add more specific filters
|
|
||||||
|
|
||||||
@@ -1,342 +0,0 @@
|
|||||||
# URL Filters and Scorers
|
|
||||||
|
|
||||||
The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively.
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
flowchart TB
|
|
||||||
Start([URL Input]) --> Chain[Filter Chain]
|
|
||||||
|
|
||||||
subgraph Chain Process
|
|
||||||
Chain --> Pattern{URL Pattern\nFilter}
|
|
||||||
Pattern -->|Match| Content{Content Type\nFilter}
|
|
||||||
Pattern -->|No Match| Reject1[Reject URL]
|
|
||||||
|
|
||||||
Content -->|Allowed| Domain{Domain\nFilter}
|
|
||||||
Content -->|Not Allowed| Reject2[Reject URL]
|
|
||||||
|
|
||||||
Domain -->|Allowed| Accept[Accept URL]
|
|
||||||
Domain -->|Blocked| Reject3[Reject URL]
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph Statistics
|
|
||||||
Pattern --> UpdatePattern[Update Pattern Stats]
|
|
||||||
Content --> UpdateContent[Update Content Stats]
|
|
||||||
Domain --> UpdateDomain[Update Domain Stats]
|
|
||||||
Accept --> UpdateChain[Update Chain Stats]
|
|
||||||
Reject1 --> UpdateChain
|
|
||||||
Reject2 --> UpdateChain
|
|
||||||
Reject3 --> UpdateChain
|
|
||||||
end
|
|
||||||
|
|
||||||
Accept --> End([End])
|
|
||||||
Reject1 --> End
|
|
||||||
Reject2 --> End
|
|
||||||
Reject3 --> End
|
|
||||||
|
|
||||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
|
||||||
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
|
||||||
classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
|
||||||
classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
|
||||||
|
|
||||||
class Start,End accept;
|
|
||||||
class Pattern,Content,Domain decision;
|
|
||||||
class Reject1,Reject2,Reject3 reject;
|
|
||||||
class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process;
|
|
||||||
```
|
|
||||||
|
|
||||||
## URL Filters
|
|
||||||
|
|
||||||
URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules.
|
|
||||||
|
|
||||||
### Available Filters
|
|
||||||
|
|
||||||
1. **URL Pattern Filter**
|
|
||||||
```python
|
|
||||||
pattern_filter = URLPatternFilter([
|
|
||||||
"*.example.com/*", # Glob pattern
|
|
||||||
"*/article/*", # Path pattern
|
|
||||||
re.compile(r"blog-\d+") # Regex pattern
|
|
||||||
])
|
|
||||||
```
|
|
||||||
- Supports glob patterns and regex
|
|
||||||
- Multiple patterns per filter
|
|
||||||
- Pattern pre-compilation for performance
|
|
||||||
|
|
||||||
2. **Content Type Filter**
|
|
||||||
```python
|
|
||||||
content_filter = ContentTypeFilter([
|
|
||||||
"text/html",
|
|
||||||
"application/pdf"
|
|
||||||
], check_extension=True)
|
|
||||||
```
|
|
||||||
- Filter by MIME types
|
|
||||||
- Extension checking
|
|
||||||
- Support for multiple content types
|
|
||||||
|
|
||||||
3. **Domain Filter**
|
|
||||||
```python
|
|
||||||
domain_filter = DomainFilter(
|
|
||||||
allowed_domains=["example.com", "blog.example.com"],
|
|
||||||
blocked_domains=["ads.example.com"]
|
|
||||||
)
|
|
||||||
```
|
|
||||||
- Allow/block specific domains
|
|
||||||
- Subdomain support
|
|
||||||
- Efficient domain matching
|
|
||||||
|
|
||||||
### Creating Filter Chains
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Create and configure a filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
URLPatternFilter(["*.example.com/*"]),
|
|
||||||
ContentTypeFilter(["text/html"]),
|
|
||||||
DomainFilter(blocked_domains=["ads.*"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Add more filters
|
|
||||||
filter_chain.add_filter(
|
|
||||||
URLPatternFilter(["*/article/*"])
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
flowchart TB
|
|
||||||
Start([URL Input]) --> Composite[Composite Scorer]
|
|
||||||
|
|
||||||
subgraph Scoring Process
|
|
||||||
Composite --> Keywords[Keyword Relevance]
|
|
||||||
Composite --> Path[Path Depth]
|
|
||||||
Composite --> Content[Content Type]
|
|
||||||
Composite --> Fresh[Freshness]
|
|
||||||
Composite --> Domain[Domain Authority]
|
|
||||||
|
|
||||||
Keywords --> KeywordScore[Calculate Score]
|
|
||||||
Path --> PathScore[Calculate Score]
|
|
||||||
Content --> ContentScore[Calculate Score]
|
|
||||||
Fresh --> FreshScore[Calculate Score]
|
|
||||||
Domain --> DomainScore[Calculate Score]
|
|
||||||
|
|
||||||
KeywordScore --> Weight1[Apply Weight]
|
|
||||||
PathScore --> Weight2[Apply Weight]
|
|
||||||
ContentScore --> Weight3[Apply Weight]
|
|
||||||
FreshScore --> Weight4[Apply Weight]
|
|
||||||
DomainScore --> Weight5[Apply Weight]
|
|
||||||
end
|
|
||||||
|
|
||||||
Weight1 --> Combine[Combine Scores]
|
|
||||||
Weight2 --> Combine
|
|
||||||
Weight3 --> Combine
|
|
||||||
Weight4 --> Combine
|
|
||||||
Weight5 --> Combine
|
|
||||||
|
|
||||||
Combine --> Normalize{Normalize?}
|
|
||||||
Normalize -->|Yes| NormalizeScore[Normalize Combined Score]
|
|
||||||
Normalize -->|No| FinalScore[Final Score]
|
|
||||||
NormalizeScore --> FinalScore
|
|
||||||
|
|
||||||
FinalScore --> Stats[Update Statistics]
|
|
||||||
Stats --> End([End])
|
|
||||||
|
|
||||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
|
||||||
classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px;
|
|
||||||
classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
|
||||||
classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
|
||||||
|
|
||||||
class Start,End calc;
|
|
||||||
class Keywords,Path,Content,Fresh,Domain scorer;
|
|
||||||
class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process;
|
|
||||||
class Normalize decision;
|
|
||||||
```
|
|
||||||
|
|
||||||
## URL Scorers
|
|
||||||
|
|
||||||
URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority.
|
|
||||||
|
|
||||||
### Available Scorers
|
|
||||||
|
|
||||||
1. **Keyword Relevance Scorer**
|
|
||||||
```python
|
|
||||||
keyword_scorer = KeywordRelevanceScorer(
|
|
||||||
keywords=["python", "programming"],
|
|
||||||
weight=1.0,
|
|
||||||
case_sensitive=False
|
|
||||||
)
|
|
||||||
```
|
|
||||||
- Score based on keyword matches
|
|
||||||
- Case sensitivity options
|
|
||||||
- Weighted scoring
|
|
||||||
|
|
||||||
2. **Path Depth Scorer**
|
|
||||||
```python
|
|
||||||
path_scorer = PathDepthScorer(
|
|
||||||
optimal_depth=3, # Preferred URL depth
|
|
||||||
weight=0.7
|
|
||||||
)
|
|
||||||
```
|
|
||||||
- Score based on URL path depth
|
|
||||||
- Configurable optimal depth
|
|
||||||
- Diminishing returns for deeper paths
|
|
||||||
|
|
||||||
3. **Content Type Scorer**
|
|
||||||
```python
|
|
||||||
content_scorer = ContentTypeScorer({
|
|
||||||
r'\.html$': 1.0,
|
|
||||||
r'\.pdf$': 0.8,
|
|
||||||
r'\.xml$': 0.6
|
|
||||||
})
|
|
||||||
```
|
|
||||||
- Score based on file types
|
|
||||||
- Configurable type weights
|
|
||||||
- Pattern matching support
|
|
||||||
|
|
||||||
4. **Freshness Scorer**
|
|
||||||
```python
|
|
||||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
|
||||||
```
|
|
||||||
- Score based on date indicators in URLs
|
|
||||||
- Multiple date format support
|
|
||||||
- Recency weighting
|
|
||||||
|
|
||||||
5. **Domain Authority Scorer**
|
|
||||||
```python
|
|
||||||
authority_scorer = DomainAuthorityScorer({
|
|
||||||
"python.org": 1.0,
|
|
||||||
"github.com": 0.9,
|
|
||||||
"medium.com": 0.7
|
|
||||||
})
|
|
||||||
```
|
|
||||||
- Score based on domain importance
|
|
||||||
- Configurable domain weights
|
|
||||||
- Default weight for unknown domains
|
|
||||||
|
|
||||||
### Combining Scorers
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Create a composite scorer
|
|
||||||
composite_scorer = CompositeScorer([
|
|
||||||
KeywordRelevanceScorer(["python"], weight=1.0),
|
|
||||||
PathDepthScorer(optimal_depth=2, weight=0.7),
|
|
||||||
FreshnessScorer(weight=0.8)
|
|
||||||
], normalize=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
### Filter Configuration
|
|
||||||
|
|
||||||
1. **Start Restrictive**
|
|
||||||
```python
|
|
||||||
# Begin with strict filters
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
DomainFilter(allowed_domains=["example.com"]),
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
])
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Layer Filters**
|
|
||||||
```python
|
|
||||||
# Add more specific filters
|
|
||||||
filter_chain.add_filter(
|
|
||||||
URLPatternFilter(["*/article/*", "*/blog/*"])
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Monitor Filter Statistics**
|
|
||||||
```python
|
|
||||||
# Check filter performance
|
|
||||||
for filter in filter_chain.filters:
|
|
||||||
print(f"{filter.name}: {filter.stats.rejected_urls} rejected")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scorer Configuration
|
|
||||||
|
|
||||||
1. **Balance Weights**
|
|
||||||
```python
|
|
||||||
# Balanced scoring configuration
|
|
||||||
scorer = create_balanced_scorer()
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Customize for Content**
|
|
||||||
```python
|
|
||||||
# News site configuration
|
|
||||||
news_scorer = CompositeScorer([
|
|
||||||
KeywordRelevanceScorer(["news", "article"], weight=1.0),
|
|
||||||
FreshnessScorer(weight=1.0),
|
|
||||||
PathDepthScorer(optimal_depth=2, weight=0.5)
|
|
||||||
])
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Monitor Scoring Statistics**
|
|
||||||
```python
|
|
||||||
# Check scoring distribution
|
|
||||||
print(f"Average score: {scorer.stats.average_score}")
|
|
||||||
print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Common Use Cases
|
|
||||||
|
|
||||||
### Blog Crawling
|
|
||||||
```python
|
|
||||||
blog_config = {
|
|
||||||
'filters': FilterChain([
|
|
||||||
URLPatternFilter(["*/blog/*", "*/post/*"]),
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
]),
|
|
||||||
'scorer': CompositeScorer([
|
|
||||||
FreshnessScorer(weight=1.0),
|
|
||||||
KeywordRelevanceScorer(["blog", "article"], weight=0.8)
|
|
||||||
])
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Documentation Sites
|
|
||||||
```python
|
|
||||||
docs_config = {
|
|
||||||
'filters': FilterChain([
|
|
||||||
URLPatternFilter(["*/docs/*", "*/guide/*"]),
|
|
||||||
ContentTypeFilter(["text/html", "application/pdf"])
|
|
||||||
]),
|
|
||||||
'scorer': CompositeScorer([
|
|
||||||
PathDepthScorer(optimal_depth=3, weight=1.0),
|
|
||||||
KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9)
|
|
||||||
])
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### E-commerce Sites
|
|
||||||
```python
|
|
||||||
ecommerce_config = {
|
|
||||||
'filters': FilterChain([
|
|
||||||
URLPatternFilter(["*/product/*", "*/category/*"]),
|
|
||||||
DomainFilter(blocked_domains=["ads.*", "tracker.*"])
|
|
||||||
]),
|
|
||||||
'scorer': CompositeScorer([
|
|
||||||
PathDepthScorer(optimal_depth=2, weight=1.0),
|
|
||||||
ContentTypeScorer({
|
|
||||||
r'/product/': 1.0,
|
|
||||||
r'/category/': 0.8
|
|
||||||
})
|
|
||||||
])
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Advanced Topics
|
|
||||||
|
|
||||||
### Custom Filters
|
|
||||||
```python
|
|
||||||
class CustomFilter(URLFilter):
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
# Your custom filtering logic
|
|
||||||
return True
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Scorers
|
|
||||||
```python
|
|
||||||
class CustomScorer(URLScorer):
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
# Your custom scoring logic
|
|
||||||
return 1.0
|
|
||||||
```
|
|
||||||
|
|
||||||
For more examples, check our [example repository](https://github.com/example/crawl4ai/examples).
|
|
||||||
@@ -1,206 +0,0 @@
|
|||||||
# Scraper Examples Guide
|
|
||||||
|
|
||||||
This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features.
|
|
||||||
|
|
||||||
## Basic Example
|
|
||||||
|
|
||||||
The basic example demonstrates a simple blog scraping scenario:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain
|
|
||||||
|
|
||||||
# Create simple filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
URLPatternFilter("*/blog/*"),
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize strategy
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2,
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=None,
|
|
||||||
max_concurrent=3
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create and run scraper
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
result = await scraper.ascrape("https://example.com/blog/")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Features Demonstrated
|
|
||||||
- Basic URL filtering
|
|
||||||
- Simple content type filtering
|
|
||||||
- Depth control
|
|
||||||
- Concurrent request limiting
|
|
||||||
- Result collection
|
|
||||||
|
|
||||||
## Advanced Example
|
|
||||||
|
|
||||||
The advanced example shows a sophisticated news site scraping setup with all features enabled:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Create comprehensive filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
DomainFilter(
|
|
||||||
allowed_domains=["example.com"],
|
|
||||||
blocked_domains=["ads.example.com"]
|
|
||||||
),
|
|
||||||
URLPatternFilter([
|
|
||||||
"*/article/*",
|
|
||||||
re.compile(r"\d{4}/\d{2}/.*")
|
|
||||||
]),
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Create intelligent scorer
|
|
||||||
scorer = CompositeScorer([
|
|
||||||
KeywordRelevanceScorer(
|
|
||||||
keywords=["news", "breaking"],
|
|
||||||
weight=1.0
|
|
||||||
),
|
|
||||||
PathDepthScorer(optimal_depth=3, weight=0.7),
|
|
||||||
FreshnessScorer(weight=0.9)
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize advanced strategy
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=4,
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=scorer,
|
|
||||||
max_concurrent=5
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Features Demonstrated
|
|
||||||
1. **Advanced Filtering**
|
|
||||||
- Domain filtering
|
|
||||||
- Pattern matching
|
|
||||||
- Content type control
|
|
||||||
|
|
||||||
2. **Intelligent Scoring**
|
|
||||||
- Keyword relevance
|
|
||||||
- Path optimization
|
|
||||||
- Freshness priority
|
|
||||||
|
|
||||||
3. **Monitoring**
|
|
||||||
- Progress tracking
|
|
||||||
- Error handling
|
|
||||||
- Statistics collection
|
|
||||||
|
|
||||||
4. **Resource Management**
|
|
||||||
- Concurrent processing
|
|
||||||
- Rate limiting
|
|
||||||
- Cleanup handling
|
|
||||||
|
|
||||||
## Running the Examples
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Basic usage
|
|
||||||
python basic_scraper_example.py
|
|
||||||
|
|
||||||
# Advanced usage with logging
|
|
||||||
PYTHONPATH=. python advanced_scraper_example.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Example Output
|
|
||||||
|
|
||||||
### Basic Example
|
|
||||||
```
|
|
||||||
Crawled 15 pages:
|
|
||||||
- https://example.com/blog/post1: 24560 bytes
|
|
||||||
- https://example.com/blog/post2: 18920 bytes
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
### Advanced Example
|
|
||||||
```
|
|
||||||
INFO: Starting crawl of https://example.com/news/
|
|
||||||
INFO: Processed: https://example.com/news/breaking/story1
|
|
||||||
DEBUG: KeywordScorer: 0.85
|
|
||||||
DEBUG: FreshnessScorer: 0.95
|
|
||||||
INFO: Progress: 10 URLs processed
|
|
||||||
...
|
|
||||||
INFO: Scraping completed:
|
|
||||||
INFO: - URLs processed: 50
|
|
||||||
INFO: - Errors: 2
|
|
||||||
INFO: - Total content size: 1240.50 KB
|
|
||||||
```
|
|
||||||
|
|
||||||
## Customization
|
|
||||||
|
|
||||||
### Adding Custom Filters
|
|
||||||
```python
|
|
||||||
class CustomFilter(URLFilter):
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
# Your custom filtering logic
|
|
||||||
return True
|
|
||||||
|
|
||||||
filter_chain.add_filter(CustomFilter())
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Scoring Logic
|
|
||||||
```python
|
|
||||||
class CustomScorer(URLScorer):
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
# Your custom scoring logic
|
|
||||||
return 1.0
|
|
||||||
|
|
||||||
scorer = CompositeScorer([
|
|
||||||
CustomScorer(weight=1.0),
|
|
||||||
...
|
|
||||||
])
|
|
||||||
```
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
1. **Start Simple**
|
|
||||||
- Begin with basic filtering
|
|
||||||
- Add features incrementally
|
|
||||||
- Test thoroughly at each step
|
|
||||||
|
|
||||||
2. **Monitor Performance**
|
|
||||||
- Watch memory usage
|
|
||||||
- Track processing times
|
|
||||||
- Adjust concurrency as needed
|
|
||||||
|
|
||||||
3. **Handle Errors**
|
|
||||||
- Implement proper error handling
|
|
||||||
- Log important events
|
|
||||||
- Track error statistics
|
|
||||||
|
|
||||||
4. **Optimize Resources**
|
|
||||||
- Set appropriate delays
|
|
||||||
- Limit concurrent requests
|
|
||||||
- Use streaming for large crawls
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
Common issues and solutions:
|
|
||||||
|
|
||||||
1. **Too Many Requests**
|
|
||||||
```python
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_concurrent=3, # Reduce concurrent requests
|
|
||||||
min_crawl_delay=2 # Increase delay between requests
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Memory Issues**
|
|
||||||
```python
|
|
||||||
# Use streaming mode for large crawls
|
|
||||||
async for result in scraper.ascrape(url, stream=True):
|
|
||||||
process_result(result)
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Missing Content**
|
|
||||||
```python
|
|
||||||
# Check your filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
URLPatternFilter("*"), # Broaden patterns
|
|
||||||
ContentTypeFilter(["*"]) # Accept all content
|
|
||||||
])
|
|
||||||
```
|
|
||||||
|
|
||||||
For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples).
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
# basic_scraper_example.py
|
|
||||||
from crawl4ai.scraper import (
|
|
||||||
AsyncWebScraper,
|
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter
|
|
||||||
)
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
async def basic_scraper_example():
|
|
||||||
"""
|
|
||||||
Basic example: Scrape a blog site for articles
|
|
||||||
- Crawls only HTML pages
|
|
||||||
- Stays within the blog section
|
|
||||||
- Collects all results at once
|
|
||||||
"""
|
|
||||||
# Create a simple filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
# Only crawl pages within the blog section
|
|
||||||
URLPatternFilter("*/blog/*"),
|
|
||||||
# Only process HTML pages
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize the strategy with basic configuration
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2, # Only go 2 levels deep
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=None, # Use default scoring
|
|
||||||
max_concurrent=3 # Limit concurrent requests
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create the crawler and scraper
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
|
|
||||||
# Start scraping
|
|
||||||
try:
|
|
||||||
result = await scraper.ascrape("https://example.com/blog/")
|
|
||||||
|
|
||||||
# Process results
|
|
||||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
|
||||||
for url, data in result.extracted_data.items():
|
|
||||||
print(f"- {url}: {len(data.html)} bytes")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error during scraping: {e}")
|
|
||||||
|
|
||||||
# advanced_scraper_example.py
|
|
||||||
import logging
|
|
||||||
from crawl4ai.scraper import (
|
|
||||||
AsyncWebScraper,
|
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter,
|
|
||||||
DomainFilter,
|
|
||||||
KeywordRelevanceScorer,
|
|
||||||
PathDepthScorer,
|
|
||||||
FreshnessScorer,
|
|
||||||
CompositeScorer
|
|
||||||
)
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
async def advanced_scraper_example():
|
|
||||||
"""
|
|
||||||
Advanced example: Intelligent news site scraping
|
|
||||||
- Uses all filter types
|
|
||||||
- Implements sophisticated scoring
|
|
||||||
- Streams results
|
|
||||||
- Includes monitoring and logging
|
|
||||||
"""
|
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger("advanced_scraper")
|
|
||||||
|
|
||||||
# Create sophisticated filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
# Domain control
|
|
||||||
DomainFilter(
|
|
||||||
allowed_domains=["example.com", "blog.example.com"],
|
|
||||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
|
||||||
),
|
|
||||||
# URL patterns
|
|
||||||
URLPatternFilter([
|
|
||||||
"*/article/*",
|
|
||||||
"*/news/*",
|
|
||||||
"*/blog/*",
|
|
||||||
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
|
|
||||||
]),
|
|
||||||
# Content types
|
|
||||||
ContentTypeFilter([
|
|
||||||
"text/html",
|
|
||||||
"application/xhtml+xml"
|
|
||||||
])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Create composite scorer
|
|
||||||
scorer = CompositeScorer([
|
|
||||||
# Prioritize by keywords
|
|
||||||
KeywordRelevanceScorer(
|
|
||||||
keywords=["news", "breaking", "update", "latest"],
|
|
||||||
weight=1.0
|
|
||||||
),
|
|
||||||
# Prefer optimal URL structure
|
|
||||||
PathDepthScorer(
|
|
||||||
optimal_depth=3,
|
|
||||||
weight=0.7
|
|
||||||
),
|
|
||||||
# Prioritize fresh content
|
|
||||||
FreshnessScorer(weight=0.9)
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize strategy with advanced configuration
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=4,
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=scorer,
|
|
||||||
max_concurrent=5,
|
|
||||||
min_crawl_delay=1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create crawler and scraper
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
|
|
||||||
# Track statistics
|
|
||||||
stats = {
|
|
||||||
'processed': 0,
|
|
||||||
'errors': 0,
|
|
||||||
'total_size': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use streaming mode
|
|
||||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
|
||||||
stats['processed'] += 1
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
stats['total_size'] += len(result.html)
|
|
||||||
logger.info(f"Processed: {result.url}")
|
|
||||||
|
|
||||||
# Print scoring information
|
|
||||||
for scorer_name, score in result.scores.items():
|
|
||||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
|
||||||
else:
|
|
||||||
stats['errors'] += 1
|
|
||||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
|
||||||
|
|
||||||
# Log progress regularly
|
|
||||||
if stats['processed'] % 10 == 0:
|
|
||||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Scraping error: {e}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Print final statistics
|
|
||||||
logger.info("Scraping completed:")
|
|
||||||
logger.info(f"- URLs processed: {stats['processed']}")
|
|
||||||
logger.info(f"- Errors: {stats['errors']}")
|
|
||||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
||||||
|
|
||||||
# Print filter statistics
|
|
||||||
for filter_ in filter_chain.filters:
|
|
||||||
logger.info(f"{filter_.name} stats:")
|
|
||||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
|
||||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
|
||||||
|
|
||||||
# Print scorer statistics
|
|
||||||
logger.info("Scoring statistics:")
|
|
||||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
|
||||||
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# Run basic example
|
|
||||||
print("Running basic scraper example...")
|
|
||||||
asyncio.run(basic_scraper_example())
|
|
||||||
|
|
||||||
print("\nRunning advanced scraper example...")
|
|
||||||
asyncio.run(advanced_scraper_example())
|
|
||||||
@@ -7,4 +7,5 @@ pillow==10.4.0
|
|||||||
playwright==1.47.0
|
playwright==1.47.0
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
requests>=2.26.0,<2.32.3
|
requests>=2.26.0,<2.32.3
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
|
playwright_stealth==1.0.6
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
# basic_scraper_example.py
|
|
||||||
from crawl4ai.scraper import (
|
|
||||||
AsyncWebScraper,
|
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter
|
|
||||||
)
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
async def basic_scraper_example():
|
|
||||||
"""
|
|
||||||
Basic example: Scrape a blog site for articles
|
|
||||||
- Crawls only HTML pages
|
|
||||||
- Stays within the blog section
|
|
||||||
- Collects all results at once
|
|
||||||
"""
|
|
||||||
# Create a simple filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
# Only crawl pages within the blog section
|
|
||||||
URLPatternFilter("*/blog/*"),
|
|
||||||
# Only process HTML pages
|
|
||||||
ContentTypeFilter(["text/html"])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize the strategy with basic configuration
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2, # Only go 2 levels deep
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=None, # Use default scoring
|
|
||||||
max_concurrent=3 # Limit concurrent requests
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create the crawler and scraper
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
|
|
||||||
# Start scraping
|
|
||||||
try:
|
|
||||||
result = await scraper.ascrape("https://example.com/blog/")
|
|
||||||
|
|
||||||
# Process results
|
|
||||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
|
||||||
for url, data in result.extracted_data.items():
|
|
||||||
print(f"- {url}: {len(data.html)} bytes")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error during scraping: {e}")
|
|
||||||
|
|
||||||
# advanced_scraper_example.py
|
|
||||||
import logging
|
|
||||||
from crawl4ai.scraper import (
|
|
||||||
AsyncWebScraper,
|
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter,
|
|
||||||
DomainFilter,
|
|
||||||
KeywordRelevanceScorer,
|
|
||||||
PathDepthScorer,
|
|
||||||
FreshnessScorer,
|
|
||||||
CompositeScorer
|
|
||||||
)
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
async def advanced_scraper_example():
|
|
||||||
"""
|
|
||||||
Advanced example: Intelligent news site scraping
|
|
||||||
- Uses all filter types
|
|
||||||
- Implements sophisticated scoring
|
|
||||||
- Streams results
|
|
||||||
- Includes monitoring and logging
|
|
||||||
"""
|
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger("advanced_scraper")
|
|
||||||
|
|
||||||
# Create sophisticated filter chain
|
|
||||||
filter_chain = FilterChain([
|
|
||||||
# Domain control
|
|
||||||
DomainFilter(
|
|
||||||
allowed_domains=["example.com", "blog.example.com"],
|
|
||||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
|
||||||
),
|
|
||||||
# URL patterns
|
|
||||||
URLPatternFilter([
|
|
||||||
"*/article/*",
|
|
||||||
"*/news/*",
|
|
||||||
"*/blog/*",
|
|
||||||
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
|
|
||||||
]),
|
|
||||||
# Content types
|
|
||||||
ContentTypeFilter([
|
|
||||||
"text/html",
|
|
||||||
"application/xhtml+xml"
|
|
||||||
])
|
|
||||||
])
|
|
||||||
|
|
||||||
# Create composite scorer
|
|
||||||
scorer = CompositeScorer([
|
|
||||||
# Prioritize by keywords
|
|
||||||
KeywordRelevanceScorer(
|
|
||||||
keywords=["news", "breaking", "update", "latest"],
|
|
||||||
weight=1.0
|
|
||||||
),
|
|
||||||
# Prefer optimal URL structure
|
|
||||||
PathDepthScorer(
|
|
||||||
optimal_depth=3,
|
|
||||||
weight=0.7
|
|
||||||
),
|
|
||||||
# Prioritize fresh content
|
|
||||||
FreshnessScorer(weight=0.9)
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize strategy with advanced configuration
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=4,
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=scorer,
|
|
||||||
max_concurrent=5,
|
|
||||||
min_crawl_delay=1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create crawler and scraper
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
|
|
||||||
# Track statistics
|
|
||||||
stats = {
|
|
||||||
'processed': 0,
|
|
||||||
'errors': 0,
|
|
||||||
'total_size': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use streaming mode
|
|
||||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
|
||||||
stats['processed'] += 1
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
stats['total_size'] += len(result.html)
|
|
||||||
logger.info(f"Processed: {result.url}")
|
|
||||||
|
|
||||||
# Print scoring information
|
|
||||||
for scorer_name, score in result.scores.items():
|
|
||||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
|
||||||
else:
|
|
||||||
stats['errors'] += 1
|
|
||||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
|
||||||
|
|
||||||
# Log progress regularly
|
|
||||||
if stats['processed'] % 10 == 0:
|
|
||||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Scraping error: {e}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Print final statistics
|
|
||||||
logger.info("Scraping completed:")
|
|
||||||
logger.info(f"- URLs processed: {stats['processed']}")
|
|
||||||
logger.info(f"- Errors: {stats['errors']}")
|
|
||||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
||||||
|
|
||||||
# Print filter statistics
|
|
||||||
for filter_ in filter_chain.filters:
|
|
||||||
logger.info(f"{filter_.name} stats:")
|
|
||||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
|
||||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
|
||||||
|
|
||||||
# Print scorer statistics
|
|
||||||
logger.info("Scoring statistics:")
|
|
||||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
|
||||||
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# Run basic example
|
|
||||||
print("Running basic scraper example...")
|
|
||||||
asyncio.run(basic_scraper_example())
|
|
||||||
|
|
||||||
print("\nRunning advanced scraper example...")
|
|
||||||
asyncio.run(advanced_scraper_example())
|
|
||||||
Reference in New Issue
Block a user