Apply Ruff Corrections
This commit is contained in:
@@ -1,19 +1,18 @@
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
|
||||
async def main():
|
||||
print("🔗 Hooks Example: Demonstrating different hook use cases")
|
||||
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Configure crawler settings
|
||||
crawler_run_config = CrawlerRunConfig(
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="body",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
# Create crawler instance
|
||||
@@ -30,16 +29,22 @@ async def main():
|
||||
"""Hook called after a new page and context are created"""
|
||||
print("[HOOK] on_page_context_created - New page created!")
|
||||
# Example: Set default viewport size
|
||||
await context.add_cookies([{
|
||||
'name': 'session_id',
|
||||
'value': 'example_session',
|
||||
'domain': '.example.com',
|
||||
'path': '/'
|
||||
}])
|
||||
await context.add_cookies(
|
||||
[
|
||||
{
|
||||
"name": "session_id",
|
||||
"value": "example_session",
|
||||
"domain": ".example.com",
|
||||
"path": "/",
|
||||
}
|
||||
]
|
||||
)
|
||||
await page.set_viewport_size({"width": 1080, "height": 800})
|
||||
return page
|
||||
|
||||
async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
|
||||
async def on_user_agent_updated(
|
||||
page: Page, context: BrowserContext, user_agent: str, **kwargs
|
||||
):
|
||||
"""Hook called when the user agent is updated"""
|
||||
print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
|
||||
return page
|
||||
@@ -53,17 +58,17 @@ async def main():
|
||||
"""Hook called before navigating to each URL"""
|
||||
print(f"[HOOK] before_goto - About to visit: {url}")
|
||||
# Example: Add custom headers for the request
|
||||
await page.set_extra_http_headers({
|
||||
"Custom-Header": "my-value"
|
||||
})
|
||||
await page.set_extra_http_headers({"Custom-Header": "my-value"})
|
||||
return page
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
|
||||
async def after_goto(
|
||||
page: Page, context: BrowserContext, url: str, response: dict, **kwargs
|
||||
):
|
||||
"""Hook called after navigating to each URL"""
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
# Example: Wait for a specific element to be loaded
|
||||
try:
|
||||
await page.wait_for_selector('.content', timeout=1000)
|
||||
await page.wait_for_selector(".content", timeout=1000)
|
||||
print("Content element found!")
|
||||
except:
|
||||
print("Content element not found, continuing anyway")
|
||||
@@ -76,7 +81,9 @@ async def main():
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
return page
|
||||
|
||||
async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
|
||||
async def before_return_html(
|
||||
page: Page, context: BrowserContext, html: str, **kwargs
|
||||
):
|
||||
"""Hook called before returning the HTML content"""
|
||||
print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
|
||||
# Example: You could modify the HTML content here if needed
|
||||
@@ -84,7 +91,9 @@ async def main():
|
||||
|
||||
# Set all the hooks
|
||||
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
|
||||
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||||
crawler.crawler_strategy.set_hook(
|
||||
"on_page_context_created", on_page_context_created
|
||||
)
|
||||
crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
crawler.crawler_strategy.set_hook("before_goto", before_goto)
|
||||
@@ -95,13 +104,15 @@ async def main():
|
||||
await crawler.start()
|
||||
|
||||
# Example usage: crawl a simple website
|
||||
url = 'https://example.com'
|
||||
url = "https://example.com"
|
||||
result = await crawler.arun(url, config=crawler_run_config)
|
||||
print(f"\nCrawled URL: {result.url}")
|
||||
print(f"HTML length: {len(result.html)}")
|
||||
|
||||
|
||||
await crawler.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user