From f2d99126976e7113502140e756c0d1e35090b9df Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 26 Dec 2024 16:34:36 +0800 Subject: [PATCH] Renames browser_config param to config in AsyncWebCrawler Standardizes parameter naming convention across the codebase by renaming browser_config to the more concise config in AsyncWebCrawler constructor. Updates all documentation examples and internal usages to reflect the new parameter name for consistency. Also improves hook execution by adding url/response parameters to goto hooks and fixes parameter ordering in before_return_html hook. --- .local/issues_todo.md | 1 + .local/llm.txt/13_hooks_auth.md | 2 +- .local/llm.txt/1_introduction.ex.md | 4 +- .local/llm.txt/3_async_webcrawler.ex.md | 4 +- .local/llm.txt/3_async_webcrawler.xs.md | 4 +- .local/llm.txt/4_browser_context_page.ex.md | 8 +- .local/llm.txt/4_browser_context_page.sm.md | 10 +- .local/ttt/13_hooks_auth.md | 2 +- .local/ttt/3_async_webcrawler.ex.md | 4 +- .local/ttt/4_browser_context_page.ex.md | 8 +- .local/ttt/context.md | 14 +-- crawl4ai/async_crawler_strategy.py | 6 +- crawl4ai/async_webcrawler.py | 11 +- docs/examples/hooks_example.py | 107 ++++++++++++++++++++ docs/llm.txt/13_hooks_auth.md | 2 +- docs/llm.txt/1_introduction.md | 4 +- docs/llm.txt/3_async_webcrawler.md | 4 +- docs/llm.txt/3_async_webcrawler.q.md | 2 +- docs/llm.txt/3_async_webcrawler.xs.md | 4 +- docs/llm.txt/3_async_webcrawler.xs.q.md | 2 +- docs/llm.txt/4_browser_context_page.md | 8 +- docs/llm.txt/4_browser_context_page.q.md | 2 +- docs/llm.txt/4_browser_context_page.xs.md | 10 +- docs/md_v2/advanced/hooks-auth.md | 2 +- docs/md_v2/advanced/managed_browser.md | 8 +- docs/md_v2/basic/simple-crawling.md | 6 +- 26 files changed, 177 insertions(+), 62 deletions(-) create mode 100644 .local/issues_todo.md create mode 100644 docs/examples/hooks_example.py diff --git a/.local/issues_todo.md b/.local/issues_todo.md new file mode 100644 index 00000000..61bdc855 --- /dev/null +++ b/.local/issues_todo.md @@ -0,0 +1 @@ +Docker: https://github.com/unclecode/crawl4ai/issues/367 \ No newline at end of file diff --git a/.local/llm.txt/13_hooks_auth.md b/.local/llm.txt/13_hooks_auth.md index a8cd77b7..89258550 100644 --- a/.local/llm.txt/13_hooks_auth.md +++ b/.local/llm.txt/13_hooks_auth.md @@ -82,7 +82,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/.local/llm.txt/1_introduction.ex.md b/.local/llm.txt/1_introduction.ex.md index b2231c71..c83c2495 100644 --- a/.local/llm.txt/1_introduction.ex.md +++ b/.local/llm.txt/1_introduction.ex.md @@ -125,7 +125,7 @@ run_config = CrawlerRunConfig( ## 4. Basic Crawling & Simple Extraction ```python -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://news.example.com/article", config=run_config) print(result.markdown) # Basic markdown content ``` @@ -375,7 +375,7 @@ async def on_page_context_created_hook(context, page, **kwargs): await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) print("[HOOK] Image requests blocked") -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook) result = await crawler.arun("https://imageheavy.example.com", config=run_config) print("Crawl finished with images blocked.") diff --git a/.local/llm.txt/3_async_webcrawler.ex.md b/.local/llm.txt/3_async_webcrawler.ex.md index 8f113e97..7d122786 100644 --- a/.local/llm.txt/3_async_webcrawler.ex.md +++ b/.local/llm.txt/3_async_webcrawler.ex.md @@ -19,7 +19,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -52,7 +52,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` diff --git a/.local/llm.txt/3_async_webcrawler.xs.md b/.local/llm.txt/3_async_webcrawler.xs.md index ce6259b7..ac849a2d 100644 --- a/.local/llm.txt/3_async_webcrawler.xs.md +++ b/.local/llm.txt/3_async_webcrawler.xs.md @@ -10,7 +10,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig import asyncio async def main(): - async with AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) as c: + async with AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) as c: r = await c.arun("https://example.com") print(r.markdown) @@ -21,7 +21,7 @@ asyncio.run(main()) **Params:** `browser_type`, `headless`, `viewport_width`, `viewport_height`, `verbose`, `proxy`. ```python browser_config = BrowserConfig(browser_type="firefox", headless=False) -async with AsyncWebCrawler(browser_config=browser_config) as c: +async with AsyncWebCrawler(config=browser_config) as c: r = await c.arun("https://site.com") ``` diff --git a/.local/llm.txt/4_browser_context_page.ex.md b/.local/llm.txt/4_browser_context_page.ex.md index 6a5efb54..f241cd68 100644 --- a/.local/llm.txt/4_browser_context_page.ex.md +++ b/.local/llm.txt/4_browser_context_page.ex.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -88,7 +88,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -192,7 +192,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/.local/llm.txt/4_browser_context_page.sm.md b/.local/llm.txt/4_browser_context_page.sm.md index 236ca2db..85479c77 100644 --- a/.local/llm.txt/4_browser_context_page.sm.md +++ b/.local/llm.txt/4_browser_context_page.sm.md @@ -20,21 +20,21 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig cfg = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Persistent Contexts ```python cfg = BrowserConfig(user_data_dir="/path/to/data") -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Managed Browser ```python cfg = BrowserConfig(headless=False, debug_port=9222, use_managed_browser=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -80,7 +80,7 @@ cfg = BrowserConfig( use_managed_browser=True, user_data_dir="/path/to/Profile" ) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -96,7 +96,7 @@ cfg = BrowserConfig( ) crawl_cfg = CrawlerRunConfig(extraction_strategy=JsonCssExtractionStrategy(schema)) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com", config=crawl_cfg) ``` diff --git a/.local/ttt/13_hooks_auth.md b/.local/ttt/13_hooks_auth.md index a8cd77b7..89258550 100644 --- a/.local/ttt/13_hooks_auth.md +++ b/.local/ttt/13_hooks_auth.md @@ -82,7 +82,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/.local/ttt/3_async_webcrawler.ex.md b/.local/ttt/3_async_webcrawler.ex.md index 8f113e97..7d122786 100644 --- a/.local/ttt/3_async_webcrawler.ex.md +++ b/.local/ttt/3_async_webcrawler.ex.md @@ -19,7 +19,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -52,7 +52,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` diff --git a/.local/ttt/4_browser_context_page.ex.md b/.local/ttt/4_browser_context_page.ex.md index 6a5efb54..f241cd68 100644 --- a/.local/ttt/4_browser_context_page.ex.md +++ b/.local/ttt/4_browser_context_page.ex.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -88,7 +88,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -192,7 +192,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/.local/ttt/context.md b/.local/ttt/context.md index c6cf9f29..09f5edfb 100644 --- a/.local/ttt/context.md +++ b/.local/ttt/context.md @@ -568,7 +568,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) @@ -1627,7 +1627,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -1660,7 +1660,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` @@ -1927,7 +1927,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -1947,7 +1947,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -1978,7 +1978,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -2082,7 +2082,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a1419ece..045fef64 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -928,7 +928,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Handle page navigation and content loading if not config.js_only: - await self.execute_hook("before_goto", page, context=context) + await self.execute_hook("before_goto", page, context=context, url=url) try: response = await page.goto( @@ -937,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") - await self.execute_hook("after_goto", page, context=context) + await self.execute_hook("after_goto", page, context=context, url=url, response=response) if response is None: status_code = 200 @@ -1102,7 +1102,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get final HTML content html = await page.content() - await self.execute_hook("before_return_html", page, html, context=context) + await self.execute_hook("before_return_html", page = page, html = html, context=context) # Handle PDF and screenshot generation start_export_time = time.perf_counter() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8cab693b..dde6c2ce 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -68,7 +68,7 @@ class AsyncWebCrawler: New way (recommended): browser_config = BrowserConfig(browser_type="chromium", headless=True) - crawler = AsyncWebCrawler(browser_config=browser_config) + crawler = AsyncWebCrawler(config=browser_config) """ _domain_last_hit = {} @@ -117,12 +117,19 @@ class AsyncWebCrawler: # Initialize crawler strategy + params = { + k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] + } self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, - **kwargs # Pass remaining kwargs for backwards compatibility + **params # Pass remaining kwargs for backwards compatibility ) + # If craweler strategy doesnt have logger, use crawler logger + if not self.crawler_strategy.logger: + self.crawler_strategy.logger = self.logger + # Handle deprecated cache parameter if always_by_pass_cache is not None: if kwargs.get("warning", True): diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py new file mode 100644 index 00000000..09e0bc17 --- /dev/null +++ b/docs/examples/hooks_example.py @@ -0,0 +1,107 @@ +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from playwright.async_api import Page, BrowserContext + +async def main(): + print("🔗 Hooks Example: Demonstrating different hook use cases") + + # Configure browser settings + browser_config = BrowserConfig( + headless=True + ) + + # Configure crawler settings + crawler_run_config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="body", + cache_mode=CacheMode.BYPASS + ) + + # Create crawler instance + crawler = AsyncWebCrawler(config=browser_config) + + # Define and set hook functions + async def on_browser_created(browser, context: BrowserContext, **kwargs): + """Hook called after the browser is created""" + print("[HOOK] on_browser_created - Browser is ready!") + # Example: Set a cookie that will be used for all requests + return browser + + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + """Hook called after a new page and context are created""" + print("[HOOK] on_page_context_created - New page created!") + # Example: Set default viewport size + await context.add_cookies([{ + 'name': 'session_id', + 'value': 'example_session', + 'domain': '.example.com', + 'path': '/' + }]) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs): + """Hook called when the user agent is updated""" + print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}") + return page + + async def on_execution_started(page: Page, context: BrowserContext, **kwargs): + """Hook called after custom JavaScript execution""" + print("[HOOK] on_execution_started - Custom JS executed!") + return page + + async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): + """Hook called before navigating to each URL""" + print(f"[HOOK] before_goto - About to visit: {url}") + # Example: Add custom headers for the request + await page.set_extra_http_headers({ + "Custom-Header": "my-value" + }) + return page + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + # Example: Wait for a specific element to be loaded + try: + await page.wait_for_selector('.content', timeout=1000) + print("Content element found!") + except: + print("Content element not found, continuing anyway") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + """Hook called before retrieving the HTML content""" + print("[HOOK] before_retrieve_html - About to get HTML content") + # Example: Scroll to bottom to trigger lazy loading + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs): + """Hook called before returning the HTML content""" + print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})") + # Example: You could modify the HTML content here if needed + return page + + # Set all the hooks + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated) + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) + crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + + await crawler.start() + + # Example usage: crawl a simple website + url = 'https://example.com' + result = await crawler.arun(url, config=crawler_run_config) + print(f"\nCrawled URL: {result.url}") + print(f"HTML length: {len(result.html)}") + + await crawler.close() + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/docs/llm.txt/13_hooks_auth.md b/docs/llm.txt/13_hooks_auth.md index a8cd77b7..89258550 100644 --- a/docs/llm.txt/13_hooks_auth.md +++ b/docs/llm.txt/13_hooks_auth.md @@ -82,7 +82,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/docs/llm.txt/1_introduction.md b/docs/llm.txt/1_introduction.md index b2231c71..c83c2495 100644 --- a/docs/llm.txt/1_introduction.md +++ b/docs/llm.txt/1_introduction.md @@ -125,7 +125,7 @@ run_config = CrawlerRunConfig( ## 4. Basic Crawling & Simple Extraction ```python -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://news.example.com/article", config=run_config) print(result.markdown) # Basic markdown content ``` @@ -375,7 +375,7 @@ async def on_page_context_created_hook(context, page, **kwargs): await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) print("[HOOK] Image requests blocked") -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook) result = await crawler.arun("https://imageheavy.example.com", config=run_config) print("Crawl finished with images blocked.") diff --git a/docs/llm.txt/3_async_webcrawler.md b/docs/llm.txt/3_async_webcrawler.md index 8f113e97..7d122786 100644 --- a/docs/llm.txt/3_async_webcrawler.md +++ b/docs/llm.txt/3_async_webcrawler.md @@ -19,7 +19,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -52,7 +52,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` diff --git a/docs/llm.txt/3_async_webcrawler.q.md b/docs/llm.txt/3_async_webcrawler.q.md index e3993566..eea7d7b5 100644 --- a/docs/llm.txt/3_async_webcrawler.q.md +++ b/docs/llm.txt/3_async_webcrawler.q.md @@ -1,4 +1,4 @@ -quick_start: Basic async crawl setup requires BrowserConfig and AsyncWebCrawler initialization | getting started, basic usage, initialization | asyncio.run(AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True))) +quick_start: Basic async crawl setup requires BrowserConfig and AsyncWebCrawler initialization | getting started, basic usage, initialization | asyncio.run(AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True))) browser_types: AsyncWebCrawler supports multiple browser types including Chromium and Firefox | supported browsers, browser options | BrowserConfig(browser_type="chromium") headless_mode: Browser can run in headless mode without UI for better performance | invisible browser, no GUI | BrowserConfig(headless=True) viewport_settings: Configure browser viewport dimensions for proper page rendering | screen size, window size | BrowserConfig(viewport_width=1920, viewport_height=1080) diff --git a/docs/llm.txt/3_async_webcrawler.xs.md b/docs/llm.txt/3_async_webcrawler.xs.md index ce6259b7..ac849a2d 100644 --- a/docs/llm.txt/3_async_webcrawler.xs.md +++ b/docs/llm.txt/3_async_webcrawler.xs.md @@ -10,7 +10,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig import asyncio async def main(): - async with AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) as c: + async with AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) as c: r = await c.arun("https://example.com") print(r.markdown) @@ -21,7 +21,7 @@ asyncio.run(main()) **Params:** `browser_type`, `headless`, `viewport_width`, `viewport_height`, `verbose`, `proxy`. ```python browser_config = BrowserConfig(browser_type="firefox", headless=False) -async with AsyncWebCrawler(browser_config=browser_config) as c: +async with AsyncWebCrawler(config=browser_config) as c: r = await c.arun("https://site.com") ``` diff --git a/docs/llm.txt/3_async_webcrawler.xs.q.md b/docs/llm.txt/3_async_webcrawler.xs.q.md index c037271b..a22d5a1c 100644 --- a/docs/llm.txt/3_async_webcrawler.xs.q.md +++ b/docs/llm.txt/3_async_webcrawler.xs.q.md @@ -1,4 +1,4 @@ -setup_usage: Initialize AsyncWebCrawler with BrowserConfig for basic web crawling | crawler setup, initialization, basic usage | AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) +setup_usage: Initialize AsyncWebCrawler with BrowserConfig for basic web crawling | crawler setup, initialization, basic usage | AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) browser_configuration: Configure browser settings including type, headless mode, viewport, and proxy | browser setup, browser settings, viewport config | BrowserConfig(browser_type="firefox", headless=False, viewport_width=1920) docker_setup: Run crawler in Docker using python slim image with playwright installation | docker configuration, containerization | FROM python:3.10-slim; RUN pip install crawl4ai playwright crawler_strategy: Use AsyncPlaywrightCrawlerStrategy as default crawler implementation | crawler implementation, strategy pattern | AsyncWebCrawler(crawler_strategy=AsyncPlaywrightCrawlerStrategy()) diff --git a/docs/llm.txt/4_browser_context_page.md b/docs/llm.txt/4_browser_context_page.md index 6a5efb54..f241cd68 100644 --- a/docs/llm.txt/4_browser_context_page.md +++ b/docs/llm.txt/4_browser_context_page.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -88,7 +88,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -192,7 +192,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/docs/llm.txt/4_browser_context_page.q.md b/docs/llm.txt/4_browser_context_page.q.md index dd446986..94368feb 100644 --- a/docs/llm.txt/4_browser_context_page.q.md +++ b/docs/llm.txt/4_browser_context_page.q.md @@ -1,4 +1,4 @@ -browser_creation: Create standard browser instance with default configurations | browser initialization, basic setup, minimal config | AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) +browser_creation: Create standard browser instance with default configurations | browser initialization, basic setup, minimal config | AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) persistent_context: Use persistent browser contexts to maintain session data and cookies | user_data_dir, session storage, login state | BrowserConfig(user_data_dir="/path/to/user/data") managed_browser: High-level browser management with resource optimization and debugging | browser process, stealth mode, debugging tools | BrowserConfig(headless=False, debug_port=9222) context_config: Configure browser context with custom headers and cookies | headers customization, session reuse | CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"}) diff --git a/docs/llm.txt/4_browser_context_page.xs.md b/docs/llm.txt/4_browser_context_page.xs.md index 236ca2db..85479c77 100644 --- a/docs/llm.txt/4_browser_context_page.xs.md +++ b/docs/llm.txt/4_browser_context_page.xs.md @@ -20,21 +20,21 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig cfg = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Persistent Contexts ```python cfg = BrowserConfig(user_data_dir="/path/to/data") -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Managed Browser ```python cfg = BrowserConfig(headless=False, debug_port=9222, use_managed_browser=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -80,7 +80,7 @@ cfg = BrowserConfig( use_managed_browser=True, user_data_dir="/path/to/Profile" ) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -96,7 +96,7 @@ cfg = BrowserConfig( ) crawl_cfg = CrawlerRunConfig(extraction_strategy=JsonCssExtractionStrategy(schema)) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com", config=crawl_cfg) ``` diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index 3d1e1930..66042229 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -84,7 +84,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md index 5d1a5e3f..bbe07f2f 100644 --- a/docs/md_v2/advanced/managed_browser.md +++ b/docs/md_v2/advanced/managed_browser.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows #### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -87,7 +87,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -147,7 +147,7 @@ Remote debugging provides a powerful way to troubleshoot complex crawling workfl #### Example: Enabling Remote Debugging ```python config = BrowserConfig(debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") ``` diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md index 4115a7fb..ec63984c 100644 --- a/docs/md_v2/basic/simple-crawling.md +++ b/docs/md_v2/basic/simple-crawling.md @@ -15,7 +15,7 @@ async def main(): browser_config = BrowserConfig() # Default browser configuration run_config = CrawlerRunConfig() # Default crawl run configuration - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://example.com", config=run_config @@ -89,7 +89,7 @@ Enable verbose logging in `BrowserConfig`: ```python browser_config = BrowserConfig(verbose=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: run_config = CrawlerRunConfig() result = await crawler.arun(url="https://example.com", config=run_config) ``` @@ -119,7 +119,7 @@ async def main(): cache_mode=CacheMode.ENABLED # Use cache if available ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://example.com", config=run_config