diff --git a/.local/issues_todo.md b/.local/issues_todo.md new file mode 100644 index 00000000..61bdc855 --- /dev/null +++ b/.local/issues_todo.md @@ -0,0 +1 @@ +Docker: https://github.com/unclecode/crawl4ai/issues/367 \ No newline at end of file diff --git a/.local/llm.txt/13_hooks_auth.md b/.local/llm.txt/13_hooks_auth.md index a8cd77b7..89258550 100644 --- a/.local/llm.txt/13_hooks_auth.md +++ b/.local/llm.txt/13_hooks_auth.md @@ -82,7 +82,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/.local/llm.txt/1_introduction.ex.md b/.local/llm.txt/1_introduction.ex.md index b2231c71..c83c2495 100644 --- a/.local/llm.txt/1_introduction.ex.md +++ b/.local/llm.txt/1_introduction.ex.md @@ -125,7 +125,7 @@ run_config = CrawlerRunConfig( ## 4. Basic Crawling & Simple Extraction ```python -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://news.example.com/article", config=run_config) print(result.markdown) # Basic markdown content ``` @@ -375,7 +375,7 @@ async def on_page_context_created_hook(context, page, **kwargs): await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) print("[HOOK] Image requests blocked") -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook) result = await crawler.arun("https://imageheavy.example.com", config=run_config) print("Crawl finished with images blocked.") diff --git a/.local/llm.txt/3_async_webcrawler.ex.md b/.local/llm.txt/3_async_webcrawler.ex.md index 8f113e97..7d122786 100644 --- a/.local/llm.txt/3_async_webcrawler.ex.md +++ b/.local/llm.txt/3_async_webcrawler.ex.md @@ -19,7 +19,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -52,7 +52,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` diff --git a/.local/llm.txt/3_async_webcrawler.xs.md b/.local/llm.txt/3_async_webcrawler.xs.md index ce6259b7..ac849a2d 100644 --- a/.local/llm.txt/3_async_webcrawler.xs.md +++ b/.local/llm.txt/3_async_webcrawler.xs.md @@ -10,7 +10,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig import asyncio async def main(): - async with AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) as c: + async with AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) as c: r = await c.arun("https://example.com") print(r.markdown) @@ -21,7 +21,7 @@ asyncio.run(main()) **Params:** `browser_type`, `headless`, `viewport_width`, `viewport_height`, `verbose`, `proxy`. ```python browser_config = BrowserConfig(browser_type="firefox", headless=False) -async with AsyncWebCrawler(browser_config=browser_config) as c: +async with AsyncWebCrawler(config=browser_config) as c: r = await c.arun("https://site.com") ``` diff --git a/.local/llm.txt/4_browser_context_page.ex.md b/.local/llm.txt/4_browser_context_page.ex.md index 6a5efb54..f241cd68 100644 --- a/.local/llm.txt/4_browser_context_page.ex.md +++ b/.local/llm.txt/4_browser_context_page.ex.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -88,7 +88,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -192,7 +192,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/.local/llm.txt/4_browser_context_page.sm.md b/.local/llm.txt/4_browser_context_page.sm.md index 236ca2db..85479c77 100644 --- a/.local/llm.txt/4_browser_context_page.sm.md +++ b/.local/llm.txt/4_browser_context_page.sm.md @@ -20,21 +20,21 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig cfg = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Persistent Contexts ```python cfg = BrowserConfig(user_data_dir="/path/to/data") -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Managed Browser ```python cfg = BrowserConfig(headless=False, debug_port=9222, use_managed_browser=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -80,7 +80,7 @@ cfg = BrowserConfig( use_managed_browser=True, user_data_dir="/path/to/Profile" ) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -96,7 +96,7 @@ cfg = BrowserConfig( ) crawl_cfg = CrawlerRunConfig(extraction_strategy=JsonCssExtractionStrategy(schema)) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com", config=crawl_cfg) ``` diff --git a/.local/ttt/13_hooks_auth.md b/.local/ttt/13_hooks_auth.md index a8cd77b7..89258550 100644 --- a/.local/ttt/13_hooks_auth.md +++ b/.local/ttt/13_hooks_auth.md @@ -82,7 +82,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/.local/ttt/3_async_webcrawler.ex.md b/.local/ttt/3_async_webcrawler.ex.md index 8f113e97..7d122786 100644 --- a/.local/ttt/3_async_webcrawler.ex.md +++ b/.local/ttt/3_async_webcrawler.ex.md @@ -19,7 +19,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -52,7 +52,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` diff --git a/.local/ttt/4_browser_context_page.ex.md b/.local/ttt/4_browser_context_page.ex.md index 6a5efb54..f241cd68 100644 --- a/.local/ttt/4_browser_context_page.ex.md +++ b/.local/ttt/4_browser_context_page.ex.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -88,7 +88,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -192,7 +192,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/.local/ttt/context.md b/.local/ttt/context.md index c6cf9f29..09f5edfb 100644 --- a/.local/ttt/context.md +++ b/.local/ttt/context.md @@ -568,7 +568,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) @@ -1627,7 +1627,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -1660,7 +1660,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` @@ -1927,7 +1927,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -1947,7 +1947,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -1978,7 +1978,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -2082,7 +2082,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a1419ece..045fef64 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -928,7 +928,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Handle page navigation and content loading if not config.js_only: - await self.execute_hook("before_goto", page, context=context) + await self.execute_hook("before_goto", page, context=context, url=url) try: response = await page.goto( @@ -937,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") - await self.execute_hook("after_goto", page, context=context) + await self.execute_hook("after_goto", page, context=context, url=url, response=response) if response is None: status_code = 200 @@ -1102,7 +1102,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get final HTML content html = await page.content() - await self.execute_hook("before_return_html", page, html, context=context) + await self.execute_hook("before_return_html", page = page, html = html, context=context) # Handle PDF and screenshot generation start_export_time = time.perf_counter() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8cab693b..dde6c2ce 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -68,7 +68,7 @@ class AsyncWebCrawler: New way (recommended): browser_config = BrowserConfig(browser_type="chromium", headless=True) - crawler = AsyncWebCrawler(browser_config=browser_config) + crawler = AsyncWebCrawler(config=browser_config) """ _domain_last_hit = {} @@ -117,12 +117,19 @@ class AsyncWebCrawler: # Initialize crawler strategy + params = { + k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] + } self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, - **kwargs # Pass remaining kwargs for backwards compatibility + **params # Pass remaining kwargs for backwards compatibility ) + # If craweler strategy doesnt have logger, use crawler logger + if not self.crawler_strategy.logger: + self.crawler_strategy.logger = self.logger + # Handle deprecated cache parameter if always_by_pass_cache is not None: if kwargs.get("warning", True): diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py new file mode 100644 index 00000000..09e0bc17 --- /dev/null +++ b/docs/examples/hooks_example.py @@ -0,0 +1,107 @@ +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from playwright.async_api import Page, BrowserContext + +async def main(): + print("🔗 Hooks Example: Demonstrating different hook use cases") + + # Configure browser settings + browser_config = BrowserConfig( + headless=True + ) + + # Configure crawler settings + crawler_run_config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="body", + cache_mode=CacheMode.BYPASS + ) + + # Create crawler instance + crawler = AsyncWebCrawler(config=browser_config) + + # Define and set hook functions + async def on_browser_created(browser, context: BrowserContext, **kwargs): + """Hook called after the browser is created""" + print("[HOOK] on_browser_created - Browser is ready!") + # Example: Set a cookie that will be used for all requests + return browser + + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + """Hook called after a new page and context are created""" + print("[HOOK] on_page_context_created - New page created!") + # Example: Set default viewport size + await context.add_cookies([{ + 'name': 'session_id', + 'value': 'example_session', + 'domain': '.example.com', + 'path': '/' + }]) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs): + """Hook called when the user agent is updated""" + print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}") + return page + + async def on_execution_started(page: Page, context: BrowserContext, **kwargs): + """Hook called after custom JavaScript execution""" + print("[HOOK] on_execution_started - Custom JS executed!") + return page + + async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): + """Hook called before navigating to each URL""" + print(f"[HOOK] before_goto - About to visit: {url}") + # Example: Add custom headers for the request + await page.set_extra_http_headers({ + "Custom-Header": "my-value" + }) + return page + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + # Example: Wait for a specific element to be loaded + try: + await page.wait_for_selector('.content', timeout=1000) + print("Content element found!") + except: + print("Content element not found, continuing anyway") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + """Hook called before retrieving the HTML content""" + print("[HOOK] before_retrieve_html - About to get HTML content") + # Example: Scroll to bottom to trigger lazy loading + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs): + """Hook called before returning the HTML content""" + print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})") + # Example: You could modify the HTML content here if needed + return page + + # Set all the hooks + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated) + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) + crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + + await crawler.start() + + # Example usage: crawl a simple website + url = 'https://example.com' + result = await crawler.arun(url, config=crawler_run_config) + print(f"\nCrawled URL: {result.url}") + print(f"HTML length: {len(result.html)}") + + await crawler.close() + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/docs/llm.txt/13_hooks_auth.md b/docs/llm.txt/13_hooks_auth.md index a8cd77b7..89258550 100644 --- a/docs/llm.txt/13_hooks_auth.md +++ b/docs/llm.txt/13_hooks_auth.md @@ -82,7 +82,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/docs/llm.txt/1_introduction.md b/docs/llm.txt/1_introduction.md index b2231c71..c83c2495 100644 --- a/docs/llm.txt/1_introduction.md +++ b/docs/llm.txt/1_introduction.md @@ -125,7 +125,7 @@ run_config = CrawlerRunConfig( ## 4. Basic Crawling & Simple Extraction ```python -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://news.example.com/article", config=run_config) print(result.markdown) # Basic markdown content ``` @@ -375,7 +375,7 @@ async def on_page_context_created_hook(context, page, **kwargs): await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) print("[HOOK] Image requests blocked") -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook) result = await crawler.arun("https://imageheavy.example.com", config=run_config) print("Crawl finished with images blocked.") diff --git a/docs/llm.txt/3_async_webcrawler.md b/docs/llm.txt/3_async_webcrawler.md index 8f113e97..7d122786 100644 --- a/docs/llm.txt/3_async_webcrawler.md +++ b/docs/llm.txt/3_async_webcrawler.md @@ -19,7 +19,7 @@ async def main(): browser_config = BrowserConfig(browser_type="chromium", headless=True) # Run the crawler asynchronously - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") print("Extracted Markdown:") print(result.markdown) @@ -52,7 +52,7 @@ browser_config = BrowserConfig( verbose=True ) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://yourwebsite.com") print(result.markdown) ``` diff --git a/docs/llm.txt/3_async_webcrawler.q.md b/docs/llm.txt/3_async_webcrawler.q.md index e3993566..eea7d7b5 100644 --- a/docs/llm.txt/3_async_webcrawler.q.md +++ b/docs/llm.txt/3_async_webcrawler.q.md @@ -1,4 +1,4 @@ -quick_start: Basic async crawl setup requires BrowserConfig and AsyncWebCrawler initialization | getting started, basic usage, initialization | asyncio.run(AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True))) +quick_start: Basic async crawl setup requires BrowserConfig and AsyncWebCrawler initialization | getting started, basic usage, initialization | asyncio.run(AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True))) browser_types: AsyncWebCrawler supports multiple browser types including Chromium and Firefox | supported browsers, browser options | BrowserConfig(browser_type="chromium") headless_mode: Browser can run in headless mode without UI for better performance | invisible browser, no GUI | BrowserConfig(headless=True) viewport_settings: Configure browser viewport dimensions for proper page rendering | screen size, window size | BrowserConfig(viewport_width=1920, viewport_height=1080) diff --git a/docs/llm.txt/3_async_webcrawler.xs.md b/docs/llm.txt/3_async_webcrawler.xs.md index ce6259b7..ac849a2d 100644 --- a/docs/llm.txt/3_async_webcrawler.xs.md +++ b/docs/llm.txt/3_async_webcrawler.xs.md @@ -10,7 +10,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig import asyncio async def main(): - async with AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) as c: + async with AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) as c: r = await c.arun("https://example.com") print(r.markdown) @@ -21,7 +21,7 @@ asyncio.run(main()) **Params:** `browser_type`, `headless`, `viewport_width`, `viewport_height`, `verbose`, `proxy`. ```python browser_config = BrowserConfig(browser_type="firefox", headless=False) -async with AsyncWebCrawler(browser_config=browser_config) as c: +async with AsyncWebCrawler(config=browser_config) as c: r = await c.arun("https://site.com") ``` diff --git a/docs/llm.txt/3_async_webcrawler.xs.q.md b/docs/llm.txt/3_async_webcrawler.xs.q.md index c037271b..a22d5a1c 100644 --- a/docs/llm.txt/3_async_webcrawler.xs.q.md +++ b/docs/llm.txt/3_async_webcrawler.xs.q.md @@ -1,4 +1,4 @@ -setup_usage: Initialize AsyncWebCrawler with BrowserConfig for basic web crawling | crawler setup, initialization, basic usage | AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) +setup_usage: Initialize AsyncWebCrawler with BrowserConfig for basic web crawling | crawler setup, initialization, basic usage | AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) browser_configuration: Configure browser settings including type, headless mode, viewport, and proxy | browser setup, browser settings, viewport config | BrowserConfig(browser_type="firefox", headless=False, viewport_width=1920) docker_setup: Run crawler in Docker using python slim image with playwright installation | docker configuration, containerization | FROM python:3.10-slim; RUN pip install crawl4ai playwright crawler_strategy: Use AsyncPlaywrightCrawlerStrategy as default crawler implementation | crawler implementation, strategy pattern | AsyncWebCrawler(crawler_strategy=AsyncPlaywrightCrawlerStrategy()) diff --git a/docs/llm.txt/4_browser_context_page.md b/docs/llm.txt/4_browser_context_page.md index 6a5efb54..f241cd68 100644 --- a/docs/llm.txt/4_browser_context_page.md +++ b/docs/llm.txt/4_browser_context_page.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows ##### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -88,7 +88,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -192,7 +192,7 @@ I'll help create a section about using command-line Chrome with a user data dire user_data_dir="/path/to/ChromeProfiles/CrawlProfile" # Use the same directory from step 1 ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` diff --git a/docs/llm.txt/4_browser_context_page.q.md b/docs/llm.txt/4_browser_context_page.q.md index dd446986..94368feb 100644 --- a/docs/llm.txt/4_browser_context_page.q.md +++ b/docs/llm.txt/4_browser_context_page.q.md @@ -1,4 +1,4 @@ -browser_creation: Create standard browser instance with default configurations | browser initialization, basic setup, minimal config | AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) +browser_creation: Create standard browser instance with default configurations | browser initialization, basic setup, minimal config | AsyncWebCrawler(config=BrowserConfig(browser_type="chromium", headless=True)) persistent_context: Use persistent browser contexts to maintain session data and cookies | user_data_dir, session storage, login state | BrowserConfig(user_data_dir="/path/to/user/data") managed_browser: High-level browser management with resource optimization and debugging | browser process, stealth mode, debugging tools | BrowserConfig(headless=False, debug_port=9222) context_config: Configure browser context with custom headers and cookies | headers customization, session reuse | CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"}) diff --git a/docs/llm.txt/4_browser_context_page.xs.md b/docs/llm.txt/4_browser_context_page.xs.md index 236ca2db..85479c77 100644 --- a/docs/llm.txt/4_browser_context_page.xs.md +++ b/docs/llm.txt/4_browser_context_page.xs.md @@ -20,21 +20,21 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig cfg = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Persistent Contexts ```python cfg = BrowserConfig(user_data_dir="/path/to/data") -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` ### Managed Browser ```python cfg = BrowserConfig(headless=False, debug_port=9222, use_managed_browser=True) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -80,7 +80,7 @@ cfg = BrowserConfig( use_managed_browser=True, user_data_dir="/path/to/Profile" ) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com") ``` @@ -96,7 +96,7 @@ cfg = BrowserConfig( ) crawl_cfg = CrawlerRunConfig(extraction_strategy=JsonCssExtractionStrategy(schema)) -async with AsyncWebCrawler(browser_config=cfg) as c: +async with AsyncWebCrawler(config=cfg) as c: r = await c.arun("https://example.com", config=crawl_cfg) ``` diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index 3d1e1930..66042229 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -84,7 +84,7 @@ async def main(): ) # Initialize crawler - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md index 5d1a5e3f..bbe07f2f 100644 --- a/docs/md_v2/advanced/managed_browser.md +++ b/docs/md_v2/advanced/managed_browser.md @@ -37,7 +37,7 @@ Standard browser creation initializes a browser instance with default or minimal from crawl4ai import AsyncWebCrawler, BrowserConfig browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -57,7 +57,7 @@ Persistent contexts create browser sessions with stored data, enabling workflows #### Example: Setting Up Persistent Contexts ```python config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -87,7 +87,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser from crawl4ai import AsyncWebCrawler, BrowserConfig config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") print(result.markdown) ``` @@ -147,7 +147,7 @@ Remote debugging provides a powerful way to troubleshoot complex crawling workfl #### Example: Enabling Remote Debugging ```python config = BrowserConfig(debug_port=9222) -async with AsyncWebCrawler(browser_config=config) as crawler: +async with AsyncWebCrawler(config=config) as crawler: result = await crawler.arun("https://crawl4ai.com") ``` diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md index 4115a7fb..ec63984c 100644 --- a/docs/md_v2/basic/simple-crawling.md +++ b/docs/md_v2/basic/simple-crawling.md @@ -15,7 +15,7 @@ async def main(): browser_config = BrowserConfig() # Default browser configuration run_config = CrawlerRunConfig() # Default crawl run configuration - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://example.com", config=run_config @@ -89,7 +89,7 @@ Enable verbose logging in `BrowserConfig`: ```python browser_config = BrowserConfig(verbose=True) -async with AsyncWebCrawler(browser_config=browser_config) as crawler: +async with AsyncWebCrawler(config=browser_config) as crawler: run_config = CrawlerRunConfig() result = await crawler.arun(url="https://example.com", config=run_config) ``` @@ -119,7 +119,7 @@ async def main(): cache_mode=CacheMode.ENABLED # Use cache if available ) - async with AsyncWebCrawler(browser_config=browser_config) as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://example.com", config=run_config