diff --git a/README.md b/README.md index d9a68482..d3b35c51 100644 --- a/README.md +++ b/README.md @@ -544,6 +544,10 @@ async def test_news_crawl(): +--- + +> **πŸ’‘ Tip:** Some websites may use **CAPTCHA** based verification mechanisms to prevent automated access. If your workflow encounters such challenges, you may optionally integrate a third-party CAPTCHA-handling service such as [CapSolver](https://www.capsolver.com/blog/Partners/crawl4ai-capsolver/?utm_source=crawl4ai&utm_medium=github_pr&utm_campaign=crawl4ai_integration). They support reCAPTCHA v2/v3, Cloudflare Turnstile, Challenge, AWS WAF, and more. Please ensure that your usage complies with the target website’s terms of service and applicable laws. + ## ✨ Recent Updates
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 50fe99ba..bfa0d398 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,6 +1,7 @@ import os from typing import Union import warnings +import requests from .config import ( DEFAULT_PROVIDER, DEFAULT_PROVIDER_API_KEY, @@ -649,6 +650,85 @@ class BrowserConfig: return config return BrowserConfig.from_kwargs(config) + def set_nstproxy( + self, + token: str, + channel_id: str, + country: str = "ANY", + state: str = "", + city: str = "", + protocol: str = "http", + session_duration: int = 10, + ): + """ + Fetch a proxy from NSTProxy API and automatically assign it to proxy_config. + + Get your NSTProxy token from: https://app.nstproxy.com/profile + + Args: + token (str): NSTProxy API token. + channel_id (str): NSTProxy channel ID. + country (str, optional): Country code (default: "ANY"). + state (str, optional): State code (default: ""). + city (str, optional): City name (default: ""). + protocol (str, optional): Proxy protocol ("http" or "socks5"). Defaults to "http". + session_duration (int, optional): Session duration in minutes (0 = rotate each request). Defaults to 10. + + Raises: + ValueError: If the API response format is invalid. + PermissionError: If the API returns an error message. + """ + + # --- Validate input early --- + if not token or not channel_id: + raise ValueError("[NSTProxy] token and channel_id are required") + + if protocol not in ("http", "socks5"): + raise ValueError(f"[NSTProxy] Invalid protocol: {protocol}") + + # --- Build NSTProxy API URL --- + params = { + "fType": 2, + "count": 1, + "channelId": channel_id, + "country": country, + "protocol": protocol, + "sessionDuration": session_duration, + "token": token, + } + if state: + params["state"] = state + if city: + params["city"] = city + + url = "https://api.nstproxy.com/api/v1/generate/apiproxies" + + try: + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + + data = response.json() + + # --- Handle API error response --- + if isinstance(data, dict) and data.get("err"): + raise PermissionError(f"[NSTProxy] API Error: {data.get('msg', 'Unknown error')}") + + if not isinstance(data, list) or not data: + raise ValueError("[NSTProxy] Invalid API response β€” expected a non-empty list") + + proxy_info = data[0] + + # --- Apply proxy config --- + self.proxy_config = ProxyConfig( + server=f"{protocol}://{proxy_info['ip']}:{proxy_info['port']}", + username=proxy_info["username"], + password=proxy_info["password"], + ) + + except Exception as e: + print(f"[NSTProxy] ❌ Failed to set proxy: {e}") + raise + class VirtualScrollConfig: """Configuration for virtual scroll handling. diff --git a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_aws_waf.py b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_aws_waf.py new file mode 100644 index 00000000..351c048d --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_aws_waf.py @@ -0,0 +1,62 @@ +import asyncio +import capsolver +from crawl4ai import * + + +# TODO: set your config +# Docs: https://docs.capsolver.com/guide/captcha/awsWaf/ +api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver +site_url = "https://nft.porsche.com/onboarding@6" # page url of your target site +cookie_domain = ".nft.porsche.com" # the domain name to which you want to apply the cookie +captcha_type = "AntiAwsWafTaskProxyLess" # type of your target captcha +capsolver.api_key = api_key + + +async def main(): + browser_config = BrowserConfig( + verbose=True, + headless=False, + use_persistent_context=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + await crawler.arun( + url=site_url, + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # get aws waf cookie using capsolver sdk + solution = capsolver.solve({ + "type": captcha_type, + "websiteURL": site_url, + }) + cookie = solution["cookie"] + print("aws waf cookie:", cookie) + + js_code = """ + document.cookie = \'aws-waf-token=""" + cookie + """;domain=""" + cookie_domain + """;path=/\'; + location.reload(); + """ + + wait_condition = """() => { + return document.title === \'Join Porsche’s journey into Web3\'; + }""" + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test", + js_code=js_code, + js_only=True, + wait_for=f"js:{wait_condition}" + ) + + result_next = await crawler.arun( + url=site_url, + config=run_config, + ) + print(result_next.markdown) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_challenge.py b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_challenge.py new file mode 100644 index 00000000..39ef3e7e --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_challenge.py @@ -0,0 +1,60 @@ +import asyncio +import capsolver +from crawl4ai import * + + +# TODO: set your config +# Docs: https://docs.capsolver.com/guide/captcha/cloudflare_challenge/ +api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver +site_url = "https://gitlab.com/users/sign_in" # page url of your target site +captcha_type = "AntiCloudflareTask" # type of your target captcha +# your http proxy to solve cloudflare challenge +proxy_server = "proxy.example.com:8080" +proxy_username = "myuser" +proxy_password = "mypass" +capsolver.api_key = api_key + + +async def main(): + # get challenge cookie using capsolver sdk + solution = capsolver.solve({ + "type": captcha_type, + "websiteURL": site_url, + "proxy": f"{proxy_server}:{proxy_username}:{proxy_password}", + }) + cookies = solution["cookies"] + user_agent = solution["userAgent"] + print("challenge cookies:", cookies) + + cookies_list = [] + for name, value in cookies.items(): + cookies_list.append({ + "name": name, + "value": value, + "url": site_url, + }) + + browser_config = BrowserConfig( + verbose=True, + headless=False, + use_persistent_context=True, + user_agent=user_agent, + cookies=cookies_list, + proxy_config={ + "server": f"http://{proxy_server}", + "username": proxy_username, + "password": proxy_password, + }, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url=site_url, + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + print(result.markdown) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_turnstile.py b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_turnstile.py new file mode 100644 index 00000000..b1603067 --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_turnstile.py @@ -0,0 +1,64 @@ +import asyncio +import capsolver +from crawl4ai import * + + +# TODO: set your config +# Docs: https://docs.capsolver.com/guide/captcha/cloudflare_turnstile/ +api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver +site_key = "0x4AAAAAAAGlwMzq_9z6S9Mh" # site key of your target site +site_url = "https://clifford.io/demo/cloudflare-turnstile" # page url of your target site +captcha_type = "AntiTurnstileTaskProxyLess" # type of your target captcha +capsolver.api_key = api_key + + +async def main(): + browser_config = BrowserConfig( + verbose=True, + headless=False, + use_persistent_context=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + await crawler.arun( + url=site_url, + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # get turnstile token using capsolver sdk + solution = capsolver.solve({ + "type": captcha_type, + "websiteURL": site_url, + "websiteKey": site_key, + }) + token = solution["token"] + print("turnstile token:", token) + + js_code = """ + document.querySelector(\'input[name="cf-turnstile-response"]\').value = \'"""+token+"""\'; + document.querySelector(\'button[type="submit"]\').click(); + """ + + wait_condition = """() => { + const items = document.querySelectorAll(\'h1\'); + return items.length === 0; + }""" + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test", + js_code=js_code, + js_only=True, + wait_for=f"js:{wait_condition}" + ) + + result_next = await crawler.arun( + url=site_url, + config=run_config, + ) + print(result_next.markdown) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v2.py b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v2.py new file mode 100644 index 00000000..c9302c4a --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v2.py @@ -0,0 +1,67 @@ +import asyncio +import capsolver +from crawl4ai import * + + +# TODO: set your config +# Docs: https://docs.capsolver.com/guide/captcha/ReCaptchaV2/ +api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver +site_key = "6LfW6wATAAAAAHLqO2pb8bDBahxlMxNdo9g947u9" # site key of your target site +site_url = "https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php" # page url of your target site +captcha_type = "ReCaptchaV2TaskProxyLess" # type of your target captcha +capsolver.api_key = api_key + + +async def main(): + browser_config = BrowserConfig( + verbose=True, + headless=False, + use_persistent_context=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + await crawler.arun( + url=site_url, + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # get recaptcha token using capsolver sdk + solution = capsolver.solve({ + "type": captcha_type, + "websiteURL": site_url, + "websiteKey": site_key, + }) + token = solution["gRecaptchaResponse"] + print("recaptcha token:", token) + + js_code = """ + const textarea = document.getElementById(\'g-recaptcha-response\'); + if (textarea) { + textarea.value = \"""" + token + """\"; + document.querySelector(\'button.form-field[type="submit"]\').click(); + } + """ + + wait_condition = """() => { + const items = document.querySelectorAll(\'h2\'); + return items.length > 1; + }""" + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test", + js_code=js_code, + js_only=True, + wait_for=f"js:{wait_condition}" + ) + + result_next = await crawler.arun( + url=site_url, + config=run_config, + ) + print(result_next.markdown) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v3.py b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v3.py new file mode 100644 index 00000000..401f0c81 --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v3.py @@ -0,0 +1,75 @@ +import asyncio +import capsolver +from crawl4ai import * + + +# TODO: set your config +# Docs: https://docs.capsolver.com/guide/captcha/ReCaptchaV3/ +api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver +site_key = "6LdKlZEpAAAAAAOQjzC2v_d36tWxCl6dWsozdSy9" # site key of your target site +site_url = "https://recaptcha-demo.appspot.com/recaptcha-v3-request-scores.php" # page url of your target site +page_action = "examples/v3scores" # page action of your target site +captcha_type = "ReCaptchaV3TaskProxyLess" # type of your target captcha +capsolver.api_key = api_key + + +async def main(): + browser_config = BrowserConfig( + verbose=True, + headless=False, + use_persistent_context=True, + ) + + # get recaptcha token using capsolver sdk + solution = capsolver.solve({ + "type": captcha_type, + "websiteURL": site_url, + "websiteKey": site_key, + "pageAction": page_action, + }) + token = solution["gRecaptchaResponse"] + print("recaptcha token:", token) + + async with AsyncWebCrawler(config=browser_config) as crawler: + await crawler.arun( + url=site_url, + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + js_code = """ + const originalFetch = window.fetch; + + window.fetch = function(...args) { + if (typeof args[0] === 'string' && args[0].includes('/recaptcha-v3-verify.php')) { + const url = new URL(args[0], window.location.origin); + url.searchParams.set('action', '""" + token + """'); + args[0] = url.toString(); + document.querySelector('.token').innerHTML = "fetch('/recaptcha-v3-verify.php?action=examples/v3scores&token=""" + token + """')"; + console.log('Fetch URL hooked:', args[0]); + } + return originalFetch.apply(this, args); + }; + """ + + wait_condition = """() => { + return document.querySelector('.step3:not(.hidden)'); + }""" + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test", + js_code=js_code, + js_only=True, + wait_for=f"js:{wait_condition}" + ) + + result_next = await crawler.arun( + url=site_url, + config=run_config, + ) + print(result_next.markdown) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_aws_waf.py b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_aws_waf.py new file mode 100644 index 00000000..d1238469 --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_aws_waf.py @@ -0,0 +1,36 @@ +import time +import asyncio +from crawl4ai import * + + +# TODO: the user data directory that includes the capsolver extension +user_data_dir = "/browser-profile/Default1" + +""" +The capsolver extension supports more features, such as: + - Telling the extension when to start solving captcha. + - Calling functions to check whether the captcha has been solved, etc. +Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/ +""" + +browser_config = BrowserConfig( + verbose=True, + headless=False, + user_data_dir=user_data_dir, + use_persistent_context=True, +) + +async def main(): + async with AsyncWebCrawler(config=browser_config) as crawler: + result_initial = await crawler.arun( + url="https://nft.porsche.com/onboarding@6", + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # do something later + time.sleep(300) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_challenge.py b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_challenge.py new file mode 100644 index 00000000..3f0e967b --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_challenge.py @@ -0,0 +1,36 @@ +import time +import asyncio +from crawl4ai import * + + +# TODO: the user data directory that includes the capsolver extension +user_data_dir = "/browser-profile/Default1" + +""" +The capsolver extension supports more features, such as: + - Telling the extension when to start solving captcha. + - Calling functions to check whether the captcha has been solved, etc. +Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/ +""" + +browser_config = BrowserConfig( + verbose=True, + headless=False, + user_data_dir=user_data_dir, + use_persistent_context=True, +) + +async def main(): + async with AsyncWebCrawler(config=browser_config) as crawler: + result_initial = await crawler.arun( + url="https://gitlab.com/users/sign_in", + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # do something later + time.sleep(300) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_turnstile.py b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_turnstile.py new file mode 100644 index 00000000..ca074f53 --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_turnstile.py @@ -0,0 +1,36 @@ +import time +import asyncio +from crawl4ai import * + + +# TODO: the user data directory that includes the capsolver extension +user_data_dir = "/browser-profile/Default1" + +""" +The capsolver extension supports more features, such as: + - Telling the extension when to start solving captcha. + - Calling functions to check whether the captcha has been solved, etc. +Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/ +""" + +browser_config = BrowserConfig( + verbose=True, + headless=False, + user_data_dir=user_data_dir, + use_persistent_context=True, +) + +async def main(): + async with AsyncWebCrawler(config=browser_config) as crawler: + result_initial = await crawler.arun( + url="https://clifford.io/demo/cloudflare-turnstile", + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # do something later + time.sleep(300) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v2.py b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v2.py new file mode 100644 index 00000000..bdcd0f94 --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v2.py @@ -0,0 +1,36 @@ +import time +import asyncio +from crawl4ai import * + + +# TODO: the user data directory that includes the capsolver extension +user_data_dir = "/browser-profile/Default1" + +""" +The capsolver extension supports more features, such as: + - Telling the extension when to start solving captcha. + - Calling functions to check whether the captcha has been solved, etc. +Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/ +""" + +browser_config = BrowserConfig( + verbose=True, + headless=False, + user_data_dir=user_data_dir, + use_persistent_context=True, +) + +async def main(): + async with AsyncWebCrawler(config=browser_config) as crawler: + result_initial = await crawler.arun( + url="https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php", + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # do something later + time.sleep(300) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v3.py b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v3.py new file mode 100644 index 00000000..899b83ba --- /dev/null +++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v3.py @@ -0,0 +1,36 @@ +import time +import asyncio +from crawl4ai import * + + +# TODO: the user data directory that includes the capsolver extension +user_data_dir = "/browser-profile/Default1" + +""" +The capsolver extension supports more features, such as: + - Telling the extension when to start solving captcha. + - Calling functions to check whether the captcha has been solved, etc. +Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/ +""" + +browser_config = BrowserConfig( + verbose=True, + headless=False, + user_data_dir=user_data_dir, + use_persistent_context=True, +) + +async def main(): + async with AsyncWebCrawler(config=browser_config) as crawler: + result_initial = await crawler.arun( + url="https://recaptcha-demo.appspot.com/recaptcha-v3-request-scores.php", + cache_mode=CacheMode.BYPASS, + session_id="session_captcha_test" + ) + + # do something later + time.sleep(300) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/proxy/api_proxy_example.py b/docs/examples/proxy/api_proxy_example.py new file mode 100644 index 00000000..11847697 --- /dev/null +++ b/docs/examples/proxy/api_proxy_example.py @@ -0,0 +1,48 @@ +""" +NSTProxy Integration Examples for crawl4ai +------------------------------------------ + +NSTProxy is a premium residential proxy provider. +πŸ‘‰ Purchase Proxies: https://nstproxy.com +πŸ’° Use coupon code "crawl4ai" for 10% off your plan. + +""" +import asyncio, requests +from crawl4ai import AsyncWebCrawler, BrowserConfig + + +async def main(): + """ + Example: Dynamically fetch a proxy from NSTProxy API before crawling. + """ + NST_TOKEN = "YOUR_NST_PROXY_TOKEN" # Get from https://app.nstproxy.com/profile + CHANNEL_ID = "YOUR_NST_PROXY_CHANNEL_ID" # Your NSTProxy Channel ID + country = "ANY" # e.g. "ANY", "US", "DE" + + # Fetch proxy from NSTProxy API + api_url = ( + f"https://api.nstproxy.com/api/v1/generate/apiproxies" + f"?fType=2&channelId={CHANNEL_ID}&country={country}" + f"&protocol=http&sessionDuration=10&count=1&token={NST_TOKEN}" + ) + response = requests.get(api_url, timeout=10).json() + proxy = response[0] + + ip = proxy.get("ip") + port = proxy.get("port") + username = proxy.get("username", "") + password = proxy.get("password", "") + + browser_config = BrowserConfig(proxy_config={ + "server": f"http://{ip}:{port}", + "username": username, + "password": password, + }) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + print("[API Proxy] Status:", result.status_code) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/proxy/auth_proxy_example.py b/docs/examples/proxy/auth_proxy_example.py new file mode 100644 index 00000000..6fb838b4 --- /dev/null +++ b/docs/examples/proxy/auth_proxy_example.py @@ -0,0 +1,31 @@ +""" +NSTProxy Integration Examples for crawl4ai +------------------------------------------ + +NSTProxy is a premium residential proxy provider. +πŸ‘‰ Purchase Proxies: https://nstproxy.com +πŸ’° Use coupon code "crawl4ai" for 10% off your plan. + +""" +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig + + +async def main(): + """ + Example: Use NSTProxy with manual username/password authentication. + """ + + browser_config = BrowserConfig(proxy_config={ + "server": "http://gate.nstproxy.io:24125", + "username": "your_username", + "password": "your_password", + }) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + print("[Auth Proxy] Status:", result.status_code) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/proxy/basic_proxy_example.py b/docs/examples/proxy/basic_proxy_example.py new file mode 100644 index 00000000..5a79525c --- /dev/null +++ b/docs/examples/proxy/basic_proxy_example.py @@ -0,0 +1,29 @@ +""" +NSTProxy Integration Examples for crawl4ai +------------------------------------------ + +NSTProxy is a premium residential proxy provider. +πŸ‘‰ Purchase Proxies: https://nstproxy.com +πŸ’° Use coupon code "crawl4ai" for 10% off your plan. + +""" +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig + + +async def main(): + # Using HTTP proxy + browser_config = BrowserConfig(proxy_config={"server": "http://gate.nstproxy.io:24125"}) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + print("[HTTP Proxy] Status:", result.status_code) + + # Using SOCKS proxy + browser_config = BrowserConfig(proxy_config={"server": "socks5://gate.nstproxy.io:24125"}) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + print("[SOCKS5 Proxy] Status:", result.status_code) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/proxy/nstproxy_example.py b/docs/examples/proxy/nstproxy_example.py new file mode 100644 index 00000000..4e8587b3 --- /dev/null +++ b/docs/examples/proxy/nstproxy_example.py @@ -0,0 +1,39 @@ +""" +NSTProxy Integration Examples for crawl4ai +------------------------------------------ + +NSTProxy is a premium residential proxy provider. +πŸ‘‰ Purchase Proxies: https://nstproxy.com +πŸ’° Use coupon code "crawl4ai" for 10% off your plan. + +""" +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig + + +async def main(): + """ + Example: Using NSTProxy with AsyncWebCrawler. + """ + + NST_TOKEN = "YOUR_NST_PROXY_TOKEN" # Get from https://app.nstproxy.com/profile + CHANNEL_ID = "YOUR_NST_PROXY_CHANNEL_ID" # Your NSTProxy Channel ID + + browser_config = BrowserConfig() + browser_config.set_nstproxy( + token=NST_TOKEN, + channel_id=CHANNEL_ID, + country="ANY", # e.g. "US", "JP", or "ANY" + state="", # optional, leave empty if not needed + city="", # optional, leave empty if not needed + session_duration=0 # Session duration in minutes,0 = rotate on every request + ) + + # === Run crawler === + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + print("[Nstproxy] Status:", result.status_code) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/md_v2/core/examples.md b/docs/md_v2/core/examples.md index b1c52013..d5d58e02 100644 --- a/docs/md_v2/core/examples.md +++ b/docs/md_v2/core/examples.md @@ -11,6 +11,12 @@ This page provides a comprehensive list of example scripts that demonstrate vari | Quickstart Set 1 | Basic examples for getting started with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_1.py) | | Quickstart Set 2 | More advanced examples for working with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_2.py) | +## Proxies + +| Example | Description | Link | +|----------|--------------|------| +| **NSTProxy** | [NSTProxy](https://www.nstproxy.com/?utm_source=crawl4ai) Seamlessly integrates with crawl4ai β€” no setup required. Access high-performance residential, datacenter, ISP, and IPv6 proxies with smart rotation and anti-blocking technology. Starts from $0.1/GB. Use code crawl4ai for 10% off. | [View Code](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/proxy) | + ## Browser & Crawling Features | Example | Description | Link | @@ -56,13 +62,14 @@ This page provides a comprehensive list of example scripts that demonstrate vari ## Anti-Bot & Stealth Features -| Example | Description | Link | -|---------|-------------|------| -| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) | +| Example | Description | Link | +|----------------------------|-------------|------| +| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) | | Stealth Mode Comprehensive | Comprehensive demonstration of stealth mode features with bot detection testing and comparisons. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_example.py) | -| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) | -| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) | -| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) | +| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) | +| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) | +| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) | +| CapSolver Captcha Solver | Seamlessly integrate with [CapSolver](https://www.capsolver.com/?utm_source=crawl4ai&utm_medium=github_pr&utm_campaign=crawl4ai_integration) to automatically solve reCAPTCHA v2/v3, Cloudflare Turnstile / Challenges, AWS WAF and more for uninterrupted scraping and automation. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/capsolver_captcha_solver/) | ## Customization & Security