Merge pull request #1589 from capsolver/main

Add some examples of using capsolver to solve captcha
2025-11-10 10:45:16 +05:30
parent 40173eeb73 2ae9899eac
commit 006e29f308
12 changed files with 517 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -95,6 +95,8 @@ if __name__ == "__main__":
    asyncio.run(main())
 ```

+If you encounter Captcha during your workflow, follow the [integration guide](https://www.capsolver.com/blog/Partners/crawl4ai-capsolver/?utm_source=crawl4ai&utm_medium=github_pr&utm_campaign=crawl4ai_integration) to integrate CapSolver. It supports reCAPTCHA v2/v3, Cloudflare Turnstile, Challenge, AWS WAF, and more.
+
 3. Or use the new command-line interface:
 ```bash
 # Basic crawl with markdown output
--- a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_aws_waf.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_aws_waf.py
@@ -0,0 +1,62 @@
+import asyncio
+import capsolver
+from crawl4ai import *
+
+
+# TODO: set your config
+# Docs: https://docs.capsolver.com/guide/captcha/awsWaf/
+api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx"              # your api key of capsolver
+site_url = "https://nft.porsche.com/onboarding@6"  # page url of your target site
+cookie_domain = ".nft.porsche.com"                 # the domain name to which you want to apply the cookie
+captcha_type = "AntiAwsWafTaskProxyLess"           # type of your target captcha
+capsolver.api_key = api_key
+
+
+async def main():
+    browser_config = BrowserConfig(
+        verbose=True,
+        headless=False,
+        use_persistent_context=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        await crawler.arun(
+            url=site_url,
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # get aws waf cookie using capsolver sdk
+        solution = capsolver.solve({
+            "type": captcha_type,
+            "websiteURL": site_url,
+        })
+        cookie = solution["cookie"]
+        print("aws waf cookie:", cookie)
+
+        js_code = """
+            document.cookie = \'aws-waf-token=""" + cookie + """;domain=""" + cookie_domain + """;path=/\';
+            location.reload();
+        """
+
+        wait_condition = """() => {
+            return document.title === \'Join Porsche’s journey into Web3\';
+        }"""
+
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test",
+            js_code=js_code,
+            js_only=True,
+            wait_for=f"js:{wait_condition}"
+        )
+
+        result_next = await crawler.arun(
+            url=site_url,
+            config=run_config,
+        )
+        print(result_next.markdown)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_challenge.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_challenge.py
@@ -0,0 +1,60 @@
+import asyncio
+import capsolver
+from crawl4ai import *
+
+
+# TODO: set your config
+# Docs: https://docs.capsolver.com/guide/captcha/cloudflare_challenge/
+api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx"          # your api key of capsolver
+site_url = "https://gitlab.com/users/sign_in"  # page url of your target site
+captcha_type = "AntiCloudflareTask"            # type of your target captcha
+# your http proxy to solve cloudflare challenge
+proxy_server = "proxy.example.com:8080"
+proxy_username = "myuser"
+proxy_password = "mypass"
+capsolver.api_key = api_key
+
+
+async def main():
+    # get challenge cookie using capsolver sdk
+    solution = capsolver.solve({
+        "type": captcha_type,
+        "websiteURL": site_url,
+        "proxy": f"{proxy_server}:{proxy_username}:{proxy_password}",
+    })
+    cookies = solution["cookies"]
+    user_agent = solution["userAgent"]
+    print("challenge cookies:", cookies)
+
+    cookies_list = []
+    for name, value in cookies.items():
+        cookies_list.append({
+            "name": name,
+            "value": value,
+            "url": site_url,
+        })
+
+    browser_config = BrowserConfig(
+        verbose=True,
+        headless=False,
+        use_persistent_context=True,
+        user_agent=user_agent,
+        cookies=cookies_list,
+        proxy_config={
+            "server": f"http://{proxy_server}",
+            "username": proxy_username,
+            "password": proxy_password,
+        },
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url=site_url,
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+        print(result.markdown)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_turnstile.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_cloudflare_turnstile.py
@@ -0,0 +1,64 @@
+import asyncio
+import capsolver
+from crawl4ai import *
+
+
+# TODO: set your config
+# Docs: https://docs.capsolver.com/guide/captcha/cloudflare_turnstile/
+api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx"                       # your api key of capsolver
+site_key = "0x4AAAAAAAGlwMzq_9z6S9Mh"                       # site key of your target site
+site_url = "https://clifford.io/demo/cloudflare-turnstile"  # page url of your target site
+captcha_type = "AntiTurnstileTaskProxyLess"                 # type of your target captcha
+capsolver.api_key = api_key
+
+
+async def main():
+    browser_config = BrowserConfig(
+        verbose=True,
+        headless=False,
+        use_persistent_context=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        await crawler.arun(
+            url=site_url,
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # get turnstile token using capsolver sdk
+        solution = capsolver.solve({
+            "type": captcha_type,
+            "websiteURL": site_url,
+            "websiteKey": site_key,
+        })
+        token = solution["token"]
+        print("turnstile token:", token)
+
+        js_code = """
+            document.querySelector(\'input[name="cf-turnstile-response"]\').value = \'"""+token+"""\';
+            document.querySelector(\'button[type="submit"]\').click();
+        """
+
+        wait_condition = """() => {
+            const items = document.querySelectorAll(\'h1\');
+            return items.length === 0;
+        }"""
+
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test",
+            js_code=js_code,
+            js_only=True,
+            wait_for=f"js:{wait_condition}"
+        )
+
+        result_next = await crawler.arun(
+            url=site_url,
+            config=run_config,
+        )
+        print(result_next.markdown)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v2.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v2.py
@@ -0,0 +1,67 @@
+import asyncio
+import capsolver
+from crawl4ai import *
+
+
+# TODO: set your config
+# Docs: https://docs.capsolver.com/guide/captcha/ReCaptchaV2/
+api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx"                                      # your api key of capsolver
+site_key = "6LfW6wATAAAAAHLqO2pb8bDBahxlMxNdo9g947u9"                      # site key of your target site
+site_url = "https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php"  # page url of your target site
+captcha_type = "ReCaptchaV2TaskProxyLess"                                  # type of your target captcha
+capsolver.api_key = api_key
+
+
+async def main():
+    browser_config = BrowserConfig(
+        verbose=True,
+        headless=False,
+        use_persistent_context=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        await crawler.arun(
+            url=site_url,
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # get recaptcha token using capsolver sdk
+        solution = capsolver.solve({
+            "type": captcha_type,
+            "websiteURL": site_url,
+            "websiteKey": site_key,
+        })
+        token = solution["gRecaptchaResponse"]
+        print("recaptcha token:", token)
+
+        js_code = """
+            const textarea = document.getElementById(\'g-recaptcha-response\');
+            if (textarea) {
+                textarea.value = \"""" + token + """\";
+                document.querySelector(\'button.form-field[type="submit"]\').click();
+            }
+        """
+
+        wait_condition = """() => {
+            const items = document.querySelectorAll(\'h2\');
+            return items.length > 1;
+        }"""
+
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test",
+            js_code=js_code,
+            js_only=True,
+            wait_for=f"js:{wait_condition}"
+        )
+
+        result_next = await crawler.arun(
+            url=site_url,
+            config=run_config,
+        )
+        print(result_next.markdown)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v3.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_api_integration/solve_recaptcha_v3.py
@@ -0,0 +1,75 @@
+import asyncio
+import capsolver
+from crawl4ai import *
+
+
+# TODO: set your config
+# Docs: https://docs.capsolver.com/guide/captcha/ReCaptchaV3/
+api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx"                                            # your api key of capsolver
+site_key = "6LdKlZEpAAAAAAOQjzC2v_d36tWxCl6dWsozdSy9"                            # site key of your target site
+site_url = "https://recaptcha-demo.appspot.com/recaptcha-v3-request-scores.php"  # page url of your target site
+page_action = "examples/v3scores"                                                # page action of your target site
+captcha_type = "ReCaptchaV3TaskProxyLess"                                        # type of your target captcha
+capsolver.api_key = api_key
+
+
+async def main():
+    browser_config = BrowserConfig(
+        verbose=True,
+        headless=False,
+        use_persistent_context=True,
+    )
+
+    # get recaptcha token using capsolver sdk
+    solution = capsolver.solve({
+        "type": captcha_type,
+        "websiteURL": site_url,
+        "websiteKey": site_key,
+        "pageAction": page_action,
+    })
+    token = solution["gRecaptchaResponse"]
+    print("recaptcha token:", token)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        await crawler.arun(
+            url=site_url,
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        js_code = """
+            const originalFetch = window.fetch;
+
+            window.fetch = function(...args) {
+              if (typeof args[0] === 'string' && args[0].includes('/recaptcha-v3-verify.php')) {
+                const url = new URL(args[0], window.location.origin);
+                url.searchParams.set('action', '""" + token + """');
+                args[0] = url.toString();
+                document.querySelector('.token').innerHTML = "fetch('/recaptcha-v3-verify.php?action=examples/v3scores&token=""" + token + """')";
+                console.log('Fetch URL hooked:', args[0]);
+              }
+              return originalFetch.apply(this, args);
+            };
+        """
+
+        wait_condition = """() => {
+            return document.querySelector('.step3:not(.hidden)');
+        }"""
+
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test",
+            js_code=js_code,
+            js_only=True,
+            wait_for=f"js:{wait_condition}"
+        )
+
+        result_next = await crawler.arun(
+            url=site_url,
+            config=run_config,
+        )
+        print(result_next.markdown)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_aws_waf.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_aws_waf.py
@@ -0,0 +1,36 @@
+import time
+import asyncio
+from crawl4ai import *
+
+
+# TODO: the user data directory that includes the capsolver extension
+user_data_dir = "/browser-profile/Default1"
+
+"""
+The capsolver extension supports more features, such as:
+    - Telling the extension when to start solving captcha.
+    - Calling functions to check whether the captcha has been solved, etc.
+Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/
+"""
+
+browser_config = BrowserConfig(
+    verbose=True,
+    headless=False,
+    user_data_dir=user_data_dir,
+    use_persistent_context=True,
+)
+
+async def main():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result_initial = await crawler.arun(
+            url="https://nft.porsche.com/onboarding@6",
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # do something later
+        time.sleep(300)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_challenge.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_challenge.py
@@ -0,0 +1,36 @@
+import time
+import asyncio
+from crawl4ai import *
+
+
+# TODO: the user data directory that includes the capsolver extension
+user_data_dir = "/browser-profile/Default1"
+
+"""
+The capsolver extension supports more features, such as:
+    - Telling the extension when to start solving captcha.
+    - Calling functions to check whether the captcha has been solved, etc.
+Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/
+"""
+
+browser_config = BrowserConfig(
+    verbose=True,
+    headless=False,
+    user_data_dir=user_data_dir,
+    use_persistent_context=True,
+)
+
+async def main():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result_initial = await crawler.arun(
+            url="https://gitlab.com/users/sign_in",
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # do something later
+        time.sleep(300)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_turnstile.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_cloudflare_turnstile.py
@@ -0,0 +1,36 @@
+import time
+import asyncio
+from crawl4ai import *
+
+
+# TODO: the user data directory that includes the capsolver extension
+user_data_dir = "/browser-profile/Default1"
+
+"""
+The capsolver extension supports more features, such as:
+    - Telling the extension when to start solving captcha.
+    - Calling functions to check whether the captcha has been solved, etc.
+Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/
+"""
+
+browser_config = BrowserConfig(
+    verbose=True,
+    headless=False,
+    user_data_dir=user_data_dir,
+    use_persistent_context=True,
+)
+
+async def main():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result_initial = await crawler.arun(
+            url="https://clifford.io/demo/cloudflare-turnstile",
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # do something later
+        time.sleep(300)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v2.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v2.py
@@ -0,0 +1,36 @@
+import time
+import asyncio
+from crawl4ai import *
+
+
+# TODO: the user data directory that includes the capsolver extension
+user_data_dir = "/browser-profile/Default1"
+
+"""
+The capsolver extension supports more features, such as:
+    - Telling the extension when to start solving captcha.
+    - Calling functions to check whether the captcha has been solved, etc.
+Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/
+"""
+
+browser_config = BrowserConfig(
+    verbose=True,
+    headless=False,
+    user_data_dir=user_data_dir,
+    use_persistent_context=True,
+)
+
+async def main():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result_initial = await crawler.arun(
+            url="https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php",
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # do something later
+        time.sleep(300)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v3.py
+++ b/docs/examples/capsolver_captcha_solver/capsolver_extension_integration/solve_recaptcha_v3.py
@@ -0,0 +1,36 @@
+import time
+import asyncio
+from crawl4ai import *
+
+
+# TODO: the user data directory that includes the capsolver extension
+user_data_dir = "/browser-profile/Default1"
+
+"""
+The capsolver extension supports more features, such as:
+    - Telling the extension when to start solving captcha.
+    - Calling functions to check whether the captcha has been solved, etc.
+Reference blog: https://docs.capsolver.com/guide/automation-tool-integration/
+"""
+
+browser_config = BrowserConfig(
+    verbose=True,
+    headless=False,
+    user_data_dir=user_data_dir,
+    use_persistent_context=True,
+)
+
+async def main():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result_initial = await crawler.arun(
+            url="https://recaptcha-demo.appspot.com/recaptcha-v3-request-scores.php",
+            cache_mode=CacheMode.BYPASS,
+            session_id="session_captcha_test"
+        )
+
+        # do something later
+        time.sleep(300)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/md_v2/core/examples.md
+++ b/docs/md_v2/core/examples.md
@@ -56,13 +56,14 @@ This page provides a comprehensive list of example scripts that demonstrate vari

 ## Anti-Bot & Stealth Features

-| Example | Description | Link |
-|---------|-------------|------|
-| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) |
+| Example                    | Description | Link |
+|----------------------------|-------------|------|
+| Stealth Mode Quick Start   | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) |
 | Stealth Mode Comprehensive | Comprehensive demonstration of stealth mode features with bot detection testing and comparisons. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_example.py) |
-| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) |
-| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) |
-| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) |
+| Undetected Browser         | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) |
+| Undetected Browser Demo    | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) |
+| Undetected Tests           | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) |
+| CapSolver Captcha Solver   | Seamlessly integrate with [CapSolver](https://www.capsolver.com/?utm_source=crawl4ai&utm_medium=github_pr&utm_campaign=crawl4ai_integration) to automatically solve reCAPTCHA v2/v3, Cloudflare Turnstile / Challenges, AWS WAF and more for uninterrupted scraping and automation. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/capsolver_captcha_solver/) |

 ## Customization & Security