From 7dfe528d43670163faa5cd89d47520753f44a12d Mon Sep 17 00:00:00 2001 From: Soham Kukreti Date: Fri, 3 Oct 2025 22:00:46 +0530 Subject: [PATCH] fix(docs): standardize C4A-Script tutorial, add CLI identity-based crawling, and add sponsorship CTA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch installs to pip install -r requirements.txt (tutorial and app docs) - Update local run steps to python server.py and http://localhost:8000 - Set default PORT to 8000; update port-in-use commands and alt port 8001 - Replace unsupported :contains() example with accessible attribute selector - Update example URLs in tutorial servers to 127.0.0.1:8000 - Add β€œIdentity-based crawling” section with crwl profiles CLI workflow and code usage - Replace legacy-docs note with sponsorship message in docs/md_v2/index.md - Minor copy and consistency fixes across pages --- docs/examples/c4a_script/tutorial/README.md | 10 +++--- docs/examples/c4a_script/tutorial/server.py | 2 +- .../md_v2/advanced/identity-based-crawling.md | 36 +++++++++++++++++++ docs/md_v2/apps/c4a-script/README.md | 10 +++--- docs/md_v2/apps/c4a-script/server.py | 4 +-- docs/md_v2/core/c4a-script.md | 10 +++--- docs/md_v2/index.md | 2 +- 7 files changed, 55 insertions(+), 19 deletions(-) diff --git a/docs/examples/c4a_script/tutorial/README.md b/docs/examples/c4a_script/tutorial/README.md index 81f855ee..2d6940bb 100644 --- a/docs/examples/c4a_script/tutorial/README.md +++ b/docs/examples/c4a_script/tutorial/README.md @@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip 2. **Install Dependencies** ```bash - pip install flask + pip install -r requirements.txt ``` 3. **Launch the Server** @@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip 4. **Open in Browser** ``` - http://localhost:8080 + http://localhost:8000 ``` **🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo) @@ -325,7 +325,7 @@ Powers the recording functionality: ### Configuration ```python # server.py configuration -PORT = 8080 +PORT = 8000 DEBUG = True THREADED = True ``` @@ -343,9 +343,9 @@ THREADED = True **Port Already in Use** ```bash # Kill existing process -lsof -ti:8080 | xargs kill -9 +lsof -ti:8000 | xargs kill -9 # Or use different port -python server.py --port 8081 +python server.py --port 8001 ``` **Blockly Not Loading** diff --git a/docs/examples/c4a_script/tutorial/server.py b/docs/examples/c4a_script/tutorial/server.py index f9cb81e9..2537e4c3 100644 --- a/docs/examples/c4a_script/tutorial/server.py +++ b/docs/examples/c4a_script/tutorial/server.py @@ -216,7 +216,7 @@ def get_examples(): 'name': 'Handle Cookie Banner', 'description': 'Accept cookies and close newsletter popup', 'script': '''# Handle cookie banner and newsletter -GO http://127.0.0.1:8080/playground/ +GO http://127.0.0.1:8000/playground/ WAIT `body` 2 IF (EXISTS `.cookie-banner`) THEN CLICK `.accept` IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`''' diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md index 3864f840..2b155857 100644 --- a/docs/md_v2/advanced/identity-based-crawling.md +++ b/docs/md_v2/advanced/identity-based-crawling.md @@ -82,6 +82,42 @@ If you installed Crawl4AI (which installs Playwright under the hood), you alread --- +### Creating a Profile Using the Crawl4AI CLI (Easiest) + +If you prefer a guided, interactive setup, use the built-in CLI to create and manage persistent browser profiles. + +1.β €Launch the profile manager: + ```bash + crwl profiles + ``` + +2.β €Choose "Create new profile" and enter a profile name. A Chromium window opens so you can log in to sites and configure settings. When finished, return to the terminal and press `q` to save the profile. + +3.β €Profiles are saved under `~/.crawl4ai/profiles/` (for example: `/home//.crawl4ai/profiles/test_profile_1`) along with a `storage_state.json` for cookies and session data. + +4.β €Optionally, choose "List profiles" in the CLI to view available profiles and their paths. + +5.β €Use the saved path with `BrowserConfig.user_data_dir`: + ```python + from crawl4ai import AsyncWebCrawler, BrowserConfig + + profile_path = "/home//.crawl4ai/profiles/test_profile_1" + + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path, + browser_type="chromium", + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com/private") + ``` + +The CLI also supports listing and deleting profiles, and even testing a crawl directly from the menu. + +--- + ## 3. Using Managed Browsers in Crawl4AI Once you have a data directory with your session data, pass it to **`BrowserConfig`**: diff --git a/docs/md_v2/apps/c4a-script/README.md b/docs/md_v2/apps/c4a-script/README.md index 81f855ee..2d6940bb 100644 --- a/docs/md_v2/apps/c4a-script/README.md +++ b/docs/md_v2/apps/c4a-script/README.md @@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip 2. **Install Dependencies** ```bash - pip install flask + pip install -r requirements.txt ``` 3. **Launch the Server** @@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip 4. **Open in Browser** ``` - http://localhost:8080 + http://localhost:8000 ``` **🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo) @@ -325,7 +325,7 @@ Powers the recording functionality: ### Configuration ```python # server.py configuration -PORT = 8080 +PORT = 8000 DEBUG = True THREADED = True ``` @@ -343,9 +343,9 @@ THREADED = True **Port Already in Use** ```bash # Kill existing process -lsof -ti:8080 | xargs kill -9 +lsof -ti:8000 | xargs kill -9 # Or use different port -python server.py --port 8081 +python server.py --port 8001 ``` **Blockly Not Loading** diff --git a/docs/md_v2/apps/c4a-script/server.py b/docs/md_v2/apps/c4a-script/server.py index 6242789d..2537e4c3 100644 --- a/docs/md_v2/apps/c4a-script/server.py +++ b/docs/md_v2/apps/c4a-script/server.py @@ -216,7 +216,7 @@ def get_examples(): 'name': 'Handle Cookie Banner', 'description': 'Accept cookies and close newsletter popup', 'script': '''# Handle cookie banner and newsletter -GO http://127.0.0.1:8080/playground/ +GO http://127.0.0.1:8000/playground/ WAIT `body` 2 IF (EXISTS `.cookie-banner`) THEN CLICK `.accept` IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`''' @@ -283,7 +283,7 @@ WAIT `.success-message` 5''' return jsonify(examples) if __name__ == '__main__': - port = int(os.environ.get('PORT', 8080)) + port = int(os.environ.get('PORT', 8000)) print(f""" ╔══════════════════════════════════════════════════════════╗ β•‘ C4A-Script Interactive Tutorial Server β•‘ diff --git a/docs/md_v2/core/c4a-script.md b/docs/md_v2/core/c4a-script.md index d92e426e..1af3da4e 100644 --- a/docs/md_v2/core/c4a-script.md +++ b/docs/md_v2/core/c4a-script.md @@ -69,12 +69,12 @@ The tutorial includes a Flask-based web interface with: cd docs/examples/c4a_script/tutorial/ # Install dependencies -pip install flask +pip install -r requirements.txt # Launch the tutorial server -python app.py +python server.py -# Open http://localhost:5000 in your browser +# Open http://localhost:8000 in your browser ``` ## Core Concepts @@ -111,8 +111,8 @@ CLICK `.submit-btn` # By attribute CLICK `button[type="submit"]` -# By text content -CLICK `button:contains("Sign In")` +# By accessible attributes +CLICK `button[aria-label="Search"][title="Search"]` # Complex selectors CLICK `.form-container input[name="email"]` diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md index d497ca89..e7566e7b 100644 --- a/docs/md_v2/index.md +++ b/docs/md_v2/index.md @@ -57,7 +57,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease. -> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com). +> Enjoy using Crawl4AI? Consider **[becoming a sponsor](https://github.com/sponsors/unclecode)** to support ongoing development and community growth! ## 🎯 New: Adaptive Web Crawling