From 7c2fd5202ec5ed0de4a63b8a891598eeac28e627 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 1 May 2025 18:27:03 +0530 Subject: [PATCH 01/11] fix: incorrect params and commands in linkedin app readme --- docs/apps/linkdin/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md index cce244ac..d14c5352 100644 --- a/docs/apps/linkdin/README.md +++ b/docs/apps/linkdin/README.md @@ -21,7 +21,7 @@ pip install crawl4ai openai sentence-transformers networkx pandas vis-network ri ### 1.2  Create / warm a LinkedIn browser profile ```bash -crwl profiler +crwl profiles ``` 1. The interactive shell shows **New profile** – hit **enter**. 2. Choose a name, e.g. `profile_linkedin_uc`. @@ -37,13 +37,13 @@ crwl profiler python c4ai_discover.py full \ --query "health insurance management" \ --geo 102713980 \ # Malaysia geoUrn - --title_filters "" \ # or "Product,Engineering" - --max_companies 10 \ # default set small for workshops - --max_people 20 \ # \^ same + --title-filters "" \ # or "Product,Engineering" + --max-companies 10 \ # default set small for workshops + --max-people 20 \ # \^ same --profile-name profile_linkedin_uc \ --outdir ./data \ --concurrency 2 \ - --log_level debug + --log-level debug ``` **Outputs** in `./data/`: * `companies.jsonl` – one JSON per company From baf7f6a6f52e9da50502b1da0f200dda02187896 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 2 May 2025 16:33:11 +0530 Subject: [PATCH 02/11] fix: typo in readme --- docs/apps/linkdin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md index d14c5352..7fe61bd7 100644 --- a/docs/apps/linkdin/README.md +++ b/docs/apps/linkdin/README.md @@ -121,6 +121,6 @@ The page fetches `data/company_graph.json` and the `org_chart_*.json` files auto --- ### TL;DR -`crwl profiler` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`. +`crwl profiles` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`. Live long and `import crawl4ai`. From 5cc58f9bb3ac049df3cb7a2ee31680b3aec30a41 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 2 May 2025 16:40:58 +0530 Subject: [PATCH 03/11] fix: 1. duplicate verbose flag 2.inconsistency in argument name --profile-name 3. duplicate initialisaiton of env_defaults --- docs/apps/linkdin/c4ai_discover.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/apps/linkdin/c4ai_discover.py b/docs/apps/linkdin/c4ai_discover.py index 82874568..dca2fa69 100644 --- a/docs/apps/linkdin/c4ai_discover.py +++ b/docs/apps/linkdin/c4ai_discover.py @@ -272,7 +272,7 @@ def build_arg_parser() -> argparse.ArgumentParser: parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords") parser.add_argument("--max-companies", type=int, default=1000) parser.add_argument("--max-people", type=int, default=500) - parser.add_argument("--profile-path", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc")) + parser.add_argument("--profile-name", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc")) parser.add_argument("--outdir", default="./output") parser.add_argument("--concurrency", type=int, default=4) parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"]) @@ -355,8 +355,7 @@ async def async_main(opts): user_agent_generator_config= { "platforms": "mobile", "os": "Android" - }, - verbose=False, + } ) crawler = AsyncWebCrawler(config=bc) @@ -366,7 +365,7 @@ async def async_main(opts): # crawler = await next_crawler().start() try: # Build LinkedIn search URL - search_url = f"https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&geoUrn={opts.geo}" + search_url = f'https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&companyHqGeo="{opts.geo}"' logging.info("Seed URL => %s", search_url) companies: List[Dict] = [] @@ -425,14 +424,13 @@ def main(): if cli_opts.debug: opts = detect_debug_defaults(force=True) else: - env_defaults = detect_debug_defaults() env_defaults = detect_debug_defaults() opts = env_defaults if env_defaults else cli_opts if not getattr(opts, "cmd", None): opts.cmd = "full" - exit_code = asyncio.run(async_main(opts)) + exit_code = asyncio.run(async_main(cli_opts)) sys.exit(exit_code) From 6650b2f34a1849d01e00ca1bcce5772ebaf7cc54 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 2 May 2025 16:51:15 +0530 Subject: [PATCH 04/11] fix: replace openAI with litellm to support multiple llm providers --- .gitignore | 4 ++- docs/apps/linkdin/c4ai_insights.py | 55 ++++++++++++++++-------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 1658a987..6a118cba 100644 --- a/.gitignore +++ b/.gitignore @@ -261,4 +261,6 @@ CLAUDE.md tests/**/test_site tests/**/reports -tests/**/benchmark_reports \ No newline at end of file +tests/**/benchmark_reports + +docs/**/data \ No newline at end of file diff --git a/docs/apps/linkdin/c4ai_insights.py b/docs/apps/linkdin/c4ai_insights.py index 8307c30d..94370258 100644 --- a/docs/apps/linkdin/c4ai_insights.py +++ b/docs/apps/linkdin/c4ai_insights.py @@ -43,7 +43,7 @@ import numpy as np import pandas as pd import hashlib -from openai import OpenAI # same SDK you pre-loaded +from litellm import completion #Support any LLM Provider # ─────────────────────────────────────────────────────────────────────────────── # Utils @@ -70,11 +70,12 @@ def dev_defaults() -> SimpleNamespace: out_dir="./insights_debug", embed_model="all-MiniLM-L6-v2", top_k=10, - openai_model="gpt-4.1", + llm_provider="openai/gpt-4.1", + llm_api_key=None, max_llm_tokens=8000, llm_temperature=1.0, - workers=4, # parallel processing - stub=False, # manual + workers=4, + stub=False ) # ─────────────────────────────────────────────────────────────────────────────── @@ -166,7 +167,7 @@ def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any # ─────────────────────────────────────────────────────────────────────────────── # Org-chart via LLM # ─────────────────────────────────────────────────────────────────────────────── -async def infer_org_chart_llm(company, people, client:OpenAI, model_name:str, max_tokens:int, temperature:float, stub:bool): +async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool): if stub: # Tiny fake org-chart when debugging offline chief = random.choice(people) @@ -202,15 +203,19 @@ Here is a JSON list of employees: Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }} """} ] - resp = client.chat.completions.create( - model=model_name, + resp = completion( + model=llm_provider, messages=prompt, max_tokens=max_tokens, temperature=temperature, - response_format={"type":"json_object"} + response_format={"type":"json_object"}, + api_key=api_key ) chart = json.loads(resp.choices[0].message.content) - chart["meta"] = dict(model=model_name, generated_at=datetime.now(UTC).isoformat()) + chart["meta"] = dict( + model=llm_provider, + generated_at=datetime.now(UTC).isoformat() + ) return chart # ─────────────────────────────────────────────────────────────────────────────── @@ -270,15 +275,11 @@ async def run(opts): logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people") logging.info("[bold]⇢[/] Embedding company descriptions…") - # embeds = embed_descriptions(companies, opts.embed_model, opts) + embeds = embed_descriptions(companies, opts.embed_model, opts) logging.info("[bold]⇢[/] Building similarity graph") - # company_graph = build_company_graph(companies, embeds, opts.top_k) - # dump_json(company_graph, out_dir/"company_graph.json") - - # OpenAI client (only built if not debugging) - stub = bool(opts.stub) - client = OpenAI() if not stub else None + company_graph = build_company_graph(companies, embeds, opts.top_k) + dump_json(company_graph, out_dir/"company_graph.json") # Filter companies that need processing to_process = [] @@ -311,14 +312,13 @@ async def run(opts): async def process_one(comp): handle = comp["handle"].strip("/").replace("/","_") persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")] - chart = await infer_org_chart_llm( comp, persons, - client=client if client else OpenAI(api_key="sk-debug"), - model_name=opts.openai_model, + llm_provider=opts.llm_provider, + api_key=getattr(opts, 'llm_api_key', None), max_tokens=opts.max_llm_tokens, temperature=opts.llm_temperature, - stub=stub, + stub=opts.stub or False, ) chart["meta"]["company"] = comp["name"] @@ -354,18 +354,21 @@ def build_arg_parser(): p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output") p.add_argument("--in", dest="in_dir", required=False, help="Stage-1 output dir", default=".") p.add_argument("--out", dest="out_dir", required=False, help="Destination dir", default=".") - p.add_argument("--embed_model", default="all-MiniLM-L6-v2") - p.add_argument("--top_k", type=int, default=10, help="Top-k neighbours per company") - p.add_argument("--openai_model", default="gpt-4.1") - p.add_argument("--max_llm_tokens", type=int, default=8024) - p.add_argument("--llm_temperature", type=float, default=1.0) + p.add_argument("--embed-model", default="all-MiniLM-L6-v2") + p.add_argument("--top-k", type=int, default=10, help="Top-k neighbours per company") + p.add_argument("--llm-provider", default="openai/gpt-4.1", + help="LLM model to use in format 'provider/model_name' (e.g., 'anthropic/claude-3')") + p.add_argument("--llm-api-key", help="API key for LLM provider (defaults to env vars)") + p.add_argument("--max-llm-tokens", type=int, default=8024) + p.add_argument("--llm-temperature", type=float, default=1.0) p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts") p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference") return p def main(): dbg = dev_defaults() - opts = dbg if True else build_arg_parser().parse_args() + # opts = dbg if True else build_arg_parser().parse_args() + opts = build_arg_parser().parse_args() asyncio.run(run(opts)) if __name__ == "__main__": From bd5a9ac632628f3e3b196ce86d3f5ed07f9414a0 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 2 May 2025 17:04:42 +0530 Subject: [PATCH 05/11] updated readme with arguments for litellm --- docs/apps/linkdin/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md index 7fe61bd7..0441bf70 100644 --- a/docs/apps/linkdin/README.md +++ b/docs/apps/linkdin/README.md @@ -69,11 +69,12 @@ _See more: – t python c4ai_insights.py \ --in ./data \ --out ./data \ - --embed_model all-MiniLM-L6-v2 \ - --top_k 10 \ - --openai_model gpt-4.1 \ - --max_llm_tokens 8024 \ - --llm_temperature 1.0 \ + --embed_model all-MiniLM-L6-v2 \ + --llm-provider gemini/gemini-2.0-flash \ + --llm-api-key "" \ + --top-k 10 \ + --max-llm-tokens 8024 \ + --llm-temperature 1.0 \ --workers 4 ``` Emits next to the Stage‑1 files: From 87d4b0fff4d39dec9ab440c2e4f11b1ff0b4f6e3 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 2 May 2025 17:21:09 +0530 Subject: [PATCH 06/11] format bash scripts properly so copy & paste may work without issues --- docs/apps/linkdin/README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md index 0441bf70..3b4ab504 100644 --- a/docs/apps/linkdin/README.md +++ b/docs/apps/linkdin/README.md @@ -34,15 +34,15 @@ crwl profiles ## 2  Discovery – scrape companies & people ```bash -python c4ai_discover.py full \ - --query "health insurance management" \ +python c4ai_discover.py full \ + --query "health insurance management" \ --geo 102713980 \ # Malaysia geoUrn --title-filters "" \ # or "Product,Engineering" --max-companies 10 \ # default set small for workshops --max-people 20 \ # \^ same - --profile-name profile_linkedin_uc \ - --outdir ./data \ - --concurrency 2 \ + --profile-name profile_linkedin_uc \ + --outdir ./data \ + --concurrency 2 \ --log-level debug ``` **Outputs** in `./data/`: @@ -66,15 +66,15 @@ _See more: – t ## 3  Insights – embeddings, org‑charts, decision makers ```bash -python c4ai_insights.py \ - --in ./data \ - --out ./data \ - --embed_model all-MiniLM-L6-v2 \ - --llm-provider gemini/gemini-2.0-flash \ - --llm-api-key "" \ - --top-k 10 \ - --max-llm-tokens 8024 \ - --llm-temperature 1.0 \ +python c4ai_insights.py \ + --in ./data \ + --out ./data \ + --embed-model all-MiniLM-L6-v2 \ + --llm-provider gemini/gemini-2.0-flash \ + --llm-api-key "" \ + --top-k 10 \ + --max-llm-tokens 8024 \ + --llm-temperature 1.0 \ --workers 4 ``` Emits next to the Stage‑1 files: From 38ebcbb304b806b81577e50befbc82d6a58f5d15 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 5 May 2025 10:34:38 +0530 Subject: [PATCH 07/11] fix: provide support for local llm by adding it to the arguments --- docs/apps/linkdin/c4ai_insights.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/apps/linkdin/c4ai_insights.py b/docs/apps/linkdin/c4ai_insights.py index 94370258..28b14cd8 100644 --- a/docs/apps/linkdin/c4ai_insights.py +++ b/docs/apps/linkdin/c4ai_insights.py @@ -74,8 +74,7 @@ def dev_defaults() -> SimpleNamespace: llm_api_key=None, max_llm_tokens=8000, llm_temperature=1.0, - workers=4, - stub=False + workers=4 ) # ─────────────────────────────────────────────────────────────────────────────── @@ -167,7 +166,7 @@ def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any # ─────────────────────────────────────────────────────────────────────────────── # Org-chart via LLM # ─────────────────────────────────────────────────────────────────────────────── -async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool): +async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool=False, base_url:str=None): if stub: # Tiny fake org-chart when debugging offline chief = random.choice(people) @@ -209,7 +208,8 @@ Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_sc max_tokens=max_tokens, temperature=temperature, response_format={"type":"json_object"}, - api_key=api_key + api_key=api_key, + base_url=base_url ) chart = json.loads(resp.choices[0].message.content) chart["meta"] = dict( @@ -315,10 +315,11 @@ async def run(opts): chart = await infer_org_chart_llm( comp, persons, llm_provider=opts.llm_provider, - api_key=getattr(opts, 'llm_api_key', None), + api_key=opts.llm_api_key or None, max_tokens=opts.max_llm_tokens, temperature=opts.llm_temperature, stub=opts.stub or False, + base_url=opts.llm_base_url or None ) chart["meta"]["company"] = comp["name"] @@ -359,6 +360,7 @@ def build_arg_parser(): p.add_argument("--llm-provider", default="openai/gpt-4.1", help="LLM model to use in format 'provider/model_name' (e.g., 'anthropic/claude-3')") p.add_argument("--llm-api-key", help="API key for LLM provider (defaults to env vars)") + p.add_argument("--llm-base-url", help="Base URL for LLM API endpoint") p.add_argument("--max-llm-tokens", type=int, default=8024) p.add_argument("--llm-temperature", type=float, default=1.0) p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts") From aaf05910ebef9f14a7a30673fa33c44bb1e94ffc Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 6 May 2025 15:53:55 +0530 Subject: [PATCH 08/11] fix: removed unnecessary imports and installs --- docs/apps/linkdin/README.md | 2 +- docs/apps/linkdin/c4ai_discover.py | 1 - docs/apps/linkdin/c4ai_insights.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md index 3b4ab504..57f97815 100644 --- a/docs/apps/linkdin/README.md +++ b/docs/apps/linkdin/README.md @@ -16,7 +16,7 @@ prospect‑wizard/ ### 1.1  Install dependencies ```bash -pip install crawl4ai openai sentence-transformers networkx pandas vis-network rich +pip install crawl4ai litellm sentence-transformers pandas rich ``` ### 1.2  Create / warm a LinkedIn browser profile diff --git a/docs/apps/linkdin/c4ai_discover.py b/docs/apps/linkdin/c4ai_discover.py index dca2fa69..ac6d2783 100644 --- a/docs/apps/linkdin/c4ai_discover.py +++ b/docs/apps/linkdin/c4ai_discover.py @@ -43,7 +43,6 @@ from rich.console import Console from rich.logging import RichHandler from datetime import datetime, UTC -from itertools import cycle from textwrap import dedent from types import SimpleNamespace from typing import Dict, List, Optional diff --git a/docs/apps/linkdin/c4ai_insights.py b/docs/apps/linkdin/c4ai_insights.py index 28b14cd8..60348f43 100644 --- a/docs/apps/linkdin/c4ai_insights.py +++ b/docs/apps/linkdin/c4ai_insights.py @@ -20,7 +20,7 @@ from __future__ import annotations # Imports & Third-party # ─────────────────────────────────────────────────────────────────────────────── -import argparse, asyncio, json, os, sys, pathlib, random, time, csv +import argparse, asyncio, json, pathlib, random from datetime import datetime, UTC from types import SimpleNamespace from pathlib import Path @@ -30,7 +30,7 @@ from rich.console import Console from rich.logging import RichHandler from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn import logging -from jinja2 import Environment, FileSystemLoader, select_autoescape + BASE_DIR = pathlib.Path(__file__).resolve().parent From 206a9dfabdcd4a292f49bcd818569497c78d1886 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 8 May 2025 17:13:35 +0800 Subject: [PATCH 09/11] feat(crawler): add session management and view-source support Add session_id feature to allow reusing browser pages across multiple crawls. Add support for view-source: protocol in URL handling. Fix browser config reference and string formatting issues. Update examples to demonstrate new session management features. BREAKING CHANGE: Browser page handling now persists when using session_id --- crawl4ai/async_crawler_strategy.py | 4 +-- crawl4ai/browser_profiler.py | 6 ++--- docs/examples/hello_world.py | 10 +++++--- docs/examples/session_id_example.py | 38 +++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 docs/examples/session_id_example.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 0d62c7b5..153205db 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -441,7 +441,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): status_code = 200 # Default for local/raw HTML screenshot_data = None - if url.startswith(("http://", "https://")): + if url.startswith(("http://", "https://", "view-source:")): return await self._crawl_web(url, config) elif url.startswith("file://"): @@ -784,7 +784,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error: visibility_info = await self.check_visibility(page) - if self.config.verbose: + if self.browser_config.config.verbose: self.logger.debug( message="Body visibility info: {info}", tag="DEBUG", diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 5a95b25d..961ba740 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -128,7 +128,7 @@ class BrowserProfiler: os.makedirs(profile_path, exist_ok=True) # Print instructions for the user with rich formatting - border = "{'='*80}" + border = f"{'='*80}" self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN}) self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN}) self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW}) @@ -607,8 +607,8 @@ class BrowserProfiler: os.makedirs(profile_path, exist_ok=True) # Print initial information - border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" - self.logger.info(f"\n{border}", tag="CDP") + border = f"{'='*80}" + self.logger.info("{border}", tag="CDP", params={"border": border}, colors={"border": LogColor.CYAN}) self.logger.info("Launching standalone browser with CDP debugging", tag="CDP") self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN}) self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW}) diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index b9f1b328..2ba2e852 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -8,17 +8,19 @@ from crawl4ai import ( CrawlResult ) - -async def main(): - browser_config = BrowserConfig(headless=True, verbose=True) +async def main(): + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) async with AsyncWebCrawler(config=browser_config) as crawler: crawler_config = CrawlerRunConfig( markdown_generator=DefaultMarkdownGenerator( content_filter=PruningContentFilter() ), ) - result : CrawlResult = await crawler.arun( + result: CrawlResult = await crawler.arun( url="https://www.helloworld.org", config=crawler_config ) print(result.markdown.raw_markdown[:500]) diff --git a/docs/examples/session_id_example.py b/docs/examples/session_id_example.py new file mode 100644 index 00000000..e49b7819 --- /dev/null +++ b/docs/examples/session_id_example.py @@ -0,0 +1,38 @@ +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + DefaultMarkdownGenerator, + PruningContentFilter, + CrawlResult +) + + + +async def main(): + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + session_id= "hello_world", # This help us to use the same page + ) + result : CrawlResult = await crawler.arun( + url="https://www.helloworld.org", config=crawler_config + ) + # Add a breakpoint here, then you will the page is open and browser is not closed + print(result.markdown.raw_markdown[:500]) + + new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True) + result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page + url="https://www.helloworld.org", config= new_config + ) + print(result.js_execution_result) # You should see {'data':'hello'} in the console + + # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config + page, context = crawler.crawler_strategy.get_page(new_config) + +if __name__ == "__main__": + asyncio.run(main()) From a3e9ef91ad6c02972ff612a957b9135c7e965a52 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 12 May 2025 21:17:57 +0800 Subject: [PATCH 10/11] fix(crawler): remove automatic page closure in screenshot methods Removes automatic page closure in take_screenshot and take_screenshot_naive methods to prevent premature closure of pages that might still be needed in the calling context. This allows for more flexible page lifecycle management by the caller. BREAKING CHANGE: Page objects are no longer automatically closed after taking screenshots. Callers must explicitly handle page closure when appropriate. --- crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 1be2ccd8..799c3b53 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,3 @@ # crawl4ai/_version.py -__version__ = "0.6.3" +__version__ = "0.6.4" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 153205db..10d395ee 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1467,8 +1467,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): buffered = BytesIO() img.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") - finally: - await page.close() + # finally: + # await page.close() async def take_screenshot_naive(self, page: Page) -> str: """ @@ -1501,8 +1501,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): buffered = BytesIO() img.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") - finally: - await page.close() + # finally: + # await page.close() async def export_storage_state(self, path: str = None) -> dict: """ From 897e0173618d20fea5d8952ccdbcdad0febc0fee Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 12 May 2025 21:20:10 +0800 Subject: [PATCH 11/11] Set version to 0.6.3 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 799c3b53..1be2ccd8 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,3 @@ # crawl4ai/_version.py -__version__ = "0.6.4" +__version__ = "0.6.3"