Merge next branch

2025-05-13 17:12:11 +05:30
parent 1af3d1c2e0 897e017361
commit 98a56e6e01
8 changed files with 113 additions and 69 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -263,4 +263,5 @@ tests/**/test_site
 tests/**/reports
 tests/**/benchmark_reports

+docs/**/data
 .codecat/
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -441,7 +441,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        status_code = 200  # Default for local/raw HTML
        screenshot_data = None

-        if url.startswith(("http://", "https://")):
+        if url.startswith(("http://", "https://", "view-source:")):
            return await self._crawl_web(url, config)

        elif url.startswith("file://"):
@@ -816,7 +816,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            except Error:
                visibility_info = await self.check_visibility(page)

-                if self.verbose:
+                if self.browser_config.config.verbose:
                    self.logger.debug(
                        message="Body visibility info: {info}",
                        tag="DEBUG",
@@ -1519,8 +1519,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            buffered = BytesIO()
            img.save(buffered, format="JPEG")
            return base64.b64encode(buffered.getvalue()).decode("utf-8")
-        finally:
-            await page.close()
+        # finally:
+        #     await page.close()

    async def take_screenshot_naive(self, page: Page) -> str:
        """
@@ -1553,8 +1553,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            buffered = BytesIO()
            img.save(buffered, format="JPEG")
            return base64.b64encode(buffered.getvalue()).decode("utf-8")
-        finally:
-            await page.close()
+        # finally:
+        #     await page.close()

    async def export_storage_state(self, path: str = None) -> dict:
        """
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -128,7 +128,7 @@ class BrowserProfiler:
        os.makedirs(profile_path, exist_ok=True)
        
        # Print instructions for the user with rich formatting
-        border = "{'='*80}"
+        border = f"{'='*80}"
        self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
        self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
        self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
@@ -607,8 +607,8 @@ class BrowserProfiler:
            os.makedirs(profile_path, exist_ok=True)
        
        # Print initial information
-        border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
-        self.logger.info(f"\n{border}", tag="CDP")
+        border = f"{'='*80}"
+        self.logger.info("{border}", tag="CDP", params={"border": border}, colors={"border": LogColor.CYAN})
        self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
        self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
        self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
--- a/docs/apps/linkdin/README.md
+++ b/docs/apps/linkdin/README.md
@@ -16,12 +16,12 @@ prospect‑wizard/

 ### 1.1  Install dependencies
 ```bash
-pip install crawl4ai openai sentence-transformers networkx pandas vis-network rich
+pip install crawl4ai litellm sentence-transformers pandas rich
 ```

 ### 1.2  Create / warm a LinkedIn browser profile
 ```bash
-crwl profiler
+crwl profiles
 ```
 1. The interactive shell shows **New profile** – hit **enter**.
 2. Choose a name, e.g. `profile_linkedin_uc`.
@@ -37,13 +37,13 @@ crwl profiler
 python c4ai_discover.py full \
  --query "health insurance management" \
  --geo 102713980 \               # Malaysia geoUrn
-  --title_filters "" \            # or "Product,Engineering"
-  --max_companies 10 \            # default set small for workshops
-  --max_people 20 \               # \^ same
+  --title-filters "" \            # or "Product,Engineering"
+  --max-companies 10 \            # default set small for workshops
+  --max-people 20 \               # \^ same
  --profile-name profile_linkedin_uc \
  --outdir ./data \
  --concurrency 2 \
-  --log_level debug
+  --log-level debug
 ```
 **Outputs** in `./data/`:
 * `companies.jsonl` – one JSON per company
@@ -67,13 +67,14 @@ _See more: <https://www.linkedin.com/search/results/companies/?geoUrn=XXX> – t

 ```bash
 python c4ai_insights.py \
-  --in  ./data \ 
+  --in ./data \
  --out ./data \
-  --embed_model all-MiniLM-L6-v2 \ 
-  --top_k 10 \ 
-  --openai_model gpt-4.1 \ 
-  --max_llm_tokens 8024 \ 
-  --llm_temperature 1.0 \ 
+  --embed-model all-MiniLM-L6-v2 \
+  --llm-provider gemini/gemini-2.0-flash \
+  --llm-api-key "" \
+  --top-k 10 \
+  --max-llm-tokens 8024 \
+  --llm-temperature 1.0 \
  --workers 4
 ```
 Emits next to the Stage‑1 files:
@@ -121,6 +122,6 @@ The page fetches `data/company_graph.json` and the `org_chart_*.json` files auto
 ---

 ### TL;DR
-`crwl profiler` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`.  
+`crwl profiles` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`.  
 Live long and `import crawl4ai`.

--- a/docs/apps/linkdin/c4ai_discover.py
+++ b/docs/apps/linkdin/c4ai_discover.py
@@ -43,7 +43,6 @@ from rich.console import Console
 from rich.logging import RichHandler

 from datetime import datetime, UTC
-from itertools import cycle
 from textwrap import dedent
 from types import SimpleNamespace
 from typing import Dict, List, Optional
@@ -272,7 +271,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
        parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
        parser.add_argument("--max-companies", type=int, default=1000)
        parser.add_argument("--max-people", type=int, default=500)
-        parser.add_argument("--profile-path", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
+        parser.add_argument("--profile-name", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
        parser.add_argument("--outdir", default="./output")
        parser.add_argument("--concurrency", type=int, default=4)
        parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
@@ -355,8 +354,7 @@ async def async_main(opts):
        user_agent_generator_config= {
            "platforms": "mobile",
            "os": "Android"
-        },
-        verbose=False,
+        }
    )
    crawler = AsyncWebCrawler(config=bc)
    
@@ -366,7 +364,7 @@ async def async_main(opts):
    # crawler = await next_crawler().start()
    try:
        # Build LinkedIn search URL
-        search_url = f"https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&geoUrn={opts.geo}"
+        search_url = f'https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&companyHqGeo="{opts.geo}"'
        logging.info("Seed URL => %s", search_url)

        companies: List[Dict] = []
@@ -425,14 +423,13 @@ def main():
    if cli_opts.debug:
        opts = detect_debug_defaults(force=True)
    else:
-        env_defaults = detect_debug_defaults()
        env_defaults = detect_debug_defaults()
        opts = env_defaults if env_defaults else cli_opts

    if not getattr(opts, "cmd", None):
        opts.cmd = "full"

-    exit_code = asyncio.run(async_main(opts))
+    exit_code = asyncio.run(async_main(cli_opts))
    sys.exit(exit_code)


--- a/docs/apps/linkdin/c4ai_insights.py
+++ b/docs/apps/linkdin/c4ai_insights.py
@@ -20,7 +20,7 @@ from __future__ import annotations
 # Imports & Third-party
 # ───────────────────────────────────────────────────────────────────────────────

-import argparse, asyncio, json, os, sys, pathlib, random, time, csv
+import argparse, asyncio, json, pathlib, random
 from datetime import datetime, UTC
 from types import SimpleNamespace
 from pathlib import Path
@@ -30,7 +30,7 @@ from rich.console import Console
 from rich.logging import RichHandler
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
 import logging
-from jinja2 import Environment, FileSystemLoader, select_autoescape
+

 BASE_DIR = pathlib.Path(__file__).resolve().parent

@@ -43,7 +43,7 @@ import numpy as np
 import pandas as pd
 import hashlib

-from openai import OpenAI                    # same SDK you pre-loaded
+from litellm import completion   #Support any LLM Provider

 # ───────────────────────────────────────────────────────────────────────────────
 # Utils
@@ -70,11 +70,11 @@ def dev_defaults() -> SimpleNamespace:
        out_dir="./insights_debug",
        embed_model="all-MiniLM-L6-v2",
        top_k=10,
-        openai_model="gpt-4.1",
+        llm_provider="openai/gpt-4.1", 
+        llm_api_key=None,
        max_llm_tokens=8000,
        llm_temperature=1.0,
-        workers=4,           # parallel processing
-        stub=False,          # manual
+        workers=4
    )

 # ───────────────────────────────────────────────────────────────────────────────
@@ -166,7 +166,7 @@ def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any
 # ───────────────────────────────────────────────────────────────────────────────
 # Org-chart via LLM
 # ───────────────────────────────────────────────────────────────────────────────
-async def infer_org_chart_llm(company, people, client:OpenAI, model_name:str, max_tokens:int, temperature:float, stub:bool):
+async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool=False, base_url:str=None):
    if stub:
        # Tiny fake org-chart when debugging offline
        chief = random.choice(people)
@@ -202,15 +202,20 @@ Here is a JSON list of employees:
 Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
 """}
    ]
-    resp = client.chat.completions.create(
-        model=model_name,
+    resp = completion(
+        model=llm_provider,
        messages=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
-        response_format={"type":"json_object"}
+        response_format={"type":"json_object"},
+        api_key=api_key,
+        base_url=base_url
    )
    chart = json.loads(resp.choices[0].message.content)
-    chart["meta"] = dict(model=model_name, generated_at=datetime.now(UTC).isoformat())
+    chart["meta"] = dict(
+        model=llm_provider,
+        generated_at=datetime.now(UTC).isoformat()
+    )
    return chart

 # ───────────────────────────────────────────────────────────────────────────────
@@ -270,15 +275,11 @@ async def run(opts):
    logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")

    logging.info("[bold]⇢[/] Embedding company descriptions…")
-    # embeds = embed_descriptions(companies, opts.embed_model, opts)
+    embeds = embed_descriptions(companies, opts.embed_model, opts)
    
    logging.info("[bold]⇢[/] Building similarity graph")
-    # company_graph = build_company_graph(companies, embeds, opts.top_k)
-    # dump_json(company_graph, out_dir/"company_graph.json")
-
-    # OpenAI client (only built if not debugging)
-    stub = bool(opts.stub)
-    client = OpenAI() if not stub else None
+    company_graph = build_company_graph(companies, embeds, opts.top_k)
+    dump_json(company_graph, out_dir/"company_graph.json")

    # Filter companies that need processing
    to_process = []
@@ -311,14 +312,14 @@ async def run(opts):
            async def process_one(comp):
                handle = comp["handle"].strip("/").replace("/","_")
                persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
-                
                chart = await infer_org_chart_llm(
                    comp, persons,
-                    client=client if client else OpenAI(api_key="sk-debug"),
-                    model_name=opts.openai_model,
+                    llm_provider=opts.llm_provider,
+                    api_key=opts.llm_api_key or None,
                    max_tokens=opts.max_llm_tokens,
                    temperature=opts.llm_temperature,
-                    stub=stub,
+                    stub=opts.stub or False,
+                    base_url=opts.llm_base_url or None
                )
                chart["meta"]["company"] = comp["name"]
                
@@ -354,18 +355,22 @@ def build_arg_parser():
    p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
    p.add_argument("--in",       dest="in_dir",  required=False, help="Stage-1 output dir", default=".")
    p.add_argument("--out",      dest="out_dir", required=False, help="Destination dir",   default=".")
-    p.add_argument("--embed_model", default="all-MiniLM-L6-v2")
-    p.add_argument("--top_k", type=int, default=10, help="Top-k neighbours per company")
-    p.add_argument("--openai_model", default="gpt-4.1")
-    p.add_argument("--max_llm_tokens", type=int, default=8024)
-    p.add_argument("--llm_temperature", type=float, default=1.0)
+    p.add_argument("--embed-model", default="all-MiniLM-L6-v2")
+    p.add_argument("--top-k", type=int, default=10, help="Top-k neighbours per company")
+    p.add_argument("--llm-provider", default="openai/gpt-4.1", 
+                  help="LLM model to use in format 'provider/model_name' (e.g., 'anthropic/claude-3')")
+    p.add_argument("--llm-api-key", help="API key for LLM provider (defaults to env vars)")
+    p.add_argument("--llm-base-url", help="Base URL for LLM API endpoint")
+    p.add_argument("--max-llm-tokens", type=int, default=8024)
+    p.add_argument("--llm-temperature", type=float, default=1.0)
    p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
    p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
    return p

 def main():
    dbg = dev_defaults()
-    opts = dbg if True else build_arg_parser().parse_args()
+    # opts = dbg if True else build_arg_parser().parse_args()
+    opts = build_arg_parser().parse_args()
    asyncio.run(run(opts))

 if __name__ == "__main__":
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -9,16 +9,18 @@ from crawl4ai import (
 )


-
 async def main():
-    browser_config = BrowserConfig(headless=True, verbose=True)
+    browser_config = BrowserConfig(
+        headless=False,
+        verbose=True,
+    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        crawler_config = CrawlerRunConfig(
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter()
            ),
        )
-        result : CrawlResult = await crawler.arun(
+        result: CrawlResult = await crawler.arun(
            url="https://www.helloworld.org", config=crawler_config
        )
        print(result.markdown.raw_markdown[:500])
--- a/docs/examples/session_id_example.py
+++ b/docs/examples/session_id_example.py
@@ -0,0 +1,38 @@
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
+
+                   
+
+async def main():    
+    browser_config = BrowserConfig(
+        headless=False, 
+        verbose=True,
+    )
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            session_id= "hello_world", # This help us to use the same page 
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        # Add a breakpoint here, then you will the page is open and browser is not closed
+        print(result.markdown.raw_markdown[:500])
+        
+        new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
+        result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
+            url="https://www.helloworld.org", config= new_config
+        )
+        print(result.js_execution_result) # You should see {'data':'hello'} in the console
+        
+        # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config 
+        page, context = crawler.crawler_strategy.get_page(new_config)
+
+if __name__ == "__main__":
+    asyncio.run(main())