Merge remote-tracking branch 'origin/linkedin-prep' into next

This commit is contained in:
UncleCode
2025-05-08 17:13:59 +08:00
4 changed files with 60 additions and 56 deletions

1
.gitignore vendored
View File

@@ -263,4 +263,5 @@ tests/**/test_site
tests/**/reports tests/**/reports
tests/**/benchmark_reports tests/**/benchmark_reports
docs/**/data
.codecat/ .codecat/

View File

@@ -16,12 +16,12 @@ prospectwizard/
### 1.1  Install dependencies ### 1.1  Install dependencies
```bash ```bash
pip install crawl4ai openai sentence-transformers networkx pandas vis-network rich pip install crawl4ai litellm sentence-transformers pandas rich
``` ```
### 1.2  Create / warm a LinkedIn browser profile ### 1.2  Create / warm a LinkedIn browser profile
```bash ```bash
crwl profiler crwl profiles
``` ```
1. The interactive shell shows **New profile** hit **enter**. 1. The interactive shell shows **New profile** hit **enter**.
2. Choose a name, e.g. `profile_linkedin_uc`. 2. Choose a name, e.g. `profile_linkedin_uc`.
@@ -37,13 +37,13 @@ crwl profiler
python c4ai_discover.py full \ python c4ai_discover.py full \
--query "health insurance management" \ --query "health insurance management" \
--geo 102713980 \ # Malaysia geoUrn --geo 102713980 \ # Malaysia geoUrn
--title_filters "" \ # or "Product,Engineering" --title-filters "" \ # or "Product,Engineering"
--max_companies 10 \ # default set small for workshops --max-companies 10 \ # default set small for workshops
--max_people 20 \ # \^ same --max-people 20 \ # \^ same
--profile-name profile_linkedin_uc \ --profile-name profile_linkedin_uc \
--outdir ./data \ --outdir ./data \
--concurrency 2 \ --concurrency 2 \
--log_level debug --log-level debug
``` ```
**Outputs** in `./data/`: **Outputs** in `./data/`:
* `companies.jsonl` one JSON per company * `companies.jsonl` one JSON per company
@@ -67,13 +67,14 @@ _See more: <https://www.linkedin.com/search/results/companies/?geoUrn=XXX> t
```bash ```bash
python c4ai_insights.py \ python c4ai_insights.py \
--in ./data \ --in ./data \
--out ./data \ --out ./data \
--embed_model all-MiniLM-L6-v2 \ --embed-model all-MiniLM-L6-v2 \
--top_k 10 \ --llm-provider gemini/gemini-2.0-flash \
--openai_model gpt-4.1 \ --llm-api-key "" \
--max_llm_tokens 8024 \ --top-k 10 \
--llm_temperature 1.0 \ --max-llm-tokens 8024 \
--llm-temperature 1.0 \
--workers 4 --workers 4
``` ```
Emits next to the Stage1 files: Emits next to the Stage1 files:
@@ -121,6 +122,6 @@ The page fetches `data/company_graph.json` and the `org_chart_*.json` files auto
--- ---
### TL;DR ### TL;DR
`crwl profiler``c4ai_discover.py``c4ai_insights.py` → open `graph_view_template.html`. `crwl profiles``c4ai_discover.py``c4ai_insights.py` → open `graph_view_template.html`.
Live long and `import crawl4ai`. Live long and `import crawl4ai`.

View File

@@ -43,7 +43,6 @@ from rich.console import Console
from rich.logging import RichHandler from rich.logging import RichHandler
from datetime import datetime, UTC from datetime import datetime, UTC
from itertools import cycle
from textwrap import dedent from textwrap import dedent
from types import SimpleNamespace from types import SimpleNamespace
from typing import Dict, List, Optional from typing import Dict, List, Optional
@@ -272,7 +271,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords") parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
parser.add_argument("--max-companies", type=int, default=1000) parser.add_argument("--max-companies", type=int, default=1000)
parser.add_argument("--max-people", type=int, default=500) parser.add_argument("--max-people", type=int, default=500)
parser.add_argument("--profile-path", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc")) parser.add_argument("--profile-name", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
parser.add_argument("--outdir", default="./output") parser.add_argument("--outdir", default="./output")
parser.add_argument("--concurrency", type=int, default=4) parser.add_argument("--concurrency", type=int, default=4)
parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"]) parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
@@ -355,8 +354,7 @@ async def async_main(opts):
user_agent_generator_config= { user_agent_generator_config= {
"platforms": "mobile", "platforms": "mobile",
"os": "Android" "os": "Android"
}, }
verbose=False,
) )
crawler = AsyncWebCrawler(config=bc) crawler = AsyncWebCrawler(config=bc)
@@ -366,7 +364,7 @@ async def async_main(opts):
# crawler = await next_crawler().start() # crawler = await next_crawler().start()
try: try:
# Build LinkedIn search URL # Build LinkedIn search URL
search_url = f"https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&geoUrn={opts.geo}" search_url = f'https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&companyHqGeo="{opts.geo}"'
logging.info("Seed URL => %s", search_url) logging.info("Seed URL => %s", search_url)
companies: List[Dict] = [] companies: List[Dict] = []
@@ -425,14 +423,13 @@ def main():
if cli_opts.debug: if cli_opts.debug:
opts = detect_debug_defaults(force=True) opts = detect_debug_defaults(force=True)
else: else:
env_defaults = detect_debug_defaults()
env_defaults = detect_debug_defaults() env_defaults = detect_debug_defaults()
opts = env_defaults if env_defaults else cli_opts opts = env_defaults if env_defaults else cli_opts
if not getattr(opts, "cmd", None): if not getattr(opts, "cmd", None):
opts.cmd = "full" opts.cmd = "full"
exit_code = asyncio.run(async_main(opts)) exit_code = asyncio.run(async_main(cli_opts))
sys.exit(exit_code) sys.exit(exit_code)

View File

@@ -20,7 +20,7 @@ from __future__ import annotations
# Imports & Third-party # Imports & Third-party
# ─────────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────────
import argparse, asyncio, json, os, sys, pathlib, random, time, csv import argparse, asyncio, json, pathlib, random
from datetime import datetime, UTC from datetime import datetime, UTC
from types import SimpleNamespace from types import SimpleNamespace
from pathlib import Path from pathlib import Path
@@ -30,7 +30,7 @@ from rich.console import Console
from rich.logging import RichHandler from rich.logging import RichHandler
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
import logging import logging
from jinja2 import Environment, FileSystemLoader, select_autoescape
BASE_DIR = pathlib.Path(__file__).resolve().parent BASE_DIR = pathlib.Path(__file__).resolve().parent
@@ -43,7 +43,7 @@ import numpy as np
import pandas as pd import pandas as pd
import hashlib import hashlib
from openai import OpenAI # same SDK you pre-loaded from litellm import completion #Support any LLM Provider
# ─────────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────────
# Utils # Utils
@@ -70,11 +70,11 @@ def dev_defaults() -> SimpleNamespace:
out_dir="./insights_debug", out_dir="./insights_debug",
embed_model="all-MiniLM-L6-v2", embed_model="all-MiniLM-L6-v2",
top_k=10, top_k=10,
openai_model="gpt-4.1", llm_provider="openai/gpt-4.1",
llm_api_key=None,
max_llm_tokens=8000, max_llm_tokens=8000,
llm_temperature=1.0, llm_temperature=1.0,
workers=4, # parallel processing workers=4
stub=False, # manual
) )
# ─────────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────────
@@ -166,7 +166,7 @@ def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any
# ─────────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────────
# Org-chart via LLM # Org-chart via LLM
# ─────────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────────
async def infer_org_chart_llm(company, people, client:OpenAI, model_name:str, max_tokens:int, temperature:float, stub:bool): async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool=False, base_url:str=None):
if stub: if stub:
# Tiny fake org-chart when debugging offline # Tiny fake org-chart when debugging offline
chief = random.choice(people) chief = random.choice(people)
@@ -202,15 +202,20 @@ Here is a JSON list of employees:
Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }} Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
"""} """}
] ]
resp = client.chat.completions.create( resp = completion(
model=model_name, model=llm_provider,
messages=prompt, messages=prompt,
max_tokens=max_tokens, max_tokens=max_tokens,
temperature=temperature, temperature=temperature,
response_format={"type":"json_object"} response_format={"type":"json_object"},
api_key=api_key,
base_url=base_url
) )
chart = json.loads(resp.choices[0].message.content) chart = json.loads(resp.choices[0].message.content)
chart["meta"] = dict(model=model_name, generated_at=datetime.now(UTC).isoformat()) chart["meta"] = dict(
model=llm_provider,
generated_at=datetime.now(UTC).isoformat()
)
return chart return chart
# ─────────────────────────────────────────────────────────────────────────────── # ───────────────────────────────────────────────────────────────────────────────
@@ -270,15 +275,11 @@ async def run(opts):
logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people") logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
logging.info("[bold]⇢[/] Embedding company descriptions…") logging.info("[bold]⇢[/] Embedding company descriptions…")
# embeds = embed_descriptions(companies, opts.embed_model, opts) embeds = embed_descriptions(companies, opts.embed_model, opts)
logging.info("[bold]⇢[/] Building similarity graph") logging.info("[bold]⇢[/] Building similarity graph")
# company_graph = build_company_graph(companies, embeds, opts.top_k) company_graph = build_company_graph(companies, embeds, opts.top_k)
# dump_json(company_graph, out_dir/"company_graph.json") dump_json(company_graph, out_dir/"company_graph.json")
# OpenAI client (only built if not debugging)
stub = bool(opts.stub)
client = OpenAI() if not stub else None
# Filter companies that need processing # Filter companies that need processing
to_process = [] to_process = []
@@ -311,14 +312,14 @@ async def run(opts):
async def process_one(comp): async def process_one(comp):
handle = comp["handle"].strip("/").replace("/","_") handle = comp["handle"].strip("/").replace("/","_")
persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")] persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
chart = await infer_org_chart_llm( chart = await infer_org_chart_llm(
comp, persons, comp, persons,
client=client if client else OpenAI(api_key="sk-debug"), llm_provider=opts.llm_provider,
model_name=opts.openai_model, api_key=opts.llm_api_key or None,
max_tokens=opts.max_llm_tokens, max_tokens=opts.max_llm_tokens,
temperature=opts.llm_temperature, temperature=opts.llm_temperature,
stub=stub, stub=opts.stub or False,
base_url=opts.llm_base_url or None
) )
chart["meta"]["company"] = comp["name"] chart["meta"]["company"] = comp["name"]
@@ -354,18 +355,22 @@ def build_arg_parser():
p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output") p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
p.add_argument("--in", dest="in_dir", required=False, help="Stage-1 output dir", default=".") p.add_argument("--in", dest="in_dir", required=False, help="Stage-1 output dir", default=".")
p.add_argument("--out", dest="out_dir", required=False, help="Destination dir", default=".") p.add_argument("--out", dest="out_dir", required=False, help="Destination dir", default=".")
p.add_argument("--embed_model", default="all-MiniLM-L6-v2") p.add_argument("--embed-model", default="all-MiniLM-L6-v2")
p.add_argument("--top_k", type=int, default=10, help="Top-k neighbours per company") p.add_argument("--top-k", type=int, default=10, help="Top-k neighbours per company")
p.add_argument("--openai_model", default="gpt-4.1") p.add_argument("--llm-provider", default="openai/gpt-4.1",
p.add_argument("--max_llm_tokens", type=int, default=8024) help="LLM model to use in format 'provider/model_name' (e.g., 'anthropic/claude-3')")
p.add_argument("--llm_temperature", type=float, default=1.0) p.add_argument("--llm-api-key", help="API key for LLM provider (defaults to env vars)")
p.add_argument("--llm-base-url", help="Base URL for LLM API endpoint")
p.add_argument("--max-llm-tokens", type=int, default=8024)
p.add_argument("--llm-temperature", type=float, default=1.0)
p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts") p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference") p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
return p return p
def main(): def main():
dbg = dev_defaults() dbg = dev_defaults()
opts = dbg if True else build_arg_parser().parse_args() # opts = dbg if True else build_arg_parser().parse_args()
opts = build_arg_parser().parse_args()
asyncio.run(run(opts)) asyncio.run(run(opts))
if __name__ == "__main__": if __name__ == "__main__":