Merge remote-tracking branch 'origin/linkedin-prep' into next
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -263,4 +263,5 @@ tests/**/test_site
|
|||||||
tests/**/reports
|
tests/**/reports
|
||||||
tests/**/benchmark_reports
|
tests/**/benchmark_reports
|
||||||
|
|
||||||
|
docs/**/data
|
||||||
.codecat/
|
.codecat/
|
||||||
@@ -16,12 +16,12 @@ prospect‑wizard/
|
|||||||
|
|
||||||
### 1.1 Install dependencies
|
### 1.1 Install dependencies
|
||||||
```bash
|
```bash
|
||||||
pip install crawl4ai openai sentence-transformers networkx pandas vis-network rich
|
pip install crawl4ai litellm sentence-transformers pandas rich
|
||||||
```
|
```
|
||||||
|
|
||||||
### 1.2 Create / warm a LinkedIn browser profile
|
### 1.2 Create / warm a LinkedIn browser profile
|
||||||
```bash
|
```bash
|
||||||
crwl profiler
|
crwl profiles
|
||||||
```
|
```
|
||||||
1. The interactive shell shows **New profile** – hit **enter**.
|
1. The interactive shell shows **New profile** – hit **enter**.
|
||||||
2. Choose a name, e.g. `profile_linkedin_uc`.
|
2. Choose a name, e.g. `profile_linkedin_uc`.
|
||||||
@@ -37,13 +37,13 @@ crwl profiler
|
|||||||
python c4ai_discover.py full \
|
python c4ai_discover.py full \
|
||||||
--query "health insurance management" \
|
--query "health insurance management" \
|
||||||
--geo 102713980 \ # Malaysia geoUrn
|
--geo 102713980 \ # Malaysia geoUrn
|
||||||
--title_filters "" \ # or "Product,Engineering"
|
--title-filters "" \ # or "Product,Engineering"
|
||||||
--max_companies 10 \ # default set small for workshops
|
--max-companies 10 \ # default set small for workshops
|
||||||
--max_people 20 \ # \^ same
|
--max-people 20 \ # \^ same
|
||||||
--profile-name profile_linkedin_uc \
|
--profile-name profile_linkedin_uc \
|
||||||
--outdir ./data \
|
--outdir ./data \
|
||||||
--concurrency 2 \
|
--concurrency 2 \
|
||||||
--log_level debug
|
--log-level debug
|
||||||
```
|
```
|
||||||
**Outputs** in `./data/`:
|
**Outputs** in `./data/`:
|
||||||
* `companies.jsonl` – one JSON per company
|
* `companies.jsonl` – one JSON per company
|
||||||
@@ -67,13 +67,14 @@ _See more: <https://www.linkedin.com/search/results/companies/?geoUrn=XXX> – t
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
python c4ai_insights.py \
|
python c4ai_insights.py \
|
||||||
--in ./data \
|
--in ./data \
|
||||||
--out ./data \
|
--out ./data \
|
||||||
--embed_model all-MiniLM-L6-v2 \
|
--embed-model all-MiniLM-L6-v2 \
|
||||||
--top_k 10 \
|
--llm-provider gemini/gemini-2.0-flash \
|
||||||
--openai_model gpt-4.1 \
|
--llm-api-key "" \
|
||||||
--max_llm_tokens 8024 \
|
--top-k 10 \
|
||||||
--llm_temperature 1.0 \
|
--max-llm-tokens 8024 \
|
||||||
|
--llm-temperature 1.0 \
|
||||||
--workers 4
|
--workers 4
|
||||||
```
|
```
|
||||||
Emits next to the Stage‑1 files:
|
Emits next to the Stage‑1 files:
|
||||||
@@ -121,6 +122,6 @@ The page fetches `data/company_graph.json` and the `org_chart_*.json` files auto
|
|||||||
---
|
---
|
||||||
|
|
||||||
### TL;DR
|
### TL;DR
|
||||||
`crwl profiler` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`.
|
`crwl profiles` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`.
|
||||||
Live long and `import crawl4ai`.
|
Live long and `import crawl4ai`.
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,6 @@ from rich.console import Console
|
|||||||
from rich.logging import RichHandler
|
from rich.logging import RichHandler
|
||||||
|
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
from itertools import cycle
|
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
@@ -272,7 +271,7 @@ def build_arg_parser() -> argparse.ArgumentParser:
|
|||||||
parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
|
parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
|
||||||
parser.add_argument("--max-companies", type=int, default=1000)
|
parser.add_argument("--max-companies", type=int, default=1000)
|
||||||
parser.add_argument("--max-people", type=int, default=500)
|
parser.add_argument("--max-people", type=int, default=500)
|
||||||
parser.add_argument("--profile-path", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
|
parser.add_argument("--profile-name", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
|
||||||
parser.add_argument("--outdir", default="./output")
|
parser.add_argument("--outdir", default="./output")
|
||||||
parser.add_argument("--concurrency", type=int, default=4)
|
parser.add_argument("--concurrency", type=int, default=4)
|
||||||
parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
|
parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
|
||||||
@@ -355,8 +354,7 @@ async def async_main(opts):
|
|||||||
user_agent_generator_config= {
|
user_agent_generator_config= {
|
||||||
"platforms": "mobile",
|
"platforms": "mobile",
|
||||||
"os": "Android"
|
"os": "Android"
|
||||||
},
|
}
|
||||||
verbose=False,
|
|
||||||
)
|
)
|
||||||
crawler = AsyncWebCrawler(config=bc)
|
crawler = AsyncWebCrawler(config=bc)
|
||||||
|
|
||||||
@@ -366,7 +364,7 @@ async def async_main(opts):
|
|||||||
# crawler = await next_crawler().start()
|
# crawler = await next_crawler().start()
|
||||||
try:
|
try:
|
||||||
# Build LinkedIn search URL
|
# Build LinkedIn search URL
|
||||||
search_url = f"https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&geoUrn={opts.geo}"
|
search_url = f'https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&companyHqGeo="{opts.geo}"'
|
||||||
logging.info("Seed URL => %s", search_url)
|
logging.info("Seed URL => %s", search_url)
|
||||||
|
|
||||||
companies: List[Dict] = []
|
companies: List[Dict] = []
|
||||||
@@ -425,14 +423,13 @@ def main():
|
|||||||
if cli_opts.debug:
|
if cli_opts.debug:
|
||||||
opts = detect_debug_defaults(force=True)
|
opts = detect_debug_defaults(force=True)
|
||||||
else:
|
else:
|
||||||
env_defaults = detect_debug_defaults()
|
|
||||||
env_defaults = detect_debug_defaults()
|
env_defaults = detect_debug_defaults()
|
||||||
opts = env_defaults if env_defaults else cli_opts
|
opts = env_defaults if env_defaults else cli_opts
|
||||||
|
|
||||||
if not getattr(opts, "cmd", None):
|
if not getattr(opts, "cmd", None):
|
||||||
opts.cmd = "full"
|
opts.cmd = "full"
|
||||||
|
|
||||||
exit_code = asyncio.run(async_main(opts))
|
exit_code = asyncio.run(async_main(cli_opts))
|
||||||
sys.exit(exit_code)
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ from __future__ import annotations
|
|||||||
# Imports & Third-party
|
# Imports & Third-party
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
import argparse, asyncio, json, os, sys, pathlib, random, time, csv
|
import argparse, asyncio, json, pathlib, random
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -30,7 +30,7 @@ from rich.console import Console
|
|||||||
from rich.logging import RichHandler
|
from rich.logging import RichHandler
|
||||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
||||||
import logging
|
import logging
|
||||||
from jinja2 import Environment, FileSystemLoader, select_autoescape
|
|
||||||
|
|
||||||
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||||||
|
|
||||||
@@ -43,7 +43,7 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
from openai import OpenAI # same SDK you pre-loaded
|
from litellm import completion #Support any LLM Provider
|
||||||
|
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
# Utils
|
# Utils
|
||||||
@@ -70,11 +70,11 @@ def dev_defaults() -> SimpleNamespace:
|
|||||||
out_dir="./insights_debug",
|
out_dir="./insights_debug",
|
||||||
embed_model="all-MiniLM-L6-v2",
|
embed_model="all-MiniLM-L6-v2",
|
||||||
top_k=10,
|
top_k=10,
|
||||||
openai_model="gpt-4.1",
|
llm_provider="openai/gpt-4.1",
|
||||||
|
llm_api_key=None,
|
||||||
max_llm_tokens=8000,
|
max_llm_tokens=8000,
|
||||||
llm_temperature=1.0,
|
llm_temperature=1.0,
|
||||||
workers=4, # parallel processing
|
workers=4
|
||||||
stub=False, # manual
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
@@ -166,7 +166,7 @@ def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any
|
|||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
# Org-chart via LLM
|
# Org-chart via LLM
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
async def infer_org_chart_llm(company, people, client:OpenAI, model_name:str, max_tokens:int, temperature:float, stub:bool):
|
async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool=False, base_url:str=None):
|
||||||
if stub:
|
if stub:
|
||||||
# Tiny fake org-chart when debugging offline
|
# Tiny fake org-chart when debugging offline
|
||||||
chief = random.choice(people)
|
chief = random.choice(people)
|
||||||
@@ -202,15 +202,20 @@ Here is a JSON list of employees:
|
|||||||
Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
|
Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
|
||||||
"""}
|
"""}
|
||||||
]
|
]
|
||||||
resp = client.chat.completions.create(
|
resp = completion(
|
||||||
model=model_name,
|
model=llm_provider,
|
||||||
messages=prompt,
|
messages=prompt,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
response_format={"type":"json_object"}
|
response_format={"type":"json_object"},
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=base_url
|
||||||
)
|
)
|
||||||
chart = json.loads(resp.choices[0].message.content)
|
chart = json.loads(resp.choices[0].message.content)
|
||||||
chart["meta"] = dict(model=model_name, generated_at=datetime.now(UTC).isoformat())
|
chart["meta"] = dict(
|
||||||
|
model=llm_provider,
|
||||||
|
generated_at=datetime.now(UTC).isoformat()
|
||||||
|
)
|
||||||
return chart
|
return chart
|
||||||
|
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
@@ -270,15 +275,11 @@ async def run(opts):
|
|||||||
logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
|
logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
|
||||||
|
|
||||||
logging.info("[bold]⇢[/] Embedding company descriptions…")
|
logging.info("[bold]⇢[/] Embedding company descriptions…")
|
||||||
# embeds = embed_descriptions(companies, opts.embed_model, opts)
|
embeds = embed_descriptions(companies, opts.embed_model, opts)
|
||||||
|
|
||||||
logging.info("[bold]⇢[/] Building similarity graph")
|
logging.info("[bold]⇢[/] Building similarity graph")
|
||||||
# company_graph = build_company_graph(companies, embeds, opts.top_k)
|
company_graph = build_company_graph(companies, embeds, opts.top_k)
|
||||||
# dump_json(company_graph, out_dir/"company_graph.json")
|
dump_json(company_graph, out_dir/"company_graph.json")
|
||||||
|
|
||||||
# OpenAI client (only built if not debugging)
|
|
||||||
stub = bool(opts.stub)
|
|
||||||
client = OpenAI() if not stub else None
|
|
||||||
|
|
||||||
# Filter companies that need processing
|
# Filter companies that need processing
|
||||||
to_process = []
|
to_process = []
|
||||||
@@ -311,14 +312,14 @@ async def run(opts):
|
|||||||
async def process_one(comp):
|
async def process_one(comp):
|
||||||
handle = comp["handle"].strip("/").replace("/","_")
|
handle = comp["handle"].strip("/").replace("/","_")
|
||||||
persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
|
persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
|
||||||
|
|
||||||
chart = await infer_org_chart_llm(
|
chart = await infer_org_chart_llm(
|
||||||
comp, persons,
|
comp, persons,
|
||||||
client=client if client else OpenAI(api_key="sk-debug"),
|
llm_provider=opts.llm_provider,
|
||||||
model_name=opts.openai_model,
|
api_key=opts.llm_api_key or None,
|
||||||
max_tokens=opts.max_llm_tokens,
|
max_tokens=opts.max_llm_tokens,
|
||||||
temperature=opts.llm_temperature,
|
temperature=opts.llm_temperature,
|
||||||
stub=stub,
|
stub=opts.stub or False,
|
||||||
|
base_url=opts.llm_base_url or None
|
||||||
)
|
)
|
||||||
chart["meta"]["company"] = comp["name"]
|
chart["meta"]["company"] = comp["name"]
|
||||||
|
|
||||||
@@ -354,18 +355,22 @@ def build_arg_parser():
|
|||||||
p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
|
p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
|
||||||
p.add_argument("--in", dest="in_dir", required=False, help="Stage-1 output dir", default=".")
|
p.add_argument("--in", dest="in_dir", required=False, help="Stage-1 output dir", default=".")
|
||||||
p.add_argument("--out", dest="out_dir", required=False, help="Destination dir", default=".")
|
p.add_argument("--out", dest="out_dir", required=False, help="Destination dir", default=".")
|
||||||
p.add_argument("--embed_model", default="all-MiniLM-L6-v2")
|
p.add_argument("--embed-model", default="all-MiniLM-L6-v2")
|
||||||
p.add_argument("--top_k", type=int, default=10, help="Top-k neighbours per company")
|
p.add_argument("--top-k", type=int, default=10, help="Top-k neighbours per company")
|
||||||
p.add_argument("--openai_model", default="gpt-4.1")
|
p.add_argument("--llm-provider", default="openai/gpt-4.1",
|
||||||
p.add_argument("--max_llm_tokens", type=int, default=8024)
|
help="LLM model to use in format 'provider/model_name' (e.g., 'anthropic/claude-3')")
|
||||||
p.add_argument("--llm_temperature", type=float, default=1.0)
|
p.add_argument("--llm-api-key", help="API key for LLM provider (defaults to env vars)")
|
||||||
|
p.add_argument("--llm-base-url", help="Base URL for LLM API endpoint")
|
||||||
|
p.add_argument("--max-llm-tokens", type=int, default=8024)
|
||||||
|
p.add_argument("--llm-temperature", type=float, default=1.0)
|
||||||
p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
|
p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
|
||||||
p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
|
p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
|
||||||
return p
|
return p
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
dbg = dev_defaults()
|
dbg = dev_defaults()
|
||||||
opts = dbg if True else build_arg_parser().parse_args()
|
# opts = dbg if True else build_arg_parser().parse_args()
|
||||||
|
opts = build_arg_parser().parse_args()
|
||||||
asyncio.run(run(opts))
|
asyncio.run(run(opts))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user