Files
crawl4ai/docs/apps/linkdin/c4ai_discover.py
UncleCode 8a5e23d374 feat(crawler): add separate timeout for wait_for condition
Adds a new wait_for_timeout parameter to CrawlerRunConfig that allows specifying
a separate timeout for the wait_for condition, independent of the page_timeout.
This provides more granular control over waiting behaviors in the crawler.

Also removes unused colorama dependency and updates LinkedIn crawler example.

BREAKING CHANGE: LinkedIn crawler example now uses different wait_for_images timing
2025-05-16 17:00:45 +08:00

440 lines
17 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
c4ai-discover — Stage1 Discovery CLI
Scrapes LinkedIn company search + their people pages and dumps two newlinedelimited
JSON files: companies.jsonl and people.jsonl.
Key design rules
----------------
* No BeautifulSoup — Crawl4AI only for network + HTML fetch.
* JsonCssExtractionStrategy for structured scraping; schema autogenerated once
from sample HTML provided by user and then cached under ./schemas/.
* Defaults are embedded so the file runs inside VS Code debugger without CLI args.
* If executed as a console script (argv > 1), CLI flags win.
* Lightweight deps: argparse + Crawl4AI stack.
Author: Tom @ Kidocode 20250426
"""
from __future__ import annotations
import warnings, re
warnings.filterwarnings(
"ignore",
message=r"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used.*",
category=FutureWarning,
module=r"soupsieve"
)
# ───────────────────────────────────────────────────────────────────────────────
# Imports
# ───────────────────────────────────────────────────────────────────────────────
import argparse
import random
import asyncio
import json
import logging
import os
import pathlib
import sys
# 3rd-party rich for pretty logging
from rich.console import Console
from rich.logging import RichHandler
from datetime import datetime, UTC
from textwrap import dedent
from types import SimpleNamespace
from typing import Dict, List, Optional
from urllib.parse import quote
from pathlib import Path
from glob import glob
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CacheMode,
CrawlerRunConfig,
JsonCssExtractionStrategy,
BrowserProfiler,
LLMConfig,
)
# ───────────────────────────────────────────────────────────────────────────────
# Constants / paths
# ───────────────────────────────────────────────────────────────────────────────
BASE_DIR = pathlib.Path(__file__).resolve().parent
SCHEMA_DIR = BASE_DIR / "schemas"
SCHEMA_DIR.mkdir(parents=True, exist_ok=True)
COMPANY_SCHEMA_PATH = SCHEMA_DIR / "company_card.json"
PEOPLE_SCHEMA_PATH = SCHEMA_DIR / "people_card.json"
# ---------- deterministic target JSON examples ----------
_COMPANY_SCHEMA_EXAMPLE = {
"handle": "/company/posify/",
"profile_image": "https://media.licdn.com/dms/image/v2/.../logo.jpg",
"name": "Management Research Services, Inc. (MRS, Inc)",
"descriptor": "Insurance • Milwaukee, Wisconsin",
"about": "Insurance • Milwaukee, Wisconsin",
"followers": 1000
}
_PEOPLE_SCHEMA_EXAMPLE = {
"profile_url": "https://www.linkedin.com/in/lily-ng/",
"name": "Lily Ng",
"headline": "VP Product @ Posify",
"followers": 890,
"connection_degree": "2nd",
"avatar_url": "https://media.licdn.com/dms/image/v2/.../lily.jpg"
}
# Provided sample HTML snippets (trimmed) — used exactly once to coldgenerate schema.
_SAMPLE_COMPANY_HTML = (Path(__file__).resolve().parent / "snippets/company.html").read_text()
_SAMPLE_PEOPLE_HTML = (Path(__file__).resolve().parent / "snippets/people.html").read_text()
# --------- tighter schema prompts ----------
_COMPANY_SCHEMA_QUERY = dedent(
"""
Using the supplied <li> company-card HTML, build a JsonCssExtractionStrategy schema that,
for every card, outputs *exactly* the keys shown in the example JSON below.
JSON spec:
• handle href of the outermost <a> that wraps the logo/title, e.g. "/company/posify/"
• profile_image absolute URL of the <img> inside that link
• name text of the <a> inside the <span class*='t-16'>
• descriptor text line with industry • location
• about text of the <div class*='t-normal'> below the name (industry + geo)
• followers integer parsed from the <div> containing 'followers'
IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
The main div parent contains these li element is "div.search-results-container" you can use this.
The <ul> parent has "role" equal to "list". Using these two should be enough to target the <li> elements."
"""
)
_PEOPLE_SCHEMA_QUERY = dedent(
"""
Using the supplied <li> people-card HTML, build a JsonCssExtractionStrategy schema that
outputs exactly the keys in the example JSON below.
Fields:
• profile_url href of the outermost profile link
• name text inside artdeco-entity-lockup__title
• headline inner text of artdeco-entity-lockup__subtitle
• followers integer parsed from the span inside lt-line-clamp--multi-line
• connection_degree '1st', '2nd', etc. from artdeco-entity-lockup__badge
• avatar_url src of the <img> within artdeco-entity-lockup__image
IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
The main div parent contains these li element is a "div" has these classes "artdeco-card org-people-profile-card__card-spacing org-people__card-margin-bottom".
"""
)
# ---------------------------------------------------------------------------
# Utility helpers
# ---------------------------------------------------------------------------
def _load_or_build_schema(
path: pathlib.Path,
sample_html: str,
query: str,
example_json: Dict,
force = False
) -> Dict:
"""Load schema from path, else call generate_schema once and persist."""
if path.exists() and not force:
return json.loads(path.read_text())
logging.info("[SCHEMA] Generating schema %s", path.name)
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llm_config=LLMConfig(
provider=os.getenv("C4AI_SCHEMA_PROVIDER", "openai/gpt-4o"),
api_token=os.getenv("OPENAI_API_KEY", "env:OPENAI_API_KEY"),
),
query=query,
target_json_example=json.dumps(example_json, indent=2),
)
path.write_text(json.dumps(schema, indent=2))
return schema
def _openai_friendly_number(text: str) -> Optional[int]:
"""Extract first int from text like '1K followers' (returns 1000)."""
import re
m = re.search(r"(\d[\d,]*)", text.replace(",", ""))
if not m:
return None
val = int(m.group(1))
if "k" in text.lower():
val *= 1000
if "m" in text.lower():
val *= 1_000_000
return val
# ---------------------------------------------------------------------------
# Core async workers
# ---------------------------------------------------------------------------
async def crawl_company_search(crawler: AsyncWebCrawler, url: str, schema: Dict, limit: int) -> List[Dict]:
"""Paginate 10-item company search pages until `limit` reached."""
extraction = JsonCssExtractionStrategy(schema)
cfg = CrawlerRunConfig(
extraction_strategy=extraction,
cache_mode=CacheMode.BYPASS,
wait_for = ".search-marvel-srp",
session_id="company_search",
delay_before_return_html=1,
magic = True,
verbose= False,
)
companies, page = [], 1
while len(companies) < max(limit, 10):
paged_url = f"{url}&page={page}"
res = await crawler.arun(paged_url, config=cfg)
batch = json.loads(res[0].extracted_content)
if not batch:
break
for item in batch:
name = item.get("name", "").strip()
handle = item.get("handle", "").strip()
if not handle or not name:
continue
descriptor = item.get("descriptor")
about = item.get("about")
followers = _openai_friendly_number(str(item.get("followers", "")))
companies.append(
{
"handle": handle,
"name": name,
"descriptor": descriptor,
"about": about,
"followers": followers,
"people_url": f"{handle}people/",
"captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
}
)
page += 1
logging.info(
f"[dim]Page {page}[/] — running total: {len(companies)}/{limit} companies"
)
return companies[:max(limit, 10)]
async def crawl_people_page(
crawler: AsyncWebCrawler,
people_url: str,
schema: Dict,
limit: int,
title_kw: str,
) -> List[Dict]:
people_u = f"{people_url}?keywords={quote(title_kw)}"
extraction = JsonCssExtractionStrategy(schema)
cfg = CrawlerRunConfig(
extraction_strategy=extraction,
# scan_full_page=True,
cache_mode=CacheMode.BYPASS,
magic=True,
wait_for=".org-people-profile-card__card-spacing",
wait_for_images=5000,
delay_before_return_html=1,
session_id="people_search",
)
res = await crawler.arun(people_u, config=cfg)
if not res[0].success:
return []
raw = json.loads(res[0].extracted_content)
people = []
for p in raw[:limit]:
followers = _openai_friendly_number(str(p.get("followers", "")))
people.append(
{
"profile_url": p.get("profile_url"),
"name": p.get("name"),
"headline": p.get("headline"),
"followers": followers,
"connection_degree": p.get("connection_degree"),
"avatar_url": p.get("avatar_url"),
}
)
return people
# ---------------------------------------------------------------------------
# CLI + main
# ---------------------------------------------------------------------------
def build_arg_parser() -> argparse.ArgumentParser:
ap = argparse.ArgumentParser("c4ai-discover — Crawl4AI LinkedIn discovery")
sub = ap.add_subparsers(dest="cmd", required=False, help="run scope")
def add_flags(parser: argparse.ArgumentParser):
parser.add_argument("--query", required=False, help="query keyword(s)")
parser.add_argument("--geo", required=False, type=int, help="LinkedIn geoUrn")
parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
parser.add_argument("--max-companies", type=int, default=1000)
parser.add_argument("--max-people", type=int, default=500)
parser.add_argument("--profile-name", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
parser.add_argument("--outdir", default="./output")
parser.add_argument("--concurrency", type=int, default=4)
parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
add_flags(sub.add_parser("full"))
add_flags(sub.add_parser("companies"))
add_flags(sub.add_parser("people"))
# global flags
ap.add_argument(
"--debug",
action="store_true",
help="Use built-in demo defaults (same as C4AI_DEMO_DEBUG=1)",
)
return ap
def detect_debug_defaults(force = False) -> SimpleNamespace:
if not force and sys.gettrace() is None and not os.getenv("C4AI_DEMO_DEBUG"):
return SimpleNamespace()
# ----- debugfriendly defaults -----
return SimpleNamespace(
cmd="full",
query="health insurance management",
geo=102713980,
# title_filters="Product,Engineering",
title_filters="",
max_companies=10,
max_people=5,
profile_name="profile_linkedin_uc",
outdir="./debug_out",
concurrency=2,
log_level="debug",
)
async def async_main(opts):
# ─────────── logging setup ───────────
console = Console()
logging.basicConfig(
level=opts.log_level.upper(),
format="%(message)s",
handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
)
# -------------------------------------------------------------------
# Load or build schemas (onetime LLM call each)
# -------------------------------------------------------------------
company_schema = _load_or_build_schema(
COMPANY_SCHEMA_PATH,
_SAMPLE_COMPANY_HTML,
_COMPANY_SCHEMA_QUERY,
_COMPANY_SCHEMA_EXAMPLE,
# True
)
people_schema = _load_or_build_schema(
PEOPLE_SCHEMA_PATH,
_SAMPLE_PEOPLE_HTML,
_PEOPLE_SCHEMA_QUERY,
_PEOPLE_SCHEMA_EXAMPLE,
# True
)
outdir = BASE_DIR / pathlib.Path(opts.outdir)
outdir.mkdir(parents=True, exist_ok=True)
f_companies = (BASE_DIR / outdir / "companies.jsonl").open("a", encoding="utf-8")
f_people = (BASE_DIR / outdir / "people.jsonl").open("a", encoding="utf-8")
# -------------------------------------------------------------------
# Prepare crawler with cookie pool rotation
# -------------------------------------------------------------------
profiler = BrowserProfiler()
path = profiler.get_profile_path(opts.profile_name)
bc = BrowserConfig(
headless=False,
verbose=False,
user_data_dir=path,
use_managed_browser=True,
user_agent_mode = "random",
user_agent_generator_config= {
"platforms": "mobile",
"os": "Android"
}
)
crawler = AsyncWebCrawler(config=bc)
await crawler.start()
# Single worker for simplicity; concurrency can be scaled by arun_many if needed.
# crawler = await next_crawler().start()
try:
# Build LinkedIn search URL
search_url = f'https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&companyHqGeo="{opts.geo}"'
logging.info("Seed URL => %s", search_url)
companies: List[Dict] = []
if opts.cmd in ("companies", "full"):
companies = await crawl_company_search(
crawler, search_url, company_schema, opts.max_companies
)
for c in companies:
f_companies.write(json.dumps(c, ensure_ascii=False) + "\n")
logging.info(f"[bold green]✓[/] Companies scraped so far: {len(companies)}")
if opts.cmd in ("people", "full"):
if not companies:
# load from previous run
src = outdir / "companies.jsonl"
if not src.exists():
logging.error("companies.jsonl missing — run companies/full first")
return 10
companies = [json.loads(l) for l in src.read_text().splitlines()]
total_people = 0
title_kw = " ".join([t.strip() for t in opts.title_filters.split(",") if t.strip()]) if opts.title_filters else ""
for comp in companies:
people = await crawl_people_page(
crawler,
comp["people_url"],
people_schema,
opts.max_people,
title_kw,
)
for p in people:
rec = p | {
"company_handle": comp["handle"],
# "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
"captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
}
f_people.write(json.dumps(rec, ensure_ascii=False) + "\n")
total_people += len(people)
logging.info(
f"{comp['name']} — [cyan]{len(people)}[/] people extracted"
)
await asyncio.sleep(random.uniform(0.5, 1))
logging.info("Total people scraped: %d", total_people)
finally:
await crawler.close()
f_companies.close()
f_people.close()
return 0
def main():
parser = build_arg_parser()
cli_opts = parser.parse_args()
# decide on debug defaults
if cli_opts.debug or True:
opts = detect_debug_defaults(force=True)
cli_opts = opts
else:
env_defaults = detect_debug_defaults()
opts = env_defaults if env_defaults else cli_opts
if not getattr(opts, "cmd", None):
opts.cmd = "full"
exit_code = asyncio.run(async_main(cli_opts))
sys.exit(exit_code)
if __name__ == "__main__":
main()