#!/usr/bin/env python3 """ c4ai-discover — Stage‑1 Discovery CLI Scrapes LinkedIn company search + their people pages and dumps two newline‑delimited JSON files: companies.jsonl and people.jsonl. Key design rules ---------------- * No BeautifulSoup — Crawl4AI only for network + HTML fetch. * JsonCssExtractionStrategy for structured scraping; schema auto‑generated once from sample HTML provided by user and then cached under ./schemas/. * Defaults are embedded so the file runs inside VS Code debugger without CLI args. * If executed as a console script (argv > 1), CLI flags win. * Lightweight deps: argparse + Crawl4AI stack. Author: Tom @ Kidocode 2025‑04‑26 """ from __future__ import annotations import warnings, re warnings.filterwarnings( "ignore", message=r"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used.*", category=FutureWarning, module=r"soupsieve" ) # ─────────────────────────────────────────────────────────────────────────────── # Imports # ─────────────────────────────────────────────────────────────────────────────── import argparse import random import asyncio import json import logging import os import pathlib import sys # 3rd-party rich for pretty logging from rich.console import Console from rich.logging import RichHandler from datetime import datetime, UTC from textwrap import dedent from types import SimpleNamespace from typing import Dict, List, Optional from urllib.parse import quote from pathlib import Path from glob import glob from crawl4ai import ( AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig, JsonCssExtractionStrategy, BrowserProfiler, LLMConfig, ) # ─────────────────────────────────────────────────────────────────────────────── # Constants / paths # ─────────────────────────────────────────────────────────────────────────────── BASE_DIR = pathlib.Path(__file__).resolve().parent SCHEMA_DIR = BASE_DIR / "schemas" SCHEMA_DIR.mkdir(parents=True, exist_ok=True) COMPANY_SCHEMA_PATH = SCHEMA_DIR / "company_card.json" PEOPLE_SCHEMA_PATH = SCHEMA_DIR / "people_card.json" # ---------- deterministic target JSON examples ---------- _COMPANY_SCHEMA_EXAMPLE = { "handle": "/company/posify/", "profile_image": "https://media.licdn.com/dms/image/v2/.../logo.jpg", "name": "Management Research Services, Inc. (MRS, Inc)", "descriptor": "Insurance • Milwaukee, Wisconsin", "about": "Insurance • Milwaukee, Wisconsin", "followers": 1000 } _PEOPLE_SCHEMA_EXAMPLE = { "profile_url": "https://www.linkedin.com/in/lily-ng/", "name": "Lily Ng", "headline": "VP Product @ Posify", "followers": 890, "connection_degree": "2nd", "avatar_url": "https://media.licdn.com/dms/image/v2/.../lily.jpg" } # Provided sample HTML snippets (trimmed) — used exactly once to cold‑generate schema. _SAMPLE_COMPANY_HTML = (Path(__file__).resolve().parent / "snippets/company.html").read_text() _SAMPLE_PEOPLE_HTML = (Path(__file__).resolve().parent / "snippets/people.html").read_text() # --------- tighter schema prompts ---------- _COMPANY_SCHEMA_QUERY = dedent( """ Using the supplied
  • company-card HTML, build a JsonCssExtractionStrategy schema that, for every card, outputs *exactly* the keys shown in the example JSON below. JSON spec: • handle – href of the outermost that wraps the logo/title, e.g. "/company/posify/" • profile_image – absolute URL of the inside that link • name – text of the inside the • descriptor – text line with industry • location • about – text of the
    below the name (industry + geo) • followers – integer parsed from the
    containing 'followers' IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable. The main div parent contains these li element is "div.search-results-container" you can use this. The