Merge branch 'next' into 2025-APR-1

This commit is contained in:
Aravind Karnam
2025-05-07 10:25:25 +05:30
42 changed files with 4861 additions and 561 deletions

4
.gitignore vendored
View File

@@ -261,4 +261,6 @@ CLAUDE.md
tests/**/test_site tests/**/test_site
tests/**/reports tests/**/reports
tests/**/benchmark_reports tests/**/benchmark_reports
.codecat/

View File

@@ -5,6 +5,30 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.6.2] - 2025-05-02
### Added
- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
- Built-in patterns for emails, URLs, phone numbers, dates, and more
- Support for custom regex patterns
- `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
- Added support for network response body capture in network request tracking
### Changed
- Updated documentation for no-LLM extraction strategies
- Enhanced API reference to include RegexExtractionStrategy examples and usage
- Improved HTML preprocessing with optimized performance for extraction strategies
## [0.6.1] - 2025-04-24
### Added
- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
### Changed
- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
## [0.6.0] 20250422 ## [0.6.0] 20250422
### Added ### Added

View File

@@ -23,7 +23,8 @@ from .extraction_strategy import (
CosineStrategy, CosineStrategy,
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
JsonXPathExtractionStrategy, JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy JsonLxmlExtractionStrategy,
RegexExtractionStrategy
) )
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -105,6 +106,7 @@ __all__ = [
"JsonCssExtractionStrategy", "JsonCssExtractionStrategy",
"JsonXPathExtractionStrategy", "JsonXPathExtractionStrategy",
"JsonLxmlExtractionStrategy", "JsonLxmlExtractionStrategy",
"RegexExtractionStrategy",
"ChunkingStrategy", "ChunkingStrategy",
"RegexChunking", "RegexChunking",
"DefaultMarkdownGenerator", "DefaultMarkdownGenerator",

View File

@@ -1,3 +1,3 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.6.0" __version__ = "0.6.3"

View File

@@ -427,7 +427,7 @@ class BrowserConfig:
host: str = "localhost", host: str = "localhost",
): ):
self.browser_type = browser_type self.browser_type = browser_type
self.headless = headless or True self.headless = headless
self.browser_mode = browser_mode self.browser_mode = browser_mode
self.use_managed_browser = use_managed_browser self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url self.cdp_url = cdp_url

View File

@@ -572,6 +572,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
async def handle_response_capture(response): async def handle_response_capture(response):
try: try:
try:
# body = await response.body()
# json_body = await response.json()
text_body = await response.text()
except Exception as e:
body = None
# json_body = None
# text_body = None
captured_requests.append({ captured_requests.append({
"event_type": "response", "event_type": "response",
"url": response.url, "url": response.url,
@@ -580,7 +588,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"headers": dict(response.headers), # Convert Header dict "headers": dict(response.headers), # Convert Header dict
"from_service_worker": response.from_service_worker, "from_service_worker": response.from_service_worker,
"request_timing": response.request.timing, # Detailed timing info "request_timing": response.request.timing, # Detailed timing info
"timestamp": time.time() "timestamp": time.time(),
"body" : {
# "raw": body,
# "json": json_body,
"text": text_body
}
}) })
except Exception as e: except Exception as e:
if self.logger: if self.logger:

View File

@@ -171,7 +171,10 @@ class AsyncDatabaseManager:
f"Code context:\n{error_context['code_context']}" f"Code context:\n{error_context['code_context']}"
) )
self.logger.error( self.logger.error(
message=create_box_message(error_message, type="error"), message="{error}",
tag="ERROR",
params={"error": str(error_message)},
boxes=["error"],
) )
raise raise
@@ -189,7 +192,10 @@ class AsyncDatabaseManager:
f"Code context:\n{error_context['code_context']}" f"Code context:\n{error_context['code_context']}"
) )
self.logger.error( self.logger.error(
message=create_box_message(error_message, type="error"), message="{error}",
tag="ERROR",
params={"error": str(error_message)},
boxes=["error"],
) )
raise raise
finally: finally:

View File

@@ -1,10 +1,12 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from enum import Enum from enum import Enum
from typing import Optional, Dict, Any from typing import Optional, Dict, Any, List
from colorama import Fore, Style, init
import os import os
from datetime import datetime from datetime import datetime
from urllib.parse import unquote from urllib.parse import unquote
from rich.console import Console
from rich.text import Text
from .utils import create_box_message
class LogLevel(Enum): class LogLevel(Enum):
@@ -21,6 +23,26 @@ class LogLevel(Enum):
FATAL = 10 FATAL = 10
def __str__(self):
return self.name.lower()
class LogColor(str, Enum):
"""Enum for log colors."""
DEBUG = "lightblack"
INFO = "cyan"
SUCCESS = "green"
WARNING = "yellow"
ERROR = "red"
CYAN = "cyan"
GREEN = "green"
YELLOW = "yellow"
MAGENTA = "magenta"
DIM_MAGENTA = "dim magenta"
def __str__(self):
"""Automatically convert rich color to string."""
return self.value
class AsyncLoggerBase(ABC): class AsyncLoggerBase(ABC):
@@ -52,6 +74,7 @@ class AsyncLoggerBase(ABC):
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100): def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
pass pass
class AsyncLogger(AsyncLoggerBase): class AsyncLogger(AsyncLoggerBase):
""" """
Asynchronous logger with support for colored console output and file logging. Asynchronous logger with support for colored console output and file logging.
@@ -79,17 +102,11 @@ class AsyncLogger(AsyncLoggerBase):
} }
DEFAULT_COLORS = { DEFAULT_COLORS = {
LogLevel.DEBUG: Fore.LIGHTBLACK_EX, LogLevel.DEBUG: LogColor.DEBUG,
LogLevel.INFO: Fore.CYAN, LogLevel.INFO: LogColor.INFO,
LogLevel.SUCCESS: Fore.GREEN, LogLevel.SUCCESS: LogColor.SUCCESS,
LogLevel.WARNING: Fore.YELLOW, LogLevel.WARNING: LogColor.WARNING,
LogLevel.ERROR: Fore.RED, LogLevel.ERROR: LogColor.ERROR,
LogLevel.CRITICAL: Fore.RED + Style.BRIGHT,
LogLevel.ALERT: Fore.RED + Style.BRIGHT,
LogLevel.NOTICE: Fore.BLUE,
LogLevel.EXCEPTION: Fore.RED + Style.BRIGHT,
LogLevel.FATAL: Fore.RED + Style.BRIGHT,
LogLevel.DEFAULT: Fore.WHITE,
} }
def __init__( def __init__(
@@ -98,7 +115,7 @@ class AsyncLogger(AsyncLoggerBase):
log_level: LogLevel = LogLevel.DEBUG, log_level: LogLevel = LogLevel.DEBUG,
tag_width: int = 10, tag_width: int = 10,
icons: Optional[Dict[str, str]] = None, icons: Optional[Dict[str, str]] = None,
colors: Optional[Dict[LogLevel, str]] = None, colors: Optional[Dict[LogLevel, LogColor]] = None,
verbose: bool = True, verbose: bool = True,
): ):
""" """
@@ -112,13 +129,13 @@ class AsyncLogger(AsyncLoggerBase):
colors: Custom colors for different log levels colors: Custom colors for different log levels
verbose: Whether to output to console verbose: Whether to output to console
""" """
init() # Initialize colorama
self.log_file = log_file self.log_file = log_file
self.log_level = log_level self.log_level = log_level
self.tag_width = tag_width self.tag_width = tag_width
self.icons = icons or self.DEFAULT_ICONS self.icons = icons or self.DEFAULT_ICONS
self.colors = colors or self.DEFAULT_COLORS self.colors = colors or self.DEFAULT_COLORS
self.verbose = verbose self.verbose = verbose
self.console = Console()
# Create log file directory if needed # Create log file directory if needed
if log_file: if log_file:
@@ -143,16 +160,11 @@ class AsyncLogger(AsyncLoggerBase):
def _write_to_file(self, message: str): def _write_to_file(self, message: str):
"""Write a message to the log file if configured.""" """Write a message to the log file if configured."""
if self.log_file: if self.log_file:
text = Text.from_markup(message)
plain_text = text.plain
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f: with open(self.log_file, "a", encoding="utf-8") as f:
# Strip ANSI color codes for file output f.write(f"[{timestamp}] {plain_text}\n")
clean_message = message.replace(Fore.RESET, "").replace(
Style.RESET_ALL, ""
)
for color in vars(Fore).values():
if isinstance(color, str):
clean_message = clean_message.replace(color, "")
f.write(f"[{timestamp}] {clean_message}\n")
def _log( def _log(
self, self,
@@ -160,8 +172,9 @@ class AsyncLogger(AsyncLoggerBase):
message: str, message: str,
tag: str, tag: str,
params: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None,
colors: Optional[Dict[str, str]] = None, colors: Optional[Dict[str, LogColor]] = None,
base_color: Optional[str] = None, boxes: Optional[List[str]] = None,
base_color: Optional[LogColor] = None,
**kwargs, **kwargs,
): ):
""" """
@@ -173,55 +186,44 @@ class AsyncLogger(AsyncLoggerBase):
tag: Tag for the message tag: Tag for the message
params: Parameters to format into the message params: Parameters to format into the message
colors: Color overrides for specific parameters colors: Color overrides for specific parameters
boxes: Box overrides for specific parameters
base_color: Base color for the entire message base_color: Base color for the entire message
""" """
if level.value < self.log_level.value: if level.value < self.log_level.value:
return return
# Format the message with parameters if provided # avoid conflict with rich formatting
parsed_message = message.replace("[", "[[").replace("]", "]]")
if params: if params:
try: # FIXME: If there are formatting strings in floating point format,
# First format the message with raw parameters # this may result in colors and boxes not being applied properly.
formatted_message = message.format(**params) # such as {value:.2f}, the value is 0.23333 format it to 0.23,
# but we replace("0.23333", "[color]0.23333[/color]")
formatted_message = parsed_message.format(**params)
for key, value in params.items():
# value_str may discard `[` and `]`, so we need to replace it.
value_str = str(value).replace("[", "[[").replace("]", "]]")
# check is need apply color
if colors and key in colors:
color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]"
formatted_message = formatted_message.replace(value_str, color_str)
value_str = color_str
# Then apply colors if specified # check is need apply box
color_map = { if boxes and key in boxes:
"green": Fore.GREEN, formatted_message = formatted_message.replace(value_str,
"red": Fore.RED, create_box_message(value_str, type=str(level)))
"yellow": Fore.YELLOW,
"blue": Fore.BLUE,
"cyan": Fore.CYAN,
"magenta": Fore.MAGENTA,
"white": Fore.WHITE,
"black": Fore.BLACK,
"reset": Style.RESET_ALL,
}
if colors:
for key, color in colors.items():
# Find the formatted value in the message and wrap it with color
if color in color_map:
color = color_map[color]
if key in params:
value_str = str(params[key])
formatted_message = formatted_message.replace(
value_str, f"{color}{value_str}{Style.RESET_ALL}"
)
except KeyError as e:
formatted_message = (
f"LOGGING ERROR: Missing parameter {e} in message template"
)
level = LogLevel.ERROR
else: else:
formatted_message = message formatted_message = parsed_message
# Construct the full log line # Construct the full log line
color = base_color or self.colors[level] color: LogColor = base_color or self.colors[level]
log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}" log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]"
# Output to console if verbose # Output to console if verbose
if self.verbose or kwargs.get("force_verbose", False): if self.verbose or kwargs.get("force_verbose", False):
print(log_line) self.console.print(log_line)
# Write to file if configured # Write to file if configured
self._write_to_file(log_line) self._write_to_file(log_line)
@@ -292,8 +294,8 @@ class AsyncLogger(AsyncLoggerBase):
"timing": timing, "timing": timing,
}, },
colors={ colors={
"status": Fore.GREEN if success else Fore.RED, "status": LogColor.SUCCESS if success else LogColor.ERROR,
"timing": Fore.YELLOW, "timing": LogColor.WARNING,
}, },
) )

View File

@@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
import os import os
import sys import sys
import time import time
from colorama import Fore
from pathlib import Path from pathlib import Path
from typing import Optional, List from typing import Optional, List
import json import json
@@ -44,7 +43,6 @@ from .utils import (
sanitize_input_encode, sanitize_input_encode,
InvalidCSSSelectorError, InvalidCSSSelectorError,
fast_format_html, fast_format_html,
create_box_message,
get_error_context, get_error_context,
RobotsParser, RobotsParser,
preprocess_html_for_schema, preprocess_html_for_schema,
@@ -419,7 +417,7 @@ class AsyncWebCrawler:
self.logger.error_status( self.logger.error_status(
url=url, url=url,
error=create_box_message(error_message, type="error"), error=error_message,
tag="ERROR", tag="ERROR",
) )
@@ -496,13 +494,17 @@ class AsyncWebCrawler:
cleaned_html = sanitize_input_encode( cleaned_html = sanitize_input_encode(
result.get("cleaned_html", "")) result.get("cleaned_html", ""))
media = result.get("media", {}) media = result.get("media", {})
tables = media.pop("tables", []) if isinstance(media, dict) else []
links = result.get("links", {}) links = result.get("links", {})
metadata = result.get("metadata", {}) metadata = result.get("metadata", {})
else: else:
cleaned_html = sanitize_input_encode(result.cleaned_html) cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump() media = result.media.model_dump()
tables = media.pop("tables", [])
links = result.links.model_dump() links = result.links.model_dump()
metadata = result.metadata metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
################################ ################################
# Generate Markdown # # Generate Markdown #
@@ -519,7 +521,7 @@ class AsyncWebCrawler:
html_source_selector = { html_source_selector = {
"raw_html": lambda: html, # The original raw HTML "raw_html": lambda: html, # The original raw HTML
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy "cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML "fit_html": lambda: fit_html, # The HTML after preprocessing for schema
} }
markdown_input_html = cleaned_html # Default to cleaned_html markdown_input_html = cleaned_html # Default to cleaned_html
@@ -593,6 +595,7 @@ class AsyncWebCrawler:
content = { content = {
"markdown": markdown_result.raw_markdown, "markdown": markdown_result.raw_markdown,
"html": html, "html": html,
"fit_html": fit_html,
"cleaned_html": cleaned_html, "cleaned_html": cleaned_html,
"fit_markdown": markdown_result.fit_markdown, "fit_markdown": markdown_result.fit_markdown,
}.get(content_format, markdown_result.raw_markdown) }.get(content_format, markdown_result.raw_markdown)
@@ -600,7 +603,7 @@ class AsyncWebCrawler:
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
chunking = ( chunking = (
IdentityChunking() IdentityChunking()
if content_format in ["html", "cleaned_html"] if content_format in ["html", "cleaned_html", "fit_html"]
else config.chunking_strategy else config.chunking_strategy
) )
sections = chunking.chunk(content) sections = chunking.chunk(content)
@@ -624,9 +627,11 @@ class AsyncWebCrawler:
return CrawlResult( return CrawlResult(
url=url, url=url,
html=html, html=html,
fit_html=fit_html,
cleaned_html=cleaned_html, cleaned_html=cleaned_html,
markdown=markdown_result, markdown=markdown_result,
media=media, media=media,
tables=tables, # NEW
links=links, links=links,
metadata=metadata, metadata=metadata,
screenshot=screenshot_data, screenshot=screenshot_data,

View File

@@ -5,7 +5,10 @@ import os
import sys import sys
import shutil import shutil
import tempfile import tempfile
import psutil
import signal
import subprocess import subprocess
import shlex
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
import hashlib import hashlib
from .js_snippet import load_js_script from .js_snippet import load_js_script
@@ -193,6 +196,45 @@ class ManagedBrowser:
if self.browser_config.extra_args: if self.browser_config.extra_args:
args.extend(self.browser_config.extra_args) args.extend(self.browser_config.extra_args)
# ── make sure no old Chromium instance is owning the same port/profile ──
try:
if sys.platform == "win32":
if psutil is None:
raise RuntimeError("psutil not available, cannot clean old browser")
for p in psutil.process_iter(["pid", "name", "cmdline"]):
cl = " ".join(p.info.get("cmdline") or [])
if (
f"--remote-debugging-port={self.debugging_port}" in cl
and f"--user-data-dir={self.user_data_dir}" in cl
):
p.kill()
p.wait(timeout=5)
else: # macOS / Linux
# kill any process listening on the same debugging port
pids = (
subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
.decode()
.strip()
.splitlines()
)
for pid in pids:
try:
os.kill(int(pid), signal.SIGTERM)
except ProcessLookupError:
pass
# remove Chromium singleton locks, or new launch exits with
# “Opening in existing browser session.”
for f in ("SingletonLock", "SingletonSocket", "SingletonCookie"):
fp = os.path.join(self.user_data_dir, f)
if os.path.exists(fp):
os.remove(fp)
except Exception as _e:
# non-fatal — we'll try to start anyway, but log what happened
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
# Start browser process # Start browser process
try: try:
@@ -922,7 +964,7 @@ class BrowserManager:
pages = context.pages pages = context.pages
page = next((p for p in pages if p.url == crawlerRunConfig.url), None) page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
if not page: if not page:
page = await context.new_page() page = context.pages[0] # await context.new_page()
else: else:
# Otherwise, check if we have an existing context for this config # Otherwise, check if we have an existing context for this config
config_signature = self._make_config_signature(crawlerRunConfig) config_signature = self._make_config_signature(crawlerRunConfig)

View File

@@ -15,12 +15,12 @@ import shutil
import json import json
import subprocess import subprocess
import time import time
from typing import List, Dict, Optional, Any, Tuple from typing import List, Dict, Optional, Any
from colorama import Fore, Style, init from rich.console import Console
from .async_configs import BrowserConfig from .async_configs import BrowserConfig
from .browser_manager import ManagedBrowser from .browser_manager import ManagedBrowser
from .async_logger import AsyncLogger, AsyncLoggerBase from .async_logger import AsyncLogger, AsyncLoggerBase, LogColor
from .utils import get_home_folder from .utils import get_home_folder
@@ -45,8 +45,8 @@ class BrowserProfiler:
logger (AsyncLoggerBase, optional): Logger for outputting messages. logger (AsyncLoggerBase, optional): Logger for outputting messages.
If None, a default AsyncLogger will be created. If None, a default AsyncLogger will be created.
""" """
# Initialize colorama for colorful terminal output # Initialize rich console for colorful input prompts
init() self.console = Console()
# Create a logger if not provided # Create a logger if not provided
if logger is None: if logger is None:
@@ -127,26 +127,30 @@ class BrowserProfiler:
profile_path = os.path.join(self.profiles_dir, profile_name) profile_path = os.path.join(self.profiles_dir, profile_name)
os.makedirs(profile_path, exist_ok=True) os.makedirs(profile_path, exist_ok=True)
# Print instructions for the user with colorama formatting # Print instructions for the user with rich formatting
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" border = "{'='*80}"
self.logger.info(f"\n{border}", tag="PROFILE") self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE") self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE") self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
self.logger.info("\nInstructions:", tag="PROFILE") self.logger.info("\nInstructions:", tag="PROFILE")
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE") self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE") self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": LogColor.CYAN})
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE") self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": LogColor.YELLOW})
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE") self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
self.logger.info(f"{border}\n", tag="PROFILE") self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": LogColor.CYAN})
browser_config.headless = False
browser_config.user_data_dir = profile_path
# Create managed browser instance # Create managed browser instance
managed_browser = ManagedBrowser( managed_browser = ManagedBrowser(
browser_type=browser_config.browser_type, browser_config=browser_config,
user_data_dir=profile_path, # user_data_dir=profile_path,
headless=False, # Must be visible # headless=False, # Must be visible
logger=self.logger, logger=self.logger,
debugging_port=browser_config.debugging_port # debugging_port=browser_config.debugging_port
) )
# Set up signal handlers to ensure cleanup on interrupt # Set up signal handlers to ensure cleanup on interrupt
@@ -181,7 +185,7 @@ class BrowserProfiler:
import select import select
# First output the prompt # First output the prompt
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE") self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
# Save original terminal settings # Save original terminal settings
fd = sys.stdin.fileno() fd = sys.stdin.fileno()
@@ -197,7 +201,7 @@ class BrowserProfiler:
if readable: if readable:
key = sys.stdin.read(1) key = sys.stdin.read(1)
if key.lower() == 'q': if key.lower() == 'q':
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE") self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
user_done_event.set() user_done_event.set()
return return
@@ -223,7 +227,7 @@ class BrowserProfiler:
self.logger.error("Failed to start browser process.", tag="PROFILE") self.logger.error("Failed to start browser process.", tag="PROFILE")
return None return None
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") self.logger.info("Browser launched. Waiting for you to finish...", tag="PROFILE")
# Start listening for keyboard input # Start listening for keyboard input
listener_task = asyncio.create_task(listen_for_quit_command()) listener_task = asyncio.create_task(listen_for_quit_command())
@@ -245,10 +249,10 @@ class BrowserProfiler:
self.logger.info("Terminating browser process...", tag="PROFILE") self.logger.info("Terminating browser process...", tag="PROFILE")
await managed_browser.cleanup() await managed_browser.cleanup()
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE") self.logger.success(f"Browser closed. Profile saved at: {profile_path}", tag="PROFILE")
except Exception as e: except Exception as e:
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE") self.logger.error(f"Error creating profile: {e!s}", tag="PROFILE")
await managed_browser.cleanup() await managed_browser.cleanup()
return None return None
finally: finally:
@@ -440,25 +444,27 @@ class BrowserProfiler:
``` ```
""" """
while True: while True:
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU") self.logger.info("\nProfile Management Options:", tag="MENU")
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU") self.logger.info("1. Create a new profile", tag="MENU", base_color=LogColor.GREEN)
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU") self.logger.info("2. List available profiles", tag="MENU", base_color=LogColor.YELLOW)
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU") self.logger.info("3. Delete a profile", tag="MENU", base_color=LogColor.RED)
# Only show crawl option if callback provided # Only show crawl option if callback provided
if crawl_callback: if crawl_callback:
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU") self.logger.info("4. Use a profile to crawl a website", tag="MENU", base_color=LogColor.CYAN)
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") self.logger.info("5. Exit", tag="MENU", base_color=LogColor.MAGENTA)
exit_option = "5" exit_option = "5"
else: else:
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
exit_option = "4" exit_option = "4"
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}") self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
choice = input()
if choice == "1": if choice == "1":
# Create new profile # Create new profile
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}") self.console.print("[green]Enter a name for the new profile (or press Enter for auto-generated name): [/green]", end="")
name = input()
await self.create_profile(name or None) await self.create_profile(name or None)
elif choice == "2": elif choice == "2":
@@ -469,11 +475,11 @@ class BrowserProfiler:
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES") self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
continue continue
# Print profile information with colorama formatting # Print profile information
self.logger.info("\nAvailable profiles:", tag="PROFILES") self.logger.info("\nAvailable profiles:", tag="PROFILES")
for i, profile in enumerate(profiles): for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES") self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES") self.logger.info(f" Path: {profile['path']}", tag="PROFILES", base_color=LogColor.YELLOW)
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES") self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES") self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
self.logger.info("", tag="PROFILES") # Empty line for spacing self.logger.info("", tag="PROFILES") # Empty line for spacing
@@ -486,12 +492,13 @@ class BrowserProfiler:
continue continue
# Display numbered list # Display numbered list
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW)
for i, profile in enumerate(profiles): for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
# Get profile to delete # Get profile to delete
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}") self.console.print("[red]Enter the number of the profile to delete (or 'c' to cancel): [/red]", end="")
profile_idx = input()
if profile_idx.lower() == 'c': if profile_idx.lower() == 'c':
continue continue
@@ -499,17 +506,18 @@ class BrowserProfiler:
idx = int(profile_idx) - 1 idx = int(profile_idx) - 1
if 0 <= idx < len(profiles): if 0 <= idx < len(profiles):
profile_name = profiles[idx]["name"] profile_name = profiles[idx]["name"]
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES") self.logger.info(f"Deleting profile: [yellow]{profile_name}[/yellow]", tag="PROFILES")
# Confirm deletion # Confirm deletion
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}") self.console.print("[red]Are you sure you want to delete this profile? (y/n): [/red]", end="")
confirm = input()
if confirm.lower() == 'y': if confirm.lower() == 'y':
success = self.delete_profile(profiles[idx]["path"]) success = self.delete_profile(profiles[idx]["path"])
if success: if success:
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES") self.logger.success(f"Profile {profile_name} deleted successfully", tag="PROFILES")
else: else:
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES") self.logger.error(f"Failed to delete profile {profile_name}", tag="PROFILES")
else: else:
self.logger.error("Invalid profile number", tag="PROFILES") self.logger.error("Invalid profile number", tag="PROFILES")
except ValueError: except ValueError:
@@ -523,12 +531,13 @@ class BrowserProfiler:
continue continue
# Display numbered list # Display numbered list
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW)
for i, profile in enumerate(profiles): for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
# Get profile to use # Get profile to use
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}") self.console.print("[cyan]Enter the number of the profile to use (or 'c' to cancel): [/cyan]", end="")
profile_idx = input()
if profile_idx.lower() == 'c': if profile_idx.lower() == 'c':
continue continue
@@ -536,7 +545,8 @@ class BrowserProfiler:
idx = int(profile_idx) - 1 idx = int(profile_idx) - 1
if 0 <= idx < len(profiles): if 0 <= idx < len(profiles):
profile_path = profiles[idx]["path"] profile_path = profiles[idx]["path"]
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}") self.console.print("[cyan]Enter the URL to crawl: [/cyan]", end="")
url = input()
if url: if url:
# Call the provided crawl callback # Call the provided crawl callback
await crawl_callback(profile_path, url) await crawl_callback(profile_path, url)
@@ -599,11 +609,11 @@ class BrowserProfiler:
# Print initial information # Print initial information
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
self.logger.info(f"\n{border}", tag="CDP") self.logger.info(f"\n{border}", tag="CDP")
self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP") self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
self.logger.info(f"Browser type: {Fore.GREEN}{browser_type}{Style.RESET_ALL}", tag="CDP") self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
self.logger.info(f"Profile path: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CDP") self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
self.logger.info(f"Debugging port: {Fore.CYAN}{debugging_port}{Style.RESET_ALL}", tag="CDP") self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
self.logger.info(f"Headless mode: {Fore.CYAN}{headless}{Style.RESET_ALL}", tag="CDP") self.logger.info(f"Headless mode: {headless}", tag="CDP")
# Create managed browser instance # Create managed browser instance
managed_browser = ManagedBrowser( managed_browser = ManagedBrowser(
@@ -646,7 +656,7 @@ class BrowserProfiler:
import select import select
# First output the prompt # First output the prompt
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' to stop the browser and exit...{Style.RESET_ALL}", tag="CDP") self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
# Save original terminal settings # Save original terminal settings
fd = sys.stdin.fileno() fd = sys.stdin.fileno()
@@ -662,7 +672,7 @@ class BrowserProfiler:
if readable: if readable:
key = sys.stdin.read(1) key = sys.stdin.read(1)
if key.lower() == 'q': if key.lower() == 'q':
self.logger.info(f"{Fore.GREEN}Closing browser...{Style.RESET_ALL}", tag="CDP") self.logger.info("Closing browser...", tag="CDP")
user_done_event.set() user_done_event.set()
return return
@@ -716,20 +726,20 @@ class BrowserProfiler:
self.logger.error("Failed to start browser process.", tag="CDP") self.logger.error("Failed to start browser process.", tag="CDP")
return None return None
self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP") self.logger.info("Browser launched successfully. Retrieving CDP information...", tag="CDP")
# Get CDP URL and JSON config # Get CDP URL and JSON config
cdp_url, config_json = await get_cdp_json(debugging_port) cdp_url, config_json = await get_cdp_json(debugging_port)
if cdp_url: if cdp_url:
self.logger.success(f"CDP URL: {Fore.GREEN}{cdp_url}{Style.RESET_ALL}", tag="CDP") self.logger.success(f"CDP URL: {cdp_url}", tag="CDP")
if config_json: if config_json:
# Display relevant CDP information # Display relevant CDP information
self.logger.info(f"Browser: {Fore.CYAN}{config_json.get('Browser', 'Unknown')}{Style.RESET_ALL}", tag="CDP") self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": LogColor.CYAN})
self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP") self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": LogColor.CYAN})
if 'webSocketDebuggerUrl' in config_json: if 'webSocketDebuggerUrl' in config_json:
self.logger.info(f"WebSocket URL: {Fore.GREEN}{config_json['webSocketDebuggerUrl']}{Style.RESET_ALL}", tag="CDP") self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": LogColor.GREEN})
else: else:
self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP") self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP")
else: else:
@@ -757,7 +767,7 @@ class BrowserProfiler:
self.logger.info("Terminating browser process...", tag="CDP") self.logger.info("Terminating browser process...", tag="CDP")
await managed_browser.cleanup() await managed_browser.cleanup()
self.logger.success(f"Browser closed.", tag="CDP") self.logger.success("Browser closed.", tag="CDP")
except Exception as e: except Exception as e:
self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP") self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP")
@@ -972,3 +982,30 @@ class BrowserProfiler:
'info': browser_info 'info': browser_info
} }
if __name__ == "__main__":
# Example usage
profiler = BrowserProfiler()
# Create a new profile
import os
from pathlib import Path
home_dir = Path.home()
profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
# Launch a standalone browser
asyncio.run(profiler.launch_standalone_browser())
# List profiles
profiles = profiler.list_profiles()
for profile in profiles:
print(f"Profile: {profile['name']}, Path: {profile['path']}")
# Delete a profile
success = profiler.delete_profile("my-profile")
if success:
print("Profile deleted successfully")
else:
print("Failed to delete profile")

View File

@@ -27,8 +27,7 @@ import json
import hashlib import hashlib
from pathlib import Path from pathlib import Path
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from .async_logger import AsyncLogger, LogLevel from .async_logger import AsyncLogger, LogLevel, LogColor
from colorama import Fore, Style
class RelevantContentFilter(ABC): class RelevantContentFilter(ABC):
@@ -846,8 +845,7 @@ class LLMContentFilter(RelevantContentFilter):
}, },
colors={ colors={
**AsyncLogger.DEFAULT_COLORS, **AsyncLogger.DEFAULT_COLORS,
LogLevel.INFO: Fore.MAGENTA LogLevel.INFO: LogColor.DIM_MAGENTA # Dimmed purple for LLM ops
+ Style.DIM, # Dimmed purple for LLM ops
}, },
) )
else: else:
@@ -892,7 +890,7 @@ class LLMContentFilter(RelevantContentFilter):
"Starting LLM markdown content filtering process", "Starting LLM markdown content filtering process",
tag="LLM", tag="LLM",
params={"provider": self.llm_config.provider}, params={"provider": self.llm_config.provider},
colors={"provider": Fore.CYAN}, colors={"provider": LogColor.CYAN},
) )
# Cache handling # Cache handling
@@ -929,7 +927,7 @@ class LLMContentFilter(RelevantContentFilter):
"LLM markdown: Split content into {chunk_count} chunks", "LLM markdown: Split content into {chunk_count} chunks",
tag="CHUNK", tag="CHUNK",
params={"chunk_count": len(html_chunks)}, params={"chunk_count": len(html_chunks)},
colors={"chunk_count": Fore.YELLOW}, colors={"chunk_count": LogColor.YELLOW},
) )
start_time = time.time() start_time = time.time()
@@ -1038,7 +1036,7 @@ class LLMContentFilter(RelevantContentFilter):
"LLM markdown: Completed processing in {time:.2f}s", "LLM markdown: Completed processing in {time:.2f}s",
tag="LLM", tag="LLM",
params={"time": end_time - start_time}, params={"time": end_time - start_time},
colors={"time": Fore.YELLOW}, colors={"time": LogColor.YELLOW},
) )
result = ordered_results if ordered_results else [] result = ordered_results if ordered_results else []

View File

@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import inspect import inspect
from typing import Any, List, Dict, Optional from typing import Any, List, Dict, Optional, Tuple, Pattern, Union
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import json import json
import time import time
from enum import IntFlag, auto
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
from .config import ( from .config import (
@@ -1668,3 +1669,303 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
def _get_element_attribute(self, element, attribute: str): def _get_element_attribute(self, element, attribute: str):
return element.get(attribute) return element.get(attribute)
"""
RegexExtractionStrategy
Fast, zero-LLM extraction of common entities via regular expressions.
"""
_CTRL = {c: rf"\x{ord(c):02x}" for c in map(chr, range(32)) if c not in "\t\n\r"}
_WB_FIX = re.compile(r"\x08") # stray back-space → word-boundary
_NEEDS_ESCAPE = re.compile(r"(?<!\\)\\(?![\\u])") # lone backslash
def _sanitize_schema(schema: Dict[str, str]) -> Dict[str, str]:
"""Fix common JSON-escape goofs coming from LLMs or manual edits."""
safe = {}
for label, pat in schema.items():
# 1⃣ replace accidental control chars (inc. the infamous back-space)
pat = _WB_FIX.sub(r"\\b", pat).translate(_CTRL)
# 2⃣ double any single backslash that JSON kept single
pat = _NEEDS_ESCAPE.sub(r"\\\\", pat)
# 3⃣ quick sanity compile
try:
re.compile(pat)
except re.error as e:
raise ValueError(f"Regex for '{label}' wont compile after fix: {e}") from None
safe[label] = pat
return safe
class RegexExtractionStrategy(ExtractionStrategy):
"""
A lean strategy that finds e-mails, phones, URLs, dates, money, etc.,
using nothing but pre-compiled regular expressions.
Extraction returns::
{
"url": "<page-url>",
"label": "<pattern-label>",
"value": "<matched-string>",
"span": [start, end]
}
Only `generate_schema()` touches an LLM, extraction itself is pure Python.
"""
# -------------------------------------------------------------- #
# Built-in patterns exposed as IntFlag so callers can bit-OR them
# -------------------------------------------------------------- #
class _B(IntFlag):
EMAIL = auto()
PHONE_INTL = auto()
PHONE_US = auto()
URL = auto()
IPV4 = auto()
IPV6 = auto()
UUID = auto()
CURRENCY = auto()
PERCENTAGE = auto()
NUMBER = auto()
DATE_ISO = auto()
DATE_US = auto()
TIME_24H = auto()
POSTAL_US = auto()
POSTAL_UK = auto()
HTML_COLOR_HEX = auto()
TWITTER_HANDLE = auto()
HASHTAG = auto()
MAC_ADDR = auto()
IBAN = auto()
CREDIT_CARD = auto()
NOTHING = auto()
ALL = (
EMAIL | PHONE_INTL | PHONE_US | URL | IPV4 | IPV6 | UUID
| CURRENCY | PERCENTAGE | NUMBER | DATE_ISO | DATE_US | TIME_24H
| POSTAL_US | POSTAL_UK | HTML_COLOR_HEX | TWITTER_HANDLE
| HASHTAG | MAC_ADDR | IBAN | CREDIT_CARD
)
# user-friendly aliases (RegexExtractionStrategy.Email, .IPv4, …)
Email = _B.EMAIL
PhoneIntl = _B.PHONE_INTL
PhoneUS = _B.PHONE_US
Url = _B.URL
IPv4 = _B.IPV4
IPv6 = _B.IPV6
Uuid = _B.UUID
Currency = _B.CURRENCY
Percentage = _B.PERCENTAGE
Number = _B.NUMBER
DateIso = _B.DATE_ISO
DateUS = _B.DATE_US
Time24h = _B.TIME_24H
PostalUS = _B.POSTAL_US
PostalUK = _B.POSTAL_UK
HexColor = _B.HTML_COLOR_HEX
TwitterHandle = _B.TWITTER_HANDLE
Hashtag = _B.HASHTAG
MacAddr = _B.MAC_ADDR
Iban = _B.IBAN
CreditCard = _B.CREDIT_CARD
All = _B.ALL
Nothing = _B(0) # no patterns
# ------------------------------------------------------------------ #
# Built-in pattern catalog
# ------------------------------------------------------------------ #
DEFAULT_PATTERNS: Dict[str, str] = {
# Communication
"email": r"[\w.+-]+@[\w-]+\.[\w.-]+",
"phone_intl": r"\+?\d[\d .()-]{7,}\d",
"phone_us": r"\(?\d{3}\)?[ -. ]?\d{3}[ -. ]?\d{4}",
# Web
"url": r"https?://[^\s\"'<>]+",
"ipv4": r"(?:\d{1,3}\.){3}\d{1,3}",
"ipv6": r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}",
# IDs
"uuid": r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
# Money / numbers
"currency": r"(?:USD|EUR|RM|\$|€|£)\s?\d+(?:[.,]\d{2})?",
"percentage": r"\d+(?:\.\d+)?%",
"number": r"\b\d{1,3}(?:[,.\s]\d{3})*(?:\.\d+)?\b",
# Dates / Times
"date_iso": r"\d{4}-\d{2}-\d{2}",
"date_us": r"\d{1,2}/\d{1,2}/\d{2,4}",
"time_24h": r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?:[:.][0-5]\d)?\b",
# Misc
"postal_us": r"\b\d{5}(?:-\d{4})?\b",
"postal_uk": r"\b[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\b",
"html_color_hex": r"#[0-9A-Fa-f]{6}\b",
"twitter_handle": r"@[\w]{1,15}",
"hashtag": r"#[\w-]+",
"mac_addr": r"(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}",
"iban": r"[A-Z]{2}\d{2}[A-Z0-9]{11,30}",
"credit_card": r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|6(?:011|5\d{2})\d{12})\b",
}
_FLAGS = re.IGNORECASE | re.MULTILINE
_UNWANTED_PROPS = {
"provider": "Use llm_config instead",
"api_token": "Use llm_config instead",
}
# ------------------------------------------------------------------ #
# Construction
# ------------------------------------------------------------------ #
def __init__(
self,
pattern: "_B" = _B.NOTHING,
*,
custom: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None,
input_format: str = "fit_html",
**kwargs,
) -> None:
"""
Args:
patterns: Custom patterns overriding or extending defaults.
Dict[label, regex] or list[tuple(label, regex)].
input_format: "html", "markdown" or "text".
**kwargs: Forwarded to ExtractionStrategy.
"""
super().__init__(input_format=input_format, **kwargs)
# 1⃣ take only the requested built-ins
merged: Dict[str, str] = {
key: rx
for key, rx in self.DEFAULT_PATTERNS.items()
if getattr(self._B, key.upper()).value & pattern
}
# 2⃣ apply user overrides / additions
if custom:
if isinstance(custom, dict):
merged.update(custom)
else: # iterable of (label, regex)
merged.update({lbl: rx for lbl, rx in custom})
self._compiled: Dict[str, Pattern] = {
lbl: re.compile(rx, self._FLAGS) for lbl, rx in merged.items()
}
# ------------------------------------------------------------------ #
# Extraction
# ------------------------------------------------------------------ #
def extract(self, url: str, content: str, *q, **kw) -> List[Dict[str, Any]]:
# text = self._plain_text(html)
out: List[Dict[str, Any]] = []
for label, cre in self._compiled.items():
for m in cre.finditer(content):
out.append(
{
"url": url,
"label": label,
"value": m.group(0),
"span": [m.start(), m.end()],
}
)
return out
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
def _plain_text(self, content: str) -> str:
if self.input_format == "text":
return content
return BeautifulSoup(content, "lxml").get_text(" ", strip=True)
# ------------------------------------------------------------------ #
# LLM-assisted pattern generator
# ------------------------------------------------------------------ #
# ------------------------------------------------------------------ #
# LLM-assisted one-off pattern builder
# ------------------------------------------------------------------ #
@staticmethod
def generate_pattern(
label: str,
html: str,
*,
query: Optional[str] = None,
examples: Optional[List[str]] = None,
llm_config: Optional[LLMConfig] = None,
**kwargs,
) -> Dict[str, str]:
"""
Ask an LLM for a single page-specific regex and return
{label: pattern} ── ready for RegexExtractionStrategy(custom=…)
"""
# ── guard deprecated kwargs
for k in RegexExtractionStrategy._UNWANTED_PROPS:
if k in kwargs:
raise AttributeError(
f"{k} is deprecated, {RegexExtractionStrategy._UNWANTED_PROPS[k]}"
)
# ── default LLM config
if llm_config is None:
llm_config = create_llm_config()
# ── system prompt hardened
system_msg = (
"You are an expert Python-regex engineer.\n"
f"Return **one** JSON object whose single key is exactly \"{label}\", "
"and whose value is a raw-string regex pattern that works with "
"the standard `re` module in Python.\n\n"
"Strict rules (obey every bullet):\n"
"• If a *user query* is supplied, treat it as the precise semantic target and optimise the "
" pattern to capture ONLY text that answers that query. If the query conflicts with the "
" sample HTML, the HTML wins.\n"
"• Tailor the pattern to the *sample HTML* reproduce its exact punctuation, spacing, "
" symbols, capitalisation, etc. Do **NOT** invent a generic form.\n"
"• Keep it minimal and fast: avoid unnecessary capturing, prefer non-capturing `(?: … )`, "
" and guard against catastrophic backtracking.\n"
"• Anchor with `^`, `$`, or `\\b` only when it genuinely improves precision.\n"
"• Use inline flags like `(?i)` when needed; no verbose flag comments.\n"
"• Output must be valid JSON no markdown, code fences, comments, or extra keys.\n"
"• The regex value must be a Python string literal: **double every backslash** "
"(e.g. `\\\\b`, `\\\\d`, `\\\\\\\\`).\n\n"
"Example valid output:\n"
f"{{\"{label}\": \"(?:RM|rm)\\\\s?\\\\d{{1,3}}(?:,\\\\d{{3}})*(?:\\\\.\\\\d{{2}})?\"}}"
)
# ── user message: cropped HTML + optional hints
user_parts = ["```html", html[:5000], "```"] # protect token budget
if query:
user_parts.append(f"\n\n## Query\n{query.strip()}")
if examples:
user_parts.append("## Examples\n" + "\n".join(examples[:20]))
user_msg = "\n\n".join(user_parts)
# ── LLM call (with retry/backoff)
resp = perform_completion_with_backoff(
provider=llm_config.provider,
prompt_with_variables="\n\n".join([system_msg, user_msg]),
json_response=True,
api_token=llm_config.api_token,
base_url=llm_config.base_url,
extra_args=kwargs,
)
# ── clean & load JSON (fix common escape mistakes *before* json.loads)
raw = resp.choices[0].message.content
raw = raw.replace("\x08", "\\b") # stray back-space → \b
raw = re.sub(r'(?<!\\)\\(?![\\u"])', r"\\\\", raw) # lone \ → \\
try:
pattern_dict = json.loads(raw)
except Exception as exc:
raise ValueError(f"LLM did not return valid JSON: {raw}") from exc
# quick sanity-compile
for lbl, pat in pattern_dict.items():
try:
re.compile(pat)
except re.error as e:
raise ValueError(f"Invalid regex for '{lbl}': {e}") from None
return pattern_dict

View File

@@ -1,4 +1,4 @@
from pydantic import BaseModel, HttpUrl, PrivateAttr from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from typing import AsyncGenerator from typing import AsyncGenerator
from typing import Generic, TypeVar from typing import Generic, TypeVar
@@ -129,6 +129,7 @@ class MarkdownGenerationResult(BaseModel):
class CrawlResult(BaseModel): class CrawlResult(BaseModel):
url: str url: str
html: str html: str
fit_html: Optional[str] = None
success: bool success: bool
cleaned_html: Optional[str] = None cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {} media: Dict[str, List[Dict]] = {}
@@ -150,6 +151,7 @@ class CrawlResult(BaseModel):
redirected_url: Optional[str] = None redirected_url: Optional[str] = None
network_requests: Optional[List[Dict[str, Any]]] = None network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None
tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}]
class Config: class Config:
arbitrary_types_allowed = True arbitrary_types_allowed = True

View File

@@ -20,7 +20,6 @@ from urllib.parse import urljoin
import requests import requests
from requests.exceptions import InvalidSchema from requests.exceptions import InvalidSchema
import xxhash import xxhash
from colorama import Fore, Style, init
import textwrap import textwrap
import cProfile import cProfile
import pstats import pstats
@@ -441,14 +440,13 @@ def create_box_message(
str: A formatted string containing the styled message box. str: A formatted string containing the styled message box.
""" """
init()
# Define border and text colors for different types # Define border and text colors for different types
styles = { styles = {
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, ""), "warning": ("yellow", "bright_yellow", ""),
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, ""), "info": ("blue", "bright_blue", ""),
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, ""), "debug": ("lightblack", "bright_black", ""),
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"), "success": ("green", "bright_green", ""),
"error": ("red", "bright_red", "×"),
} }
border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
@@ -480,12 +478,12 @@ def create_box_message(
# Create the box with colored borders and lighter text # Create the box with colored borders and lighter text
horizontal_line = h_line * (width - 1) horizontal_line = h_line * (width - 1)
box = [ box = [
f"{border_color}{tl}{horizontal_line}{tr}", f"[{border_color}]{tl}{horizontal_line}{tr}[/{border_color}]",
*[ *[
f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" f"[{border_color}]{v_line}[{text_color}] {line:<{width-2}}[/{text_color}][{border_color}]{v_line}[/{border_color}]"
for line in formatted_lines for line in formatted_lines
], ],
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}", f"[{border_color}]{bl}{horizontal_line}{br}[/{border_color}]",
] ]
result = "\n".join(box) result = "\n".join(box)
@@ -2737,33 +2735,67 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
# Also truncate tail text if present # Also truncate tail text if present
if element.tail and len(element.tail.strip()) > text_threshold: if element.tail and len(element.tail.strip()) > text_threshold:
element.tail = element.tail.strip()[:text_threshold] + '...' element.tail = element.tail.strip()[:text_threshold] + '...'
# 4. Find repeated patterns and keep only a few examples # 4. Detect duplicates and drop them in a single pass
# This is a simplistic approach - more sophisticated pattern detection could be implemented seen: dict[tuple, None] = {}
pattern_elements = {} for el in list(tree.xpath('//*[@class]')): # snapshot once, XPath is fast
for element in tree.xpath('//*[contains(@class, "")]'): parent = el.getparent()
parent = element.getparent()
if parent is None: if parent is None:
continue continue
# Create a signature based on tag and classes cls = el.get('class')
classes = element.get('class', '') if not cls:
if not classes:
continue continue
signature = f"{element.tag}.{classes}"
# ── build signature ───────────────────────────────────────────
if signature in pattern_elements: h = xxhash.xxh64() # stream, no big join()
pattern_elements[signature].append(element) for txt in el.itertext():
h.update(txt)
sig = (el.tag, cls, h.intdigest()) # tuple cheaper & hashable
# ── first seen? keep else drop ─────────────
if sig in seen and parent is not None:
parent.remove(el) # duplicate
else: else:
pattern_elements[signature] = [element] seen[sig] = None
# Keep only 3 examples of each repeating pattern # # 4. Find repeated patterns and keep only a few examples
for signature, elements in pattern_elements.items(): # # This is a simplistic approach - more sophisticated pattern detection could be implemented
if len(elements) > 3: # pattern_elements = {}
# Keep the first 2 and last elements # for element in tree.xpath('//*[contains(@class, "")]'):
for element in elements[2:-1]: # parent = element.getparent()
if element.getparent() is not None: # if parent is None:
element.getparent().remove(element) # continue
# # Create a signature based on tag and classes
# classes = element.get('class', '')
# if not classes:
# continue
# innert_text = ''.join(element.xpath('.//text()'))
# innert_text_hash = xxhash.xxh64(innert_text.encode()).hexdigest()
# signature = f"{element.tag}.{classes}.{innert_text_hash}"
# if signature in pattern_elements:
# pattern_elements[signature].append(element)
# else:
# pattern_elements[signature] = [element]
# # Keep only first examples of each repeating pattern
# for signature, elements in pattern_elements.items():
# if len(elements) > 1:
# # Keep the first element and remove the rest
# for element in elements[1:]:
# if element.getparent() is not None:
# element.getparent().remove(element)
# # Keep only 3 examples of each repeating pattern
# for signature, elements in pattern_elements.items():
# if len(elements) > 3:
# # Keep the first 2 and last elements
# for element in elements[2:-1]:
# if element.getparent() is not None:
# element.getparent().remove(element)
# 5. Convert back to string # 5. Convert back to string
result = etree.tostring(tree, encoding='unicode', method='html') result = etree.tostring(tree, encoding='unicode', method='html')
@@ -2778,4 +2810,3 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
# Fallback for parsing errors # Fallback for parsing errors
return html_content[:max_size] if len(html_content) > max_size else html_content return html_content[:max_size] if len(html_content) > max_size else html_content

View File

@@ -1,8 +1,10 @@
import os import os
import json import json
import asyncio import asyncio
from typing import List, Tuple from typing import List, Tuple, Dict
from functools import partial from functools import partial
from uuid import uuid4
from datetime import datetime
import logging import logging
from typing import Optional, AsyncGenerator from typing import Optional, AsyncGenerator
@@ -272,7 +274,9 @@ async def handle_llm_request(
async def handle_task_status( async def handle_task_status(
redis: aioredis.Redis, redis: aioredis.Redis,
task_id: str, task_id: str,
base_url: str base_url: str,
*,
keep: bool = False
) -> JSONResponse: ) -> JSONResponse:
"""Handle task status check requests.""" """Handle task status check requests."""
task = await redis.hgetall(f"task:{task_id}") task = await redis.hgetall(f"task:{task_id}")
@@ -286,7 +290,7 @@ async def handle_task_status(
response = create_task_response(task, task_id, base_url) response = create_task_response(task, task_id, base_url)
if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]: if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
if should_cleanup_task(task["created_at"]): if not keep and should_cleanup_task(task["created_at"]):
await redis.delete(f"task:{task_id}") await redis.delete(f"task:{task_id}")
return JSONResponse(response) return JSONResponse(response)
@@ -520,4 +524,48 @@ async def handle_stream_crawl_request(
raise HTTPException( raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e) detail=str(e)
) )
async def handle_crawl_job(
redis,
background_tasks: BackgroundTasks,
urls: List[str],
browser_config: Dict,
crawler_config: Dict,
config: Dict,
) -> Dict:
"""
Fire-and-forget version of handle_crawl_request.
Creates a task in Redis, runs the heavy work in a background task,
lets /crawl/job/{task_id} polling fetch the result.
"""
task_id = f"crawl_{uuid4().hex[:8]}"
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
"created_at": datetime.utcnow().isoformat(),
"url": json.dumps(urls), # store list as JSON string
"result": "",
"error": "",
})
async def _runner():
try:
result = await handle_crawl_request(
urls=urls,
browser_config=browser_config,
crawler_config=crawler_config,
config=config,
)
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.COMPLETED,
"result": json.dumps(result),
})
await asyncio.sleep(5) # Give Redis time to process the update
except Exception as exc:
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.FAILED,
"error": str(exc),
})
background_tasks.add_task(_runner)
return {"task_id": task_id}

View File

@@ -3,7 +3,7 @@ app:
title: "Crawl4AI API" title: "Crawl4AI API"
version: "1.0.0" version: "1.0.0"
host: "0.0.0.0" host: "0.0.0.0"
port: 11235 port: 11234
reload: False reload: False
workers: 1 workers: 1
timeout_keep_alive: 300 timeout_keep_alive: 300

99
deploy/docker/job.py Normal file
View File

@@ -0,0 +1,99 @@
"""
Job endpoints (enqueue + poll) for long-running LLM extraction and raw crawl.
Relies on the existing Redis task helpers in api.py
"""
from typing import Dict, Optional, Callable
from fastapi import APIRouter, BackgroundTasks, Depends, Request
from pydantic import BaseModel, HttpUrl
from api import (
handle_llm_request,
handle_crawl_job,
handle_task_status,
)
# ------------- dependency placeholders -------------
_redis = None # will be injected from server.py
_config = None
_token_dep: Callable = lambda: None # dummy until injected
# public router
router = APIRouter()
# === init hook called by server.py =========================================
def init_job_router(redis, config, token_dep) -> APIRouter:
"""Inject shared singletons and return the router for mounting."""
global _redis, _config, _token_dep
_redis, _config, _token_dep = redis, config, token_dep
return router
# ---------- payload models --------------------------------------------------
class LlmJobPayload(BaseModel):
url: HttpUrl
q: str
schema: Optional[str] = None
cache: bool = False
class CrawlJobPayload(BaseModel):
urls: list[HttpUrl]
browser_config: Dict = {}
crawler_config: Dict = {}
# ---------- LLM job ---------------------------------------------------------
@router.post("/llm/job", status_code=202)
async def llm_job_enqueue(
payload: LlmJobPayload,
background_tasks: BackgroundTasks,
request: Request,
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
):
return await handle_llm_request(
_redis,
background_tasks,
request,
str(payload.url),
query=payload.q,
schema=payload.schema,
cache=payload.cache,
config=_config,
)
@router.get("/llm/job/{task_id}")
async def llm_job_status(
request: Request,
task_id: str,
_td: Dict = Depends(lambda: _token_dep())
):
return await handle_task_status(_redis, task_id)
# ---------- CRAWL job -------------------------------------------------------
@router.post("/crawl/job", status_code=202)
async def crawl_job_enqueue(
payload: CrawlJobPayload,
background_tasks: BackgroundTasks,
_td: Dict = Depends(lambda: _token_dep()),
):
return await handle_crawl_job(
_redis,
background_tasks,
[str(u) for u in payload.urls],
payload.browser_config,
payload.crawler_config,
config=_config,
)
@router.get("/crawl/job/{task_id}")
async def crawl_job_status(
request: Request,
task_id: str,
_td: Dict = Depends(lambda: _token_dep())
):
return await handle_task_status(_redis, task_id, base_url=str(request.base_url))

42
deploy/docker/schemas.py Normal file
View File

@@ -0,0 +1,42 @@
from typing import List, Optional, Dict
from enum import Enum
from pydantic import BaseModel, Field
from utils import FilterType
class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT,
description="Contentfilter strategy: FIT, RAW, BM25, or LLM")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")
class RawCode(BaseModel):
code: str
class HTMLRequest(BaseModel):
url: str
class ScreenshotRequest(BaseModel):
url: str
screenshot_wait_for: Optional[float] = 2
output_path: Optional[str] = None
class PDFRequest(BaseModel):
url: str
output_path: Optional[str] = None
class JSEndpointRequest(BaseModel):
url: str
scripts: List[str] = Field(
...,
description="List of separated JavaScript snippets to execute"
)

View File

@@ -12,7 +12,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from auth import create_access_token, get_token_dependency, TokenRequest from auth import create_access_token, get_token_dependency, TokenRequest
from pydantic import BaseModel from pydantic import BaseModel
from typing import Optional, List, Dict from typing import Optional, List, Dict
from fastapi import Request, Depends from fastapi import Request, Depends
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
import base64 import base64
import re import re
@@ -22,6 +22,16 @@ from api import (
handle_stream_crawl_request, handle_crawl_request, handle_stream_crawl_request, handle_crawl_request,
stream_results stream_results
) )
from schemas import (
CrawlRequest,
MarkdownRequest,
RawCode,
HTMLRequest,
ScreenshotRequest,
PDFRequest,
JSEndpointRequest,
)
from utils import ( from utils import (
FilterType, load_config, setup_logging, verify_email_domain FilterType, load_config, setup_logging, verify_email_domain
) )
@@ -37,23 +47,13 @@ from fastapi import (
FastAPI, HTTPException, Request, Path, Query, Depends FastAPI, HTTPException, Request, Path, Query, Depends
) )
from rank_bm25 import BM25Okapi from rank_bm25 import BM25Okapi
def chunk_code_functions(code: str) -> List[str]:
tree = ast.parse(code)
lines = code.splitlines()
chunks = []
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno - 1
end = getattr(node, 'end_lineno', start + 1)
chunks.append("\n".join(lines[start:end]))
return chunks
from fastapi.responses import ( from fastapi.responses import (
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
) )
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from job import init_job_router
from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
@@ -129,8 +129,6 @@ app.mount(
name="play", name="play",
) )
# Optional nicetohave: opening the root shows the playground
@app.get("/") @app.get("/")
async def root(): async def root():
@@ -211,48 +209,10 @@ def _safe_eval_config(expr: str) -> dict:
return obj.dump() return obj.dump()
# ───────────────────────── Schemas ─────────────────────────── # ── job router ──────────────────────────────────────────────
class CrawlRequest(BaseModel): app.include_router(init_job_router(redis, config, token_dep))
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
# ────────────── Schemas ──────────────
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT,
description="Contentfilter strategy: FIT, RAW, BM25, or LLM")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")
class RawCode(BaseModel):
code: str
class HTMLRequest(BaseModel):
url: str
class ScreenshotRequest(BaseModel):
url: str
screenshot_wait_for: Optional[float] = 2
output_path: Optional[str] = None
class PDFRequest(BaseModel):
url: str
output_path: Optional[str] = None
class JSEndpointRequest(BaseModel):
url: str
scripts: List[str] = Field(
...,
description="List of separated JavaScript snippets to execute"
)
# ──────────────────────── Endpoints ────────────────────────── # ──────────────────────── Endpoints ──────────────────────────
@app.post("/token") @app.post("/token")
async def get_token(req: TokenRequest): async def get_token(req: TokenRequest):
if not verify_email_domain(req.email): if not verify_email_domain(req.email):
@@ -278,7 +238,8 @@ async def get_markdown(
_td: Dict = Depends(token_dep), _td: Dict = Depends(token_dep),
): ):
if not body.url.startswith(("http://", "https://")): if not body.url.startswith(("http://", "https://")):
raise HTTPException(400, "URL must be absolute and start with http/https") raise HTTPException(
400, "URL must be absolute and start with http/https")
markdown = await handle_markdown_request( markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config body.url, body.f, body.q, body.c, config
) )
@@ -314,12 +275,13 @@ async def generate_html(
# Screenshot endpoint # Screenshot endpoint
@app.post("/screenshot") @app.post("/screenshot")
@limiter.limit(config["rate_limiting"]["default_limit"]) @limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("screenshot") @mcp_tool("screenshot")
async def generate_screenshot( async def generate_screenshot(
request: Request, request: Request,
body: ScreenshotRequest, body: ScreenshotRequest,
_td: Dict = Depends(token_dep), _td: Dict = Depends(token_dep),
): ):
""" """
@@ -327,7 +289,8 @@ async def generate_screenshot(
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot. Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
Then in result instead of the screenshot you will get a path to the saved file. Then in result instead of the screenshot you will get a path to the saved file.
""" """
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for) cfg = CrawlerRunConfig(
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler: async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
results = await crawler.arun(url=body.url, config=cfg) results = await crawler.arun(url=body.url, config=cfg)
screenshot_data = results[0].screenshot screenshot_data = results[0].screenshot
@@ -341,12 +304,13 @@ async def generate_screenshot(
# PDF endpoint # PDF endpoint
@app.post("/pdf") @app.post("/pdf")
@limiter.limit(config["rate_limiting"]["default_limit"]) @limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("pdf") @mcp_tool("pdf")
async def generate_pdf( async def generate_pdf(
request: Request, request: Request,
body: PDFRequest, body: PDFRequest,
_td: Dict = Depends(token_dep), _td: Dict = Depends(token_dep),
): ):
""" """
@@ -384,7 +348,7 @@ async def execute_js(
Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value. Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
Return Format: Return Format:
- The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints. - The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
```python ```python
class CrawlResult(BaseModel): class CrawlResult(BaseModel):
url: str url: str
@@ -418,7 +382,7 @@ async def execute_js(
fit_markdown: Optional[str] = None fit_markdown: Optional[str] = None
fit_html: Optional[str] = None fit_html: Optional[str] = None
``` ```
""" """
cfg = CrawlerRunConfig(js_code=body.scripts) cfg = CrawlerRunConfig(js_code=body.scripts)
async with AsyncWebCrawler(config=BrowserConfig()) as crawler: async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
@@ -507,6 +471,7 @@ async def crawl_stream(
}, },
) )
def chunk_code_functions(code_md: str) -> List[str]: def chunk_code_functions(code_md: str) -> List[str]:
"""Extract each function/class from markdown code blocks per file.""" """Extract each function/class from markdown code blocks per file."""
pattern = re.compile( pattern = re.compile(
@@ -530,6 +495,7 @@ def chunk_code_functions(code_md: str) -> List[str]:
chunks.append(f"# File: {file_path}\n{snippet}") chunks.append(f"# File: {file_path}\n{snippet}")
return chunks return chunks
def chunk_doc_sections(doc: str) -> List[str]: def chunk_doc_sections(doc: str) -> List[str]:
lines = doc.splitlines(keepends=True) lines = doc.splitlines(keepends=True)
sections = [] sections = []
@@ -545,6 +511,7 @@ def chunk_doc_sections(doc: str) -> List[str]:
sections.append("".join(current)) sections.append("".join(current))
return sections return sections
@app.get("/ask") @app.get("/ask")
@limiter.limit(config["rate_limiting"]["default_limit"]) @limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("ask") @mcp_tool("ask")
@@ -552,21 +519,24 @@ async def get_context(
request: Request, request: Request,
_td: Dict = Depends(token_dep), _td: Dict = Depends(token_dep),
context_type: str = Query("all", regex="^(code|doc|all)$"), context_type: str = Query("all", regex="^(code|doc|all)$"),
query: Optional[str] = Query(None, description="search query to filter chunks"), query: Optional[str] = Query(
score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"), None, description="search query to filter chunks"),
max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"), score_ratio: float = Query(
0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
max_results: int = Query(
20, ge=1, description="absolute cap on returned chunks"),
): ):
""" """
This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai. This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks. You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long. Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
Parameters: Parameters:
- context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both. - context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
- query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context. - query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
- score_ratio: Minimum score as a fraction of the maximum score for filtering results. - score_ratio: Minimum score as a fraction of the maximum score for filtering results.
- max_results: Maximum number of results to return. Default is 20. - max_results: Maximum number of results to return. Default is 20.
Returns: Returns:
- JSON response with the requested context. - JSON response with the requested context.
- If "code" is specified, returns the code context. - If "code" is specified, returns the code context.
@@ -576,7 +546,7 @@ async def get_context(
# load contexts # load contexts
base = os.path.dirname(__file__) base = os.path.dirname(__file__)
code_path = os.path.join(base, "c4ai-code-context.md") code_path = os.path.join(base, "c4ai-code-context.md")
doc_path = os.path.join(base, "c4ai-doc-context.md") doc_path = os.path.join(base, "c4ai-doc-context.md")
if not os.path.exists(code_path) or not os.path.exists(doc_path): if not os.path.exists(code_path) or not os.path.exists(doc_path):
raise HTTPException(404, "Context files not found") raise HTTPException(404, "Context files not found")
@@ -626,7 +596,7 @@ async def get_context(
] ]
return JSONResponse(results) return JSONResponse(results)
# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema) # attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
print(f"MCP server running on {config['app']['host']}:{config['app']['port']}") print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")

View File

@@ -193,7 +193,48 @@
<textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4" <textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
spellcheck="false">https://example.com</textarea> spellcheck="false">https://example.com</textarea>
<details class="mb-4"> <!-- Specific options for /md endpoint -->
<details id="md-options" class="mb-4 hidden">
<summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
<div class="mt-2 space-y-3 p-2 border border-border rounded">
<div>
<label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
<select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
<option value="fit">fit - Adaptive content filtering</option>
<option value="raw">raw - No filtering</option>
<option value="bm25">bm25 - BM25 keyword relevance</option>
<option value="llm">llm - LLM-based filtering</option>
</select>
</div>
<div>
<label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
<input id="md-query" type="text" placeholder="Enter search terms or instructions"
class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
</div>
<div>
<label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
<select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
<option value="0">Write-Only (0)</option>
<option value="1">Enabled (1)</option>
</select>
</div>
</div>
</details>
<!-- Specific options for /llm endpoint -->
<details id="llm-options" class="mb-4 hidden">
<summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
<div class="mt-2 space-y-3 p-2 border border-border rounded">
<div>
<label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
<input id="llm-question" type="text" value="What is this page about?"
class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
</div>
</div>
</details>
<!-- Advanced config for /crawl endpoints -->
<details id="adv-config" class="mb-4">
<summary class="text-sm text-secondary cursor-pointer">Advanced Config <span <summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
class="text-xs text-primary">(Python → autoJSON)</span></summary> class="text-xs text-primary">(Python → autoJSON)</span></summary>
@@ -437,6 +478,33 @@
cm.setValue(TEMPLATES[e.target.value]); cm.setValue(TEMPLATES[e.target.value]);
document.getElementById('cfg-status').textContent = ''; document.getElementById('cfg-status').textContent = '';
}); });
// Handle endpoint selection change to show appropriate options
document.getElementById('endpoint').addEventListener('change', function(e) {
const endpoint = e.target.value;
const mdOptions = document.getElementById('md-options');
const llmOptions = document.getElementById('llm-options');
const advConfig = document.getElementById('adv-config');
// Hide all option sections first
mdOptions.classList.add('hidden');
llmOptions.classList.add('hidden');
advConfig.classList.add('hidden');
// Show the appropriate section based on endpoint
if (endpoint === 'md') {
mdOptions.classList.remove('hidden');
// Auto-open the /md options
mdOptions.setAttribute('open', '');
} else if (endpoint === 'llm') {
llmOptions.classList.remove('hidden');
// Auto-open the /llm options
llmOptions.setAttribute('open', '');
} else {
// For /crawl endpoints, show the advanced config
advConfig.classList.remove('hidden');
}
});
async function pyConfigToJson() { async function pyConfigToJson() {
const code = cm.getValue().trim(); const code = cm.getValue().trim();
@@ -494,10 +562,18 @@
} }
// Generate code snippets // Generate code snippets
function generateSnippets(api, payload) { function generateSnippets(api, payload, method = 'POST') {
// Python snippet // Python snippet
const pyCodeEl = document.querySelector('#python-content code'); const pyCodeEl = document.querySelector('#python-content code');
const pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`; let pySnippet;
if (method === 'GET') {
// GET request (for /llm endpoint)
pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.get(\n "${window.location.origin}${api}"\n )\n return response.json()`;
} else {
// POST request (for /crawl and /md endpoints)
pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`;
}
pyCodeEl.textContent = pySnippet; pyCodeEl.textContent = pySnippet;
pyCodeEl.className = 'python hljs'; // Reset classes pyCodeEl.className = 'python hljs'; // Reset classes
@@ -505,7 +581,15 @@
// cURL snippet // cURL snippet
const curlCodeEl = document.querySelector('#curl-content code'); const curlCodeEl = document.querySelector('#curl-content code');
const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`; let curlSnippet;
if (method === 'GET') {
// GET request (for /llm endpoint)
curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
} else {
// POST request (for /crawl and /md endpoints)
curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`;
}
curlCodeEl.textContent = curlSnippet; curlCodeEl.textContent = curlSnippet;
curlCodeEl.className = 'bash hljs'; // Reset classes curlCodeEl.className = 'bash hljs'; // Reset classes
@@ -536,20 +620,39 @@
const endpointMap = { const endpointMap = {
crawl: '/crawl', crawl: '/crawl',
}; // crawl_stream: '/crawl/stream',
/*const endpointMap = {
crawl: '/crawl',
crawl_stream: '/crawl/stream',
md: '/md', md: '/md',
llm: '/llm' llm: '/llm'
};*/ };
const api = endpointMap[endpoint]; const api = endpointMap[endpoint];
const payload = { let payload;
urls,
...advConfig // Create appropriate payload based on endpoint type
}; if (endpoint === 'md') {
// Get values from the /md specific inputs
const filterType = document.getElementById('md-filter').value;
const query = document.getElementById('md-query').value.trim();
const cache = document.getElementById('md-cache').value;
// MD endpoint expects: { url, f, q, c }
payload = {
url: urls[0], // Take first URL
f: filterType, // Lowercase filter type as required by server
q: query || null, // Use the query if provided, otherwise null
c: cache
};
} else if (endpoint === 'llm') {
// LLM endpoint has a different URL pattern and uses query params
// This will be handled directly in the fetch below
payload = null;
} else {
// Default payload for /crawl and /crawl/stream
payload = {
urls,
...advConfig
};
}
updateStatus('processing'); updateStatus('processing');
@@ -557,7 +660,18 @@
const startTime = performance.now(); const startTime = performance.now();
let response, responseData; let response, responseData;
if (endpoint === 'crawl_stream') { if (endpoint === 'llm') {
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
const url = urls[0];
const encodedUrl = encodeURIComponent(url);
// Get the question from the LLM-specific input
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
method: 'GET',
headers: { 'Accept': 'application/json' }
});
} else if (endpoint === 'crawl_stream') {
// Stream processing // Stream processing
response = await fetch(api, { response = await fetch(api, {
method: 'POST', method: 'POST',
@@ -597,7 +711,7 @@
document.querySelector('#response-content code').className = 'json hljs'; // Reset classes document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
forceHighlightElement(document.querySelector('#response-content code')); forceHighlightElement(document.querySelector('#response-content code'));
} else { } else {
// Regular request // Regular request (handles /crawl and /md)
response = await fetch(api, { response = await fetch(api, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
@@ -625,7 +739,16 @@
} }
forceHighlightElement(document.querySelector('#response-content code')); forceHighlightElement(document.querySelector('#response-content code'));
generateSnippets(api, payload);
// For generateSnippets, handle the LLM case specially
if (endpoint === 'llm') {
const url = urls[0];
const encodedUrl = encodeURIComponent(url);
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
} else {
generateSnippets(api, payload);
}
} catch (error) { } catch (error) {
console.error('Error:', error); console.error('Error:', error);
updateStatus('error'); updateStatus('error');
@@ -807,9 +930,24 @@
}); });
}); });
} }
// Function to initialize UI based on selected endpoint
function initUI() {
// Trigger the endpoint change handler to set initial UI state
const endpointSelect = document.getElementById('endpoint');
const event = new Event('change');
endpointSelect.dispatchEvent(event);
// Initialize copy buttons
initCopyButtons();
}
// Call this in your DOMContentLoaded or initialization // Initialize on page load
initCopyButtons(); document.addEventListener('DOMContentLoaded', initUI);
// Also call it immediately in case the script runs after DOM is already loaded
if (document.readyState !== 'loading') {
initUI();
}
</script> </script>
</body> </body>

View File

@@ -45,10 +45,10 @@ def datetime_handler(obj: any) -> Optional[str]:
return obj.isoformat() return obj.isoformat()
raise TypeError(f"Object of type {type(obj)} is not JSON serializable") raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
def should_cleanup_task(created_at: str) -> bool: def should_cleanup_task(created_at: str, ttl_seconds: int = 3600) -> bool:
"""Check if task should be cleaned up based on creation time.""" """Check if task should be cleaned up based on creation time."""
created = datetime.fromisoformat(created_at) created = datetime.fromisoformat(created_at)
return (datetime.now() - created).total_seconds() > 3600 return (datetime.now() - created).total_seconds() > ttl_seconds
def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]: def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
"""Decode Redis hash data from bytes to strings.""" """Decode Redis hash data from bytes to strings."""

126
docs/apps/linkdin/README.md Normal file
View File

@@ -0,0 +1,126 @@
# Crawl4AIProspectWizard stepbystep guide
A threestage demo that goes from **LinkedIn scraping****LLM reasoning****graph visualisation**.
```
prospectwizard/
├─ c4ai_discover.py # Stage 1 scrape companies + people
├─ c4ai_insights.py # Stage 2 embeddings, orgcharts, scores
├─ graph_view_template.html # Stage 3 graph viewer (static HTML)
└─ data/ # output lands here (*.jsonl / *.json)
```
---
## 1  Install & boot a LinkedIn profile (onetime)
### 1.1  Install dependencies
```bash
pip install crawl4ai openai sentence-transformers networkx pandas vis-network rich
```
### 1.2  Create / warm a LinkedIn browser profile
```bash
crwl profiler
```
1. The interactive shell shows **New profile** hit **enter**.
2. Choose a name, e.g. `profile_linkedin_uc`.
3. A Chromium window opens log in to LinkedIn, solve whatever CAPTCHA, then close.
> Remember the **profile name**. All future runs take `--profile-name <your_name>`.
---
## 2  Discovery scrape companies & people
```bash
python c4ai_discover.py full \
--query "health insurance management" \
--geo 102713980 \ # Malaysia geoUrn
--title_filters "" \ # or "Product,Engineering"
--max_companies 10 \ # default set small for workshops
--max_people 20 \ # \^ same
--profile-name profile_linkedin_uc \
--outdir ./data \
--concurrency 2 \
--log_level debug
```
**Outputs** in `./data/`:
* `companies.jsonl` one JSON per company
* `people.jsonl` one JSON per employee
🛠️ **Dryrun:** `C4AI_DEMO_DEBUG=1 python c4ai_discover.py full --query coffee` uses bundled HTML snippets, no network.
### Handy geoUrn cheatsheet
| Location | geoUrn |
|----------|--------|
| Singapore | **103644278** |
| Malaysia | **102713980** |
| UnitedStates | **103644922** |
| UnitedKingdom | **102221843** |
| Australia | **101452733** |
_See more: <https://www.linkedin.com/search/results/companies/?geoUrn=XXX> the number after `geoUrn=` is what you need._
---
## 3  Insights embeddings, orgcharts, decision makers
```bash
python c4ai_insights.py \
--in ./data \
--out ./data \
--embed_model all-MiniLM-L6-v2 \
--top_k 10 \
--openai_model gpt-4.1 \
--max_llm_tokens 8024 \
--llm_temperature 1.0 \
--workers 4
```
Emits next to the Stage1 files:
* `company_graph.json` intercompany similarity graph
* `org_chart_<handle>.json` one per company
* `decision_makers.csv` handpicked who to pitch list
Flags reference (straight from `build_arg_parser()`):
| Flag | Default | Purpose |
|------|---------|---------|
| `--in` | `.` | Stage1 output dir |
| `--out` | `.` | Destination dir |
| `--embed_model` | `all-MiniLM-L6-v2` | SentenceTransformer model |
| `--top_k` | `10` | Neighbours per company in graph |
| `--openai_model` | `gpt-4.1` | LLM for scoring decision makers |
| `--max_llm_tokens` | `8024` | Token budget per LLM call |
| `--llm_temperature` | `1.0` | Creativity knob |
| `--stub` | off | Skip OpenAI and fabricate tiny charts |
| `--workers` | `4` | Parallel LLM workers |
---
## 4  Visualise interactive graph
After Stage 2 completes, simply open the HTML viewer from the project root:
```bash
open graph_view_template.html # or Live Server / Python -http
```
The page fetches `data/company_graph.json` and the `org_chart_*.json` files automatically; keep the `data/` folder beside the HTML file.
* Left pane → list of companies (clans).
* Click a node to load its orgchart on the right.
* Chat drawer lets you ask followup questions; context is pulled from `people.jsonl`.
---
## 5  Common snags
| Symptom | Fix |
|---------|-----|
| Infinite CAPTCHA | Use a residential proxy: `--proxy http://user:pass@ip:port` |
| 429 Too Many Requests | Lower `--concurrency`, rotate profile, add delay |
| Blank graph | Check JSON paths, clear `localStorage` in browser |
---
### TL;DR
`crwl profiler``c4ai_discover.py``c4ai_insights.py` → open `graph_view_template.html`.
Live long and `import crawl4ai`.

View File

@@ -0,0 +1,440 @@
#!/usr/bin/env python3
"""
c4ai-discover — Stage1 Discovery CLI
Scrapes LinkedIn company search + their people pages and dumps two newlinedelimited
JSON files: companies.jsonl and people.jsonl.
Key design rules
----------------
* No BeautifulSoup — Crawl4AI only for network + HTML fetch.
* JsonCssExtractionStrategy for structured scraping; schema autogenerated once
from sample HTML provided by user and then cached under ./schemas/.
* Defaults are embedded so the file runs inside VS Code debugger without CLI args.
* If executed as a console script (argv > 1), CLI flags win.
* Lightweight deps: argparse + Crawl4AI stack.
Author: Tom @ Kidocode 20250426
"""
from __future__ import annotations
import warnings, re
warnings.filterwarnings(
"ignore",
message=r"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used.*",
category=FutureWarning,
module=r"soupsieve"
)
# ───────────────────────────────────────────────────────────────────────────────
# Imports
# ───────────────────────────────────────────────────────────────────────────────
import argparse
import random
import asyncio
import json
import logging
import os
import pathlib
import sys
# 3rd-party rich for pretty logging
from rich.console import Console
from rich.logging import RichHandler
from datetime import datetime, UTC
from itertools import cycle
from textwrap import dedent
from types import SimpleNamespace
from typing import Dict, List, Optional
from urllib.parse import quote
from pathlib import Path
from glob import glob
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CacheMode,
CrawlerRunConfig,
JsonCssExtractionStrategy,
BrowserProfiler,
LLMConfig,
)
# ───────────────────────────────────────────────────────────────────────────────
# Constants / paths
# ───────────────────────────────────────────────────────────────────────────────
BASE_DIR = pathlib.Path(__file__).resolve().parent
SCHEMA_DIR = BASE_DIR / "schemas"
SCHEMA_DIR.mkdir(parents=True, exist_ok=True)
COMPANY_SCHEMA_PATH = SCHEMA_DIR / "company_card.json"
PEOPLE_SCHEMA_PATH = SCHEMA_DIR / "people_card.json"
# ---------- deterministic target JSON examples ----------
_COMPANY_SCHEMA_EXAMPLE = {
"handle": "/company/posify/",
"profile_image": "https://media.licdn.com/dms/image/v2/.../logo.jpg",
"name": "Management Research Services, Inc. (MRS, Inc)",
"descriptor": "Insurance • Milwaukee, Wisconsin",
"about": "Insurance • Milwaukee, Wisconsin",
"followers": 1000
}
_PEOPLE_SCHEMA_EXAMPLE = {
"profile_url": "https://www.linkedin.com/in/lily-ng/",
"name": "Lily Ng",
"headline": "VP Product @ Posify",
"followers": 890,
"connection_degree": "2nd",
"avatar_url": "https://media.licdn.com/dms/image/v2/.../lily.jpg"
}
# Provided sample HTML snippets (trimmed) — used exactly once to coldgenerate schema.
_SAMPLE_COMPANY_HTML = (Path(__file__).resolve().parent / "snippets/company.html").read_text()
_SAMPLE_PEOPLE_HTML = (Path(__file__).resolve().parent / "snippets/people.html").read_text()
# --------- tighter schema prompts ----------
_COMPANY_SCHEMA_QUERY = dedent(
"""
Using the supplied <li> company-card HTML, build a JsonCssExtractionStrategy schema that,
for every card, outputs *exactly* the keys shown in the example JSON below.
JSON spec:
• handle href of the outermost <a> that wraps the logo/title, e.g. "/company/posify/"
• profile_image absolute URL of the <img> inside that link
• name text of the <a> inside the <span class*='t-16'>
• descriptor text line with industry • location
• about text of the <div class*='t-normal'> below the name (industry + geo)
• followers integer parsed from the <div> containing 'followers'
IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
The main div parent contains these li element is "div.search-results-container" you can use this.
The <ul> parent has "role" equal to "list". Using these two should be enough to target the <li> elements."
"""
)
_PEOPLE_SCHEMA_QUERY = dedent(
"""
Using the supplied <li> people-card HTML, build a JsonCssExtractionStrategy schema that
outputs exactly the keys in the example JSON below.
Fields:
• profile_url href of the outermost profile link
• name text inside artdeco-entity-lockup__title
• headline inner text of artdeco-entity-lockup__subtitle
• followers integer parsed from the span inside lt-line-clamp--multi-line
• connection_degree '1st', '2nd', etc. from artdeco-entity-lockup__badge
• avatar_url src of the <img> within artdeco-entity-lockup__image
IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
The main div parent contains these li element is a "div" has these classes "artdeco-card org-people-profile-card__card-spacing org-people__card-margin-bottom".
"""
)
# ---------------------------------------------------------------------------
# Utility helpers
# ---------------------------------------------------------------------------
def _load_or_build_schema(
path: pathlib.Path,
sample_html: str,
query: str,
example_json: Dict,
force = False
) -> Dict:
"""Load schema from path, else call generate_schema once and persist."""
if path.exists() and not force:
return json.loads(path.read_text())
logging.info("[SCHEMA] Generating schema %s", path.name)
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llm_config=LLMConfig(
provider=os.getenv("C4AI_SCHEMA_PROVIDER", "openai/gpt-4o"),
api_token=os.getenv("OPENAI_API_KEY", "env:OPENAI_API_KEY"),
),
query=query,
target_json_example=json.dumps(example_json, indent=2),
)
path.write_text(json.dumps(schema, indent=2))
return schema
def _openai_friendly_number(text: str) -> Optional[int]:
"""Extract first int from text like '1K followers' (returns 1000)."""
import re
m = re.search(r"(\d[\d,]*)", text.replace(",", ""))
if not m:
return None
val = int(m.group(1))
if "k" in text.lower():
val *= 1000
if "m" in text.lower():
val *= 1_000_000
return val
# ---------------------------------------------------------------------------
# Core async workers
# ---------------------------------------------------------------------------
async def crawl_company_search(crawler: AsyncWebCrawler, url: str, schema: Dict, limit: int) -> List[Dict]:
"""Paginate 10-item company search pages until `limit` reached."""
extraction = JsonCssExtractionStrategy(schema)
cfg = CrawlerRunConfig(
extraction_strategy=extraction,
cache_mode=CacheMode.BYPASS,
wait_for = ".search-marvel-srp",
session_id="company_search",
delay_before_return_html=1,
magic = True,
verbose= False,
)
companies, page = [], 1
while len(companies) < max(limit, 10):
paged_url = f"{url}&page={page}"
res = await crawler.arun(paged_url, config=cfg)
batch = json.loads(res[0].extracted_content)
if not batch:
break
for item in batch:
name = item.get("name", "").strip()
handle = item.get("handle", "").strip()
if not handle or not name:
continue
descriptor = item.get("descriptor")
about = item.get("about")
followers = _openai_friendly_number(str(item.get("followers", "")))
companies.append(
{
"handle": handle,
"name": name,
"descriptor": descriptor,
"about": about,
"followers": followers,
"people_url": f"{handle}people/",
"captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
}
)
page += 1
logging.info(
f"[dim]Page {page}[/] — running total: {len(companies)}/{limit} companies"
)
return companies[:max(limit, 10)]
async def crawl_people_page(
crawler: AsyncWebCrawler,
people_url: str,
schema: Dict,
limit: int,
title_kw: str,
) -> List[Dict]:
people_u = f"{people_url}?keywords={quote(title_kw)}"
extraction = JsonCssExtractionStrategy(schema)
cfg = CrawlerRunConfig(
extraction_strategy=extraction,
# scan_full_page=True,
cache_mode=CacheMode.BYPASS,
magic=True,
wait_for=".org-people-profile-card__card-spacing",
delay_before_return_html=1,
session_id="people_search",
)
res = await crawler.arun(people_u, config=cfg)
if not res[0].success:
return []
raw = json.loads(res[0].extracted_content)
people = []
for p in raw[:limit]:
followers = _openai_friendly_number(str(p.get("followers", "")))
people.append(
{
"profile_url": p.get("profile_url"),
"name": p.get("name"),
"headline": p.get("headline"),
"followers": followers,
"connection_degree": p.get("connection_degree"),
"avatar_url": p.get("avatar_url"),
}
)
return people
# ---------------------------------------------------------------------------
# CLI + main
# ---------------------------------------------------------------------------
def build_arg_parser() -> argparse.ArgumentParser:
ap = argparse.ArgumentParser("c4ai-discover — Crawl4AI LinkedIn discovery")
sub = ap.add_subparsers(dest="cmd", required=False, help="run scope")
def add_flags(parser: argparse.ArgumentParser):
parser.add_argument("--query", required=False, help="query keyword(s)")
parser.add_argument("--geo", required=False, type=int, help="LinkedIn geoUrn")
parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
parser.add_argument("--max-companies", type=int, default=1000)
parser.add_argument("--max-people", type=int, default=500)
parser.add_argument("--profile-path", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
parser.add_argument("--outdir", default="./output")
parser.add_argument("--concurrency", type=int, default=4)
parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
add_flags(sub.add_parser("full"))
add_flags(sub.add_parser("companies"))
add_flags(sub.add_parser("people"))
# global flags
ap.add_argument(
"--debug",
action="store_true",
help="Use built-in demo defaults (same as C4AI_DEMO_DEBUG=1)",
)
return ap
def detect_debug_defaults(force = False) -> SimpleNamespace:
if not force and sys.gettrace() is None and not os.getenv("C4AI_DEMO_DEBUG"):
return SimpleNamespace()
# ----- debugfriendly defaults -----
return SimpleNamespace(
cmd="full",
query="health insurance management",
geo=102713980,
# title_filters="Product,Engineering",
title_filters="",
max_companies=10,
max_people=5,
profile_name="profile_linkedin_uc",
outdir="./debug_out",
concurrency=2,
log_level="debug",
)
async def async_main(opts):
# ─────────── logging setup ───────────
console = Console()
logging.basicConfig(
level=opts.log_level.upper(),
format="%(message)s",
handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
)
# -------------------------------------------------------------------
# Load or build schemas (onetime LLM call each)
# -------------------------------------------------------------------
company_schema = _load_or_build_schema(
COMPANY_SCHEMA_PATH,
_SAMPLE_COMPANY_HTML,
_COMPANY_SCHEMA_QUERY,
_COMPANY_SCHEMA_EXAMPLE,
# True
)
people_schema = _load_or_build_schema(
PEOPLE_SCHEMA_PATH,
_SAMPLE_PEOPLE_HTML,
_PEOPLE_SCHEMA_QUERY,
_PEOPLE_SCHEMA_EXAMPLE,
# True
)
outdir = BASE_DIR / pathlib.Path(opts.outdir)
outdir.mkdir(parents=True, exist_ok=True)
f_companies = (BASE_DIR / outdir / "companies.jsonl").open("a", encoding="utf-8")
f_people = (BASE_DIR / outdir / "people.jsonl").open("a", encoding="utf-8")
# -------------------------------------------------------------------
# Prepare crawler with cookie pool rotation
# -------------------------------------------------------------------
profiler = BrowserProfiler()
path = profiler.get_profile_path(opts.profile_name)
bc = BrowserConfig(
headless=False,
verbose=False,
user_data_dir=path,
use_managed_browser=True,
user_agent_mode = "random",
user_agent_generator_config= {
"platforms": "mobile",
"os": "Android"
},
verbose=False,
)
crawler = AsyncWebCrawler(config=bc)
await crawler.start()
# Single worker for simplicity; concurrency can be scaled by arun_many if needed.
# crawler = await next_crawler().start()
try:
# Build LinkedIn search URL
search_url = f"https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&geoUrn={opts.geo}"
logging.info("Seed URL => %s", search_url)
companies: List[Dict] = []
if opts.cmd in ("companies", "full"):
companies = await crawl_company_search(
crawler, search_url, company_schema, opts.max_companies
)
for c in companies:
f_companies.write(json.dumps(c, ensure_ascii=False) + "\n")
logging.info(f"[bold green]✓[/] Companies scraped so far: {len(companies)}")
if opts.cmd in ("people", "full"):
if not companies:
# load from previous run
src = outdir / "companies.jsonl"
if not src.exists():
logging.error("companies.jsonl missing — run companies/full first")
return 10
companies = [json.loads(l) for l in src.read_text().splitlines()]
total_people = 0
title_kw = " ".join([t.strip() for t in opts.title_filters.split(",") if t.strip()]) if opts.title_filters else ""
for comp in companies:
people = await crawl_people_page(
crawler,
comp["people_url"],
people_schema,
opts.max_people,
title_kw,
)
for p in people:
rec = p | {
"company_handle": comp["handle"],
# "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
"captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
}
f_people.write(json.dumps(rec, ensure_ascii=False) + "\n")
total_people += len(people)
logging.info(
f"{comp['name']} — [cyan]{len(people)}[/] people extracted"
)
await asyncio.sleep(random.uniform(0.5, 1))
logging.info("Total people scraped: %d", total_people)
finally:
await crawler.close()
f_companies.close()
f_people.close()
return 0
def main():
parser = build_arg_parser()
cli_opts = parser.parse_args()
# decide on debug defaults
if cli_opts.debug:
opts = detect_debug_defaults(force=True)
else:
env_defaults = detect_debug_defaults()
env_defaults = detect_debug_defaults()
opts = env_defaults if env_defaults else cli_opts
if not getattr(opts, "cmd", None):
opts.cmd = "full"
exit_code = asyncio.run(async_main(opts))
sys.exit(exit_code)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,372 @@
#!/usr/bin/env python3
"""
Stage-2 Insights builder
------------------------
Reads companies.jsonl & people.jsonl (Stage-1 output) and produces:
• company_graph.json
• org_chart_<handle>.json (one per company)
• decision_makers.csv
• graph_view.html (interactive visualisation)
Run:
python c4ai_insights.py --in ./stage1_out --out ./stage2_out
Author : Tom @ Kidocode, 2025-04-28
"""
from __future__ import annotations
# ───────────────────────────────────────────────────────────────────────────────
# Imports & Third-party
# ───────────────────────────────────────────────────────────────────────────────
import argparse, asyncio, json, os, sys, pathlib, random, time, csv
from datetime import datetime, UTC
from types import SimpleNamespace
from pathlib import Path
from typing import List, Dict, Any
# Pretty CLI UX
from rich.console import Console
from rich.logging import RichHandler
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
import logging
from jinja2 import Environment, FileSystemLoader, select_autoescape
BASE_DIR = pathlib.Path(__file__).resolve().parent
# ───────────────────────────────────────────────────────────────────────────────
# 3rd-party deps
# ───────────────────────────────────────────────────────────────────────────────
import numpy as np
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import hashlib
from openai import OpenAI # same SDK you pre-loaded
# ───────────────────────────────────────────────────────────────────────────────
# Utils
# ───────────────────────────────────────────────────────────────────────────────
def load_jsonl(path: Path) -> List[Dict[str, Any]]:
with open(path, "r", encoding="utf-8") as f:
return [json.loads(l) for l in f]
def dump_json(obj, path: Path):
with open(path, "w", encoding="utf-8") as f:
json.dump(obj, f, ensure_ascii=False, indent=2)
# ───────────────────────────────────────────────────────────────────────────────
# Constants
# ───────────────────────────────────────────────────────────────────────────────
BASE_DIR = pathlib.Path(__file__).resolve().parent
# ───────────────────────────────────────────────────────────────────────────────
# Debug defaults (mirrors Stage-1 trick)
# ───────────────────────────────────────────────────────────────────────────────
def dev_defaults() -> SimpleNamespace:
return SimpleNamespace(
in_dir="./debug_out",
out_dir="./insights_debug",
embed_model="all-MiniLM-L6-v2",
top_k=10,
openai_model="gpt-4.1",
max_llm_tokens=8000,
llm_temperature=1.0,
workers=4, # parallel processing
stub=False, # manual
)
# ───────────────────────────────────────────────────────────────────────────────
# Graph builders
# ───────────────────────────────────────────────────────────────────────────────
def embed_descriptions(companies, model_name:str, opts) -> np.ndarray:
from sentence_transformers import SentenceTransformer
logging.debug(f"Using embedding model: {model_name}")
cache_path = BASE_DIR / Path(opts.out_dir) / "embeds_cache.json"
cache = {}
if cache_path.exists():
with open(cache_path) as f:
cache = json.load(f)
# flush cache if model differs
if cache.get("_model") != model_name:
cache = {}
model = SentenceTransformer(model_name)
new_texts, new_indices = [], []
vectors = np.zeros((len(companies), 384), dtype=np.float32)
for idx, comp in enumerate(companies):
text = comp.get("about") or comp.get("descriptor","")
h = hashlib.sha1(text.encode("utf-8")).hexdigest()
cached = cache.get(comp["handle"])
if cached and cached["hash"] == h:
vectors[idx] = np.array(cached["vector"], dtype=np.float32)
else:
new_texts.append(text)
new_indices.append((idx, comp["handle"], h))
if new_texts:
embeds = model.encode(new_texts, show_progress_bar=False, convert_to_numpy=True)
for vec, (idx, handle, h) in zip(embeds, new_indices):
vectors[idx] = vec
cache[handle] = {"hash": h, "vector": vec.tolist()}
cache["_model"] = model_name
with open(cache_path, "w") as f:
json.dump(cache, f)
return vectors
def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any]:
from sklearn.metrics.pairwise import cosine_similarity
sims = cosine_similarity(embeds)
nodes, edges = [], []
idx_of = {c["handle"]: i for i,c in enumerate(companies)}
for i,c in enumerate(companies):
node = dict(
id=c["handle"].strip("/"),
name=c["name"],
handle=c["handle"],
about=c.get("about",""),
people_url=c.get("people_url",""),
industry=c.get("descriptor","").split("")[0].strip(),
geoUrn=c.get("geoUrn"),
followers=c.get("followers",0),
# desc_embed=embeds[i].tolist(),
desc_embed=[],
)
nodes.append(node)
# pick top-k most similar except itself
top_idx = np.argsort(sims[i])[::-1][1:top_k+1]
for j in top_idx:
tgt = companies[j]
weight = float(sims[i,j])
if node["industry"] == tgt.get("descriptor","").split("")[0].strip():
weight += 0.10
if node["geoUrn"] == tgt.get("geoUrn"):
weight += 0.05
tgt['followers'] = tgt.get("followers", None) or 1
node["followers"] = node.get("followers", None) or 1
follower_ratio = min(node["followers"], tgt.get("followers",1)) / max(node["followers"] or 1, tgt.get("followers",1))
weight += 0.05 * follower_ratio
edges.append(dict(
source=node["id"],
target=tgt["handle"].strip("/"),
weight=round(weight,4),
drivers=dict(
embed_sim=round(float(sims[i,j]),4),
industry_match=0.10 if node["industry"] == tgt.get("descriptor","").split("")[0].strip() else 0,
geo_overlap=0.05 if node["geoUrn"] == tgt.get("geoUrn") else 0,
)
))
# return {"nodes":nodes,"edges":edges,"meta":{"generated_at":datetime.now(UTC).isoformat()}}
return {"nodes":nodes,"edges":edges,"meta":{"generated_at":datetime.now(UTC).isoformat()}}
# ───────────────────────────────────────────────────────────────────────────────
# Org-chart via LLM
# ───────────────────────────────────────────────────────────────────────────────
async def infer_org_chart_llm(company, people, client:OpenAI, model_name:str, max_tokens:int, temperature:float, stub:bool):
if stub:
# Tiny fake org-chart when debugging offline
chief = random.choice(people)
nodes = [{
"id": chief["profile_url"],
"name": chief["name"],
"title": chief["headline"],
"dept": chief["headline"].split()[:1][0],
"yoe_total": 8,
"yoe_current": 2,
"seniority_score": 0.8,
"decision_score": 0.9,
"avatar_url": chief.get("avatar_url")
}]
return {"nodes":nodes,"edges":[],"meta":{"debug_stub":True,"generated_at":datetime.now(UTC).isoformat()}}
prompt = [
{"role":"system","content":"You are an expert B2B org-chart reasoner."},
{"role":"user","content":f"""Here is the company description:
<company>
{json.dumps(company, ensure_ascii=False)}
</company>
Here is a JSON list of employees:
<employees>
{json.dumps(people, ensure_ascii=False)}
</employees>
1) Build a reporting tree (manager -> direct reports)
2) For each person output a decision_score 0-1 for buying new software
Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
"""}
]
resp = client.chat.completions.create(
model=model_name,
messages=prompt,
max_tokens=max_tokens,
temperature=temperature,
response_format={"type":"json_object"}
)
chart = json.loads(resp.choices[0].message.content)
chart["meta"] = dict(model=model_name, generated_at=datetime.now(UTC).isoformat())
return chart
# ───────────────────────────────────────────────────────────────────────────────
# CSV flatten
# ───────────────────────────────────────────────────────────────────────────────
def export_decision_makers(charts_dir:Path, csv_path:Path, threshold:float=0.5):
rows=[]
for p in charts_dir.glob("org_chart_*.json"):
data=json.loads(p.read_text())
comp = p.stem.split("org_chart_")[1]
for n in data.get("nodes",[]):
if n.get("decision_score",0)>=threshold:
rows.append(dict(
company=comp,
person=n["name"],
title=n["title"],
decision_score=n["decision_score"],
profile_url=n["id"]
))
pd.DataFrame(rows).to_csv(csv_path,index=False)
# ───────────────────────────────────────────────────────────────────────────────
# HTML rendering
# ───────────────────────────────────────────────────────────────────────────────
def render_html(out:Path, template_dir:Path):
# From template folder cp graph_view.html and ai.js in out folder
import shutil
shutil.copy(template_dir/"graph_view_template.html", out / "graph_view.html")
shutil.copy(template_dir/"ai.js", out)
# ───────────────────────────────────────────────────────────────────────────────
# Main async pipeline
# ───────────────────────────────────────────────────────────────────────────────
async def run(opts):
# ── silence SDK noise ──────────────────────────────────────────────────────
for noisy in ("openai", "httpx", "httpcore"):
lg = logging.getLogger(noisy)
lg.setLevel(logging.WARNING) # or ERROR if you want total silence
lg.propagate = False # optional: stop them reaching root
# ────────────── logging bootstrap ──────────────
console = Console()
logging.basicConfig(
level="INFO",
format="%(message)s",
handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
)
in_dir = BASE_DIR / Path(opts.in_dir)
out_dir = BASE_DIR / Path(opts.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
companies = load_jsonl(in_dir/"companies.jsonl")
people = load_jsonl(in_dir/"people.jsonl")
logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
logging.info("[bold]⇢[/] Embedding company descriptions…")
# embeds = embed_descriptions(companies, opts.embed_model, opts)
logging.info("[bold]⇢[/] Building similarity graph")
# company_graph = build_company_graph(companies, embeds, opts.top_k)
# dump_json(company_graph, out_dir/"company_graph.json")
# OpenAI client (only built if not debugging)
stub = bool(opts.stub)
client = OpenAI() if not stub else None
# Filter companies that need processing
to_process = []
for comp in companies:
handle = comp["handle"].strip("/").replace("/","_")
out_file = out_dir/f"org_chart_{handle}.json"
if out_file.exists() and False:
logging.info(f"[green]✓[/] Skipping existing {comp['name']}")
continue
to_process.append(comp)
if not to_process:
logging.info("[yellow]All companies already processed[/]")
else:
workers = getattr(opts, 'workers', 1)
parallel = workers > 1
logging.info(f"[bold]⇢[/] Inferring org-charts via LLM {f'(parallel={workers} workers)' if parallel else ''}")
with Progress(
SpinnerColumn(),
BarColumn(),
TextColumn("[progress.description]{task.description}"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Org charts", total=len(to_process))
async def process_one(comp):
handle = comp["handle"].strip("/").replace("/","_")
persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
chart = await infer_org_chart_llm(
comp, persons,
client=client if client else OpenAI(api_key="sk-debug"),
model_name=opts.openai_model,
max_tokens=opts.max_llm_tokens,
temperature=opts.llm_temperature,
stub=stub,
)
chart["meta"]["company"] = comp["name"]
# Save the result immediately
dump_json(chart, out_dir/f"org_chart_{handle}.json")
progress.update(task, advance=1, description=f"{comp['name']} ({len(persons)} ppl)")
# Create tasks for all companies
tasks = [process_one(comp) for comp in to_process]
# Process in batches based on worker count
semaphore = asyncio.Semaphore(workers)
async def bounded_process(coro):
async with semaphore:
return await coro
# Run with concurrency control
await asyncio.gather(*(bounded_process(task) for task in tasks))
logging.info("[bold]⇢[/] Flattening decision-makers CSV")
export_decision_makers(out_dir, out_dir/"decision_makers.csv")
render_html(out_dir, template_dir=BASE_DIR/"templates")
logging.success = lambda msg, **k: console.print(f"[bold green]✓[/] {msg}", **k)
logging.success(f"Stage-2 artefacts written to {out_dir}")
# ───────────────────────────────────────────────────────────────────────────────
# CLI
# ───────────────────────────────────────────────────────────────────────────────
def build_arg_parser():
p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
p.add_argument("--in", dest="in_dir", required=False, help="Stage-1 output dir", default=".")
p.add_argument("--out", dest="out_dir", required=False, help="Destination dir", default=".")
p.add_argument("--embed_model", default="all-MiniLM-L6-v2")
p.add_argument("--top_k", type=int, default=10, help="Top-k neighbours per company")
p.add_argument("--openai_model", default="gpt-4.1")
p.add_argument("--max_llm_tokens", type=int, default=8024)
p.add_argument("--llm_temperature", type=float, default=1.0)
p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
return p
def main():
dbg = dev_defaults()
opts = dbg if True else build_arg_parser().parse_args()
asyncio.run(run(opts))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,39 @@
{
"name": "LinkedIn Company Card",
"baseSelector": "div.search-results-container ul[role='list'] > li",
"fields": [
{
"name": "handle",
"selector": "a[href*='/company/']",
"type": "attribute",
"attribute": "href"
},
{
"name": "profile_image",
"selector": "a[href*='/company/'] img",
"type": "attribute",
"attribute": "src"
},
{
"name": "name",
"selector": "span[class*='t-16'] a",
"type": "text"
},
{
"name": "descriptor",
"selector": "div[class*='t-black t-normal']",
"type": "text"
},
{
"name": "about",
"selector": "p[class*='entity-result__summary--2-lines']",
"type": "text"
},
{
"name": "followers",
"selector": "div:contains('followers')",
"type": "regex",
"pattern": "(\\d+)\\s*followers"
}
]
}

View File

@@ -0,0 +1,38 @@
{
"name": "LinkedIn People Card",
"baseSelector": "li.org-people-profile-card__profile-card-spacing",
"fields": [
{
"name": "profile_url",
"selector": "a.eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo",
"type": "attribute",
"attribute": "href"
},
{
"name": "name",
"selector": ".artdeco-entity-lockup__title .lt-line-clamp--single-line",
"type": "text"
},
{
"name": "headline",
"selector": ".artdeco-entity-lockup__subtitle .lt-line-clamp--multi-line",
"type": "text"
},
{
"name": "followers",
"selector": ".lt-line-clamp--multi-line.t-12",
"type": "text"
},
{
"name": "connection_degree",
"selector": ".artdeco-entity-lockup__badge .artdeco-entity-lockup__degree",
"type": "text"
},
{
"name": "avatar_url",
"selector": ".artdeco-entity-lockup__image img",
"type": "attribute",
"attribute": "src"
}
]
}

View File

@@ -0,0 +1,143 @@
<li class="yCLWzruNprmIzaZzFFonVFBtMrbaVYnuDFA">
<!----><!---->
<div class="IxlEPbRZwQYrRltKPvHAyjBmCdIWTAoYo" data-chameleon-result-urn="urn:li:company:362492"
data-view-name="search-entity-result-universal-template">
<div class="linked-area flex-1
cursor-pointer">
<div class="BAEgVqVuxosMJZodcelsgPoyRcrkiqgVCGHXNQ">
<div class="afcvrbGzNuyRlhPPQWrWirJtUdHAAtUlqxwvVA">
<div class="display-flex align-items-center">
<!---->
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo scale-down " aria-hidden="true"
tabindex="-1" href="https://www.linkedin.com/company/managment-research-services-inc./"
data-test-app-aware-link="">
<div class="ivm-image-view-model ">
<div class="ivm-view-attr__img-wrapper
">
<!---->
<!----> <img width="48"
src="https://media.licdn.com/dms/image/v2/C560BAQFWpusEOgW-ww/company-logo_100_100/company-logo_100_100/0/1630583697877/managment_research_services_inc_logo?e=1750896000&amp;v=beta&amp;t=Ch9vyEZdfng-1D1m_XqP5kjNpVXUBKkk9cNhMZUhx0E"
loading="lazy" height="48" alt="Management Research Services, Inc. (MRS, Inc)"
id="ember28"
class="ivm-view-attr__img--centered EntityPhoto-square-3 evi-image lazy-image ember-view">
</div>
</div>
</a>
</div>
</div>
<div
class="wympnVuDByXHvafWrMGJLZuchDmCRqLmWPwg MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA pt3 pb3 t-12 t-black--light">
<div class="mb1">
<div class="t-roman t-sans">
<div class="display-flex">
<span class="TikBXjihYvcNUoIzkslUaEjfIuLmYxfs OoHEyXgsiIqGADjcOtTmfdpoYVXrLKTvkwI ">
<span class="CgaWLOzmXNuKbRIRARSErqCJcBPYudEKo
t-16">
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
href="https://www.linkedin.com/company/managment-research-services-inc./"
data-test-app-aware-link="">
<!---->Management Research Services, Inc. (MRS, Inc)<!---->
<!----> </a>
<!----> </span>
</span>
<!---->
</div>
</div>
<div class="LjmdKCEqKITHihFOiQsBAQylkdnsWhqZii
t-14 t-black t-normal">
<!---->Insurance • Milwaukee, Wisconsin<!---->
</div>
<div class="cTPhJiHyNLmxdQYFlsEOutjznmqrVHUByZwZ
t-14 t-normal">
<!---->1K followers<!---->
</div>
</div>
<!---->
<p class="yWzlqwKNlvCWVNoKqmzoDDEnBMUuyynaLg
entity-result__summary--2-lines
t-12 t-black--light
">
<!---->MRS combines 30 years of experience supporting the Life,<span class="white-space-pre">
</span><strong><!---->Health<!----></strong><span class="white-space-pre"> </span>and
Annuities<span class="white-space-pre"> </span><strong><!---->Insurance<!----></strong><span
class="white-space-pre"> </span>Industry with customized<span class="white-space-pre">
</span><strong><!---->insurance<!----></strong><span class="white-space-pre">
</span>underwriting solutions that efficiently support clients workflows. Supported by the
Agenium Platform (www.agenium.ai) our innovative underwriting solutions are guaranteed to
optimize requirements...<!---->
</p>
<!---->
</div>
<div class="qXxdnXtzRVFTnTnetmNpssucBwQBsWlUuk MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA">
<!---->
<div>
<button aria-label="Follow Management Research Services, Inc. (MRS, Inc)" id="ember61"
class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view"
type="button"><!---->
<span class="artdeco-button__text">
Follow
</span></button>
<!---->
<!---->
</div>
</div>
</div>
</div>
</div>
</li>

View File

@@ -0,0 +1,94 @@
<li class="grid grid__col--lg-8 block org-people-profile-card__profile-card-spacing">
<div>
<section class="artdeco-card full-width qQdPErXQkSAbwApNgNfuxukTIPPykttCcZGOHk">
<!---->
<img width="210" src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"
ariarole="presentation" loading="lazy" height="210" alt="" id="ember96"
class="evi-image lazy-image ghost-default ember-view org-people-profile-card__cover-photo org-people-profile-card__cover-photo--people">
<div class="org-people-profile-card__profile-info">
<div id="ember97"
class="artdeco-entity-lockup artdeco-entity-lockup--stacked-center artdeco-entity-lockup--size-7 ember-view">
<div id="ember98"
class="artdeco-entity-lockup__image artdeco-entity-lockup__image--type-circle ember-view"
type="circle">
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
id="org-people-profile-card__profile-image-0"
href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
data-test-app-aware-link="">
<img width="104"
src="https://media.licdn.com/dms/image/v2/D5603AQGs2Vyju4xZ7A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1681741067031?e=1750896000&amp;v=beta&amp;t=Hvj--IrrmpVIH7pec7-l_PQok8vsS__CGeUqBWOw7co"
loading="lazy" height="104" alt="Dr. Rayna S." id="ember99"
class="evi-image lazy-image ember-view">
</a>
</div>
<div id="ember100" class="artdeco-entity-lockup__content ember-view">
<div id="ember101" class="artdeco-entity-lockup__title ember-view">
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo link-without-visited-state"
aria-label="View Dr. Rayna S.s profile"
href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
data-test-app-aware-link="">
<div id="ember103" class="ember-view lt-line-clamp lt-line-clamp--single-line AGabuksChUpCmjWshSnaZryLKSthOKkwclxY
t-black" style="">
Dr. Rayna S.
<!---->
</div>
</a>
</div>
<div id="ember104" class="artdeco-entity-lockup__badge ember-view"> <span class="a11y-text">3rd+
degree connection</span>
<span class="artdeco-entity-lockup__degree" aria-hidden="true">
·&nbsp;3rd
</span>
<!----><!---->
</div>
<div id="ember105" class="artdeco-entity-lockup__subtitle ember-view">
<div class="t-14 t-black--light t-normal">
<div id="ember107" class="ember-view lt-line-clamp lt-line-clamp--multi-line"
style="-webkit-line-clamp: 2">
Leadership and Talent Development Consultant and Professional Speaker
<!---->
</div>
</div>
</div>
<div id="ember108" class="artdeco-entity-lockup__caption ember-view"></div>
</div>
</div>
<span class="text-align-center">
<span id="ember110"
class="ember-view lt-line-clamp lt-line-clamp--multi-line t-12 t-black--light mt2"
style="-webkit-line-clamp: 3">
727 followers
<!----> </span>
</span>
</div>
<footer class="ph3 pb3">
<button aria-label="Follow Dr. Rayna S." id="ember111"
class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view full-width"
type="button"><!---->
<span class="artdeco-button__text">
Follow
</span></button>
</footer>
</section>
</div>
</li>

View File

@@ -0,0 +1,50 @@
// ==== File: ai.js ====
class ApiHandler {
constructor(apiKey = null) {
this.apiKey = apiKey || localStorage.getItem("openai_api_key") || "";
console.log("ApiHandler ready");
}
setApiKey(k) {
this.apiKey = k.trim();
if (this.apiKey) localStorage.setItem("openai_api_key", this.apiKey);
}
async *chatStream(messages, {model = "gpt-4o", temperature = 0.7} = {}) {
if (!this.apiKey) throw new Error("OpenAI API key missing");
const payload = {model, messages, stream: true, max_tokens: 1024};
const controller = new AbortController();
const res = await fetch("https://api.openai.com/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
},
body: JSON.stringify(payload),
signal: controller.signal,
});
if (!res.ok) throw new Error(`OpenAI: ${res.statusText}`);
const reader = res.body.getReader();
const dec = new TextDecoder();
let buf = "";
while (true) {
const {done, value} = await reader.read();
if (done) break;
buf += dec.decode(value, {stream: true});
for (const line of buf.split("\n")) {
if (!line.startsWith("data: ")) continue;
if (line.includes("[DONE]")) return;
const json = JSON.parse(line.slice(6));
const delta = json.choices?.[0]?.delta?.content;
if (delta) yield delta;
}
buf = buf.endsWith("\n") ? "" : buf; // keep partial line
}
}
}
window.API = new ApiHandler();

File diff suppressed because it is too large Load Diff

51
docs/codebase/browser.md Normal file
View File

@@ -0,0 +1,51 @@
### browser_manager.py
| Function | What it does |
|---|---|
| `ManagedBrowser.build_browser_flags` | Returns baseline Chromium CLI flags, disables GPU and sandbox, plugs locale, timezone, stealth tweaks, and any extras from `BrowserConfig`. |
| `ManagedBrowser.__init__` | Stores config and logger, creates temp dir, preps internal state. |
| `ManagedBrowser.start` | Spawns or connects to the Chromium process, returns its CDP endpoint plus the `subprocess.Popen` handle. |
| `ManagedBrowser._initial_startup_check` | Pings the CDP endpoint once to be sure the browser is alive, raises if not. |
| `ManagedBrowser._monitor_browser_process` | Async-loops on the subprocess, logs exits or crashes, restarts if policy allows. |
| `ManagedBrowser._get_browser_path_WIP` | Old helper that maps OS + browser type to an executable path. |
| `ManagedBrowser._get_browser_path` | Current helper, checks env vars, Playwright cache, and OS defaults for the real executable. |
| `ManagedBrowser._get_browser_args` | Builds the final CLI arg list by merging user flags, stealth flags, and defaults. |
| `ManagedBrowser.cleanup` | Terminates the browser, stops monitors, deletes the temp dir. |
| `ManagedBrowser.create_profile` | Opens a visible browser so a human can log in, then zips the resulting user-data-dir to `~/.crawl4ai/profiles/<name>`. |
| `ManagedBrowser.list_profiles` | Thin wrapper, now forwarded to `BrowserProfiler.list_profiles()`. |
| `ManagedBrowser.delete_profile` | Thin wrapper, now forwarded to `BrowserProfiler.delete_profile()`. |
| `BrowserManager.__init__` | Holds the global Playwright instance, browser handle, config signature cache, session map, and logger. |
| `BrowserManager.start` | Boots the underlying `ManagedBrowser`, then spins up the default Playwright browser context with stealth patches. |
| `BrowserManager._build_browser_args` | Translates `CrawlerRunConfig` (proxy, UA, timezone, headless flag, etc.) into Playwright `launch_args`. |
| `BrowserManager.setup_context` | Applies locale, geolocation, permissions, cookies, and UA overrides on a fresh context. |
| `BrowserManager.create_browser_context` | Internal helper that actually calls `browser.new_context(**options)` after running `setup_context`. |
| `BrowserManager._make_config_signature` | Hashes the non-ephemeral parts of `CrawlerRunConfig` so contexts can be reused safely. |
| `BrowserManager.get_page` | Returns a ready `Page` for a given session id, reusing an existing one or creating a new context/page, injects helper scripts, updates `last_used`. |
| `BrowserManager.kill_session` | Force-closes a context/page for a session and removes it from the session map. |
| `BrowserManager._cleanup_expired_sessions` | Periodic sweep that drops sessions idle longer than `ttl_seconds`. |
| `BrowserManager.close` | Gracefully shuts down all contexts, the browser, Playwright, and background tasks. |
---
### browser_profiler.py
| Function | What it does |
|---|---|
| `BrowserProfiler.__init__` | Sets up profile folder paths, async logger, and signal handlers. |
| `BrowserProfiler.create_profile` | Launches a visible browser with a new user-data-dir for manual login, on exit compresses and stores it as a named profile. |
| `BrowserProfiler.cleanup_handler` | General SIGTERM/SIGINT cleanup wrapper that kills child processes. |
| `BrowserProfiler.sigint_handler` | Handles Ctrl-C during an interactive session, makes sure the browser shuts down cleanly. |
| `BrowserProfiler.listen_for_quit_command` | Async REPL that exits when the user types `q`. |
| `BrowserProfiler.list_profiles` | Enumerates `~/.crawl4ai/profiles`, prints profile name, browser type, size, and last modified. |
| `BrowserProfiler.get_profile_path` | Returns the absolute path of a profile given its name, or `None` if missing. |
| `BrowserProfiler.delete_profile` | Removes a profile folder or a direct path from disk, with optional confirmation prompt. |
| `BrowserProfiler.interactive_manager` | Text UI loop for listing, creating, deleting, or launching profiles. |
| `BrowserProfiler.launch_standalone_browser` | Starts a non-headless Chromium with remote debugging enabled and keeps it alive for manual tests. |
| `BrowserProfiler.get_cdp_json` | Pulls `/json/version` from a CDP endpoint and returns the parsed JSON. |
| `BrowserProfiler.launch_builtin_browser` | Spawns a headless Chromium in the background, saves `{wsEndpoint, pid, started_at}` to `~/.crawl4ai/builtin_browser.json`. |
| `BrowserProfiler.get_builtin_browser_info` | Reads that JSON file, verifies the PID, and returns browser status info. |
| `BrowserProfiler._is_browser_running` | Cross-platform helper that checks if a PID is still alive. |
| `BrowserProfiler.kill_builtin_browser` | Terminates the background builtin browser and removes its status file. |
| `BrowserProfiler.get_builtin_browser_status` | Returns `{running: bool, wsEndpoint, pid, started_at}` for quick health checks. |
Let me know what you want to tweak or dive into next.

40
docs/codebase/cli.md Normal file
View File

@@ -0,0 +1,40 @@
### `cli.py` command surface
| Command | Inputs / flags | What it does |
|---|---|---|
| **profiles** | *(none)* | Opens the interactive profile manager, lets you list, create, delete saved browser profiles that live in `~/.crawl4ai/profiles`. |
| **browser status** | | Prints whether the always-on *builtin* browser is running, shows its CDP URL, PID, start time. |
| **browser stop** | | Kills the builtin browser and deletes its status file. |
| **browser view** | `--url, -u` URL *(optional)* | Pops a visible window of the builtin browser, navigates to `URL` or `about:blank`. |
| **config list** | | Dumps every global setting, showing current value, default, and description. |
| **config get** | `key` | Prints the value of a single setting, falls back to default if unset. |
| **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). |
| **examples** | | Just spits out real-world CLI usage samples. |
| **crawl** | `url` *(positional)*<br>`--browser-config,-B` path<br>`--crawler-config,-C` path<br>`--filter-config,-f` path<br>`--extraction-config,-e` path<br>`--json-extract,-j` [desc]\*<br>`--schema,-s` path<br>`--browser,-b` k=v list<br>`--crawler,-c` k=v list<br>`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*<br>`--output-file,-O` path<br>`--bypass-cache,-b` *(flag, default true — note flag reuse)*<br>`--question,-q` str<br>`--verbose,-v` *(flag)*<br>`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. |
| **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. |
\* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions.
> Quick mental model
> `profiles` = manage identities,
> `browser ...` = control long-running headless Chrome that all crawls can piggy-back on,
> `crawl` = do the actual work,
> `config` = tweak global defaults,
> everything else is sugar.
### Quick-fire “profile” usage cheatsheet
| Scenario | Command (copy-paste ready) | Notes |
|---|---|---|
| **Launch interactive Profile Manager UI** | `crwl profiles` | Opens TUI with options: 1 List, 2 Create, 3 Delete, 4 Use-to-crawl, 5 Exit. |
| **Create a fresh profile** | `crwl profiles` → choose **2** → name it → browser opens → log in → press **q** in terminal | Saves to `~/.crawl4ai/profiles/<name>`. |
| **List saved profiles** | `crwl profiles` → choose **1** | Shows name, browser type, size, last-modified. |
| **Delete a profile** | `crwl profiles` → choose **3** → pick the profile index → confirm | Removes the folder. |
| **Crawl with a profile (default alias)** | `crwl https://site.com/dashboard -p my-profile` | Keeps login cookies, sets `use_managed_browser=true` under the hood. |
| **Crawl + verbose JSON output** | `crwl https://site.com -p my-profile -o json -v` | Any other `crawl` flags work the same. |
| **Crawl with extra browser tweaks** | `crwl https://site.com -p my-profile -b "headless=true,viewport_width=1680"` | CLI overrides go on top of the profile. |
| **Same but via explicit sub-command** | `crwl crawl https://site.com -p my-profile` | Identical to default alias. |
| **Use profile from inside Profile Manager** | `crwl profiles` → choose **4** → pick profile → enter URL → follow prompts | Handy when demo-ing to non-CLI folks. |
| **One-off crawl with a profile folder path (no name lookup)** | `crwl https://site.com -b "user_data_dir=$HOME/.crawl4ai/profiles/my-profile,use_managed_browser=true"` | Bypasses registry, useful for CI scripts. |
| **Launch a dev browser on CDP port with the same identity** | `crwl cdp -d $HOME/.crawl4ai/profiles/my-profile -P 9223` | Lets Puppeteer/Playwright attach for debugging. |

View File

@@ -391,12 +391,14 @@ async def main():
# Process results # Process results
raw_df = pd.DataFrame() raw_df = pd.DataFrame()
for result in results: for result in results:
if result.success and result.media["tables"]: # Use the new tables field, falling back to media["tables"] for backward compatibility
tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
if result.success and tables:
# Extract primary market table # Extract primary market table
# DataFrame # DataFrame
raw_df = pd.DataFrame( raw_df = pd.DataFrame(
result.media["tables"][0]["rows"], tables[0]["rows"],
columns=result.media["tables"][0]["headers"], columns=tables[0]["headers"],
) )
break break

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,149 @@
#!/usr/bin/env python3
"""
demo_docker_polling.py
Quick sanity-check for the asynchronous crawl job endpoints:
• POST /crawl/job enqueue work, get task_id
• GET /crawl/job/{id} poll status / fetch result
The style matches demo_docker_api.py (console.rule banners, helper
functions, coloured status lines). Adjust BASE_URL as needed.
Run: python demo_docker_polling.py
"""
import asyncio, json, os, time, urllib.parse
from typing import Dict, List
import httpx
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
console = Console()
BASE_URL = os.getenv("BASE_URL", "http://localhost:11234")
SIMPLE_URL = "https://example.org"
LINKS_URL = "https://httpbin.org/links/10/1"
# --- helpers --------------------------------------------------------------
def print_payload(payload: Dict):
console.print(Panel(Syntax(json.dumps(payload, indent=2),
"json", theme="monokai", line_numbers=False),
title="Payload", border_style="cyan", expand=False))
async def check_server_health(client: httpx.AsyncClient) -> bool:
try:
resp = await client.get("/health")
if resp.is_success:
console.print("[green]Server healthy[/]")
return True
except Exception:
pass
console.print("[bold red]Server is not responding on /health[/]")
return False
async def poll_for_result(client: httpx.AsyncClient, task_id: str,
poll_interval: float = 1.5, timeout: float = 90.0):
"""Hit /crawl/job/{id} until COMPLETED/FAILED or timeout."""
start = time.time()
while True:
resp = await client.get(f"/crawl/job/{task_id}")
resp.raise_for_status()
data = resp.json()
status = data.get("status")
if status.upper() in ("COMPLETED", "FAILED"):
return data
if time.time() - start > timeout:
raise TimeoutError(f"Task {task_id} did not finish in {timeout}s")
await asyncio.sleep(poll_interval)
# --- demo functions -------------------------------------------------------
async def demo_poll_single_url(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig",
"params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS"}}
}
console.rule("[bold blue]Demo A: /crawl/job Single URL[/]", style="blue")
print_payload(payload)
# enqueue
resp = await client.post("/crawl/job", json=payload)
console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
resp.raise_for_status()
task_id = resp.json()["task_id"]
console.print(f"Task ID: [yellow]{task_id}[/]")
# poll
console.print("Polling…")
result = await poll_for_result(client, task_id)
console.print(Panel(Syntax(json.dumps(result, indent=2),
"json", theme="fruity"),
title="Final result", border_style="green"))
if result["status"] == "COMPLETED":
console.print("[green]✅ Crawl succeeded[/]")
else:
console.print("[red]❌ Crawl failed[/]")
async def demo_poll_multi_url(client: httpx.AsyncClient):
payload = {
"urls": [SIMPLE_URL, LINKS_URL],
"browser_config": {"type": "BrowserConfig",
"params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS"}}
}
console.rule("[bold magenta]Demo B: /crawl/job Multi-URL[/]",
style="magenta")
print_payload(payload)
resp = await client.post("/crawl/job", json=payload)
console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
resp.raise_for_status()
task_id = resp.json()["task_id"]
console.print(f"Task ID: [yellow]{task_id}[/]")
console.print("Polling…")
result = await poll_for_result(client, task_id)
console.print(Panel(Syntax(json.dumps(result, indent=2),
"json", theme="fruity"),
title="Final result", border_style="green"))
if result["status"] == "COMPLETED":
console.print(
f"[green]✅ {len(json.loads(result['result'])['results'])} URLs crawled[/]")
else:
console.print("[red]❌ Crawl failed[/]")
# --- main runner ----------------------------------------------------------
async def main_demo():
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
if not await check_server_health(client):
return
await demo_poll_single_url(client)
await demo_poll_multi_url(client)
console.rule("[bold green]Polling demos complete[/]", style="green")
if __name__ == "__main__":
try:
asyncio.run(main_demo())
except KeyboardInterrupt:
console.print("\n[yellow]Interrupted by user[/]")
except Exception:
console.print_exception(show_locals=False)

View File

@@ -3,42 +3,19 @@ from crawl4ai import (
AsyncWebCrawler, AsyncWebCrawler,
BrowserConfig, BrowserConfig,
CrawlerRunConfig, CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator, DefaultMarkdownGenerator,
PruningContentFilter, PruningContentFilter,
CrawlResult CrawlResult
) )
async def example_cdp():
browser_conf = BrowserConfig(
headless=False,
cdp_url="http://localhost:9223"
)
crawler_config = CrawlerRunConfig(
session_id="test",
js_code = """(() => { return {"result": "Hello World!"} })()""",
js_only=True
)
async with AsyncWebCrawler(
config=browser_conf,
verbose=True,
) as crawler:
result : CrawlResult = await crawler.arun(
url="https://www.helloworld.org",
config=crawler_config,
)
print(result.js_execution_result)
async def main(): async def main():
browser_config = BrowserConfig(headless=True, verbose=True) browser_config = BrowserConfig(headless=True, verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator( markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter( content_filter=PruningContentFilter()
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
), ),
) )
result : CrawlResult = await crawler.arun( result : CrawlResult = await crawler.arun(

View File

@@ -0,0 +1,143 @@
# == File: regex_extraction_quickstart.py ==
"""
Miniquick-start for RegexExtractionStrategy
────────────────────────────────────────────
3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:
1. **Default catalog** scrape a page and pull out e-mails / phones / URLs, etc.
2. **Custom pattern** add your own regex at instantiation time.
3. **LLM-assisted schema** ask the model to write a pattern, cache it, then
run extraction _without_ further LLM calls.
Run the whole thing with::
python regex_extraction_quickstart.py
"""
import os, json, asyncio
from pathlib import Path
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
CrawlResult,
RegexExtractionStrategy,
LLMConfig,
)
# ────────────────────────────────────────────────────────────────────────────
# 1. Default-catalog extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_default() -> None:
print("\n=== 1. Regex extraction default patterns ===")
url = "https://www.iana.org/domains/example" # has e-mail + URLs
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
print(f"Fetched {url} - success={result.success}")
if result.success:
data = json.loads(result.extracted_content)
for d in data[:10]:
print(f" {d['label']:<12} {d['value']}")
print(f"... total matches: {len(data)}")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# 2. Custom pattern override / extension
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_custom() -> None:
print("\n=== 2. Regex extraction custom price pattern ===")
url = "https://www.apple.com/shop/buy-mac/macbook-pro"
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
strategy = RegexExtractionStrategy(custom = price_pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
if result.success:
data = json.loads(result.extracted_content)
for d in data:
print(f" {d['value']}")
if not data:
print(" (No prices found - page layout may have changed)")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# 3. One-shot LLM pattern generation, then fast extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_generate_pattern() -> None:
print("\n=== 3. generate_pattern → regex extraction ===")
cache_dir = Path(__file__).parent / "tmp"
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "price_pattern.json"
url = "https://www.lazada.sg/tag/smartphone/"
# ── 3-A. build or load the cached pattern
if pattern_file.exists():
pattern = json.load(pattern_file.open(encoding="utf-8"))
print("Loaded cached pattern:", pattern)
else:
print("Generating pattern via LLM…")
llm_cfg = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY",
)
# pull one sample page as HTML context
async with AsyncWebCrawler() as crawler:
html = (await crawler.arun(url)).fit_html
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
llm_config=llm_cfg,
)
json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
print("Saved pattern:", pattern_file)
# ── 3-B. extraction pass zero LLM calls
strategy = RegexExtractionStrategy(custom=pattern)
config = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
if result.success:
data = json.loads(result.extracted_content)
for d in data[:15]:
print(f" {d['value']}")
print(f"... total matches: {len(data)}")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# Entrypoint
# ────────────────────────────────────────────────────────────────────────────
async def main() -> None:
# await demo_regex_default()
# await demo_regex_custom()
await demo_regex_generate_pattern()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
html: str html: str
success: bool success: bool
cleaned_html: Optional[str] = None cleaned_html: Optional[str] = None
fit_html: Optional[str] = None # Preprocessed HTML optimized for extraction
media: Dict[str, List[Dict]] = {} media: Dict[str, List[Dict]] = {}
links: Dict[str, List[Dict]] = {} links: Dict[str, List[Dict]] = {}
downloaded_files: Optional[List[str]] = None downloaded_files: Optional[List[str]] = None
@@ -50,7 +51,7 @@ if not result.success:
``` ```
### 1.3 **`status_code`** *(Optional[int])* ### 1.3 **`status_code`** *(Optional[int])*
**What**: The pages HTTP status code (e.g., 200, 404). **What**: The page's HTTP status code (e.g., 200, 404).
**Usage**: **Usage**:
```python ```python
if result.status_code == 404: if result.status_code == 404:
@@ -82,7 +83,7 @@ if result.response_headers:
``` ```
### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])* ### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])*
**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the sites certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, **What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site's certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`,
`subject`, `valid_from`, `valid_until`, etc. `subject`, `valid_from`, `valid_until`, etc.
**Usage**: **Usage**:
```python ```python
@@ -109,14 +110,6 @@ print(len(result.html))
print(result.cleaned_html[:500]) # Show a snippet print(result.cleaned_html[:500]) # Show a snippet
``` ```
### 2.3 **`fit_html`** *(Optional[str])*
**What**: If a **content filter** or heuristic (e.g., Pruning/BM25) modifies the HTML, the “fit” or post-filter version.
**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it.
**Usage**:
```python
if result.markdown.fit_html:
print("High-value HTML content:", result.markdown.fit_html[:300])
```
--- ---
@@ -135,7 +128,7 @@ Crawl4AI can convert HTML→Markdown, optionally including:
- **`raw_markdown`** *(str)*: The full HTML→Markdown conversion. - **`raw_markdown`** *(str)*: The full HTML→Markdown conversion.
- **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations. - **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations.
- **`references_markdown`** *(str)*: The reference list or footnotes at the end. - **`references_markdown`** *(str)*: The reference list or footnotes at the end.
- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered fit text. - **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered "fit" text.
- **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`. - **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`.
**Usage**: **Usage**:
@@ -157,7 +150,7 @@ print(result.markdown.raw_markdown[:200])
print(result.markdown.fit_markdown) print(result.markdown.fit_markdown)
print(result.markdown.fit_html) print(result.markdown.fit_html)
``` ```
**Important**: Fit content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`. **Important**: "Fit" content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
--- ---
@@ -169,7 +162,7 @@ print(result.markdown.fit_html)
- `src` *(str)*: Media URL - `src` *(str)*: Media URL
- `alt` or `title` *(str)*: Descriptive text - `alt` or `title` *(str)*: Descriptive text
- `score` *(float)*: Relevance score if the crawlers heuristic found it important - `score` *(float)*: Relevance score if the crawler's heuristic found it "important"
- `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text - `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text
**Usage**: **Usage**:
@@ -263,7 +256,7 @@ A `DispatchResult` object providing additional concurrency and resource usage in
- **`task_id`**: A unique identifier for the parallel task. - **`task_id`**: A unique identifier for the parallel task.
- **`memory_usage`** (float): The memory (in MB) used at the time of completion. - **`memory_usage`** (float): The memory (in MB) used at the time of completion.
- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the tasks execution. - **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task's execution.
- **`start_time`** / **`end_time`** (datetime): Time range for this crawling task. - **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
- **`error_message`** (str): Any dispatcher- or concurrency-related error encountered. - **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
@@ -358,7 +351,7 @@ async def handle_result(result: CrawlResult):
# HTML # HTML
print("Original HTML size:", len(result.html)) print("Original HTML size:", len(result.html))
print("Cleaned HTML size:", len(result.cleaned_html or "")) print("Cleaned HTML size:", len(result.cleaned_html or ""))
# Markdown output # Markdown output
if result.markdown: if result.markdown:
print("Raw Markdown:", result.markdown.raw_markdown[:300]) print("Raw Markdown:", result.markdown.raw_markdown[:300])

View File

@@ -36,6 +36,45 @@ LLMExtractionStrategy(
) )
``` ```
### RegexExtractionStrategy
Used for fast pattern-based extraction of common entities using regular expressions.
```python
RegexExtractionStrategy(
# Pattern Configuration
pattern: IntFlag = RegexExtractionStrategy.Nothing, # Bit flags of built-in patterns to use
custom: Optional[Dict[str, str]] = None, # Custom pattern dictionary {label: regex}
# Input Format
input_format: str = "fit_html", # "html", "markdown", "text" or "fit_html"
)
# Built-in Patterns as Bit Flags
RegexExtractionStrategy.Email # Email addresses
RegexExtractionStrategy.PhoneIntl # International phone numbers
RegexExtractionStrategy.PhoneUS # US-format phone numbers
RegexExtractionStrategy.Url # HTTP/HTTPS URLs
RegexExtractionStrategy.IPv4 # IPv4 addresses
RegexExtractionStrategy.IPv6 # IPv6 addresses
RegexExtractionStrategy.Uuid # UUIDs
RegexExtractionStrategy.Currency # Currency values (USD, EUR, etc)
RegexExtractionStrategy.Percentage # Percentage values
RegexExtractionStrategy.Number # Numeric values
RegexExtractionStrategy.DateIso # ISO format dates
RegexExtractionStrategy.DateUS # US format dates
RegexExtractionStrategy.Time24h # 24-hour format times
RegexExtractionStrategy.PostalUS # US postal codes
RegexExtractionStrategy.PostalUK # UK postal codes
RegexExtractionStrategy.HexColor # HTML hex color codes
RegexExtractionStrategy.TwitterHandle # Twitter handles
RegexExtractionStrategy.Hashtag # Hashtags
RegexExtractionStrategy.MacAddr # MAC addresses
RegexExtractionStrategy.Iban # International bank account numbers
RegexExtractionStrategy.CreditCard # Credit card numbers
RegexExtractionStrategy.All # All available patterns
```
### CosineStrategy ### CosineStrategy
Used for content similarity-based extraction and clustering. Used for content similarity-based extraction and clustering.
@@ -156,6 +195,55 @@ result = await crawler.arun(
data = json.loads(result.extracted_content) data = json.loads(result.extracted_content)
``` ```
### Regex Extraction
```python
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, RegexExtractionStrategy
# Method 1: Use built-in patterns
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Email | RegexExtractionStrategy.Url
)
# Method 2: Use custom patterns
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
strategy = RegexExtractionStrategy(custom=price_pattern)
# Method 3: Generate pattern with LLM assistance (one-time)
from crawl4ai import LLMConfig
async with AsyncWebCrawler() as crawler:
# Get sample HTML first
sample_result = await crawler.arun("https://example.com/products")
html = sample_result.fit_html
# Generate regex pattern once
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Product prices in USD format",
llm_config=LLMConfig(provider="openai/gpt-4o-mini")
)
# Save pattern for reuse
import json
with open("price_pattern.json", "w") as f:
json.dump(pattern, f)
# Use pattern for extraction (no LLM calls)
strategy = RegexExtractionStrategy(custom=pattern)
result = await crawler.arun(
url="https://example.com/products",
config=CrawlerRunConfig(extraction_strategy=strategy)
)
# Process results
data = json.loads(result.extracted_content)
for item in data:
print(f"{item['label']}: {item['value']}")
```
### CSS Extraction ### CSS Extraction
```python ```python
@@ -220,12 +308,28 @@ result = await crawler.arun(
## Best Practices ## Best Practices
1. **Choose the Right Strategy** 1. **Choose the Right Strategy**
- Use `LLMExtractionStrategy` for complex, unstructured content - Use `RegexExtractionStrategy` for common data types like emails, phones, URLs, dates
- Use `JsonCssExtractionStrategy` for well-structured HTML - Use `JsonCssExtractionStrategy` for well-structured HTML with consistent patterns
- Use `LLMExtractionStrategy` for complex, unstructured content requiring reasoning
- Use `CosineStrategy` for content similarity and clustering - Use `CosineStrategy` for content similarity and clustering
2. **Optimize Chunking** 2. **Strategy Selection Guide**
```
Is the target data a common type (email/phone/date/URL)?
→ RegexExtractionStrategy
Does the page have consistent HTML structure?
→ JsonCssExtractionStrategy or JsonXPathExtractionStrategy
Is the data semantically complex or unstructured?
→ LLMExtractionStrategy
Need to find content similar to a specific topic?
→ CosineStrategy
```
3. **Optimize Chunking**
```python ```python
# For long documents # For long documents
strategy = LLMExtractionStrategy( strategy = LLMExtractionStrategy(
@@ -234,7 +338,26 @@ result = await crawler.arun(
) )
``` ```
3. **Handle Errors** 4. **Combine Strategies for Best Performance**
```python
# First pass: Extract structure with CSS
css_strategy = JsonCssExtractionStrategy(product_schema)
css_result = await crawler.arun(url, config=CrawlerRunConfig(extraction_strategy=css_strategy))
product_data = json.loads(css_result.extracted_content)
# Second pass: Extract specific fields with regex
descriptions = [product["description"] for product in product_data]
regex_strategy = RegexExtractionStrategy(
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS,
custom={"dimension": r"\d+x\d+x\d+ (?:cm|in)"}
)
# Process descriptions with regex
for text in descriptions:
matches = regex_strategy.extract("", text) # Direct extraction
```
5. **Handle Errors**
```python ```python
try: try:
result = await crawler.arun( result = await crawler.arun(
@@ -247,11 +370,31 @@ result = await crawler.arun(
print(f"Extraction failed: {e}") print(f"Extraction failed: {e}")
``` ```
4. **Monitor Performance** 6. **Monitor Performance**
```python ```python
strategy = CosineStrategy( strategy = CosineStrategy(
verbose=True, # Enable logging verbose=True, # Enable logging
word_count_threshold=20, # Filter short content word_count_threshold=20, # Filter short content
top_k=5 # Limit results top_k=5 # Limit results
) )
```
7. **Cache Generated Patterns**
```python
# For RegexExtractionStrategy pattern generation
import json
from pathlib import Path
cache_dir = Path("./pattern_cache")
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "product_pattern.json"
if pattern_file.exists():
with open(pattern_file) as f:
pattern = json.load(f)
else:
# Generate once with LLM
pattern = RegexExtractionStrategy.generate_pattern(...)
with open(pattern_file, "w") as f:
json.dump(pattern, f)
``` ```

View File

@@ -1,15 +1,20 @@
# Extracting JSON (No LLM) # Extracting JSON (No LLM)
One of Crawl4AIs **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM. One of Crawl4AI's **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. Crawl4AI offers several strategies for LLM-free extraction:
1. **Schema-based extraction** with CSS or XPath selectors via `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`
2. **Regular expression extraction** with `RegexExtractionStrategy` for fast pattern matching
These approaches let you extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
**Why avoid LLM for basic extractions?** **Why avoid LLM for basic extractions?**
1. **Faster & Cheaper**: No API calls or GPU overhead. 1. **Faster & Cheaper**: No API calls or GPU overhead.
2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free. 2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. Pattern-based extraction is practically carbon-free.
3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate. 3. **Precise & Repeatable**: CSS/XPath selectors and regex patterns do exactly what you specify. LLM outputs can vary or hallucinate.
4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel. 4. **Scales Readily**: For thousands of pages, pattern-based extraction runs quickly and in parallel.
Below, well explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). Well also highlight advanced features like **nested fields** and **base element attributes**. Below, we'll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We'll also highlight advanced features like **nested fields** and **base element attributes**.
--- ---
@@ -17,17 +22,17 @@ Below, well explore how to craft these schemas and use them with **JsonCssExt
A schema defines: A schema defines:
1. A **base selector** that identifies each container element on the page (e.g., a product row, a blog post card). 1. A **base selector** that identifies each "container" element on the page (e.g., a product row, a blog post card).
2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.). 2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).
3. **Nested** or **list** types for repeated or hierarchical structures. 3. **Nested** or **list** types for repeated or hierarchical structures.
For example, if you have a list of products, each one might have a name, price, reviews, and related products. This approach is faster and more reliable than an LLM for consistent, structured pages. For example, if you have a list of products, each one might have a name, price, reviews, and "related products." This approach is faster and more reliable than an LLM for consistent, structured pages.
--- ---
## 2. Simple Example: Crypto Prices ## 2. Simple Example: Crypto Prices
Lets begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **dont** call any LLM: Let's begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don't** call any LLM:
```python ```python
import json import json
@@ -87,7 +92,7 @@ asyncio.run(extract_crypto_prices())
**Highlights**: **Highlights**:
- **`baseSelector`**: Tells us where each item (crypto row) is. - **`baseSelector`**: Tells us where each "item" (crypto row) is.
- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors. - **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.
- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.). - Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
@@ -97,7 +102,7 @@ No LLM is needed, and the performance is **near-instant** for hundreds or thousa
### **XPath Example with `raw://` HTML** ### **XPath Example with `raw://` HTML**
Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. Well pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`. Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We'll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
```python ```python
import json import json
@@ -168,12 +173,12 @@ asyncio.run(extract_crypto_prices_xpath())
**Key Points**: **Key Points**:
1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`. 1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.
2. **`baseSelector`** and each fields `"selector"` use **XPath** instead of CSS. 2. **`baseSelector`** and each field's `"selector"` use **XPath** instead of CSS.
3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing. 3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.
4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**. 4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.
Thats how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`. That's how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
--- ---
@@ -187,7 +192,7 @@ We have a **sample e-commerce** HTML file on GitHub (example):
``` ```
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
``` ```
This snippet includes categories, products, features, reviews, and related items. Lets see how to define a schema that fully captures that structure **without LLM**. This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
```python ```python
schema = { schema = {
@@ -333,24 +338,253 @@ async def extract_ecommerce_data():
asyncio.run(extract_ecommerce_data()) asyncio.run(extract_ecommerce_data())
``` ```
If all goes well, you get a **structured** JSON array with each category, containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM. If all goes well, you get a **structured** JSON array with each "category," containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
--- ---
## 4. Why “No LLM” Is Often Better ## 4. RegexExtractionStrategy - Fast Pattern-Based Extraction
1. **Zero Hallucination**: Schema-based extraction doesnt guess text. It either finds it or not. Crawl4AI now offers a powerful new zero-LLM extraction strategy: `RegexExtractionStrategy`. This strategy provides lightning-fast extraction of common data types like emails, phone numbers, URLs, dates, and more using pre-compiled regular expressions.
2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.
3. **Speed**: LLM-based extraction can be 101000x slower for large-scale crawling.
4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns. ### Key Features
- **Zero LLM Dependency**: Extracts data without any AI model calls
- **Blazing Fast**: Uses pre-compiled regex patterns for maximum performance
- **Built-in Patterns**: Includes ready-to-use patterns for common data types
- **Custom Patterns**: Add your own regex patterns for domain-specific extraction
- **LLM-Assisted Pattern Generation**: Optionally use an LLM once to generate optimized patterns, then reuse them without further LLM calls
### Simple Example: Extracting Common Entities
The easiest way to start is by using the built-in pattern catalog:
```python
import json
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
RegexExtractionStrategy
)
async def extract_with_regex():
# Create a strategy using built-in patterns for URLs and currencies
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data[:5]: # Show first 5 matches
print(f"{item['label']}: {item['value']}")
print(f"Total matches: {len(data)}")
asyncio.run(extract_with_regex())
```
### Available Built-in Patterns
`RegexExtractionStrategy` provides these common patterns as IntFlag attributes for easy combining:
```python
# Use individual patterns
strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email)
# Combine multiple patterns
strategy = RegexExtractionStrategy(
pattern = (
RegexExtractionStrategy.Email |
RegexExtractionStrategy.PhoneUS |
RegexExtractionStrategy.Url
)
)
# Use all available patterns
strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.All)
```
Available patterns include:
- `Email` - Email addresses
- `PhoneIntl` - International phone numbers
- `PhoneUS` - US-format phone numbers
- `Url` - HTTP/HTTPS URLs
- `IPv4` - IPv4 addresses
- `IPv6` - IPv6 addresses
- `Uuid` - UUIDs
- `Currency` - Currency values (USD, EUR, etc.)
- `Percentage` - Percentage values
- `Number` - Numeric values
- `DateIso` - ISO format dates
- `DateUS` - US format dates
- `Time24h` - 24-hour format times
- `PostalUS` - US postal codes
- `PostalUK` - UK postal codes
- `HexColor` - HTML hex color codes
- `TwitterHandle` - Twitter handles
- `Hashtag` - Hashtags
- `MacAddr` - MAC addresses
- `Iban` - International bank account numbers
- `CreditCard` - Credit card numbers
### Custom Pattern Example
For more targeted extraction, you can provide custom patterns:
```python
import json
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
RegexExtractionStrategy
)
async def extract_prices():
# Define a custom pattern for US Dollar prices
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
# Create strategy with custom pattern
strategy = RegexExtractionStrategy(custom=price_pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.example.com/products",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data:
print(f"Found price: {item['value']}")
asyncio.run(extract_prices())
```
### LLM-Assisted Pattern Generation
For complex or site-specific patterns, you can use an LLM once to generate an optimized pattern, then save and reuse it without further LLM calls:
```python
import json
import asyncio
from pathlib import Path
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
RegexExtractionStrategy,
LLMConfig
)
async def extract_with_generated_pattern():
cache_dir = Path("./pattern_cache")
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "price_pattern.json"
# 1. Generate or load pattern
if pattern_file.exists():
pattern = json.load(pattern_file.open())
print(f"Using cached pattern: {pattern}")
else:
print("Generating pattern via LLM...")
# Configure LLM
llm_config = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY",
)
# Get sample HTML for context
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com/products")
html = result.fit_html
# Generate pattern (one-time LLM usage)
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Product prices in USD format",
llm_config=llm_config,
)
# Cache pattern for future use
json.dump(pattern, pattern_file.open("w"), indent=2)
# 2. Use pattern for extraction (no LLM calls)
strategy = RegexExtractionStrategy(custom=pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/products",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data[:10]:
print(f"Extracted: {item['value']}")
print(f"Total matches: {len(data)}")
asyncio.run(extract_with_generated_pattern())
```
This pattern allows you to:
1. Use an LLM once to generate a highly optimized regex for your specific site
2. Save the pattern to disk for reuse
3. Extract data using only regex (no further LLM calls) in production
### Extraction Results Format
The `RegexExtractionStrategy` returns results in a consistent format:
```json
[
{
"url": "https://example.com",
"label": "email",
"value": "contact@example.com",
"span": [145, 163]
},
{
"url": "https://example.com",
"label": "url",
"value": "https://support.example.com",
"span": [210, 235]
}
]
```
Each match includes:
- `url`: The source URL
- `label`: The pattern name that matched (e.g., "email", "phone_us")
- `value`: The extracted text
- `span`: The start and end positions in the source content
--- ---
## 5. Base Element Attributes & Additional Fields ## 5. Why "No LLM" Is Often Better
Its easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using: 1. **Zero Hallucination**: Pattern-based extraction doesn't guess text. It either finds it or not.
2. **Guaranteed Structure**: The same schema or regex yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.
3. **Speed**: LLM-based extraction can be 101000x slower for large-scale crawling.
4. **Scalable**: Adding or updating a field is a matter of adjusting the schema or regex, not re-tuning a model.
**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema or regex approach first for repeated or consistent data patterns.
---
## 6. Base Element Attributes & Additional Fields
It's easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
```json ```json
{ {
@@ -361,11 +595,11 @@ Its easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from y
} }
``` ```
You can define them in **`baseFields`** (extracted from the main container element) or in each fields sub-lists. This is especially helpful if you need an items link or ID stored in the parent `<div>`. You can define them in **`baseFields`** (extracted from the main container element) or in each field's sub-lists. This is especially helpful if you need an item's link or ID stored in the parent `<div>`.
--- ---
## 6. Putting It All Together: Larger Example ## 7. Putting It All Together: Larger Example
Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author: Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
@@ -389,19 +623,20 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
--- ---
## 7. Tips & Best Practices ## 8. Tips & Best Practices
1. **Inspect the DOM** in Chrome DevTools or Firefoxs Inspector to find stable selectors. 1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.
2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists. 2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.
3. **Test** your schema on partial HTML or a test page before a big crawl. 3. **Test** your schema on partial HTML or a test page before a big crawl.
4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`. 4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.
5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, itll often show warnings. 5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it'll often show warnings.
6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the parent item. 6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the "parent" item.
7. **Performance**: For large pages, make sure your selectors are as narrow as possible. 7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
8. **Consider Using Regex First**: For simple data types like emails, URLs, and dates, `RegexExtractionStrategy` is often the fastest approach.
--- ---
## 8. Schema Generation Utility ## 9. Schema Generation Utility
While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when: While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
@@ -481,27 +716,26 @@ strategy = JsonCssExtractionStrategy(css_schema)
- Use OpenAI for production-quality schemas - Use OpenAI for production-quality schemas
- Use Ollama for development, testing, or when you need a self-hosted solution - Use Ollama for development, testing, or when you need a self-hosted solution
That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
--- ---
## 9. Conclusion ## 10. Conclusion
With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that: With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
- Scrape any consistent site for structured data. - Scrape any consistent site for structured data.
- Support nested objects, repeating lists, or advanced transformations. - Support nested objects, repeating lists, or pattern-based extraction.
- Scale to thousands of pages quickly and reliably. - Scale to thousands of pages quickly and reliably.
**Next Steps**: **Choosing the Right Strategy**:
- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed. - Use **`RegexExtractionStrategy`** for fast extraction of common data types like emails, phones, URLs, dates, etc.
- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded. - Use **`JsonCssExtractionStrategy`** or **`JsonXPathExtractionStrategy`** for structured data with clear HTML patterns
- If you need both: first extract structured data with JSON strategies, then use regex on specific fields
**Remember**: For repeated, structured data, you dont need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI. **Remember**: For repeated, structured data, you don't need to pay for or wait on an LLM. Well-crafted schemas and regex patterns get you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
**Last Updated**: 2025-01-01 **Last Updated**: 2025-05-02
--- ---
Thats it for **Extracting JSON (No LLM)**! Youve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines! That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) and regex patterns can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!

View File

@@ -0,0 +1,32 @@
from crawl4ai import BrowserProfiler
import asyncio
if __name__ == "__main__":
# Example usage
profiler = BrowserProfiler()
# Create a new profile
import os
from pathlib import Path
home_dir = Path.home()
profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
print(f"Profile created at: {profile_path}")
# # Launch a standalone browser
# asyncio.run(profiler.launch_standalone_browser())
# # List profiles
# profiles = profiler.list_profiles()
# for profile in profiles:
# print(f"Profile: {profile['name']}, Path: {profile['path']}")
# # Delete a profile
# success = profiler.delete_profile("my-profile")
# if success:
# print("Profile deleted successfully")
# else:
# print("Failed to delete profile")