co-mono/packages/pi-channels/src/adapters/transcription.ts
2026-03-05 18:58:27 -08:00

250 lines
7.9 KiB
TypeScript

/**
* pi-channels — Pluggable audio transcription.
*
* Supports three providers:
* - "apple" — macOS SFSpeechRecognizer (free, offline, no API key)
* - "openai" — Whisper API
* - "elevenlabs" — Scribe API
*
* Usage:
* const provider = createTranscriptionProvider(config);
* const result = await provider.transcribe("/path/to/audio.ogg", "en");
*/
import { execFile } from "node:child_process";
import * as fs from "node:fs";
import * as path from "node:path";
import type { TranscriptionConfig } from "../types.js";
// ── Public interface ────────────────────────────────────────────
export interface TranscriptionResult {
ok: boolean;
text?: string;
error?: string;
}
export interface TranscriptionProvider {
transcribe(filePath: string, language?: string): Promise<TranscriptionResult>;
}
/** Create a transcription provider from config. */
export function createTranscriptionProvider(config: TranscriptionConfig): TranscriptionProvider {
switch (config.provider) {
case "apple":
return new AppleProvider(config);
case "openai":
return new OpenAIProvider(config);
case "elevenlabs":
return new ElevenLabsProvider(config);
default:
throw new Error(`Unknown transcription provider: ${config.provider}`);
}
}
// ── Helpers ─────────────────────────────────────────────────────
/** Resolve "env:VAR_NAME" patterns to actual environment variable values. */
function resolveEnvValue(value: string | undefined): string | undefined {
if (!value) return undefined;
if (value.startsWith("env:")) {
const envVar = value.slice(4);
return process.env[envVar] || undefined;
}
return value;
}
function validateFile(filePath: string): TranscriptionResult | null {
if (!fs.existsSync(filePath)) {
return { ok: false, error: `File not found: ${filePath}` };
}
const stat = fs.statSync(filePath);
// 25MB limit (Whisper max; Telegram max is 20MB)
if (stat.size > 25 * 1024 * 1024) {
return { ok: false, error: `File too large: ${(stat.size / 1024 / 1024).toFixed(1)}MB (max 25MB)` };
}
if (stat.size === 0) {
return { ok: false, error: "File is empty" };
}
return null;
}
// ── Apple Provider ──────────────────────────────────────────────
const SWIFT_HELPER_SRC = path.join(import.meta.dirname, "transcribe-apple.swift");
const SWIFT_HELPER_BIN = path.join(import.meta.dirname, "transcribe-apple");
class AppleProvider implements TranscriptionProvider {
private language: string | undefined;
private compilePromise: Promise<TranscriptionResult> | null = null;
constructor(config: TranscriptionConfig) {
this.language = config.language;
}
async transcribe(filePath: string, language?: string): Promise<TranscriptionResult> {
if (process.platform !== "darwin") {
return { ok: false, error: "Apple transcription is only available on macOS" };
}
const fileErr = validateFile(filePath);
if (fileErr) return fileErr;
// Compile Swift helper on first use (promise-based lock prevents races)
if (!this.compilePromise) {
this.compilePromise = this.compileHelper();
}
const compileResult = await this.compilePromise;
if (!compileResult.ok) return compileResult;
const lang = language || this.language;
const args = [filePath];
if (lang) args.push(lang);
return new Promise((resolve) => {
execFile(SWIFT_HELPER_BIN, args, { timeout: 60_000 }, (err, stdout, stderr) => {
if (err) {
resolve({ ok: false, error: stderr?.trim() || err.message });
return;
}
const text = stdout.trim();
if (!text) {
resolve({ ok: false, error: "Transcription returned empty result" });
return;
}
resolve({ ok: true, text });
});
});
}
private compileHelper(): Promise<TranscriptionResult> {
// Skip if already compiled and binary exists
if (fs.existsSync(SWIFT_HELPER_BIN)) {
return Promise.resolve({ ok: true });
}
if (!fs.existsSync(SWIFT_HELPER_SRC)) {
return Promise.resolve({
ok: false,
error: `Swift helper source not found: ${SWIFT_HELPER_SRC}`,
});
}
return new Promise((resolve) => {
execFile(
"swiftc",
["-O", "-o", SWIFT_HELPER_BIN, SWIFT_HELPER_SRC],
{ timeout: 30_000 },
(err, _stdout, stderr) => {
if (err) {
resolve({ ok: false, error: `Failed to compile Swift helper: ${stderr?.trim() || err.message}` });
return;
}
resolve({ ok: true });
},
);
});
}
}
// ── OpenAI Provider ─────────────────────────────────────────────
class OpenAIProvider implements TranscriptionProvider {
private apiKey: string;
private model: string;
private language: string | undefined;
constructor(config: TranscriptionConfig) {
const key = resolveEnvValue(config.apiKey);
if (!key) throw new Error("OpenAI transcription requires apiKey");
this.apiKey = key;
this.model = config.model || "whisper-1";
this.language = config.language;
}
async transcribe(filePath: string, language?: string): Promise<TranscriptionResult> {
const fileErr = validateFile(filePath);
if (fileErr) return fileErr;
const lang = language || this.language;
try {
const form = new FormData();
const fileBuffer = fs.readFileSync(filePath);
const filename = path.basename(filePath);
form.append("file", new Blob([fileBuffer]), filename);
form.append("model", this.model);
if (lang) form.append("language", lang);
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
method: "POST",
headers: { Authorization: `Bearer ${this.apiKey}` },
body: form,
});
if (!response.ok) {
const body = await response.text();
return { ok: false, error: `OpenAI API error (${response.status}): ${body.slice(0, 200)}` };
}
const data = (await response.json()) as { text?: string };
if (!data.text) {
return { ok: false, error: "OpenAI returned empty transcription" };
}
return { ok: true, text: data.text };
} catch (err: any) {
return { ok: false, error: `OpenAI transcription failed: ${err.message}` };
}
}
}
// ── ElevenLabs Provider ─────────────────────────────────────────
class ElevenLabsProvider implements TranscriptionProvider {
private apiKey: string;
private model: string;
private language: string | undefined;
constructor(config: TranscriptionConfig) {
const key = resolveEnvValue(config.apiKey);
if (!key) throw new Error("ElevenLabs transcription requires apiKey");
this.apiKey = key;
this.model = config.model || "scribe_v1";
this.language = config.language;
}
async transcribe(filePath: string, language?: string): Promise<TranscriptionResult> {
const fileErr = validateFile(filePath);
if (fileErr) return fileErr;
const lang = language || this.language;
try {
const form = new FormData();
const fileBuffer = fs.readFileSync(filePath);
const filename = path.basename(filePath);
form.append("file", new Blob([fileBuffer]), filename);
form.append("model_id", this.model);
if (lang) form.append("language_code", lang);
const response = await fetch("https://api.elevenlabs.io/v1/speech-to-text", {
method: "POST",
headers: { "xi-api-key": this.apiKey },
body: form,
});
if (!response.ok) {
const body = await response.text();
return { ok: false, error: `ElevenLabs API error (${response.status}): ${body.slice(0, 200)}` };
}
const data = (await response.json()) as { text?: string };
if (!data.text) {
return { ok: false, error: "ElevenLabs returned empty transcription" };
}
return { ok: true, text: data.text };
} catch (err: any) {
return { ok: false, error: `ElevenLabs transcription failed: ${err.message}` };
}
}
}