Initial commit: PodcastYes — AI podcast platform

2026-06-07 03:58:32 -04:00
commit 155507f21a
151 changed files with 19826 additions and 0 deletions
@@ -0,0 +1,52 @@
+import { prisma } from "@/lib/db";
+import type { TokenUsage } from "./types";
+
+// Rough 2026 unit prices (USD). Tune in one place; admin cost dashboards read AiCostLog.
+const PRICE = {
+  gptInputPer1k: 0.0025,
+  gptOutputPer1k: 0.01,
+  elevenPer1kChars: 0.3,
+  dallePerImage: 0.04,
+};
+
+export function scriptCostUsd(usage: TokenUsage): number {
+  return round4(
+    (usage.inputTokens / 1000) * PRICE.gptInputPer1k +
+      (usage.outputTokens / 1000) * PRICE.gptOutputPer1k
+  );
+}
+
+export function audioCostUsd(characters: number): number {
+  return round4((characters / 1000) * PRICE.elevenPer1kChars);
+}
+
+export function artCostUsd(images: number): number {
+  return round4(images * PRICE.dallePerImage);
+}
+
+export interface CostEntry {
+  provider: "openai" | "elevenlabs";
+  operation: "script" | "audio" | "art" | "repurpose";
+  units: number;
+  costUsd: number;
+  episodeId?: string;
+  userId?: string;
+}
+
+/** Record an AI usage/cost line for the admin monitoring dashboard. */
+export async function recordCost(entry: CostEntry): Promise<void> {
+  await prisma.aiCostLog.create({
+    data: {
+      provider: entry.provider,
+      operation: entry.operation,
+      units: entry.units,
+      costUsd: entry.costUsd.toFixed(4),
+      episodeId: entry.episodeId,
+      userId: entry.userId,
+    },
+  });
+}
+
+function round4(n: number): number {
+  return Math.round(n * 10000) / 10000;
+}
@@ -0,0 +1,45 @@
+import { spawn } from "node:child_process";
+
+const FFMPEG = process.env.FFMPEG_PATH ?? "ffmpeg";
+const FFPROBE = process.env.FFPROBE_PATH ?? "ffprobe";
+
+/** Run ffmpeg with the given args; rejects with the tail of stderr on non-zero exit. */
+export function runFfmpeg(args: string[]): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const proc = spawn(FFMPEG, args, { stdio: ["ignore", "ignore", "pipe"] });
+    let stderr = "";
+    proc.stderr.on("data", (d) => {
+      stderr += d.toString();
+    });
+    proc.on("error", (err) =>
+      reject(new Error(`Failed to spawn ffmpeg (${FFMPEG}): ${err.message}`))
+    );
+    proc.on("close", (code) =>
+      code === 0 ? resolve() : reject(new Error(`ffmpeg exited ${code}: ${stderr.slice(-600)}`))
+    );
+  });
+}
+
+/** Probe an audio file's duration in whole seconds, or null if ffprobe is unavailable. */
+export function ffprobeDuration(file: string): Promise<number | null> {
+  return new Promise((resolve) => {
+    const proc = spawn(FFPROBE, [
+      "-v",
+      "error",
+      "-show_entries",
+      "format=duration",
+      "-of",
+      "default=noprint_wrappers=1:nokey=1",
+      file,
+    ]);
+    let out = "";
+    proc.stdout.on("data", (d) => {
+      out += d.toString();
+    });
+    proc.on("error", () => resolve(null));
+    proc.on("close", () => {
+      const n = parseFloat(out.trim());
+      resolve(Number.isFinite(n) ? Math.round(n) : null);
+    });
+  });
+}
@@ -0,0 +1,16 @@
+import OpenAI from "openai";
+
+let client: OpenAI | null = null;
+
+/** Lazily-constructed OpenAI client (used for GPT-4 scripts and DALL·E art). */
+export function openai(): OpenAI {
+  if (!client) {
+    const apiKey = process.env.OPENAI_API_KEY;
+    if (!apiKey) throw new Error("OPENAI_API_KEY is not set");
+    client = new OpenAI({ apiKey });
+  }
+  return client;
+}
+
+export const SCRIPT_MODEL = process.env.OPENAI_SCRIPT_MODEL ?? "gpt-4o";
+export const ART_MODEL = process.env.OPENAI_ART_MODEL ?? "dall-e-3";
@@ -0,0 +1,228 @@
+import { Prisma } from "@prisma/client";
+import { prisma } from "@/lib/db";
+import { setEpisodeStatus } from "@/lib/episodes/status";
+import { scriptProvider, audioProvider, artProvider } from "@/lib/ai/providers";
+import { buildCoverPrompt } from "@/lib/ai/providers/openai-art";
+import { segmentScript } from "./segment";
+import { stitchMp3 } from "./stitch";
+import { storage, assetKey } from "@/lib/storage";
+import { recordCost, scriptCostUsd, audioCostUsd, artCostUsd } from "@/lib/ai/cost";
+import { incrementUsage } from "@/lib/usage/meter";
+import { sendEmail, emailLayout } from "@/lib/email";
+import { DEFAULT_VOICE_IDS } from "@/lib/ai/voices";
+import type { EpisodeConfig, StructuredScript } from "@/lib/ai/types";
+import type { GenerationType } from "@/lib/queue/jobs";
+
+type EpisodeWithRelations = Prisma.EpisodeGetPayload<{
+  include: { speakers: true; user: true };
+}>;
+
+/**
+ * The episode generation pipeline, run by the worker.
+ * Stages: script → segment → synthesize → stitch → art → save → meter.
+ * `type` selects which stages run (full, or a single re-generation).
+ */
+export async function runEpisodeGeneration(
+  episodeId: string,
+  type: GenerationType = "full"
+): Promise<void> {
+  const episode = await loadEpisode(episodeId);
+  const config = toConfig(episode);
+
+  const did = { script: false, audio: false, art: false };
+
+  if (type === "full" || type === "script") {
+    await generateScript(episode, config);
+    did.script = true;
+  }
+  if (type === "full" || type === "script" || type === "audio") {
+    await generateAudio(episode);
+    did.audio = true;
+  }
+  if (type === "full" || type === "art") {
+    await generateArt(episode);
+    did.art = true;
+  }
+
+  await setEpisodeStatus(episodeId, "SAVING", { stage: "Finalizing your episode" });
+  await meter(episode, did);
+  await setEpisodeStatus(episodeId, "READY", { stage: "Done" });
+  await notifyReady(episode);
+}
+
+async function loadEpisode(episodeId: string): Promise<EpisodeWithRelations> {
+  const episode = await prisma.episode.findUnique({
+    where: { id: episodeId },
+    include: { speakers: true, user: true },
+  });
+  if (!episode) throw new Error(`Episode ${episodeId} not found`);
+  return episode;
+}
+
+function toConfig(episode: EpisodeWithRelations): EpisodeConfig {
+  const speakers =
+    episode.speakers.length > 0
+      ? episode.speakers.map((s) => ({ speakerKey: s.speakerKey, displayName: s.displayName }))
+      : [{ speakerKey: "host", displayName: "Host" }];
+  return {
+    title: episode.title,
+    topic: episode.topic,
+    tone: episode.tone,
+    format: episode.format,
+    language: episode.language,
+    targetLengthMin: episode.targetLengthMin,
+    audience: episode.audience ?? undefined,
+    speakers,
+  };
+}
+
+// ─────────────── Stage 1: script ───────────────
+async function generateScript(episode: EpisodeWithRelations, config: EpisodeConfig) {
+  await setEpisodeStatus(episode.id, "SCRIPTING", { stage: "Writing the script" });
+  const { script, usage } = await scriptProvider().generate(config);
+
+  await prisma.script.upsert({
+    where: { episodeId: episode.id },
+    create: {
+      episodeId: episode.id,
+      content: script as unknown as Prisma.InputJsonValue,
+      model: scriptProvider().model,
+    },
+    update: { content: script as unknown as Prisma.InputJsonValue, version: { increment: 1 } },
+  });
+
+  // Adopt the generated title when the user didn't set one.
+  if (!episode.title?.trim() && script.title) {
+    await prisma.episode.update({ where: { id: episode.id }, data: { title: script.title } });
+    episode.title = script.title;
+  }
+
+  await recordCost({
+    provider: "openai",
+    operation: "script",
+    units: usage.inputTokens + usage.outputTokens,
+    costUsd: scriptCostUsd(usage),
+    episodeId: episode.id,
+    userId: episode.userId,
+  });
+}
+
+// ─────────────── Stages 2–4: segment → synthesize → stitch ───────────────
+async function generateAudio(episode: EpisodeWithRelations) {
+  await setEpisodeStatus(episode.id, "SYNTHESIZING", { stage: "Recording the audio" });
+
+  const scriptRow = await prisma.script.findUnique({ where: { episodeId: episode.id } });
+  if (!scriptRow) throw new Error("Cannot synthesize audio before a script exists");
+  const script = scriptRow.content as unknown as StructuredScript;
+
+  const voiceMap: Record<string, string> = {};
+  for (const s of episode.speakers) voiceMap[s.speakerKey] = s.elevenVoiceId;
+  const fallback =
+    episode.speakers[0]?.elevenVoiceId ?? DEFAULT_VOICE_IDS.host;
+
+  const provider = audioProvider();
+  const segments = segmentScript(script, voiceMap, fallback, provider.maxCharsPerRequest);
+  if (segments.length === 0) throw new Error("Script produced no spoken lines");
+
+  const buffers: Buffer[] = [];
+  let totalChars = 0;
+  for (const seg of segments) {
+    const res =
+      seg.uniqueVoices <= 1
+        ? await provider.synthesizeSpeech(
+            seg.turns.map((t) => t.text).join(" "),
+            seg.turns[0].voiceId,
+            { language: episode.language }
+          )
+        : await provider.synthesizeDialogue(seg.turns, { language: episode.language });
+    buffers.push(res.audio);
+    totalChars += res.characters;
+  }
+
+  await setEpisodeStatus(episode.id, "STITCHING", { stage: "Mixing the audio" });
+  const { data, durationSec } = await stitchMp3(buffers);
+
+  const key = assetKey("mp3", `${episode.id}.mp3`);
+  await storage().put(key, data, "audio/mpeg");
+
+  await prisma.audioAsset.upsert({
+    where: { episodeId: episode.id },
+    create: {
+      episodeId: episode.id,
+      storageKey: key,
+      durationSec,
+      sizeBytes: data.length,
+      segments: { count: segments.length } as Prisma.InputJsonValue,
+    },
+    update: {
+      storageKey: key,
+      durationSec,
+      sizeBytes: data.length,
+      segments: { count: segments.length } as Prisma.InputJsonValue,
+    },
+  });
+
+  await recordCost({
+    provider: "elevenlabs",
+    operation: "audio",
+    units: totalChars,
+    costUsd: audioCostUsd(totalChars),
+    episodeId: episode.id,
+    userId: episode.userId,
+  });
+}
+
+// ─────────────── Stage 5: cover art ───────────────
+async function generateArt(episode: EpisodeWithRelations) {
+  await setEpisodeStatus(episode.id, "ART", { stage: "Designing the cover art" });
+  const prompt = buildCoverPrompt(episode.topic, episode.tone, episode.title);
+  const { data, revisedPrompt } = await artProvider().generateCover(prompt);
+
+  const key = assetKey("art", `${episode.id}.png`);
+  await storage().put(key, data, "image/png");
+
+  await prisma.coverArt.upsert({
+    where: { episodeId: episode.id },
+    create: { episodeId: episode.id, storageKey: key, prompt: revisedPrompt ?? prompt, model: artProvider().model },
+    update: { storageKey: key, prompt: revisedPrompt ?? prompt },
+  });
+
+  await recordCost({
+    provider: "openai",
+    operation: "art",
+    units: 1,
+    costUsd: artCostUsd(1),
+    episodeId: episode.id,
+    userId: episode.userId,
+  });
+}
+
+// ─────────────── Stage 7: meter ───────────────
+async function meter(
+  episode: EpisodeWithRelations,
+  did: { script: boolean; audio: boolean; art: boolean }
+) {
+  const ownerId = episode.organizationId ?? episode.userId;
+  const ownerType = episode.organizationId ? "organization" : "user";
+  if (did.script) await incrementUsage(ownerId, ownerType, "script");
+  if (did.audio) await incrementUsage(ownerId, ownerType, "audio");
+  if (did.art) await incrementUsage(ownerId, ownerType, "art");
+}
+
+async function notifyReady(episode: EpisodeWithRelations) {
+  const appUrl = process.env.NEXT_PUBLIC_APP_URL ?? "http://localhost:3000";
+  try {
+    await sendEmail({
+      to: episode.user.email,
+      subject: `🎙️ "${episode.title}" is ready`,
+      html: emailLayout(
+        "Your episode is ready",
+        `“${episode.title}” has finished generating — script, audio, and cover art are all set.`,
+        { label: "Open episode", url: `${appUrl}/episodes/${episode.id}` }
+      ),
+      text: `Your episode "${episode.title}" is ready: ${appUrl}/episodes/${episode.id}`,
+    });
+  } catch (err) {
+    console.error("[notifyReady] email failed (non-fatal)", err);
+  }
+}
@@ -0,0 +1,53 @@
+import { z } from "zod";
+import { openai, SCRIPT_MODEL } from "@/lib/ai/openai";
+import type { StructuredScript, TokenUsage } from "@/lib/ai/types";
+
+export type RepurposeFormat = "blog" | "social_thread" | "newsletter";
+
+const FORMAT_PROMPTS: Record<RepurposeFormat, string> = {
+  blog: "Write an engaging, SEO-friendly blog post based on this episode. Include a compelling title and well-structured markdown body with headings and a short conclusion.",
+  social_thread:
+    "Write a punchy social thread (6–10 posts, numbered) summarizing the episode's best insights. Start with a strong hook. Put the whole thread in the markdown body.",
+  newsletter:
+    "Write a friendly email newsletter edition about this episode: a subject line as the title, a short intro, 3–4 key takeaways as bullets, and a call-to-action to listen. Markdown body.",
+};
+
+const outputSchema = z.object({ title: z.string().min(1), body: z.string().min(1) });
+export type RepurposedOutput = z.infer<typeof outputSchema>;
+
+function scriptToText(script: StructuredScript): string {
+  return script.sections
+    .map((s) => `## ${s.title}\n` + s.turns.map((t) => t.text).join("\n"))
+    .join("\n\n");
+}
+
+export async function repurposeScript(
+  script: StructuredScript,
+  format: RepurposeFormat
+): Promise<{ content: RepurposedOutput; usage: TokenUsage }> {
+  const transcript = scriptToText(script).slice(0, 9000);
+  const res = await openai().chat.completions.create({
+    model: SCRIPT_MODEL,
+    messages: [
+      {
+        role: "system",
+        content:
+          "You are a content marketer who repurposes podcast episodes into other formats. Return STRICT JSON: { \"title\": string, \"body\": string } where body is markdown.",
+      },
+      {
+        role: "user",
+        content: `${FORMAT_PROMPTS[format]}\n\nEpisode title: ${script.title}\n\nTranscript:\n${transcript}`,
+      },
+    ],
+    response_format: { type: "json_object" },
+    temperature: 0.7,
+  });
+  const content = outputSchema.parse(JSON.parse(res.choices[0]?.message?.content ?? "{}"));
+  return {
+    content,
+    usage: {
+      inputTokens: res.usage?.prompt_tokens ?? 0,
+      outputTokens: res.usage?.completion_tokens ?? 0,
+    },
+  };
+}
@@ -0,0 +1,110 @@
+import type { DialogueTurn, ScriptSection, StructuredScript } from "../types";
+
+export interface AudioSegment {
+  turns: DialogueTurn[];
+  characters: number;
+  /** Distinct voices used in this segment (drives speech vs dialogue choice). */
+  uniqueVoices: number;
+}
+
+/** Map each script turn to a voice, dropping turns with empty text. */
+export function flattenTurns(
+  script: StructuredScript,
+  voiceMap: Record<string, string>,
+  fallbackVoiceId: string
+): DialogueTurn[] {
+  const turns: DialogueTurn[] = [];
+  for (const section of script.sections) {
+    for (const turn of section.turns) {
+      const text = turn.text.trim();
+      if (!text) continue;
+      turns.push({ text, voiceId: voiceMap[turn.speakerKey] ?? fallbackVoiceId });
+    }
+  }
+  return turns;
+}
+
+/** Split text longer than maxChars at sentence boundaries (then hard-wrap if needed). */
+export function splitLongText(text: string, maxChars: number): string[] {
+  if (text.length <= maxChars) return [text];
+  const sentences = text.match(/[^.!?]+[.!?]*\s*/g) ?? [text];
+  const parts: string[] = [];
+  let current = "";
+  for (const sentence of sentences) {
+    if (sentence.length > maxChars) {
+      // A single very long sentence — hard-wrap on whitespace.
+      if (current) {
+        parts.push(current.trim());
+        current = "";
+      }
+      for (let i = 0; i < sentence.length; i += maxChars) {
+        parts.push(sentence.slice(i, i + maxChars).trim());
+      }
+      continue;
+    }
+    if ((current + sentence).length > maxChars) {
+      parts.push(current.trim());
+      current = sentence;
+    } else {
+      current += sentence;
+    }
+  }
+  if (current.trim()) parts.push(current.trim());
+  return parts.filter(Boolean);
+}
+
+/**
+ * Group dialogue turns into segments each within `maxChars`. Turns longer than
+ * the limit are split (preserving their voice). Each segment is later sent to
+ * ElevenLabs as one request, then all segment MP3s are stitched together.
+ */
+export function segmentTurns(turns: DialogueTurn[], maxChars: number): AudioSegment[] {
+  // First expand any oversized turns into multiple sub-turns.
+  const expanded: DialogueTurn[] = [];
+  for (const turn of turns) {
+    for (const piece of splitLongText(turn.text, maxChars)) {
+      expanded.push({ text: piece, voiceId: turn.voiceId });
+    }
+  }
+
+  const segments: AudioSegment[] = [];
+  let bucket: DialogueTurn[] = [];
+  let chars = 0;
+
+  const flush = () => {
+    if (bucket.length === 0) return;
+    segments.push({
+      turns: bucket,
+      characters: chars,
+      uniqueVoices: new Set(bucket.map((t) => t.voiceId)).size,
+    });
+    bucket = [];
+    chars = 0;
+  };
+
+  for (const turn of expanded) {
+    if (chars + turn.text.length > maxChars && bucket.length > 0) flush();
+    bucket.push(turn);
+    chars += turn.text.length;
+  }
+  flush();
+  return segments;
+}
+
+/** Convenience: full script → audio segments. */
+export function segmentScript(
+  script: StructuredScript,
+  voiceMap: Record<string, string>,
+  fallbackVoiceId: string,
+  maxChars: number
+): AudioSegment[] {
+  return segmentTurns(flattenTurns(script, voiceMap, fallbackVoiceId), maxChars);
+}
+
+/** Total characters across a script (for cost/limit estimation). */
+export function totalCharacters(sections: ScriptSection[]): number {
+  return sections.reduce(
+    (sum, s) => sum + s.turns.reduce((n, t) => n + t.text.length, 0),
+    0
+  );
+}
@@ -0,0 +1,60 @@
+import { promises as fs } from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { runFfmpeg, ffprobeDuration } from "../ffmpeg";
+
+/**
+ * Concatenate per-segment MP3 buffers into one normalized episode MP3.
+ *
+ * Segments are re-encoded (not stream-copied) through a single libmp3lame pass
+ * with loudness normalization, which guarantees a uniform codec/bitrate and
+ * avoids the header/timestamp glitches that `-c copy` concat can produce.
+ */
+export async function stitchMp3(
+  segments: Buffer[]
+): Promise<{ data: Buffer; durationSec: number | null }> {
+  if (segments.length === 0) throw new Error("No audio segments to stitch");
+
+  const dir = await fs.mkdtemp(path.join(os.tmpdir(), "podcastyes-"));
+  try {
+    const files: string[] = [];
+    for (let i = 0; i < segments.length; i++) {
+      const file = path.join(dir, `seg_${String(i).padStart(4, "0")}.mp3`);
+      await fs.writeFile(file, segments[i]);
+      files.push(file);
+    }
+
+    // concat demuxer list — forward slashes so it parses on Windows and Linux.
+    const listPath = path.join(dir, "list.txt");
+    const listBody = files
+      .map((f) => `file '${f.split(path.sep).join("/").replace(/'/g, "'\\''")}'`)
+      .join("\n");
+    await fs.writeFile(listPath, listBody);
+
+    const outPath = path.join(dir, "episode.mp3");
+    await runFfmpeg([
+      "-y",
+      "-f",
+      "concat",
+      "-safe",
+      "0",
+      "-i",
+      listPath,
+      "-af",
+      "loudnorm=I=-16:TP=-1.5:LRA=11",
+      "-c:a",
+      "libmp3lame",
+      "-b:a",
+      "128k",
+      "-ar",
+      "44100",
+      outPath,
+    ]);
+
+    const data = await fs.readFile(outPath);
+    const durationSec = await ffprobeDuration(outPath);
+    return { data, durationSec };
+  } finally {
+    await fs.rm(dir, { recursive: true, force: true }).catch(() => {});
+  }
+}
@@ -0,0 +1,93 @@
+import type { EpisodeConfig, StructuredScript } from "../types";
+
+const FORMAT_GUIDANCE: Record<EpisodeConfig["format"], string> = {
+  SOLO: "A single host speaking directly to the listener. Use only the host speaker.",
+  INTERVIEW:
+    "A host interviewing a guest. Alternate naturally between host questions and guest answers.",
+  MULTI_HOST:
+    "A panel of co-hosts in lively conversation. Distribute lines across all speakers and let them react to each other.",
+};
+
+/** Roughly 150 spoken words per minute → target word budget for the whole episode. */
+function wordBudget(minutes: number): number {
+  return Math.round(minutes * 150);
+}
+
+export function buildScriptMessages(config: EpisodeConfig) {
+  const speakerList = config.speakers
+    .map((s) => `- key "${s.speakerKey}" = ${s.displayName}`)
+    .join("\n");
+
+  const system = [
+    "You are an expert podcast scriptwriter and showrunner.",
+    "You write natural, engaging, spoken-word scripts that sound great when read aloud by AI voices.",
+    "Avoid stage directions, sound-effect notes, and parentheticals — output only spoken dialogue.",
+    "Return STRICT JSON only, matching the requested schema. Do not include markdown fences.",
+  ].join(" ");
+
+  const user = [
+    `Write a complete podcast episode script in ${config.language}.`,
+    "",
+    `Topic: ${config.topic}`,
+    `Tone: ${config.tone}`,
+    `Format: ${config.format} — ${FORMAT_GUIDANCE[config.format]}`,
+    config.audience ? `Target audience: ${config.audience}` : "",
+    `Approximate length: ${config.targetLengthMin} minutes (~${wordBudget(
+      config.targetLengthMin
+    )} words total).`,
+    "",
+    "Speakers (use ONLY these keys in `speakerKey`):",
+    speakerList,
+    "",
+    "Structure the episode into 3–6 sections (e.g. intro, main segments, outro).",
+    "Each section has a short title and a list of turns. Each turn is one speaker's spoken line.",
+    "",
+    "Return JSON with this exact shape:",
+    `{
+  "title": "string — a catchy episode title",
+  "sections": [
+    {
+      "id": "kebab-case-id",
+      "title": "string",
+      "turns": [
+        { "speakerKey": "host", "text": "spoken line..." }
+      ]
+    }
+  ]
+}`,
+  ]
+    .filter(Boolean)
+    .join("\n");
+
+  return [
+    { role: "system" as const, content: system },
+    { role: "user" as const, content: user },
+  ];
+}
+
+export function buildSectionMessages(
+  config: EpisodeConfig,
+  script: StructuredScript,
+  sectionId: string
+) {
+  const section = script.sections.find((s) => s.id === sectionId);
+  const speakerList = config.speakers.map((s) => `"${s.speakerKey}"=${s.displayName}`).join(", ");
+
+  const system =
+    "You are an expert podcast scriptwriter. Rewrite a single section of an existing episode, keeping the same speakers, tone, and language. Return STRICT JSON for just that one section.";
+
+  const user = [
+    `Episode title: ${script.title}`,
+    `Tone: ${config.tone}. Language: ${config.language}. Speakers: ${speakerList}.`,
+    "",
+    `Rewrite the section titled "${section?.title ?? sectionId}" (id "${sectionId}") to be fresh and engaging while serving the same purpose in the episode.`,
+    "",
+    "Return JSON with this exact shape:",
+    `{ "id": "${sectionId}", "title": "string", "turns": [ { "speakerKey": "host", "text": "..." } ] }`,
+  ].join("\n");
+
+  return [
+    { role: "system" as const, content: system },
+    { role: "user" as const, content: user },
+  ];
+}
@@ -0,0 +1,97 @@
+import type { AudioProvider, DialogueTurn, Voice } from "../types";
+
+const API = "https://api.elevenlabs.io/v1";
+const TTS_MODEL = process.env.ELEVENLABS_TTS_MODEL ?? "eleven_multilingual_v2";
+const DIALOGUE_MODEL = process.env.ELEVENLABS_DIALOGUE_MODEL ?? "eleven_v3";
+const OUTPUT_FORMAT = "mp3_44100_128";
+
+function apiKey(): string {
+  const k = process.env.ELEVENLABS_API_KEY;
+  if (!k) throw new Error("ELEVENLABS_API_KEY is not set");
+  return k;
+}
+
+interface ElevenVoice {
+  voice_id: string;
+  name: string;
+  preview_url?: string;
+  labels?: Record<string, string>;
+}
+
+export class ElevenLabsAudioProvider implements AudioProvider {
+  // Kept safely under the ~2,000-char dialogue limit to leave headroom.
+  readonly maxCharsPerRequest = 1800;
+
+  async synthesizeSpeech(
+    text: string,
+    voiceId: string,
+    _opts?: { language?: string }
+  ): Promise<{ audio: Buffer; characters: number }> {
+    const res = await fetch(
+      `${API}/text-to-speech/${voiceId}?output_format=${OUTPUT_FORMAT}`,
+      {
+        method: "POST",
+        headers: {
+          "xi-api-key": apiKey(),
+          "Content-Type": "application/json",
+          accept: "audio/mpeg",
+        },
+        body: JSON.stringify({
+          text,
+          model_id: TTS_MODEL,
+          voice_settings: { stability: 0.5, similarity_boost: 0.75 },
+        }),
+      }
+    );
+    if (!res.ok) throw new Error(`ElevenLabs TTS ${res.status}: ${await safeText(res)}`);
+    return { audio: Buffer.from(await res.arrayBuffer()), characters: text.length };
+  }
+
+  async synthesizeDialogue(
+    turns: DialogueTurn[],
+    _opts?: { language?: string }
+  ): Promise<{ audio: Buffer; characters: number }> {
+    const res = await fetch(`${API}/text-to-dialogue?output_format=${OUTPUT_FORMAT}`, {
+      method: "POST",
+      headers: {
+        "xi-api-key": apiKey(),
+        "Content-Type": "application/json",
+        accept: "audio/mpeg",
+      },
+      body: JSON.stringify({
+        inputs: turns.map((t) => ({ text: t.text, voice_id: t.voiceId })),
+        model_id: DIALOGUE_MODEL,
+      }),
+    });
+    if (!res.ok) throw new Error(`ElevenLabs dialogue ${res.status}: ${await safeText(res)}`);
+    const characters = turns.reduce((n, t) => n + t.text.length, 0);
+    return { audio: Buffer.from(await res.arrayBuffer()), characters };
+  }
+
+  async listVoices(): Promise<Voice[]> {
+    const res = await fetch(`${API}/voices`, { headers: { "xi-api-key": apiKey() } });
+    if (!res.ok) throw new Error(`ElevenLabs voices ${res.status}`);
+    const data = (await res.json()) as { voices?: ElevenVoice[] };
+    return (data.voices ?? []).map((v) => ({
+      id: v.voice_id,
+      name: v.name,
+      gender: normalizeGender(v.labels?.gender),
+      accent: v.labels?.accent,
+      description: v.labels?.description,
+      previewUrl: v.preview_url,
+    }));
+  }
+}
+
+function normalizeGender(g?: string): Voice["gender"] {
+  if (g === "male" || g === "female") return g;
+  return "neutral";
+}
+
+async function safeText(res: Response): Promise<string> {
+  try {
+    return await res.text();
+  } catch {
+    return res.statusText;
+  }
+}
@@ -0,0 +1,21 @@
+import { OpenAIScriptProvider } from "./openai-script";
+import { ElevenLabsAudioProvider } from "./elevenlabs-audio";
+import { OpenAIArtProvider } from "./openai-art";
+import type { ArtProvider, AudioProvider, ScriptProvider } from "../types";
+
+// Registry of active providers. Swapping a model later = change one line here.
+let script: ScriptProvider | null = null;
+let audio: AudioProvider | null = null;
+let art: ArtProvider | null = null;
+
+export function scriptProvider(): ScriptProvider {
+  return (script ??= new OpenAIScriptProvider());
+}
+
+export function audioProvider(): AudioProvider {
+  return (audio ??= new ElevenLabsAudioProvider());
+}
+
+export function artProvider(): ArtProvider {
+  return (art ??= new OpenAIArtProvider());
+}
@@ -0,0 +1,36 @@
+import { openai, ART_MODEL } from "../openai";
+import type { ArtProvider } from "../types";
+
+export class OpenAIArtProvider implements ArtProvider {
+  readonly model = ART_MODEL;
+
+  async generateCover(
+    prompt: string,
+    opts?: { size?: "1024x1024" }
+  ): Promise<{ data: Buffer; revisedPrompt?: string }> {
+    const res = await openai().images.generate({
+      model: this.model,
+      prompt,
+      n: 1,
+      size: opts?.size ?? "1024x1024",
+      response_format: "b64_json",
+    });
+    const item = res.data?.[0];
+    if (!item?.b64_json) throw new Error("DALL·E returned no image data");
+    return {
+      data: Buffer.from(item.b64_json, "base64"),
+      revisedPrompt: item.revised_prompt,
+    };
+  }
+}
+
+/** Build a cover-art prompt for an episode topic. */
+export function buildCoverPrompt(topic: string, tone: string, title?: string): string {
+  return [
+    `Podcast cover art for an episode titled "${title ?? topic}".`,
+    `Topic: ${topic}. Mood/tone: ${tone}.`,
+    "Modern, bold, eye-catching square album-cover style.",
+    "Strong focal subject, clean composition, vibrant but tasteful colors.",
+    "No text, no words, no letters, no logos.",
+  ].join(" ");
+}
@@ -0,0 +1,90 @@
+import { z } from "zod";
+import { openai, SCRIPT_MODEL } from "../openai";
+import { buildScriptMessages, buildSectionMessages } from "../prompts/script";
+import type {
+  EpisodeConfig,
+  ScriptProvider,
+  ScriptSection,
+  StructuredScript,
+  TokenUsage,
+} from "../types";
+
+const turnSchema = z.object({
+  speakerKey: z.string().min(1),
+  text: z.string().min(1),
+});
+
+const sectionSchema = z.object({
+  id: z.string().min(1),
+  title: z.string().min(1),
+  turns: z.array(turnSchema).min(1),
+});
+
+const scriptSchema = z.object({
+  title: z.string().min(1),
+  sections: z.array(sectionSchema).min(1),
+});
+
+/** Coerce/repair speakerKeys the model may have invented to the configured set. */
+function normalizeSpeakers(script: StructuredScript, config: EpisodeConfig): StructuredScript {
+  const valid = new Set(config.speakers.map((s) => s.speakerKey));
+  const fallback = config.speakers[0]?.speakerKey ?? "host";
+  return {
+    ...script,
+    sections: script.sections.map((sec) => ({
+      ...sec,
+      turns: sec.turns.map((t) => ({
+        ...t,
+        speakerKey: valid.has(t.speakerKey) ? t.speakerKey : fallback,
+      })),
+    })),
+  };
+}
+
+function usageFrom(u: { prompt_tokens?: number; completion_tokens?: number } | undefined): TokenUsage {
+  return { inputTokens: u?.prompt_tokens ?? 0, outputTokens: u?.completion_tokens ?? 0 };
+}
+
+export class OpenAIScriptProvider implements ScriptProvider {
+  readonly model = SCRIPT_MODEL;
+
+  async generate(config: EpisodeConfig): Promise<{ script: StructuredScript; usage: TokenUsage }> {
+    const res = await openai().chat.completions.create({
+      model: this.model,
+      messages: buildScriptMessages(config),
+      response_format: { type: "json_object" },
+      temperature: 0.8,
+    });
+    const content = res.choices[0]?.message?.content ?? "{}";
+    const parsed = scriptSchema.parse(JSON.parse(content));
+    return { script: normalizeSpeakers(parsed, config), usage: usageFrom(res.usage) };
+  }
+
+  async regenerateSection(
+    config: EpisodeConfig,
+    script: StructuredScript,
+    sectionId: string
+  ): Promise<{ section: ScriptSection; usage: TokenUsage }> {
+    const res = await openai().chat.completions.create({
+      model: this.model,
+      messages: buildSectionMessages(config, script, sectionId),
+      response_format: { type: "json_object" },
+      temperature: 0.9,
+    });
+    const content = res.choices[0]?.message?.content ?? "{}";
+    const section = sectionSchema.parse(JSON.parse(content));
+    const valid = new Set(config.speakers.map((s) => s.speakerKey));
+    const fallback = config.speakers[0]?.speakerKey ?? "host";
+    return {
+      section: {
+        ...section,
+        id: sectionId,
+        turns: section.turns.map((t) => ({
+          ...t,
+          speakerKey: valid.has(t.speakerKey) ? t.speakerKey : fallback,
+        })),
+      },
+      usage: usageFrom(res.usage),
+    };
+  }
+}
@@ -0,0 +1,48 @@
+import { z } from "zod";
+import { openai, SCRIPT_MODEL } from "./openai";
+import type { TokenUsage } from "./types";
+
+const seasonSchema = z.object({
+  title: z.string().min(1),
+  description: z.string().min(1),
+  episodes: z
+    .array(z.object({ title: z.string().min(1), topic: z.string().min(1), summary: z.string().min(1) }))
+    .min(1),
+});
+
+export type SeasonPlan = z.infer<typeof seasonSchema>;
+
+export async function planSeason(input: {
+  theme: string;
+  count: number;
+  tone: string;
+  audience?: string;
+  language: string;
+}): Promise<{ plan: SeasonPlan; usage: TokenUsage }> {
+  const res = await openai().chat.completions.create({
+    model: SCRIPT_MODEL,
+    messages: [
+      {
+        role: "system",
+        content:
+          "You are a podcast showrunner planning a cohesive season. Return STRICT JSON: { \"title\": string, \"description\": string, \"episodes\": [{ \"title\": string, \"topic\": string, \"summary\": string }] }.",
+      },
+      {
+        role: "user",
+        content: `Plan a ${input.count}-episode podcast season about: ${input.theme}.
+Tone: ${input.tone}. ${input.audience ? `Audience: ${input.audience}.` : ""} Language: ${input.language}.
+Give the season a title and short description, then ${input.count} episodes, each with a catchy title, a specific topic to cover, and a one-sentence summary.`,
+      },
+    ],
+    response_format: { type: "json_object" },
+    temperature: 0.85,
+  });
+  const plan = seasonSchema.parse(JSON.parse(res.choices[0]?.message?.content ?? "{}"));
+  return {
+    plan,
+    usage: {
+      inputTokens: res.usage?.prompt_tokens ?? 0,
+      outputTokens: res.usage?.completion_tokens ?? 0,
+    },
+  };
+}
@@ -0,0 +1,103 @@
+/**
+ * Provider abstraction for the three AI capabilities. Each capability has a thin
+ * interface so the underlying model (GPT-4, ElevenLabs, DALL·E) can be swapped
+ * via the registry in providers/index.ts without touching call sites.
+ */
+
+export type EpisodeFormat = "SOLO" | "INTERVIEW" | "MULTI_HOST";
+
+export interface SpeakerRole {
+  /** Stable key referenced by script turns, e.g. "host", "guest", "cohost". */
+  speakerKey: string;
+  displayName: string;
+}
+
+export interface EpisodeConfig {
+  title?: string;
+  topic: string;
+  tone: string;
+  format: EpisodeFormat;
+  /** ISO language code, e.g. "en", "es". */
+  language: string;
+  targetLengthMin: number;
+  audience?: string;
+  speakers: SpeakerRole[];
+}
+
+// ─────────────── Script ───────────────
+
+export interface ScriptTurn {
+  speakerKey: string;
+  text: string;
+}
+
+export interface ScriptSection {
+  id: string;
+  title: string;
+  turns: ScriptTurn[];
+}
+
+export interface StructuredScript {
+  title: string;
+  sections: ScriptSection[];
+}
+
+export interface TokenUsage {
+  inputTokens: number;
+  outputTokens: number;
+}
+
+export interface ScriptProvider {
+  readonly model: string;
+  generate(config: EpisodeConfig): Promise<{ script: StructuredScript; usage: TokenUsage }>;
+  regenerateSection(
+    config: EpisodeConfig,
+    script: StructuredScript,
+    sectionId: string
+  ): Promise<{ section: ScriptSection; usage: TokenUsage }>;
+}
+
+// ─────────────── Audio ───────────────
+
+export interface Voice {
+  id: string;
+  name: string;
+  gender?: "male" | "female" | "neutral";
+  accent?: string;
+  description?: string;
+  previewUrl?: string;
+}
+
+/** One line of multi-voice dialogue. */
+export interface DialogueTurn {
+  text: string;
+  voiceId: string;
+}
+
+export interface AudioProvider {
+  /** Synthesize a single voice reading (used for SOLO and as a fallback). */
+  synthesizeSpeech(
+    text: string,
+    voiceId: string,
+    opts?: { language?: string }
+  ): Promise<{ audio: Buffer; characters: number }>;
+  /** Synthesize a multi-voice dialogue chunk (≤ provider char limit, ≤10 voices). */
+  synthesizeDialogue(
+    turns: DialogueTurn[],
+    opts?: { language?: string }
+  ): Promise<{ audio: Buffer; characters: number }>;
+  /** Live voice catalog for the account. */
+  listVoices(): Promise<Voice[]>;
+  /** Hard cap on characters per synthesis request (drives segmentation). */
+  readonly maxCharsPerRequest: number;
+}
+
+// ─────────────── Art ───────────────
+
+export interface ArtProvider {
+  readonly model: string;
+  generateCover(
+    prompt: string,
+    opts?: { size?: "1024x1024" }
+  ): Promise<{ data: Buffer; revisedPrompt?: string }>;
+}
@@ -0,0 +1,36 @@
+import type { Voice } from "./types";
+
+/**
+ * Curated catalog of ElevenLabs premade voices (stable public voice IDs available
+ * to all accounts). Used by the create-episode wizard so it can render the voice
+ * picker without a live API call. The provider's listVoices() returns the live
+ * account catalog when needed.
+ */
+export const VOICE_CATALOG: Voice[] = [
+  { id: "21m00Tcm4TlvDq8ikWAM", name: "Rachel", gender: "female", accent: "American", description: "Calm, narrational" },
+  { id: "EXAVITQu4vr4xnSDxMaL", name: "Sarah", gender: "female", accent: "American", description: "Soft, news" },
+  { id: "FGY2WhTYpPnrIDTdsKH5", name: "Laura", gender: "female", accent: "American", description: "Upbeat, social" },
+  { id: "XB0fDUnXU5powFXDhCwa", name: "Charlotte", gender: "female", accent: "British", description: "Warm, seductive" },
+  { id: "XrExE9yKIg1WjnnlVkGX", name: "Matilda", gender: "female", accent: "American", description: "Friendly, warm" },
+  { id: "pFZP5JQG7iQjIQuC4Bku", name: "Lily", gender: "female", accent: "British", description: "Confident narration" },
+  { id: "cgSgspJ2msm6clMCkdW9", name: "Jessica", gender: "female", accent: "American", description: "Expressive, young" },
+  { id: "9BWtsMINqrJLrRacOk9x", name: "Aria", gender: "female", accent: "American", description: "Husky, expressive" },
+  { id: "pNInz6obpgDQGcFmaJgB", name: "Adam", gender: "male", accent: "American", description: "Deep, narration" },
+  { id: "JBFqnCBsd6RMkjVDRZzb", name: "George", gender: "male", accent: "British", description: "Warm, mature" },
+  { id: "TX3LPaxmHKxFdv7VOQHJ", name: "Liam", gender: "male", accent: "American", description: "Articulate, young" },
+  { id: "onwK4e9ZLuTAKqWW03F9", name: "Daniel", gender: "male", accent: "British", description: "Authoritative, news" },
+  { id: "nPczCjzI2devNBz1zQrb", name: "Brian", gender: "male", accent: "American", description: "Deep, mature" },
+  { id: "iP95p4xoKVk53GoZ742B", name: "Chris", gender: "male", accent: "American", description: "Casual, conversational" },
+  { id: "bIHbv24MWmeRgasZH58o", name: "Will", gender: "male", accent: "American", description: "Friendly, chill" },
+  { id: "cjVigY5qzO86Huf0OWal", name: "Eric", gender: "male", accent: "American", description: "Smooth, classy" },
+];
+
+export const DEFAULT_VOICE_IDS: Record<string, string> = {
+  host: "21m00Tcm4TlvDq8ikWAM", // Rachel
+  guest: "pNInz6obpgDQGcFmaJgB", // Adam
+  cohost: "JBFqnCBsd6RMkjVDRZzb", // George
+};
+
+export function voiceById(id: string): Voice | undefined {
+  return VOICE_CATALOG.find((v) => v.id === id);
+}