Sprint 6d — Migrate Gemini Live to @google/genai SDK
feat(geminiLive): rewrite with GoogleGenAI SDK (vertexai: true, apiKey) replaces raw WebSocket to generativelanguage.googleapis.com feat(geminiLive): restore full setup config (systemInstruction, inputAudioTranscription, outputAudioTranscription, VAD) fix(geminiLive): buildSetupFrame → SDK config object (no manual JSON) fix(useT2LiveSession): cancelTokenRef for idempotent startDialogue, closeAllRef for stable unmount cleanup chore: add @google/genai@^1.50.1 dependency test: 11 geminiLive tests rewritten with SDK mock 292/292 backend tests green
This commit is contained in:
parent
d89b0b1e89
commit
0662e766d4
6 changed files with 970 additions and 331 deletions
|
|
@ -1,9 +1,38 @@
|
|||
import { WebSocket as NodeWebSocket } from "ws";
|
||||
/**
|
||||
* geminiLive.ts — Sprint 6d.
|
||||
*
|
||||
* Migration du WebSocket brut (`wss://generativelanguage.googleapis.com/...`)
|
||||
* vers le SDK officiel `@google/genai` v1.50.x. Motif : Google a migré les
|
||||
* clés API vers le mode "Vertex AI Express", incompatible avec l'endpoint WS
|
||||
* historique (réponse 403 systématique). Le SDK gère l'auth automatiquement
|
||||
* et accepte les clés Express bound à un service account.
|
||||
*
|
||||
* Interface publique (consommée par `routes/t2live.ts`) :
|
||||
* - openGeminiLiveSession(clientWs, opts) : ouvre une session Live et
|
||||
* proxifie les messages dans les deux sens entre le client (navigateur)
|
||||
* et Gemini, accumule les transcripts, gère timeouts + close codes.
|
||||
* - WebSocketLike : interface minimale pour le client WS (Hono adapter).
|
||||
* - buildT2SystemPrompt({role, contexte}) : prompt dynamique T2 Live.
|
||||
* - GEMINI_LIVE_MODEL, T2_SESSION_TIMEOUT_MS, T2_SESSION_WARNING_MS.
|
||||
*
|
||||
* Cf. docs/IMPLEMENTATION_T2_LIVE.md §3, docs/Prompt_t2live.md §3.
|
||||
*/
|
||||
|
||||
export const GEMINI_LIVE_URL =
|
||||
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
|
||||
import {
|
||||
GoogleGenAI,
|
||||
Modality,
|
||||
StartSensitivity,
|
||||
EndSensitivity,
|
||||
type Session,
|
||||
} from "@google/genai";
|
||||
|
||||
export const GEMINI_LIVE_MODEL = "models/gemini-2.5-flash-native-audio-latest";
|
||||
/**
|
||||
* Modèle Live cible. `gemini-3.1-flash-live-preview` est le choix par défaut
|
||||
* (Sprint 6d), à valider sur Express Mode via `test-gemini-live.js`. Fallback
|
||||
* documenté : `gemini-2.0-flash-live-001` (modèle Live garanti sur Express
|
||||
* d'après la doc Vertex Express).
|
||||
*/
|
||||
export const GEMINI_LIVE_MODEL = "gemini-3.1-flash-live-preview";
|
||||
|
||||
/** Timeout total session WS T2 Live : 3 min 30 (durée TCF) + marge évaluation. */
|
||||
export const T2_SESSION_TIMEOUT_MS = 210_000;
|
||||
|
|
@ -36,7 +65,6 @@ Règles à respecter impérativement :
|
|||
/**
|
||||
* Subset minimal d'une WebSocket — compatible avec :
|
||||
* - le wrapper exposé par @hono/node-ws (côté client navigateur)
|
||||
* - la WebSocket de `ws` (côté Gemini)
|
||||
* - les fakes basés sur EventEmitter dans les tests
|
||||
*/
|
||||
export interface WebSocketLike {
|
||||
|
|
@ -59,34 +87,32 @@ export interface OpenGeminiLiveSessionOptions {
|
|||
timeoutMs?: number;
|
||||
/** Override warning (par défaut T2_SESSION_WARNING_MS). */
|
||||
warningMs?: number;
|
||||
/** Injection pour les tests — fabrique de WebSocket vers Gemini. */
|
||||
geminiFactory?: (url: string) => WebSocketLike;
|
||||
/** Surcharge la clé API (par défaut : process.env.GEMINI_API_KEY). */
|
||||
apiKey?: string;
|
||||
/**
|
||||
* Injection pour les tests — fabrique de client SDK. Permet de remplacer
|
||||
* `new GoogleGenAI(...)` par un mock dans les tests sans toucher au code prod.
|
||||
*/
|
||||
clientFactory?: (apiKey: string) => GoogleGenAI;
|
||||
}
|
||||
|
||||
function buildSetupFrame(systemPrompt: string): string {
|
||||
return JSON.stringify({
|
||||
setup: {
|
||||
model: GEMINI_LIVE_MODEL,
|
||||
systemInstruction: {
|
||||
parts: [{ text: systemPrompt }],
|
||||
},
|
||||
generationConfig: {
|
||||
responseModalities: ["AUDIO"],
|
||||
},
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
realtimeInputConfig: {
|
||||
automaticActivityDetection: {
|
||||
disabled: false,
|
||||
startOfSpeechSensitivity: "START_SENSITIVITY_LOW",
|
||||
endOfSpeechSensitivity: "END_SENSITIVITY_LOW",
|
||||
silenceDurationMs: 2000,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
/**
|
||||
* Forme minimale d'un message Live retourné par le SDK. On n'exporte pas
|
||||
* `LiveServerMessage` du SDK pour ne pas coupler les tests à son shape exact.
|
||||
*/
|
||||
interface LiveServerMessage {
|
||||
serverContent?: {
|
||||
modelTurn?: {
|
||||
parts?: Array<{
|
||||
inlineData?: { data?: string; mimeType?: string };
|
||||
}>;
|
||||
};
|
||||
inputTranscription?: { text?: string };
|
||||
outputTranscription?: { text?: string };
|
||||
interrupted?: boolean;
|
||||
turnComplete?: boolean;
|
||||
};
|
||||
setupComplete?: unknown;
|
||||
}
|
||||
|
||||
interface TranscriptEntry {
|
||||
|
|
@ -104,54 +130,6 @@ function reconstructTranscript(entries: TranscriptEntry[]): string {
|
|||
.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Tente de parser un message Gemini en JSON pour en extraire les transcripts.
|
||||
* Retourne null si non-JSON (chunks audio binaires).
|
||||
*/
|
||||
function tryParseGeminiMessage(data: unknown): {
|
||||
inputText?: string;
|
||||
outputText?: string;
|
||||
} | null {
|
||||
let text: string;
|
||||
if (typeof data === "string") {
|
||||
text = data;
|
||||
} else if (data instanceof Buffer) {
|
||||
// Heuristique : tenter de parser comme JSON UTF-8 ; si ça échoue, c'est binaire.
|
||||
try {
|
||||
text = data.toString("utf8");
|
||||
if (!text.startsWith("{")) return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
} else if (typeof data === "object" && data !== null && "toString" in data) {
|
||||
try {
|
||||
text = (data as { toString: () => string }).toString();
|
||||
if (!text.startsWith("{")) return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(text) as {
|
||||
serverContent?: {
|
||||
inputTranscription?: { text?: string };
|
||||
outputTranscription?: { text?: string };
|
||||
};
|
||||
};
|
||||
const sc = parsed.serverContent;
|
||||
if (!sc) return {};
|
||||
return {
|
||||
inputText: sc.inputTranscription?.text,
|
||||
outputText: sc.outputTranscription?.text,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Détecte un signal de fin de session envoyé par le client : `{type:'end'}`.
|
||||
*/
|
||||
|
|
@ -178,19 +156,53 @@ function isEndSignal(data: unknown): boolean {
|
|||
}
|
||||
|
||||
/**
|
||||
* Ouvre une session Gemini Live et proxifie les messages
|
||||
* Parse un message client `{type:'audio', data: base64}` et renvoie le base64
|
||||
* si le format est valide, sinon null.
|
||||
*/
|
||||
function parseAudioChunk(data: unknown): string | null {
|
||||
let text: string;
|
||||
if (typeof data === "string") {
|
||||
text = data;
|
||||
} else if (data instanceof Buffer) {
|
||||
try {
|
||||
text = data.toString("utf8");
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
if (!text.startsWith("{")) return null;
|
||||
try {
|
||||
const parsed = JSON.parse(text) as { type?: string; data?: unknown };
|
||||
if (parsed.type === "audio" && typeof parsed.data === "string") {
|
||||
return parsed.data;
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ouvre une session Gemini Live via le SDK et proxifie les messages
|
||||
* dans les deux sens entre le client (navigateur) et Gemini.
|
||||
*
|
||||
* - À l'open Gemini : envoie le setup frame avec prompt dynamique + VAD
|
||||
* + inputAudioTranscription + outputAudioTranscription.
|
||||
* - Forward transparent des frames audio dans les deux directions.
|
||||
* - Accumule les transcripts (input = candidat, output = examinateur IA).
|
||||
* - Détecte signal client `{type:'end'}` → déclenche fin de session.
|
||||
* - Timeout 210 s : warning client à 180 s, fin auto à 210 s.
|
||||
* - En fin de session : appelle `onSessionEnd(transcript)` puis ferme Gemini.
|
||||
* Le client WS n'est PAS fermé ici — c'est l'appelant qui décide (envoi du
|
||||
* rapport puis close 1000).
|
||||
* - Erreur Gemini → close client 4006 GEMINI_DISCONNECTED.
|
||||
* - Init : `new GoogleGenAI({ vertexai: true, apiKey })` → mode Vertex Express
|
||||
* (compatible avec les clés API auto-bound à un service account).
|
||||
* - Setup config : modèle + responseModalities AUDIO + systemInstruction
|
||||
* + inputAudioTranscription + outputAudioTranscription + VAD.
|
||||
* - Forward client → Gemini : parse `{type:'audio', data: base64}` →
|
||||
* `session.sendRealtimeInput({audio: {data, mimeType: 'audio/pcm;rate=16000'}})`.
|
||||
* - Forward Gemini → client : `clientWs.send(JSON.stringify(msg))` (le frontend
|
||||
* parse `serverContent.modelTurn.parts[].inlineData.data`).
|
||||
* - Accumule input/outputTranscription pour la correction finale.
|
||||
* - Détecte `{type:'end'}` du client → fin de session.
|
||||
* - Timer 210 s : warning à 180 s, fin auto à 210 s.
|
||||
* - En fin : `onSessionEnd(transcript)` puis ferme la session SDK. Le client WS
|
||||
* n'est PAS fermé ici — c'est l'appelant qui décide (envoi du rapport puis
|
||||
* close 1000).
|
||||
* - Erreur SDK / close Gemini → close client 4006 GEMINI_DISCONNECTED.
|
||||
* - GEMINI_API_KEY absente → close client 4005 GEMINI_CONFIG.
|
||||
*/
|
||||
export function openGeminiLiveSession(
|
||||
|
|
@ -211,17 +223,14 @@ export function openGeminiLiveSession(
|
|||
contexte: opts.contexte,
|
||||
});
|
||||
|
||||
const url = `${GEMINI_LIVE_URL}?key=${apiKey}`;
|
||||
const factory =
|
||||
opts.geminiFactory ??
|
||||
((u: string) => new NodeWebSocket(u) as unknown as WebSocketLike);
|
||||
|
||||
const geminiWs = factory(url);
|
||||
const ai =
|
||||
opts.clientFactory?.(apiKey) ?? new GoogleGenAI({ vertexai: true, apiKey });
|
||||
|
||||
const transcriptEntries: TranscriptEntry[] = [];
|
||||
let sessionEnded = false;
|
||||
let warningTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
let timeoutTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
let session: Session | null = null;
|
||||
|
||||
const clearTimers = () => {
|
||||
if (warningTimer !== null) {
|
||||
|
|
@ -238,10 +247,12 @@ export function openGeminiLiveSession(
|
|||
if (sessionEnded) return;
|
||||
sessionEnded = true;
|
||||
clearTimers();
|
||||
try {
|
||||
geminiWs.close(1000);
|
||||
} catch {
|
||||
/* ignore */
|
||||
if (session) {
|
||||
try {
|
||||
session.close();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
if (opts.onSessionEnd) {
|
||||
try {
|
||||
|
|
@ -255,105 +266,153 @@ export function openGeminiLiveSession(
|
|||
}
|
||||
};
|
||||
|
||||
geminiWs.on("open", () => {
|
||||
console.log("[T2] Gemini WS opened");
|
||||
try {
|
||||
geminiWs.send(buildSetupFrame(systemPrompt));
|
||||
console.log("[T2] Setup frame sent");
|
||||
|
||||
// Démarrer les timers une fois la session Gemini effectivement ouverte.
|
||||
warningTimer = setTimeout(() => {
|
||||
if (sessionEnded) return;
|
||||
try {
|
||||
clientWs.send(
|
||||
JSON.stringify({
|
||||
type: "warning",
|
||||
message: "30 secondes restantes",
|
||||
}),
|
||||
);
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}, warningMs);
|
||||
|
||||
timeoutTimer = setTimeout(() => {
|
||||
void endSession();
|
||||
}, timeoutMs);
|
||||
} catch {
|
||||
try {
|
||||
clientWs.close(4005, "GEMINI_CONFIG");
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
const handleSdkMessage = (msg: LiveServerMessage) => {
|
||||
// Accumuler transcripts pour la correction finale.
|
||||
const sc = msg.serverContent;
|
||||
if (sc?.inputTranscription?.text && sc.inputTranscription.text.length > 0) {
|
||||
transcriptEntries.push({
|
||||
speaker: "candidat",
|
||||
text: sc.inputTranscription.text,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
geminiWs.on("message", (data) => {
|
||||
// Tentative d'extraction des transcripts — si JSON, on accumule ;
|
||||
// dans tous les cas (JSON ou audio binaire), on forward au client.
|
||||
const parsed = tryParseGeminiMessage(data);
|
||||
if (parsed) {
|
||||
if (parsed.inputText && parsed.inputText.length > 0) {
|
||||
transcriptEntries.push({
|
||||
speaker: "candidat",
|
||||
text: parsed.inputText,
|
||||
});
|
||||
}
|
||||
if (parsed.outputText && parsed.outputText.length > 0) {
|
||||
transcriptEntries.push({
|
||||
speaker: "examinateur",
|
||||
text: parsed.outputText,
|
||||
});
|
||||
}
|
||||
if (
|
||||
sc?.outputTranscription?.text &&
|
||||
sc.outputTranscription.text.length > 0
|
||||
) {
|
||||
transcriptEntries.push({
|
||||
speaker: "examinateur",
|
||||
text: sc.outputTranscription.text,
|
||||
});
|
||||
}
|
||||
|
||||
// Forward verbatim au client. Le frontend parse serverContent.modelTurn.
|
||||
try {
|
||||
clientWs.send(data);
|
||||
clientWs.send(JSON.stringify(msg));
|
||||
} catch {
|
||||
void endSession();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
// ── Ouverture de la session SDK ──────────────────────────────────────
|
||||
ai.live
|
||||
.connect({
|
||||
model: GEMINI_LIVE_MODEL,
|
||||
config: {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
systemInstruction: systemPrompt,
|
||||
inputAudioTranscription: {},
|
||||
outputAudioTranscription: {},
|
||||
realtimeInputConfig: {
|
||||
automaticActivityDetection: {
|
||||
disabled: false,
|
||||
startOfSpeechSensitivity: StartSensitivity.START_SENSITIVITY_LOW,
|
||||
endOfSpeechSensitivity: EndSensitivity.END_SENSITIVITY_LOW,
|
||||
silenceDurationMs: 2000,
|
||||
},
|
||||
},
|
||||
},
|
||||
callbacks: {
|
||||
onopen: () => {
|
||||
console.log("[T2] Session Gemini ouverte (SDK)");
|
||||
// Démarrer les timers une fois la session effectivement ouverte.
|
||||
warningTimer = setTimeout(() => {
|
||||
if (sessionEnded) return;
|
||||
try {
|
||||
clientWs.send(
|
||||
JSON.stringify({
|
||||
type: "warning",
|
||||
message: "30 secondes restantes",
|
||||
}),
|
||||
);
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}, warningMs);
|
||||
|
||||
timeoutTimer = setTimeout(() => {
|
||||
void endSession();
|
||||
}, timeoutMs);
|
||||
},
|
||||
onmessage: (msg: LiveServerMessage) => {
|
||||
handleSdkMessage(msg);
|
||||
},
|
||||
onerror: (err: unknown) => {
|
||||
console.log(
|
||||
"[T2] Erreur SDK :",
|
||||
err instanceof Error ? err.message : String(err),
|
||||
);
|
||||
if (!sessionEnded) {
|
||||
clearTimers();
|
||||
sessionEnded = true;
|
||||
try {
|
||||
clientWs.close(4006, "GEMINI_DISCONNECTED");
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
},
|
||||
onclose: () => {
|
||||
console.log("[T2] Session Gemini fermée (SDK)");
|
||||
if (!sessionEnded) {
|
||||
clearTimers();
|
||||
try {
|
||||
clientWs.close(4006, "GEMINI_DISCONNECTED");
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
})
|
||||
.then((s: Session) => {
|
||||
session = s;
|
||||
})
|
||||
.catch((err: unknown) => {
|
||||
console.log(
|
||||
"[T2] live.connect a échoué :",
|
||||
err instanceof Error ? err.message : String(err),
|
||||
);
|
||||
sessionEnded = true;
|
||||
clearTimers();
|
||||
try {
|
||||
clientWs.close(4006, "GEMINI_DISCONNECTED");
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
});
|
||||
|
||||
// ── Forward client → Gemini ──────────────────────────────────────────
|
||||
clientWs.on("message", (data) => {
|
||||
if (isEndSignal(data)) {
|
||||
void endSession();
|
||||
return;
|
||||
}
|
||||
try {
|
||||
geminiWs.send(data);
|
||||
} catch {
|
||||
void endSession();
|
||||
}
|
||||
});
|
||||
|
||||
geminiWs.on("close", () => {
|
||||
console.log("[T2] Gemini closed");
|
||||
if (!sessionEnded) {
|
||||
clearTimers();
|
||||
const audioBase64 = parseAudioChunk(data);
|
||||
if (audioBase64 !== null && session !== null && !sessionEnded) {
|
||||
try {
|
||||
clientWs.close(4006, "GEMINI_DISCONNECTED");
|
||||
} catch {
|
||||
/* ignore */
|
||||
session.sendRealtimeInput({
|
||||
audio: {
|
||||
data: audioBase64,
|
||||
mimeType: "audio/pcm;rate=16000",
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
console.log(
|
||||
"[T2] sendRealtimeInput a échoué :",
|
||||
err instanceof Error ? err.message : String(err),
|
||||
);
|
||||
void endSession();
|
||||
}
|
||||
}
|
||||
// Tout autre message client est ignoré (ex: ping keep-alive frontend).
|
||||
});
|
||||
|
||||
clientWs.on("close", () => {
|
||||
clearTimers();
|
||||
sessionEnded = true;
|
||||
try {
|
||||
geminiWs.close(1000);
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
});
|
||||
|
||||
geminiWs.on("error", (err) => {
|
||||
console.log("[T2] Gemini error:", (err as Error)?.message);
|
||||
if (!sessionEnded) {
|
||||
clearTimers();
|
||||
sessionEnded = true;
|
||||
if (session) {
|
||||
try {
|
||||
clientWs.close(4006, "GEMINI_DISCONNECTED");
|
||||
session.close();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
|
|
@ -363,10 +422,12 @@ export function openGeminiLiveSession(
|
|||
clientWs.on("error", () => {
|
||||
clearTimers();
|
||||
sessionEnded = true;
|
||||
try {
|
||||
geminiWs.close(1011);
|
||||
} catch {
|
||||
/* ignore */
|
||||
if (session) {
|
||||
try {
|
||||
session.close();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue