/** * 语音 API 封装 * TTS (文字转语音) 和 STT (语音转文字) */ import { getSettings } from './config.js'; /** * 获取语音 API 配置 * @param {Object} contact - 角色对象（可选，用于获取角色独立配置） * @returns {Object} 配置对象 */ export function getVoiceApiConfig(contact = null) { const settings = getSettings(); // 基础配置 const config = { stt: { url: settings.sttApiUrl || '', key: settings.sttApiKey || '', model: settings.sttModel || '' }, tts: { url: settings.ttsApiUrl || '', key: settings.ttsApiKey || '', model: settings.ttsModel || '', voice: settings.ttsVoice || '', speed: settings.ttsSpeed || 1, emotion: settings.ttsEmotion || '默认', proxyUrl: settings.ttsProxyUrl || '' } }; // 角色独立 TTS 配置 if (contact?.useCustomVoice && contact.customTtsVoice) { config.tts.voice = contact.customTtsVoice; } return config; } /** * 根据 Blob 类型获取文件名 */ function getAudioFileName(blob) { const type = blob.type || 'audio/webm'; if (type.includes('webm')) return 'audio.webm'; if (type.includes('ogg')) return 'audio.ogg'; if (type.includes('mp4')) return 'audio.mp4'; if (type.includes('mpeg') || type.includes('mp3')) return 'audio.mp3'; if (type.includes('wav')) return 'audio.wav'; if (type.includes('flac')) return 'audio.flac'; return 'audio.webm'; } /** * 将音频 Blob 转换为 WAV 格式（更好的兼容性） * 导出供其他模块使用 */ export async function convertToWav(audioBlob) { try { const audioContext = new (window.AudioContext || window.webkitAudioContext)(); const arrayBuffer = await audioBlob.arrayBuffer(); const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); // 创建 WAV 文件 const numChannels = audioBuffer.numberOfChannels; const sampleRate = audioBuffer.sampleRate; const format = 1; // PCM const bitDepth = 16; const bytesPerSample = bitDepth / 8; const blockAlign = numChannels * bytesPerSample; const samples = audioBuffer.length; const dataSize = samples * blockAlign; const buffer = new ArrayBuffer(44 + dataSize); const view = new DataView(buffer); // WAV 头部 const writeString = (offset, str) => { for (let i = 0; i < str.length; i++) { view.setUint8(offset + i, str.charCodeAt(i)); } }; writeString(0, 'RIFF'); view.setUint32(4, 36 + dataSize, true); writeString(8, 'WAVE'); writeString(12, 'fmt '); view.setUint32(16, 16, true); view.setUint16(20, format, true); view.setUint16(22, numChannels, true); view.setUint32(24, sampleRate, true); view.setUint32(28, sampleRate * blockAlign, true); view.setUint16(32, blockAlign, true); view.setUint16(34, bitDepth, true); writeString(36, 'data'); view.setUint32(40, dataSize, true); // 写入音频数据 const channelData = []; for (let i = 0; i < numChannels; i++) { channelData.push(audioBuffer.getChannelData(i)); } let offset = 44; for (let i = 0; i < samples; i++) { for (let ch = 0; ch < numChannels; ch++) { const sample = Math.max(-1, Math.min(1, channelData[ch][i])); const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7FFF; view.setInt16(offset, intSample, true); offset += 2; } } await audioContext.close(); return new Blob([buffer], { type: 'audio/wav' }); } catch (err) { console.warn('[可乐] WAV 转换失败，使用原格式:', err); return audioBlob; } } /** * STT: 语音转文字 * @param {Blob} audioBlob - 音频数据 * @param {Object} options - 选项 * @returns {Promise} 识别的文字 */ export async function speechToText(audioBlob, options = {}) { const config = getVoiceApiConfig(); if (!config.stt.url || !config.stt.key) { throw new Error('请先配置语音识别 (STT) API'); } // 自动补全 URL 路径 let sttUrl = config.stt.url.trim().replace(/\/+$/, ''); if (!sttUrl.includes('/audio/transcriptions')) { sttUrl = sttUrl + '/audio/transcriptions'; } // 如果不是 WAV 格式，尝试转换以提高兼容性 let processedBlob = audioBlob; if (!audioBlob.type.includes('wav')) { console.log('[可乐] 转换音频为 WAV 格式...'); processedBlob = await convertToWav(audioBlob); } // 根据音频类型设置正确的文件名 const fileName = getAudioFileName(processedBlob); const formData = new FormData(); formData.append('file', processedBlob, fileName); if (config.stt.model) { formData.append('model', config.stt.model); } try { console.log('[可乐] STT 请求:', { url: sttUrl, model: config.stt.model, originalType: audioBlob.type, processedType: processedBlob.type, audioSize: processedBlob.size, fileName: fileName }); const response = await fetch(sttUrl, { method: 'POST', headers: { 'Authorization': `Bearer ${config.stt.key}` }, body: formData }); if (!response.ok) { const errorText = await response.text(); console.error('[可乐] STT API 错误:', response.status, errorText); // 尝试解析 JSON 错误 try { const errorJson = JSON.parse(errorText); const errorMsg = errorJson.error?.message || errorJson.message || errorText; throw new Error(errorMsg); } catch (parseErr) { // 如果不是 JSON 解析错误，而是 throw 的错误，重新抛出 if (parseErr.message && !parseErr.message.includes('JSON')) { throw parseErr; } throw new Error(`HTTP ${response.status}: ${errorText.substring(0, 200)}`); } } const result = await response.json(); console.log('[可乐] STT 响应:', result); return result.text || ''; } catch (err) { console.error('[可乐] STT 请求失败:', err); throw err; } } /** * TTS: 文字转语音 * @param {string} text - 要合成的文字 * @param {Object} contact - 角色对象（用于获取角色独立音色） * @param {Object} options - 选项 * @returns {Promise} 音频 Blob */ export async function textToSpeech(text, contact = null, options = {}) { const config = getVoiceApiConfig(contact); if (!config.tts.url || !config.tts.key) { throw new Error('请先配置语音合成 (TTS) API'); } if (!text || !text.trim()) { throw new Error('合成文字不能为空'); } // 自动补全 URL 路径 let ttsUrl = config.tts.url.trim().replace(/\/+$/, ''); if (!ttsUrl.includes('/audio/speech')) { ttsUrl = ttsUrl + '/audio/speech'; } // 构建请求体 const model = (options.model || config.tts.model || '').trim(); const voice = (options.voice || config.tts.voice || '').trim(); // 检查必填字段 if (!model) { throw new Error('请先配置 TTS 模型'); } if (!voice) { throw new Error('请先配置 TTS 音色'); } // 检测是否是 Gemini TTS 模型 const isGeminiTTS = model.toLowerCase().includes('gemini') && model.toLowerCase().includes('tts'); // 检测是否是 GSVI 模型 (gsv2p.acgnai.top) const isGSVI = model.toLowerCase().includes('gsvi'); // 检测是否是 MiniMax TTS API const isMiniMax = ttsUrl.toLowerCase().includes('minimax') || ttsUrl.includes('/t2a_v2'); // MiniMax API 使用完全不同的格式 if (isMiniMax) { // 修正 URL：MiniMax 使用 /v1/t2a_v2 而不是 /audio/speech ttsUrl = ttsUrl.replace(/\/audio\/speech$/, '/t2a_v2'); if (!ttsUrl.includes('/t2a_v2')) { ttsUrl = ttsUrl.replace(/\/+$/, '') + '/t2a_v2'; } // 如果配置了代理 URL，使用代理（解决 CORS 问题） if (config.tts.proxyUrl) { const proxyBase = config.tts.proxyUrl.trim().replace(/\/+$/, ''); // 提取 MiniMax URL 的路径部分 const urlObj = new URL(ttsUrl); ttsUrl = proxyBase + urlObj.pathname; console.log('[可乐] MiniMax 使用代理:', ttsUrl); } } // 构建请求体 let requestBody; if (isMiniMax) { // MiniMax API 格式 const speed = options.speed || config.tts.speed || 1; const emotion = options.emotion || config.tts.emotion; requestBody = { model: model, text: text.trim(), stream: false, voice_setting: { voice_id: voice, speed: speed, vol: 1, pitch: 0 }, audio_setting: { sample_rate: 32000, bitrate: 128000, format: 'mp3', channel: 1 } }; // 添加情绪参数（只有有效值才添加） if (emotion && emotion !== '默认') { const emotionMap = { '高兴': 'happy', '悲伤': 'sad', '愤怒': 'angry', '害怕': 'fearful', '厌恶': 'disgusted', '惊讶': 'surprised', '中性': 'calm', '生动': 'fluent', '低语': 'whisper' }; // 只有在 emotionMap 中有对应值时才添加 const mappedEmotion = emotionMap[emotion]; if (mappedEmotion) { requestBody.voice_setting.emotion = mappedEmotion; } } } else { requestBody = { model: model, voice: voice }; // GSVI 模型只需要基本参数 if (isGSVI) { requestBody.input = text.trim(); // GSVI API 不需要 language 和 emotion 参数 } else { // OpenAI 标准格式使用 input requestBody.input = text.trim(); // 非 Gemini TTS 时才添加额外参数 if (!isGeminiTTS) { // 只有非默认语速才添加 speed 参数 const speed = options.speed || config.tts.speed || 1; if (speed !== 1) { requestBody.speed = speed; } // 扩展参数 (GPT-SoVITS 等支持) const emotion = options.emotion || config.tts.emotion; if (emotion && emotion !== '默认') { requestBody.other_params = { text_lang: '中英混合', prompt_lang: '中文', emotion: emotion }; } } } } try { const textContent = requestBody.input || requestBody.text || ''; console.log('[可乐] TTS 请求:', { url: ttsUrl, model: model, voice: voice, isGSVI: isGSVI, isGeminiTTS: isGeminiTTS, isMiniMax: isMiniMax, textLength: textContent.length, textFull: textContent // 打印完整文本 }); const response = await fetch(ttsUrl, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Accept': isMiniMax ? 'application/json' : 'audio/mpeg, audio/wav, audio/*', 'Authorization': `Bearer ${config.tts.key}` }, body: JSON.stringify(requestBody) }); if (!response.ok) { const errorText = await response.text(); console.error('[可乐] TTS API 错误:'); console.error(' 状态码:', response.status); console.error(' 响应内容:', errorText); console.error(' 请求URL:', ttsUrl); console.error(' 请求体:', JSON.stringify(requestBody, null, 2)); // 尝试解析 JSON 错误 try { const errorJson = JSON.parse(errorText); // MiniMax 错误格式: base_resp.status_msg const errorMsg = errorJson.base_resp?.status_msg || errorJson.error?.message || errorJson.message || errorJson.error || errorText; throw new Error(typeof errorMsg === 'string' ? errorMsg : JSON.stringify(errorMsg)); } catch (parseErr) { if (parseErr.message && !parseErr.message.includes('JSON')) { throw parseErr; } throw new Error(`HTTP ${response.status}: ${errorText.substring(0, 300)}`); } } // MiniMax API 返回 JSON，需要特殊处理 if (isMiniMax) { const jsonResp = await response.json(); console.log('[可乐] MiniMax TTS 响应:', { status_code: jsonResp.base_resp?.status_code, status_msg: jsonResp.base_resp?.status_msg, audio_length: jsonResp.extra_info?.audio_length, audio_format: jsonResp.extra_info?.audio_format }); // 检查 MiniMax 错误 if (jsonResp.base_resp?.status_code !== 0) { throw new Error('MiniMax TTS 错误: ' + (jsonResp.base_resp?.status_msg || '未知错误')); } if (!jsonResp.data?.audio) { throw new Error('MiniMax TTS 未返回音频数据'); } // 将 hex 编码的音频转换为 Blob const hexAudio = jsonResp.data.audio; const bytes = new Uint8Array(hexAudio.length / 2); for (let i = 0; i < hexAudio.length; i += 2) { bytes[i / 2] = parseInt(hexAudio.substr(i, 2), 16); } const audioFormat = jsonResp.extra_info?.audio_format || 'mp3'; const mimeType = `audio/${audioFormat}`; return new Blob([bytes], { type: mimeType }); } const audioBlob = await response.blob(); console.log('[可乐] TTS 响应:', { 音频大小: audioBlob.size, 类型: audioBlob.type, 响应头ContentType: response.headers.get('content-type') }); // 先检查是否返回了错误的 JSON（有些 API 错误时返回 JSON） const contentType = response.headers.get('content-type') || audioBlob.type; if (contentType.includes('application/json') || contentType.includes('text/')) { const text = await audioBlob.text(); console.error('[可乐] TTS 返回了文本而非音频:', text); try { const errJson = JSON.parse(text); const errMsg = errJson.error?.message || errJson.message || errJson.error || JSON.stringify(errJson); throw new Error('TTS 错误: ' + errMsg); } catch (e) { if (e.message.includes('TTS')) throw e; throw new Error('TTS 返回了非音频数据: ' + text.substring(0, 100)); } } // 检查是否返回了有效的音频数据 if (audioBlob.size < 100) { console.error('[可乐] TTS 返回的数据太小，可能不是有效音频'); throw new Error('TTS 返回的音频数据无效'); } // 修复：如果 blob 类型为空或不是音频类型，手动指定 MIME 类型 // 某些 TTS API（如 GPT-SoVITS）返回的音频没有正确的 Content-Type let finalBlob = audioBlob; if (!audioBlob.type || audioBlob.type === '' || !audioBlob.type.startsWith('audio/')) { // 尝试从 Content-Type 头获取类型，或使用默认的 audio/wav let mimeType = 'audio/wav'; const headerType = response.headers.get('content-type'); if (headerType && headerType.startsWith('audio/')) { mimeType = headerType.split(';')[0].trim(); } else if (headerType && headerType.includes('octet-stream')) { // application/octet-stream 通常是 wav 格式 mimeType = 'audio/wav'; } console.log('[可乐] TTS blob 类型为空，手动指定为:', mimeType); const arrayBuffer = await audioBlob.arrayBuffer(); finalBlob = new Blob([arrayBuffer], { type: mimeType }); } return finalBlob; } catch (err) { console.error('[可乐] TTS 请求失败:', err); // 检查是否是网络错误 if (err.message?.includes('Failed to fetch') || err.message?.includes('NetworkError')) { throw new Error('网络连接失败，请检查 API 地址是否正确，或尝试使用代理'); } throw err; } } /** * 播放音频 * @param {Blob|string} audio - 音频 Blob 或 URL * @returns {Promise} Audio 元素 */ export function playAudio(audio) { return new Promise((resolve, reject) => { const audioEl = new Audio(); if (audio instanceof Blob) { audioEl.src = URL.createObjectURL(audio); } else { audioEl.src = audio; } audioEl.onended = () => { if (audio instanceof Blob) { URL.revokeObjectURL(audioEl.src); } resolve(audioEl); }; audioEl.onerror = (err) => { if (audio instanceof Blob) { URL.revokeObjectURL(audioEl.src); } reject(err); }; audioEl.play().catch(reject); }); } /** * 录音类 */ export class AudioRecorder { constructor() { this.mediaRecorder = null; this.audioChunks = []; this.stream = null; this.isRecording = false; this.mimeType = 'audio/webm'; } /** * 开始录音 * @returns {Promise} */ async start() { if (this.isRecording) return; try { this.stream = await navigator.mediaDevices.getUserMedia({ audio: true }); // 选择最佳支持的音频格式 this.mimeType = getSupportedMimeType(); console.log('[可乐] 录音使用格式:', this.mimeType); this.mediaRecorder = new MediaRecorder(this.stream, { mimeType: this.mimeType }); this.audioChunks = []; this.mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) { this.audioChunks.push(e.data); } }; this.mediaRecorder.start(100); // 每100ms收集一次数据 this.isRecording = true; console.log('[可乐] 开始录音'); } catch (err) { console.error('[可乐] 无法获取麦克风权限:', err); throw new Error('无法获取麦克风权限，请检查浏览器设置'); } } /** * 停止录音 * @returns {Promise} 录音数据 */ stop() { return new Promise((resolve, reject) => { if (!this.isRecording || !this.mediaRecorder) { reject(new Error('没有正在进行的录音')); return; } const mimeType = this.mimeType; this.mediaRecorder.onstop = () => { const audioBlob = new Blob(this.audioChunks, { type: mimeType }); this.cleanup(); console.log('[可乐] 录音结束，格式:', mimeType, '大小:', audioBlob.size); resolve(audioBlob); }; this.mediaRecorder.stop(); this.isRecording = false; }); } /** * 取消录音 */ cancel() { if (this.mediaRecorder && this.isRecording) { this.mediaRecorder.stop(); } this.cleanup(); this.isRecording = false; } /** * 清理资源 */ cleanup() { if (this.stream) { this.stream.getTracks().forEach(track => track.stop()); this.stream = null; } this.mediaRecorder = null; this.audioChunks = []; } /** * 检查浏览器是否支持录音 * @returns {boolean} */ static isSupported() { const hasGetUserMedia = !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia); const hasMediaRecorder = typeof MediaRecorder !== 'undefined'; const isSecureContext = window.isSecureContext; console.log('[可乐] 录音支持检测:', { getUserMedia: hasGetUserMedia, MediaRecorder: hasMediaRecorder, isSecureContext: isSecureContext, protocol: location.protocol }); return hasGetUserMedia && hasMediaRecorder; } /** * 获取不支持录音的原因 * @returns {string} */ static getUnsupportedReason() { if (!window.isSecureContext) { return '需要 HTTPS 安全连接才能使用录音功能'; } if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { return '浏览器不支持 getUserMedia API'; } if (typeof MediaRecorder === 'undefined') { return '浏览器不支持 MediaRecorder API（iOS Safari 需要 iOS 14.3+）'; } return '未知原因'; } } /** * 获取 MediaRecorder 支持的音频格式 */ function getSupportedMimeType() { const types = [ 'audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/ogg', 'audio/mp4', 'audio/mpeg' ]; for (const type of types) { if (MediaRecorder.isTypeSupported(type)) { return type; } } return 'audio/webm'; } /** * 测试 STT API * @returns {Promise} */ export async function testSttApi() { const config = getVoiceApiConfig(); if (!config.stt.url || !config.stt.key) { throw new Error('请先填写 STT API 地址和密钥'); } console.log('[可乐] 开始 STT 测试...'); console.log('[可乐] STT 配置:', { url: config.stt.url, model: config.stt.model, keyLength: config.stt.key?.length || 0 }); // 创建测试音频 (1.5秒，包含一些变化的音调模拟语音) const audioContext = new (window.AudioContext || window.webkitAudioContext)(); const oscillator = audioContext.createOscillator(); const gainNode = audioContext.createGain(); const destination = audioContext.createMediaStreamDestination(); oscillator.connect(gainNode); gainNode.connect(destination); // 模拟语音的频率变化 oscillator.frequency.setValueAtTime(200, audioContext.currentTime); oscillator.frequency.linearRampToValueAtTime(400, audioContext.currentTime + 0.5); oscillator.frequency.linearRampToValueAtTime(300, audioContext.currentTime + 1); oscillator.frequency.linearRampToValueAtTime(350, audioContext.currentTime + 1.5); // 音量包络 gainNode.gain.setValueAtTime(0.3, audioContext.currentTime); gainNode.gain.linearRampToValueAtTime(0.5, audioContext.currentTime + 0.3); gainNode.gain.linearRampToValueAtTime(0.3, audioContext.currentTime + 1.2); gainNode.gain.linearRampToValueAtTime(0, audioContext.currentTime + 1.5); oscillator.start(); const mimeType = getSupportedMimeType(); console.log('[可乐] 录制音频格式:', mimeType); const recorder = new MediaRecorder(destination.stream, { mimeType }); const chunks = []; return new Promise((resolve, reject) => { recorder.ondataavailable = e => { if (e.data.size > 0) { chunks.push(e.data); } }; recorder.onstop = async () => { oscillator.stop(); audioContext.close(); const blob = new Blob(chunks, { type: mimeType }); console.log('[可乐] 测试音频大小:', blob.size, 'bytes'); if (blob.size < 100) { reject(new Error('测试音频生成失败')); return; } try { // speechToText 会自动转换为 WAV 格式 const result = await speechToText(blob); console.log('[可乐] STT 测试结果:', result); resolve(true); } catch (err) { reject(err); } }; recorder.start(100); // 录制 1.5 秒 setTimeout(() => recorder.stop(), 1500); }); } /** * 测试 TTS API * @returns {Promise} */ export async function testTtsApi() { const config = getVoiceApiConfig(); if (!config.tts.url || !config.tts.key) { throw new Error('请先填写 TTS API 地址和密钥'); } return await textToSpeech('测试语音合成'); }