main idea is now working :)

Using openai for tts and groq for ollama fast inference
This commit is contained in:
Dobromir Popov
2024-06-12 10:05:09 +03:00
parent 8bafcd9dbe
commit 4627f16284
6 changed files with 308 additions and 20 deletions

View File

@ -8,6 +8,14 @@ const path = require('path');
const dotenv = require('dotenv');
const ollama = require('ollama');
const axios = require('axios');
// import OpenAI from "openai";
const OpenAI = require('openai');
const openai = new OpenAI({ apiKey: "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" });
const Groq = require('groq-sdk');
//const LLM = require("@themaximalist/llm.js"); //https://www.npmjs.com/package/@themaximalist/llm.js
const groq = new Groq({ apiKey: process.env.GROQ_API_KEY });
if (dotenv) {
const envFile = process.env.NODE_ENV === 'development' ? '.env.development' : '.env';
@ -21,6 +29,7 @@ const PORT_HTTP = process.env.SERVER_PORT_HTTP || 3000;
const PORT_WS = process.env.SERVER_PORT_WS || 8080;
const TTS_API_URL = process.env.TTS_API_URL;
const LNN_API_URL = process.env.LNN_API_URL;
const LLN_MODEL = process.env.LLN_MODEL;
let language = "en";
let storeRecordings = false;
@ -221,17 +230,50 @@ function detectLanguage(ws, formData) {
}
async function translateText(originalText, originalLanguage, targetLanguage) {
return queryLLMAxios("translate this text from " + originalLanguage + " to " + targetLanguage + ": " + originalText)
.then(response => {
console.log('Translation response:', response);
return response;
const prompt = "Translate this text from " + originalLanguage + " to " + targetLanguage + ": " + originalText;
// const llm = new LLM();
// llm.system("Translate voice transcriptions. some words may be omonymous, so please provide the most likely translation.");
// let result = await llm.chat(prompt, { service: "groq", model: "mixtral-8x7b-32768" });
// return result;
return groq.chat.completions
.create({
messages: [
{
role: "system",
content: "You are translating voice transcriptions from '" + originalLanguage + "' to '" + targetLanguage + "'. Reply with just the translation. It will be converted to speech using TTS - you can add more context if needed.",
},
{
role: "user",
content: originalText,
},
],
model: "llama3-8b-8192",
})
.then((chatCompletion) => {
let result = chatCompletion.choices[0]?.message?.content || "";
console.log(result);
return { response: result };
});
// return queryLLMAxios("translate this text from " + originalLanguage + " to " + targetLanguage + ": " + originalText)
// .then(response => {
// console.log('Translation response:', response);
// return response;
// });
}
async function queryLLM(prompt) {
const requestData = {
model: 'qwen2', // ollama3
model: LLN_MODEL || 'qwen2', // ollama3
prompt: prompt,
system: "you provide translations to the text transcribed from audio. The text is in a language you understand, and you can provide translations to any language you know.",
system: "Translate voice transcriptions. some words may be omonymous, so please provide the most likely translation.",
//format: "json"
};
const ola = new ollama.Ollama({ host: LNN_API_URL })
@ -241,14 +283,14 @@ async function queryLLM(prompt) {
///obsolete function
async function queryLLMAxios(prompt) {
const requestData = {
model: 'qwen2',
model: LLN_MODEL || 'qwen2',
prompt: prompt,
"system": "talk like a pirate",
"system": "Translate voice transcriptions. some words may be omonymous, so please provide the most likely translation.",
"stream": false
};
try {
const response = await axios.post(LNN_API_URL + "/api/generate", requestData, {
const response = await axios.post(LNN_API_URL, requestData, {
headers: {
// 'Authorization': `Bearer ${OLLAMA_API_KEY}`,
'Content-Type': 'application/json'
@ -261,7 +303,7 @@ async function queryLLMAxios(prompt) {
}
}
function transcribeAudio(ws, formData, sessionData) {
async function transcribeAudio(ws, formData, sessionData) {
const start = new Date().getTime();
queueCounter++;
@ -289,16 +331,36 @@ function transcribeAudio(ws, formData, sessionData) {
chat.participants.forEach(sessionId => {
if (sessionId !== ws.sessionId) {
let targetLang = sessions.get(sessionId)?.language || 'en';
targetLang = "bg";
//targetLang = "bg";
if (targetLang !== sessionData.language) {
console.log('Translating message "'+body+'" from ' + sessionData.language + ' to ' + targetLang);
console.log('Translating message "' + body + '" from ' + sessionData.language + ' to ' + targetLang);
translateText(body, sessionData.language, targetLang)
.then(translation => {
const jsonResp = JSON.parse(translation);
msg.translations.push({ language: targetLang, text: jsonResp.response });
let jsonResp;
if (typeof translation === 'string') {
try {
jsonResp = JSON.parse(translation);
} catch (e) {
console.error('Failed to parse translation response:', e);
ws.send(JSON.stringify({ type: 'error', message: 'Invalid translation response' }));
return;
}
} else {
jsonResp = translation;
}
const participantSocket = Array.from(wss.clients).find(client => client.sessionId === sessionId);
if (participantSocket && participantSocket.readyState === WebSocket.OPEN) {
participantSocket.send(JSON.stringify({ type: 'text', text: sessionData.username + ': ' + jsonResp.response + "\n" }));
// Generate and send the speech audio
generateSpeech(jsonResp.response)
.then(audioBuffer => {
console.log('Generated audio for translation:', audioBuffer.length);
msg.translations.push({ language: targetLang, text: jsonResp.response, audio: audioBuffer.toString('base64') });
participantSocket.send(JSON.stringify({ type: 'audio', audio: audioBuffer.toString('base64') }));
});
}
});
}
@ -306,6 +368,7 @@ function transcribeAudio(ws, formData, sessionData) {
const participantSocket = Array.from(wss.clients).find(client => client.sessionId === sessionId);
if (participantSocket && participantSocket.readyState === WebSocket.OPEN) {
participantSocket.send(JSON.stringify({ type: 'text', text: sessionData.username + ': ' + body + "\n" }));
participantSocket.send(JSON.stringify({ type: 'audio', audio: formData.toString('base64') }));
}
}
}
@ -336,6 +399,16 @@ function broadcastUserList() {
});
}
async function generateSpeech(text) {
const mp3 = await openai.audio.speech.create({
model: "tts-1",
voice: "alloy",
input: text,
});
const buffer = Buffer.from(await mp3.arrayBuffer());
return buffer;
}
// HTTP Server
app.get('/', (req, res) => {
res.sendFile(path.join(__dirname, 'chat-client.html'));