main idea is now working :)

Using openai for tts and groq for ollama fast inference
2024-06-12 10:05:09 +03:00
parent 8bafcd9dbe
commit 4627f16284
6 changed files with 308 additions and 20 deletions
--- a/web/chat-client.html
+++ b/web/chat-client.html
@@ -183,7 +183,7 @@
                        break;
                    case "text":
                    case "transcriptionResult":
-                        transcription.innerHTML += "\r\n" + json.text;
+                        transcription.innerHTML += "<br />" + json.text;
                        let latency = Date.now() - serverTime;
                        if (autosend.checked) {
                            // const arr = event.data.split(/[(\)]/);
@@ -197,6 +197,13 @@
                            //transcription.innerHTML = event.data;
                        }
                        break;
+                    case 'audio':
+                        const audioBuffer = Uint8Array.from(atob(json.audio), char => char.charCodeAt(0));
+                        const audioBlob = new Blob([audioBuffer], { type: 'audio/mp3' });
+                        const audioUrl = URL.createObjectURL(audioBlob);
+                        const audio = new Audio(audioUrl);
+                        audio.play();
+                        break;

                    case "userList":
                        users = json.users;
@@ -235,14 +242,14 @@
        }

        function userJoin(sessionId, username, language) {
-            socket.send(JSON.stringify({ type: 'join', username , language}));
+            socket.send(JSON.stringify({ type: 'join', username, language }));
            document.cookie = `sessionId=${sessionId}; path=/;`;
            document.cookie = `username=${username}; path=/;`;

            showClearSessionOption();
        }

-        
+
        function clearSession() {
            document.cookie = "sessionId=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=/;";
            document.cookie = "username=; expires=Thu, 01 Jan 1970 00:00:00 UTC; path=/;";
@@ -303,7 +310,7 @@
            users.forEach(user => {
                const option = document.createElement('option');
                option.value = user.sessionId;
-                option.innerText = "["+user.language+"] " +user.username;
+                option.innerText = "[" + user.language + "] " + user.username;
                if (user.username === username) {
                    option.innerText += " (me)";
                }
--- a/web/chat-server.js
+++ b/web/chat-server.js
@@ -8,6 +8,14 @@ const path = require('path');
 const dotenv = require('dotenv');
 const ollama = require('ollama');
 const axios = require('axios');
+// import OpenAI from "openai";
+const OpenAI = require('openai');
+const openai = new OpenAI({ apiKey: "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" });
+
+const Groq = require('groq-sdk');
+//const LLM = require("@themaximalist/llm.js"); //https://www.npmjs.com/package/@themaximalist/llm.js
+const groq = new Groq({ apiKey: process.env.GROQ_API_KEY });
+

 if (dotenv) {
    const envFile = process.env.NODE_ENV === 'development' ? '.env.development' : '.env';
@@ -21,6 +29,7 @@ const PORT_HTTP = process.env.SERVER_PORT_HTTP || 3000;
 const PORT_WS = process.env.SERVER_PORT_WS || 8080;
 const TTS_API_URL = process.env.TTS_API_URL;
 const LNN_API_URL = process.env.LNN_API_URL;
+const LLN_MODEL = process.env.LLN_MODEL;

 let language = "en";
 let storeRecordings = false;
@@ -221,17 +230,50 @@ function detectLanguage(ws, formData) {
 }

 async function translateText(originalText, originalLanguage, targetLanguage) {
-    return queryLLMAxios("translate this text from " + originalLanguage + " to " + targetLanguage + ": " + originalText)
-        .then(response => {
-            console.log('Translation response:', response);
-            return response;
+    const prompt = "Translate this text from " + originalLanguage + " to " + targetLanguage + ": " + originalText;
+
+
+    // const llm = new LLM();
+    // llm.system("Translate voice transcriptions. some words may be omonymous, so please provide the most likely translation.");
+
+    // let result = await llm.chat(prompt, { service: "groq", model: "mixtral-8x7b-32768" });
+    // return result;
+
+
+    return  groq.chat.completions
+        .create({
+            messages: [
+                {
+                    role: "system",
+                    content: "You are translating voice transcriptions from '" + originalLanguage + "' to '" + targetLanguage + "'. Reply with just the translation. It will be converted to speech using TTS - you can add more context if needed.",
+                },
+                {
+                    role: "user",
+                    content: originalText,
+                },
+            ],
+            model: "llama3-8b-8192",
+        })
+        .then((chatCompletion) => {
+            let result = chatCompletion.choices[0]?.message?.content || "";
+            console.log(result);
+            return { response: result };
        });
+
+
+
+
+    // return queryLLMAxios("translate this text from " + originalLanguage + " to " + targetLanguage + ": " + originalText)
+    //     .then(response => {
+    //         console.log('Translation response:', response);
+    //         return response;
+    //     });
 }
 async function queryLLM(prompt) {
    const requestData = {
-        model: 'qwen2', // ollama3
+        model: LLN_MODEL || 'qwen2', // ollama3
        prompt: prompt,
-        system: "you provide translations to the text transcribed from audio. The text is in a language you understand, and you can provide translations to any language you know.",
+        system: "Translate voice transcriptions. some words may be omonymous, so please provide the most likely translation.",
        //format: "json"
    };
    const ola = new ollama.Ollama({ host: LNN_API_URL })
@@ -241,14 +283,14 @@ async function queryLLM(prompt) {
 ///obsolete function
 async function queryLLMAxios(prompt) {
    const requestData = {
-        model: 'qwen2',
+        model: LLN_MODEL || 'qwen2',
        prompt: prompt,
-        "system": "talk like a pirate",
+        "system": "Translate voice transcriptions. some words may be omonymous, so please provide the most likely translation.",
        "stream": false
    };

    try {
-        const response = await axios.post(LNN_API_URL + "/api/generate", requestData, {
+        const response = await axios.post(LNN_API_URL, requestData, {
            headers: {
                // 'Authorization': `Bearer ${OLLAMA_API_KEY}`,
                'Content-Type': 'application/json'
@@ -261,7 +303,7 @@ async function queryLLMAxios(prompt) {
    }
 }

-function transcribeAudio(ws, formData, sessionData) {
+async function transcribeAudio(ws, formData, sessionData) {
    const start = new Date().getTime();
    queueCounter++;

@@ -289,16 +331,36 @@ function transcribeAudio(ws, formData, sessionData) {
                chat.participants.forEach(sessionId => {
                    if (sessionId !== ws.sessionId) {
                        let targetLang = sessions.get(sessionId)?.language || 'en';
-                        targetLang = "bg";
+                        //targetLang = "bg";
                        if (targetLang !== sessionData.language) {
-                            console.log('Translating message "'+body+'" from ' + sessionData.language + ' to ' + targetLang);
+                            console.log('Translating message "' + body + '" from ' + sessionData.language + ' to ' + targetLang);
                            translateText(body, sessionData.language, targetLang)
                                .then(translation => {
-                                    const jsonResp = JSON.parse(translation);
-                                    msg.translations.push({ language: targetLang, text: jsonResp.response });
+                                    let jsonResp;
+                                    if (typeof translation === 'string') {
+                                        try {
+                                            jsonResp = JSON.parse(translation);
+                                        } catch (e) {
+                                            console.error('Failed to parse translation response:', e);
+                                            ws.send(JSON.stringify({ type: 'error', message: 'Invalid translation response' }));
+                                            return;
+                                        }
+                                    } else {
+                                        jsonResp = translation;
+                                    }
+
                                    const participantSocket = Array.from(wss.clients).find(client => client.sessionId === sessionId);
                                    if (participantSocket && participantSocket.readyState === WebSocket.OPEN) {
                                        participantSocket.send(JSON.stringify({ type: 'text', text: sessionData.username + ': ' + jsonResp.response + "\n" }));
+
+                                        // Generate and send the speech audio
+                                        generateSpeech(jsonResp.response)
+                                            .then(audioBuffer => {
+                                                console.log('Generated audio for translation:', audioBuffer.length);
+                                                msg.translations.push({ language: targetLang, text: jsonResp.response, audio: audioBuffer.toString('base64') });
+                                                participantSocket.send(JSON.stringify({ type: 'audio', audio: audioBuffer.toString('base64') }));
+                                            });
+
                                    }
                                });
                        }
@@ -306,6 +368,7 @@ function transcribeAudio(ws, formData, sessionData) {
                            const participantSocket = Array.from(wss.clients).find(client => client.sessionId === sessionId);
                            if (participantSocket && participantSocket.readyState === WebSocket.OPEN) {
                                participantSocket.send(JSON.stringify({ type: 'text', text: sessionData.username + ': ' + body + "\n" }));
+                                participantSocket.send(JSON.stringify({ type: 'audio', audio: formData.toString('base64') }));
                            }
                        }
                    }
@@ -336,6 +399,16 @@ function broadcastUserList() {
    });
 }

+async function generateSpeech(text) {
+    const mp3 = await openai.audio.speech.create({
+        model: "tts-1",
+        voice: "alloy",
+        input: text,
+    });
+    const buffer = Buffer.from(await mp3.arrayBuffer());
+    return buffer;
+}
+
 // HTTP Server
 app.get('/', (req, res) => {
    res.sendFile(path.join(__dirname, 'chat-client.html'));