various improvements:

using mono audio;
options to store recordings;
multple backends;
new audio file name;
sessions;
This commit is contained in:
popov 2023-03-15 14:20:35 +00:00
parent 5354d8c328
commit 1c15463b21
6 changed files with 588 additions and 357 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
node_modules/*
package-lock.json
rec/*

View File

@ -0,0 +1 @@
{"key":"language","value":"bg"}

View File

@ -0,0 +1 @@
{"key":"storeRecordings","value":"true"}

View File

@ -37,7 +37,7 @@ COPY package*.json ./
# Install dependencies
RUN npm install ws express request #--only=production
RUN npm install ws express request node-persist body-parser dotenv #--only=production
# Copy the rest of the application files
COPY . .
@ -45,6 +45,7 @@ COPY . .
# Start the application
#CMD ["npm", "start"]
CMD npm start
# portainer: '-c' 'echo Container started; trap "exit 0" 15; exec npm start'
EXPOSE 8080 8081

View File

@ -3,45 +3,87 @@
<head>
<title>Real-time Speech-to-Text</title>
<style>
.recording {
background-color: red;
color: white;
}
</style>
<meta name="viewport"
content="width=device-width, initial-scale=1">
<!-- Add the Tailwind CSS library -->
<link rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/tailwindcss/2.2.19/tailwind.min.css">
</head>
<body>
<h1>Rt STT</h1>
<label class="toggle">
<body class="bg-gray-100">
<div class="container mx-auto px-4 py-8">
<h1 class="text-2xl font-bold mb-4 text-center">Rt STT</h1>
<div class="flex justify-center items-center mb-4">
<label class="toggle flex items-center">
<input type="checkbox"
id="autosend" />
<span class="slider">Continious</span>
id="autosend"
class="mr-2">
<span class="slider"></span>
<span class="ml-2">Continuous</span>
</label>
<select id="input-devices">
<select id="input-devices"
class="ml-4">
<option value="default">Default</option>
</select>
<select id="language-select">
<option value="en">English</option>
<option value="bg">Български</option>
</select>
<label class="toggle flex items-center ml-4">
<input type="checkbox"
id="store-recordings"
class="mr-2">
<span class="slider"></span>
<span class="ml-2">Store Recordings</span>
</div>
<div class="flex justify-center items-center mb-4">
<span id="record-actions">
<button id="record-button"
disabled>Start Recording</button>
<span id="connection-status"></span>
<div id="status-recording"></div>
<p id="transcription"></p>
disabled
class="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded mr-4">
Start Recording</button>
<button id="record-button-speakers"
disabled
class="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded mr-4">
Stream from speakers</button>
</span>
</div>
<div class="flex justify-center items-center mb-4">
<div id="connection-status"
style="margin-right: 5px;"></div>
</div>
<div class="flex justify-center items-center mb-4">
<div id="info"></div>
</div>
<div id="status-recording"
class="flex justify-center items-center mb-4">
</div>
<div class="relative rounded-lg border border-gray-300 shadow-sm">
<textarea id="transcription"
class="block w-full h-48 p-4 resize-none"
placeholder="Whisper something into the microphone..."></textarea>
<button id="copyButton"
class="absolute top-0 right-0 px-4 py-2 text-sm font-medium text-gray-700 bg-gray-200 hover:bg-gray-300 rounded-bl-lg focus:outline-none"
onclick="copyToClipboard('transcription')">
Copy
</button>
</div>
<canvas id="canvas"
width="500"
height="500"></canvas>
class="w-full"></canvas>
<script>
let sessionId;
let selectedDeviceId = "default";
let socket;
let audioRecorder;
let audioStream;
let recording = false;
let recordButton;
let connected = false;
let connectionStatus; //HTML auto generated
let statusRecording; //HTML auto generated
let audioContext;
let audioSampleRate;
let serverTime;
let volumeChecker;
@ -51,7 +93,12 @@
let isSpeaking = false;
let soundDetected = false;
let speakingCount = 0;
let SILENCE_DELAY_MS = 50; //was 100 with good results
let SILENCE_DELAY_MS = 50;
let preDetect_IncludedAudio = 400; //ms
let soundCount_Threshold = 10;
let silenceCount_Threshold = 10;
const volumeHistory = [];
let canvas = document.getElementById("canvas");
@ -59,6 +106,26 @@
let barWidth = 10;
let barSpacing = 5;
// Handle language select change
document.getElementById('language-select').addEventListener('change', (event) => {
const language = event.target.value;
fetch('/settings', {
method: 'POST',
body: JSON.stringify({ language, sessionId }),
headers: { 'Content-Type': 'application/json' },
credentials: 'same-origin'
});
});
document.getElementById('store-recordings').addEventListener('change', (event) => {
const storeRecordings = event.target.checked;
fetch('/settings', {
method: 'POST',
body: JSON.stringify({ storeRecordings, sessionId }),
headers: { 'Content-Type': 'application/json' },
credentials: 'same-origin'
});
});
// Draw sliding bar graph
function drawSlidingBarGraph(lastVolumes) {
canvasCtx.clearRect(0, 0, canvas.width, canvas.height);
@ -105,7 +172,7 @@
audioRecorder.start();
}
speakingCount++;
if (speakingCount > 7) {
if (speakingCount > soundCount_Threshold) {
statusRecording.innerHTML = "Listening...";
statusRecording.style.color = "green";
isSpeaking = true;
@ -115,7 +182,7 @@
speakingCount = 0;
if (isSpeaking) {
silenceCount++;
if (silenceCount > 3) {
if (silenceCount > silenceCount_Threshold) {
if (autosend.checked) {
console.log("Was speakng and is now silence. (" + averageVolume + " averageVolume). Sending audio to server.");
soundDetected = true;
@ -162,7 +229,6 @@
socket.onopen = () => {
console.log("WebSocket connection opened.");
connectionStatus.innerHTML = "Connected to " + wsurl;
transcription.innerHTML = "Whisper something into the microphone...";
recordButton.disabled = false;
connected = true;
};
@ -184,12 +250,38 @@
};
function onmessage(event) {
//check if the message is json
try {
let json = JSON.parse(event.data);
//store session id in cookies
if (json.hasOwnProperty("sessionId")) {
sessionId = json.sessionId;
console.log("Got session id: " + sessionId);
}
if (json.hasOwnProperty("language")) {
languageSelect.value = json.language;
}
//storerecordings checkbox
if (json.hasOwnProperty("storeRecordings")) {
storeRecordings.checked = json.storeRecordings;
}
return;
} catch (e) {
//not json
}
let latency = Date.now() - serverTime;
console.log("Received message from server: " + event.data + " (latency: " + latency + "ms)");
info.innerHTML = "latency: " + latency + "ms";
if (autosend.checked) {
//append to the text on new line
transcription.innerHTML += "<br>>" + event.data;
statusRecording.innerHTML = "waiting...";
//data is in format (count)text. split it
const arr = event.data.split(/[(\)]/); // split the string at "(" or ")"
let queue = arr[1]
let text = arr[2].trim();
info.innerHTML = "latency: " + latency + "ms; server queue: " + queue + " requests";
transcription.value += text + " ";
statusRecording.innerHTML = "listening...";
statusRecording.style.color = "black";
} else {
//replace the text
@ -197,48 +289,56 @@
}
}
const preBufferDuration = 500; // duration of pre-buffer in ms
function startListening() {
// Initialize canvas
canvasCtx.fillStyle = "green";
recording = true;
navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, echoCancellation: true } }).then((stream) => {
audioRecorder = new MediaRecorder(stream);
audioRecorder.start();
navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000 } })
.then((stream) => {
audioStream = stream;
console.log("Started listening to microphone (sample rate: " + stream.getAudioTracks()[0].getSettings().sampleRate + " Hz, echoCancellation: " + stream.getAudioTracks()[0].getSettings().echoCancellation + ", " +audioRecorder.mimeType.audioChannels + " channels)");
//const preBufferLength = Math.ceil(preBufferDuration * audioRecorder.mimeType.audioSampleRate * audioRecorder.mimeType.audioChannels);
const preBufferLength = Math.ceil(preBufferDuration * 48000 * 2 );
const audioContext = new AudioContext();
const sourceNode = audioContext.createMediaStreamSource(audioStream);
const audioSampleRate = sourceNode.context.sampleRate;
console.log("Started listening to microphone (sample rate: " + audioSampleRate + " Hz, echoCancellation: " + stream.getAudioTracks()[0].getSettings().echoCancellation + ", " + sourceNode.channelCount + " channels)");
info.innerHTML = "Sample rate: " + audioSampleRate + " Hz";
var preBuffer = [];
//merge audio channels
const channelSplitter = audioContext.createChannelSplitter(2);
const channelMerger = audioContext.createChannelMerger(1);
sourceNode.connect(channelSplitter);
channelSplitter.connect(channelMerger, 0, 0);
const outputNode = channelMerger;
//const singleChannelStream = new MediaStream();
//singleChannelStream.addTrack(outputNode.stream.getAudioTracks()[0]);
const mediaStreamDestination = audioContext.createMediaStreamDestination();
outputNode.connect(mediaStreamDestination);
const singleChannelStream = mediaStreamDestination.stream;
audioRecorder = new MediaRecorder(singleChannelStream);
audioRecorder.start();
audioRecorder.addEventListener("dataavailable", (event) => {
console.log("Audio data available: " + event.data.size + " bytes");
console.log("Audio data available: " + (event.data.size / 1024).toFixed(2) + " KB in " + event.data.type + " format. (" + audioContext.sampleRate + " Hz)");
if (!soundDetected && autosend.checked) {
console.log("discarding audio data because not speaking");
//store last 100ms of audio data
preBuffer = [];
const audioData = event.data;
//const start = audioBlob.size - preBufferLength; // calculate offset to trim pre-buffer
//const audioBlob = new Blob([...preBuffer, audioData], { type: "audio/wav" });
//const end = audioBlob.size;
//const slicedAudio = audioBlob.slice(start, end);
const audioBlob = new Blob([...preBuffer, audioData], { type: "audio/ogg; codecs=opus" });
const start = audioBlob.size - 500 * 48 * 2 / 8; // Assuming 48 kHz sampling rate and 16-bit PCM audio
const end = audioBlob.size;
const slicedAudio = audioBlob.slice(start, end);
preBuffer = slicedAudio;
//sendAudioToServer(event.data);
preBuffer.push(event.data);
return;
}
if (event.data.size > 0) {
let data = event.data;
if (preBuffer.length > 0) {
const audioBlob = new Blob([...preBuffer, audioData], { type: "audio/ogg; codecs=opus" });
const newEventChunk = new Blob([audioBlob, preBuffer], { type: "audio/ogg; codecs=opus" });
data = newEventChunk;
// const audioBlob = new Blob([...preBuffer, audioData], { type: "audio/ogg; codecs=opus" });
// const newEventChunk = new Blob([audioBlob, preBuffer], { type: "audio/ogg; codecs=opus" });
/// data = newEventChunk;
sendAudioToServerPost(preBuffer);
}
sendAudioToServer(data);
soundDetected = false;
@ -249,7 +349,10 @@
});
recordButton.innerHTML = "Stop Recording";
recordButton.classList.add("recording");
recordButton.classList.toggle('bg-red-500');
recordButton.classList.toggle('bg-blue-500');
recordButton.classList.toggle('hover:bg-blue-700');
}
function getlast500ms(audioData, preBuffer) {
@ -263,18 +366,40 @@
const newEvent = new Blob([remainingAudio], { type: "audio/ogg; codecs=opus" });
// Replace the original event data with the new event data
const newEventChunk = new Blob([newEvent, slicedAudio], { type: "audio/ogg; codecs=opus" });
return newEventChunk;
//const newEventChunk = new Blob([newEvent, slicedAudio], { type: "audio/ogg; codecs=opus" });
const newEventChunk = new Blob([slicedAudio], { type: "audio/ogg; codecs=opus" });
return audioData;
}
function stopListening() {
recording = false;
audioRecorder.stop();
recordButton.innerHTML = "Start Recording";
recordButton.classList.remove("recording");
//recordButton.classList.remove("recording");
recordButton.classList.toggle('bg-blue-500');
recordButton.classList.toggle('bg-red-500');
recordButton.classList.toggle('hover:bg-blue-700');
clearInterval(volumeChecker);
//stop using microphone
if (audioStream) {
// stop all tracks in the stream
audioStream.getTracks().forEach(track => track.stop());
// set the stream variable to null or undefined to indicate it's no longer in use
audioStream = null;
}
}
function sendAudioToServerPost(data) {
const blob = new Blob(data, { type: "audio/ogg; codecs=opus" });
var formData = new FormData();
formData.append('file', data);
fetch('/upload', {
method: 'POST',
body: formData
});
}
function sendAudioToServer(data) {
if (connected) {
//const blob = new Blob(data, { type: 'audio/webm' });
@ -356,10 +481,22 @@
//transcription = document.getElementById("transcription");
//autosend = document.getElementById("autosend");
statusRecording = document.getElementById("status-recording");
languageSelect = document.getElementById("language-select");
inputDevices = document.getElementById("input-devices");
storeRecordings = document.getElementById("store-recordings");
enumerateDevices();
connect(socket);
};
function copyToClipboard(id) {
var textarea = document.getElementById(id);
textarea.select();
document.execCommand('copy');
}
</script>
<script src="https://cdn.webrtc-experiment.com/MediaStreamRecorder.js"></script>
</body>

View File

@ -13,18 +13,49 @@ console.log(process.env)
console.log(process.env.TTS_BACKEND_URL)
console.log(process.env.WS_URL)
let language = "en";
let storeRecordings = false;
let queueCounter = 0;
const storage = require('node-persist');
storage.init().then(() => {
storage.getItem('language').then((value) => {
if (value != undefined) { language = value; console.log('language: ' + language); }
else { storage.setItem('language', language).then(() => { console.log('language set to ' + language + "(default)"); }); }
});
storage.getItem('storeRecordings').then((value) => {
if (value != undefined) { storeRecordings = value; console.log('storeRecordings: ' + storeRecordings); }
else { storage.setItem('storeRecordings', storeRecordings).then(() => { console.log('storeRecordings set to ' + storeRecordings + "(default)"); }); }
});
});
//we use https://hub.docker.com/r/onerahmet/openai-whisper-asr-webservice to transcribe the audio
//docker run -p 9009:9009 -d onerahmet/openai-whisper-asr-webservice
wss.on('connection', (ws) => {
console.log('Client ' + ws._socket.remoteAddress + ' connected');
const sessions = new Map(); // Store session data
wss.on('connection', (ws, req) => {
ws.sessionId = Math.random().toString(36).slice(2);
sessions.set(ws.sessionId, { language: 'en' });
console.log('Client ' + ws._socket.remoteAddress + ' connected with session id ' + ws.sessionId);
//send cookie to client
ws.send(JSON.stringify({ sessionId: ws.sessionId, language: language, storeRecordings: storeRecordings }));
ws.on('message', (data) => {
let webSocket = ws;
const sessionData = sessions.get(webSocket.sessionId);
if (!sessionData) {
console.log('No session data found for session id ' + webSocket.sessionId);
}
let language = sessionData?.language || 'en';
//show the size of the audio data as 0.000 MB
console.log('Received data from client: ' + (data.length / 1024 / 1024).toFixed(3) + ' MB');
console.log('(queue ' + queueCounter + ') Received ' + (data.length / 1024 / 1024).toFixed(3) + ' MB audio from client. Crrent language: ' + language);
var request = require('request');
var formData = {
task: 'transcribe',
language: 'en-US', //bg-BG|en-US
language: sessionData.language,
output: 'json',
audio_file: {
value: data,
@ -35,30 +66,35 @@ wss.on('connection', (ws) => {
}
};
storeRecordings = sessionData?.storeRecordings || storeRecordings;
if (storeRecordings) {
//"yyyymmdd-hhMMss"
var timestampfilename = Date.now("yyyymmdd-hhMMss");
//save the audio data to a file to /rec subfolder
var fs = require('fs');
fs.mkdir('rec', { recursive: true }, (err) => {
if (err) throw err;
});
fs.writeFile('rec/audio' + timestampfilename + '.ogg', data, function (err) {
if (err) {
return console.log(err);
}
console.log('Audio data saved to audio.ogg');
});
}
//record start time
var start = new Date().getTime();
queueCounter++;
request.post({ url: process.env.TTS_BACKEND_URL, formData: formData }, function optionalCallback(err, httpResponse, body) {
queueCounter--;
if (err) {
return console.error('upload failed:', err);
}
console.log('Whisper decoded:', body);
ws.send(body);
//duration of the transcribe in 0.00s
var duration = new Date().getTime() - start;
//console.log('decoded (' + duration + 'ms):', body);
console.log('decoded (' + (duration / 1000).toFixed(2) + 's):', body);
webSocket.send("(" + queueCounter + ") " + body);
});
});
});
@ -72,7 +108,11 @@ function transcribeAudio(audioData) {
// --- web server that servers client.html
const express = require('express');
const bodyParser = require('body-parser');
const app = express();
app.use(bodyParser.json());
// app.use(bodyParser.urlencoded({ extended: false })); // Parse request body as URL-encoded
const path = require('path');
app.get('/', (req, res) => {
@ -84,6 +124,56 @@ app.get('/', (req, res) => {
app.get('/wsurl', (req, res) => {
res.send(process.env.WS_URL, 200, { 'Content-Type': 'text/plain' });
});
//GET used to store default settings for all clients
app.get('/settings', (req, res) => {
if (req.query.language != undefined) {
language = req.query.language;
storage.setItem('language', language).then(() => { console.log('language set to ' + language); });
}
if (req.query.storeRecordings != undefined) {
storeRecordings = req.query.storeRecordings;
storage.setItem('storeRecordings', storeRecordings).then(() => { console.log('storeRecordings set to ' + storeRecordings); });
}
//send back the current settings as json
res.send(JSON.stringify({ language: language, storeRecordings: storeRecordings }), 200, { 'Content-Type': 'text/plain' });
});
//POST used to store settings for a specific client
app.post('/settings', (req, res) => {
//get the language from the json body ( { language: language, sessionId: sessionId })
const body = req.body;
const sid = body.sessionId;
const sessionData = sessions.get(sid);
if (body.language != undefined) {
sessionData.language = body.language;
console.log(`Session ${sid}: language set to ${sessionData.language}`);
}
if(body.storeRecordings != undefined) {
sessionData.storeRecordings = body.storeRecordings;
console.log(`Session ${sid}: storeRecordings set to ${sessionData.storeRecordings}`);
}
res.send('OK', 200, { 'Content-Type': 'text/plain' });
});
//save the audio file
app.post('/upload', (req, res) => {
try {
//save the audio file
var timestampfilename = Date.now("yyyymmdd-hhMMss");
var fs = require('fs');
fs.mkdir('rec', { recursive: true }, (err) => {
if (err) throw err;
});
var file = fs.createWriteStream('rec/audio_slice_' + timestampfilename + '.ogg');
req.pipe(file);
res.send('OK', 200, { 'Content-Type': 'text/plain' });
} catch (err) {
console.log(err);
res.send('ERROR', 500, { 'Content-Type': 'text/plain' });
}
});
app.listen(8080, () => {
console.log('Server listening on port 8080');