From 80ff1832cd3f847bbf9e0bc060a251b949e88704 Mon Sep 17 00:00:00 2001 From: Dobromir Popov Date: Tue, 10 Sep 2024 02:31:14 +0300 Subject: [PATCH 1/3] win py agent maid - transcription service --- agent-mAId/main.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 agent-mAId/main.py diff --git a/agent-mAId/main.py b/agent-mAId/main.py new file mode 100644 index 0000000..707f8b6 --- /dev/null +++ b/agent-mAId/main.py @@ -0,0 +1,93 @@ +# let's create a new application which will run in the background at startup, will listen to the microphone while mouse or other (configurable) button is pressed and will send the audio to Whisper and will enter (emulate keypress) of the transcribed text result on key release +# https://console.groq.com/docs/speech-text + +# import system_hooks +# import audio_recorder +# import whisper_api +# import keyboard_emulator + +# def on_button_press(): +# audio_recorder.start_recording() + +# def on_button_release(): +# audio = audio_recorder.stop_recording() +# text = whisper_api.transcribe(audio) +# keyboard_emulator.type_text(text) + +# def main(): +# system_hooks.set_startup_run() +# system_hooks.register_button_callback(on_button_press, on_button_release) +# system_hooks.run_event_loop() + +# if __name__ == "__main__": +# main() + + +import pyaudio +import wave +import pyautogui +import requests +import keyboard + +# Constants +API_URL = "https://api.openai.com/v1/whisper" +API_KEY = "your_openai_api_key" +BUTTON = 'ctrl' # The button to listen for + +def record_audio(filename): + # Setup audio recording + audio = pyaudio.PyAudio() + stream = audio.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024) + + frames = [] + print("Recording...") + + # Record while button is pressed + while keyboard.is_pressed(BUTTON): + data = stream.read(1024) + frames.append(data) + + print("Recording stopped.") + stream.stop_stream() + stream.close() + audio.terminate() + + # Save the recorded audio + wave_file = wave.open(filename, 'wb') + wave_file.setnchannels(1) + wave_file.setsampwidth(audio.get_sample_size(pyaudio.paInt16)) + wave_file.setframerate(44100) + wave_file.writeframes(b''.join(frames)) + wave_file.close() + +def transcribe_audio(filename): + # Transcribe audio using Whisper API + with open(filename, 'rb') as audio_file: + response = requests.post( + API_URL, + headers={"Authorization": f"Bearer {API_KEY}"}, + files={"file": audio_file} + ) + return response.json().get('text', '') + +def simulate_keypress(text): + # Simulate keypress for each character in text + for char in text: + pyautogui.typewrite(char) + pyautogui.press('enter') + +def main(): + filename = "output.wav" + + print("Press and hold the button to record...") + keyboard.wait(BUTTON) # Wait for button press + record_audio(filename) + + print("Transcribing audio...") + transcribed_text = transcribe_audio(filename) + + print("Entering text...") + simulate_keypress(transcribed_text) + +if __name__ == "__main__": + main() \ No newline at end of file From cb4e222bab1f8beffdc794bc857a5fb7e87c0f1f Mon Sep 17 00:00:00 2001 From: Dobromir Popov Date: Tue, 10 Sep 2024 02:36:40 +0300 Subject: [PATCH 2/3] edits to transcribe agent --- agent-mAId/main.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/agent-mAId/main.py b/agent-mAId/main.py index 707f8b6..96dd788 100644 --- a/agent-mAId/main.py +++ b/agent-mAId/main.py @@ -22,18 +22,20 @@ # if __name__ == "__main__": # main() - +import os +from groq import Groq import pyaudio import wave import pyautogui -import requests import keyboard # Constants -API_URL = "https://api.openai.com/v1/whisper" -API_KEY = "your_openai_api_key" +API_KEY = "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" # Make sure to use your actual API key BUTTON = 'ctrl' # The button to listen for +# Initialize the Groq client +client = Groq(api_key=API_KEY) + def record_audio(filename): # Setup audio recording audio = pyaudio.PyAudio() @@ -42,8 +44,8 @@ def record_audio(filename): frames = [] print("Recording...") - # Record while button is pressed - while keyboard.is_pressed(BUTTON): + # Record while button or mouse is pressed + while keyboard.is_pressed(BUTTON) or mouse.is_pressed(button='left'): data = stream.read(1024) frames.append(data) @@ -61,14 +63,17 @@ def record_audio(filename): wave_file.close() def transcribe_audio(filename): - # Transcribe audio using Whisper API - with open(filename, 'rb') as audio_file: - response = requests.post( - API_URL, - headers={"Authorization": f"Bearer {API_KEY}"}, - files={"file": audio_file} + # Open the audio file + with open(filename, "rb") as file: + # Create a transcription of the audio file + transcription = client.audio.transcriptions.create( + file=(filename, file.read()), # Required audio file + model="distil-whisper-large-v3-en", # Required model to use for transcription + prompt="Specify context or spelling", # Optional + response_format="json", # Optional + temperature=0.0 # Optional ) - return response.json().get('text', '') + return transcription['text'] def simulate_keypress(text): # Simulate keypress for each character in text From 736ef27852ad7d4d748ff80c708aeb39099212ad Mon Sep 17 00:00:00 2001 From: Dobromir Popov Date: Tue, 10 Sep 2024 02:37:01 +0300 Subject: [PATCH 3/3] MISC --- vision/notes.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 vision/notes.md diff --git a/vision/notes.md b/vision/notes.md new file mode 100644 index 0000000..4d311c5 --- /dev/null +++ b/vision/notes.md @@ -0,0 +1,25 @@ + Visual options : + -- OD: + - object detction /w fine tuning: Yolo V5: https://learnopencv.com/custom-object-detection-training-using-yolov5/ + +-- V-aware + - visual LLM: LLAVA : https://llava.hliu.cc/ + + -- BOTH detection and comprehention: + -Phi + https://huggingface.co/microsoft/Phi-3-vision-128k-instruct + https://github.com/microsoft/Phi-3CookBook + +- Lavva chat +https://github.com/LLaVA-VL/LLaVA-Interactive-Demo?tab=readme-ov-file +git clone https://github.com/LLaVA-VL/LLaVA-Interactive-Demo.git +conda create -n llava_int -c conda-forge -c pytorch python=3.10.8 pytorch=2.0.1 -y +conda activate llava_int +cd LLaVA-Interactive-Demo +pip install -r requirements.txt +source setup.sh + + + + +- decision making based on ENV, RL: https://github.com/OpenGenerativeAI/llm-colosseum \ No newline at end of file