gogo2/agent-mAId/main.py

# let's create a new application which will run in the background at startup, will listen to the microphone while mouse or other (configurable) button is pressed and will send the audio to Whisper and will enter (emulate keypress) of the transcribed text result on key release
# https://console.groq.com/docs/speech-text

# import system_hooks
# import audio_recorder
# import whisper_api
# import keyboard_emulator

# def on_button_press():
#     audio_recorder.start_recording()

# def on_button_release():
#     audio = audio_recorder.stop_recording()
#     text = whisper_api.transcribe(audio)
#     keyboard_emulator.type_text(text)

# def main():
#     system_hooks.set_startup_run()
#     system_hooks.register_button_callback(on_button_press, on_button_release)
#     system_hooks.run_event_loop()

# if __name__ == "__main__":
#     main()


import pyaudio
import wave
import pyautogui
import requests
import keyboard

# Constants
API_URL = "https://api.openai.com/v1/whisper"
API_KEY = "your_openai_api_key"
BUTTON = 'ctrl'  # The button to listen for

def record_audio(filename):
    # Setup audio recording
    audio = pyaudio.PyAudio()
    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)

    frames = []
    print("Recording...")

    # Record while button is pressed
    while keyboard.is_pressed(BUTTON):
        data = stream.read(1024)
        frames.append(data)

    print("Recording stopped.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save the recorded audio
    wave_file = wave.open(filename, 'wb')
    wave_file.setnchannels(1)
    wave_file.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
    wave_file.setframerate(44100)
    wave_file.writeframes(b''.join(frames))
    wave_file.close()

def transcribe_audio(filename):
    # Transcribe audio using Whisper API
    with open(filename, 'rb') as audio_file:
        response = requests.post(
            API_URL,
            headers={"Authorization": f"Bearer {API_KEY}"},
            files={"file": audio_file}
        )
    return response.json().get('text', '')

def simulate_keypress(text):
    # Simulate keypress for each character in text
    for char in text:
        pyautogui.typewrite(char)
    pyautogui.press('enter')

def main():
    filename = "output.wav"

    print("Press and hold the button to record...")
    keyboard.wait(BUTTON)  # Wait for button press
    record_audio(filename)

    print("Transcribing audio...")
    transcribed_text = transcribe_audio(filename)

    print("Entering text...")
    simulate_keypress(transcribed_text)

if __name__ == "__main__":
    main()