Merge branch 'master' of https://git.d-popov.com/popov/ai-kevin

2024-09-10 02:40:08 +03:00
parent b8eafeb67f a4f7c3a012
commit 0c5b59ef69
2 changed files with 123 additions and 0 deletions
--- a/agent-mAId/main.py
+++ b/agent-mAId/main.py
@@ -0,0 +1,98 @@
+# let's create a new application which will run in the background at startup, will listen to the microphone while mouse or other (configurable) button is pressed and will send the audio to Whisper and will enter (emulate keypress) of the transcribed text result on key release 
+# https://console.groq.com/docs/speech-text
+
+# import system_hooks
+# import audio_recorder
+# import whisper_api
+# import keyboard_emulator
+
+# def on_button_press():
+#     audio_recorder.start_recording()
+
+# def on_button_release():
+#     audio = audio_recorder.stop_recording()
+#     text = whisper_api.transcribe(audio)
+#     keyboard_emulator.type_text(text)
+
+# def main():
+#     system_hooks.set_startup_run()
+#     system_hooks.register_button_callback(on_button_press, on_button_release)
+#     system_hooks.run_event_loop()
+
+# if __name__ == "__main__":
+#     main()
+
+import os
+from groq import Groq
+import pyaudio
+import wave
+import pyautogui
+import keyboard
+
+# Constants
+API_KEY = "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"  # Make sure to use your actual API key
+BUTTON = 'ctrl'  # The button to listen for
+
+# Initialize the Groq client
+client = Groq(api_key=API_KEY)
+
+def record_audio(filename):
+    # Setup audio recording
+    audio = pyaudio.PyAudio()
+    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)
+    
+    frames = []
+    print("Recording...")
+
+    # Record while button or mouse is pressed
+    while keyboard.is_pressed(BUTTON) or mouse.is_pressed(button='left'):
+        data = stream.read(1024)
+        frames.append(data)
+    
+    print("Recording stopped.")
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()
+
+    # Save the recorded audio
+    wave_file = wave.open(filename, 'wb')
+    wave_file.setnchannels(1)
+    wave_file.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
+    wave_file.setframerate(44100)
+    wave_file.writeframes(b''.join(frames))
+    wave_file.close()
+
+def transcribe_audio(filename):
+    # Open the audio file
+    with open(filename, "rb") as file:
+        # Create a transcription of the audio file
+        transcription = client.audio.transcriptions.create(
+            file=(filename, file.read()),  # Required audio file
+            model="distil-whisper-large-v3-en",  # Required model to use for transcription
+            prompt="Specify context or spelling",  # Optional
+            response_format="json",  # Optional
+            temperature=0.0  # Optional
+        )
+    return transcription['text']
+
+def simulate_keypress(text):
+    # Simulate keypress for each character in text
+    for char in text:
+        pyautogui.typewrite(char)
+    pyautogui.press('enter')
+
+def main():
+    filename = "output.wav"
+    
+    print("Press and hold the button to record...")
+    keyboard.wait(BUTTON)  # Wait for button press
+    record_audio(filename)
+    
+    print("Transcribing audio...")
+    transcribed_text = transcribe_audio(filename)
+    
+    print("Entering text...")
+    simulate_keypress(transcribed_text)
+
+if __name__ == "__main__":
+    main()
--- a/vision/notes.md
+++ b/vision/notes.md
@@ -0,0 +1,25 @@
+ Visual options :
+ -- OD:
+ - object detction /w fine tuning: Yolo V5: https://learnopencv.com/custom-object-detection-training-using-yolov5/
+
+-- V-aware
+ - visual LLM: LLAVA : https://llava.hliu.cc/
+
+ -- BOTH detection and comprehention: 
+ -Phi
+ https://huggingface.co/microsoft/Phi-3-vision-128k-instruct
+ https://github.com/microsoft/Phi-3CookBook
+
+- Lavva chat
+https://github.com/LLaVA-VL/LLaVA-Interactive-Demo?tab=readme-ov-file
+git clone https://github.com/LLaVA-VL/LLaVA-Interactive-Demo.git
+conda create -n llava_int -c conda-forge -c pytorch python=3.10.8 pytorch=2.0.1 -y
+conda activate llava_int
+cd LLaVA-Interactive-Demo
+pip install -r requirements.txt
+source setup.sh
+
+
+
+
+- decision making based on ENV, RL: https://github.com/OpenGenerativeAI/llm-colosseum