chat: record button constantly listens

rmackay9 · rmackay9 · commit b57b4a99295c · 2023-12-21T20:21:35.000+09:00
diff --git a/MAVProxy/modules/mavproxy_chat/chat_voice_to_text.py b/MAVProxy/modules/mavproxy_chat/chat_voice_to_text.py
@@ -1,9 +1,13 @@
 '''
 AI Chat Module voice-to-text class
 Randy Mackay, December 2023
+
+Audio threshold algorithm courtesy of Primusa on StackOverflow: https://stackoverflow.com/questions/18406570/python-record-audio-on-detected-sound
 '''
 
 import time
+import math
+import struct
 
 try:
     import pyaudio  # install using, "sudo apt-get install python3-pyaudio"
@@ -19,6 +23,15 @@ def __init__(self):
         self.client = None
         self.assistant = None
 
+        # initialise audio recording
+        self.p = pyaudio.PyAudio()
+
+        # flag to enable/disable listening and recording
+        self.listening_and_recording_enabled = False
+
+        # audio stream is opened during listening and closed at the end of recording
+        self.stream = None
+
     # set the OpenAI API key
     def set_api_key(self, api_key_str):
         self.client = OpenAI(api_key = api_key_str)
@@ -37,34 +50,84 @@ def check_connection(self):
         # return True if connected
         return self.client is not None
 
-    # record audio from microphone
-    # returns filename of recording or None if failed
-    def record_audio(self):
-        # Initialize PyAudio
-        p = pyaudio.PyAudio()
+    # listen for noise
+    # returns true if noise is detected, false if not
+    def listen_for_noise(self):
+        # check pyaudio is initialised
+        if self.p is None:
+            print("chat: pyaudio not initialised")
+            return False
 
         # Open stream
         try:
-            stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)
+            self.stream = self.p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)
         except:
             print("chat: failed to connect to microphone")
+            return False
+
+        # enable listening and recording
+        self.listening_and_recording_enabled = True
+
+        # throw away first 2 seconds of audio which can be noisy
+        for i in range(0, int(44100 / 1024 * 2)):
+            self.stream.read(1024)
+
+        # listen for noise
+        noise_detected = False
+        while self.listening_and_recording_enabled and not noise_detected:
+            data = self.stream.read(1024)
+            if self.volume_over_threshold(data):
+                noise_detected = True
+
+        # return true if listening enabled andnoise detected
+        if self.listening_and_recording_enabled and noise_detected:
+            return True
+
+        # stop and close the stream
+        self.stream.stop_stream()
+        self.stream.close()
+        return False
+
+    # stop listening for noise or recording audio
+    def stop_listening_or_recording(self):
+        self.listening_and_recording_enabled = False
+        
+    # record audio from microphone.  should only be called after listen_for_noise() returns true
+    # returns filename of recording on success
+    # returns None if failed to record or recording volume was below threshold (e.g. nothing recorded)
+    def record_audio(self):
+        # check pyaudio is initialised
+        if self.p is None:
+            print("chat: pyaudio not initialised")
             return None
 
-        # calculate time recording should stop
+        # check stream is open and active
+        if self.stream is None or not self.stream.is_active() or self.stream.is_stopped():
+            print("chat: failed to connect to microphone")
+            return None
+
+        # record for at least 2 seconds
         curr_time = time.time()
-        time_stop = curr_time + 5
+        stop_time = curr_time + 2
 
-        # record until specified time
+        # record until no noise is heard for 2 seconds or user unchecks the record button
+        noise_heard = False
         frames = []
-        while curr_time < time_stop:
-            data = stream.read(1024)
+        while curr_time < stop_time and self.listening_and_recording_enabled:
+            data = self.stream.read(1024)
             frames.append(data)
             curr_time = time.time()
+            if self.volume_over_threshold(data):
+                noise_heard = True
+                stop_time = curr_time + 2
+
+        # if no noise was heard, return None
+        if not noise_heard:
+            return None
 
-        # Stop and close the stream
-        stream.stop_stream()
-        stream.close()
-        p.terminate()
+        # stop and close the stream
+        self.stream.stop_stream()
+        self.stream.close()
 
         # Save audio file
         wf = wave.open("recording.wav", "wb")
@@ -89,3 +152,25 @@ def convert_audio_to_text(self, audio_filename):
             file=audio_file, 
             response_format="text")
         return transcript
+
+    # return true if the volume of a frame of audio is above a given threshold
+    @staticmethod
+    def volume_over_threshold(frame, threshold = 10):
+        # calculate number of samples in the frame
+        num_samples = len(frame) / pyaudio.PyAudio().get_sample_size(pyaudio.paInt16)
+
+        # protect against divide by zero
+        if num_samples == 0:
+            return False
+        format = "%dh" % (num_samples)
+        shorts = struct.unpack(format, frame)
+
+        # iterate over the frame and calculate the RMS volume
+        sum_squares = 0.0
+        for sample in shorts:
+            n = sample * (1.0/32768.0)
+            sum_squares += n*n
+        volume_rms = math.sqrt(sum_squares / num_samples) * 1000
+
+        # return true if volume is above threshold
+        return volume_rms >= threshold
diff --git a/MAVProxy/modules/mavproxy_chat/chat_window.py b/MAVProxy/modules/mavproxy_chat/chat_window.py
@@ -49,8 +49,8 @@ def __init__(self, mpstate):
         self.horiz_sizer = wx.BoxSizer(wx.HORIZONTAL)
 
         # add a record button
-        self.record_button = wx.Button(self.frame, id=-1, label="Rec", size=(75, 25))
-        self.frame.Bind(wx.EVT_BUTTON, self.record_button_click, self.record_button)
+        self.record_button = wx.ToggleButton(self.frame, id=-1, label="Rec", size=(75, 25))
+        self.frame.Bind(wx.EVT_TOGGLEBUTTON, self.record_button_click, self.record_button)
         self.horiz_sizer.Add(self.record_button, proportion = 0, flag = wx.ALIGN_TOP | wx.ALL, border = 5)
 
         # add an input text box
@@ -110,29 +110,38 @@ def apikey_close_button_click(self, event):
 
     # record button clicked
     def record_button_click(self, event):
-        # run record_button_click_execute in a new thread
-        th = Thread(target=self.record_button_click_execute, args=(event,))
-        th.start()
+        if self.record_button.GetValue():
+            # run record_button_click_execute in a new thread
+            th = Thread(target=self.record_button_click_execute, args=(event,))
+            th.start()
+        else:
+            self.chat_voice_to_text.stop_listening_or_recording()
 
     # record button clicked
     def record_button_click_execute(self, event):
-        # record audio
-        rec_filename = self.chat_voice_to_text.record_audio()
-        if rec_filename is None:
-            print("chat: audio recording failed")
-            self.set_status_text("Audio recording failed")
-            return
-
-        # convert audio to text and place in input box
-        text = self.chat_voice_to_text.convert_audio_to_text(rec_filename)
-        if text is None:
-            print("chat: audio to text conversion failed")
-            self.set_status_text("Audio to text conversion failed")
-            return
-        wx.CallAfter(self.text_input.SetValue, text)
-
-        # send text to assistant
-        self.send_text_to_assistant()
+        while True:
+            # listen for noise
+            if not self.chat_voice_to_text.listen_for_noise():
+                # exit if listening failed or user unclicked the record button
+                self.set_status_text("no noise detected")
+                wx.CallAfter(self.record_button.SetValue, False)
+                return
+
+            # noises heard, record audio
+            rec_filename = self.chat_voice_to_text.record_audio()
+            if rec_filename is None:
+                # audio was not recorded so return to listening
+                continue
+
+            # convert audio to text and place in input box
+            text = self.chat_voice_to_text.convert_audio_to_text(rec_filename)
+            if text is None:
+                self.set_status_text("audio to text conversion failed")
+                return
+            wx.CallAfter(self.text_input.SetValue, text)
+
+            # send text to assistant
+            self.send_text_to_assistant()
 
     # send button clicked
     def send_button_click(self, event):