How can I use the data from speech_recognition's listen() function as an embedding to compare with previously recorded .wav files of different speakers talking so that I can print (speaker): (recognized text)? If possible, as my memory system is an SD card, I'd like to get the audio data live instead of storing it. I have tried to do this following the errors but the cosine distance values end up always around 1, with no apparent relation to who is actually speaking:
import speech_recognition as sr
from pyannote.audio import Model, Inference
from scipy.spatial.distance import cdist
import torch
import numpy
recognizer = sr.Recognizer()
recognizer.dynamic_energy_threshold = False
recognizer.energy_threshold=1100
recognizer.pause_threshold=3
embedding_model = Model.from_pretrained("pyannote/embedding", use_auth_token="(my token)", device="cpu")
inference = Inference(embedding_model, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), window="whole")
Speaker_embedding = inference("voice_sample.wav")
with sr.Microphone() as source:
audio = recognizer.listen(source, timeout=10)
recognized_text = recognizer.recognize_google(audio, language="en")
audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16).astype(np.float32) / 32768.0
live_audio = {"waveform": torch.tensor(audio_data).unsqueeze(0), "sample_rate": 16000}
live_embedding = inference(live_audio)
distance = cdist(live_embedding, Speaker_embedding, metric="cosine")[0, 0]
print(distance)
print(recognized_text)
The speech recognition works, though.
Also, weirdly, the example from https://huggingface.co/pyannote/embedding:
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/embedding", use_auth_token="ACCESS_TOKEN_GOES_HERE")
from pyannote.audio import Inference
inference = Inference(model, window="whole")
embedding1 = inference("speaker1.wav")
embedding2 = inference("speaker2.wav")
# `embeddingX` is (1 x D) numpy array extracted from the file as a whole.
from scipy.spatial.distance import cdist
distance = cdist(embedding1, embedding2, metric="cosine")[0,0]
raises this:
ValueError: XA must be a 2-dimensional array (reshaping them with np.reshape(1, -1) is what then gives the strange distance values)