How to use speech_recognition and pyannote.audio simultaneously
10:13 21 Jan 2025

How can I use the data from speech_recognition's listen() function as an embedding to compare with previously recorded .wav files of different speakers talking so that I can print (speaker): (recognized text)? If possible, as my memory system is an SD card, I'd like to get the audio data live instead of storing it. I have tried to do this following the errors but the cosine distance values end up always around 1, with no apparent relation to who is actually speaking:

import speech_recognition as sr
from pyannote.audio import Model, Inference
from scipy.spatial.distance import cdist
import torch
import numpy

recognizer = sr.Recognizer()
recognizer.dynamic_energy_threshold = False
recognizer.energy_threshold=1100
recognizer.pause_threshold=3

embedding_model = Model.from_pretrained("pyannote/embedding", use_auth_token="(my token)", device="cpu")
inference = Inference(embedding_model, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), window="whole")
Speaker_embedding = inference("voice_sample.wav")

with sr.Microphone() as source:
    audio = recognizer.listen(source, timeout=10)
    recognized_text = recognizer.recognize_google(audio, language="en")
                        
    audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16).astype(np.float32) / 32768.0
    live_audio = {"waveform": torch.tensor(audio_data).unsqueeze(0), "sample_rate": 16000}
    live_embedding = inference(live_audio)
    distance = cdist(live_embedding, Speaker_embedding, metric="cosine")[0, 0]
    print(distance)
    print(recognized_text)

The speech recognition works, though.

Also, weirdly, the example from https://huggingface.co/pyannote/embedding:

from pyannote.audio import Model 
model = Model.from_pretrained("pyannote/embedding", use_auth_token="ACCESS_TOKEN_GOES_HERE")

from pyannote.audio import Inference 
inference = Inference(model, window="whole") 
embedding1 = inference("speaker1.wav") 
embedding2 = inference("speaker2.wav") 
# `embeddingX` is (1 x D) numpy array extracted from the file as a whole.  

from scipy.spatial.distance import cdist 
distance = cdist(embedding1, embedding2, metric="cosine")[0,0]

raises this:
ValueError: XA must be a 2-dimensional array (reshaping them with np.reshape(1, -1) is what then gives the strange distance values)

python formatting speech-recognition cosine-similarity speaker-diarization