Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docs/usage/speech-embedding.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ export SPEACHES_BASE_URL="http://localhost:8000"
uvx speaches-cli registry ls --task speaker-embedding | jq '.data | [].id'

# Downloading a model
uvx speaches-cli model download deepghs/pyannote-embedding-onnx
uvx speaches-cli model download Wespeaker/wespeaker-voxceleb-resnet34-LM

# Check that the model has been installed
uvx speaches-cli model ls --task speaker-embedding | jq '.data | map(select(.id == "deepghs/pyannote-embedding-onnx"))'
uvx speaches-cli model ls --task speaker-embedding | jq '.data | map(select(.id == "Wespeaker/wespeaker-voxceleb-resnet34-LM"))'
```

## Usage
Expand All @@ -27,7 +27,7 @@ uvx speaches-cli model ls --task speaker-embedding | jq '.data | map(select(.id

```bash
export SPEACHES_BASE_URL="http://localhost:8000"
export EMBEDDING_MODEL_ID="deepghs/pyannote-embedding-onnx"
export EMBEDDING_MODEL_ID="Wespeaker/wespeaker-voxceleb-resnet34-LM"

curl -s "$SPEACHES_BASE_URL/v1/audio/speech/embedding" \
-F "[email protected]" \
Expand All @@ -43,7 +43,7 @@ curl -s "$SPEACHES_BASE_URL/v1/audio/speech/embedding" \

with open('audio.wav', 'rb') as f:
files = {'file': ('audio.wav', f)}
data = {'model': 'deepghs/pyannote-embedding-onnx'}
data = {'model': 'Wespeaker/wespeaker-voxceleb-resnet34-LM'}
response = httpx.post(
'http://localhost:8000/v1/audio/speech/embedding',
files=files,
Expand All @@ -62,7 +62,7 @@ curl -s "$SPEACHES_BASE_URL/v1/audio/speech/embedding" \

with open('audio.wav', 'rb') as f:
files = {'file': ('audio.wav', f)}
data = {'model': 'deepghs/pyannote-embedding-onnx'}
data = {'model': 'Wespeaker/wespeaker-voxceleb-resnet34-LM'}
response = requests.post(
'http://localhost:8000/v1/audio/speech/embedding',
files=files,
Expand Down Expand Up @@ -101,7 +101,7 @@ def cosine_similarity(embedding1: list[float], embedding2: list[float]) -> float
vec2 = np.array(embedding2)
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

model_id = 'deepghs/pyannote-embedding-onnx'
model_id = 'Wespeaker/wespeaker-voxceleb-resnet34-LM'

embedding1 = get_embedding('speaker1.wav', model_id)
embedding2 = get_embedding('speaker2.wav', model_id)
Expand Down Expand Up @@ -154,7 +154,7 @@ class SpeakerVerifier:

verifier = SpeakerVerifier(
base_url='http://localhost:8000',
model_id='deepghs/pyannote-embedding-onnx',
model_id='Wespeaker/wespeaker-voxceleb-resnet34-LM',
threshold=0.7
)

Expand Down Expand Up @@ -182,7 +182,7 @@ The response follows a structure similar to OpenAI's text embedding endpoint:
]
}
],
"model": "deepghs/pyannote-embedding-onnx",
"model": "Wespeaker/wespeaker-voxceleb-resnet34-LM",
"usage": {
"prompt_tokens": 48000,
"total_tokens": 48000
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ dependencies = [
"opentelemetry-instrumentation-urllib3==0.50b0",
"opentelemetry-instrumentation-openai-v2>=2.1b0",
"opentelemetry-instrumentation-openai>=0.37.1",
"onnx-diarization>=0.1.0",
]

[dependency-groups]
dev = [
{include-group = "docs"},
{include-group = "lint"},
{include-group = "test"}
{include-group = "test"},
]
docs = [
"mdx-truly-sane-lists>=1.3",
Expand Down
9 changes: 9 additions & 0 deletions src/speaches/diarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pydantic import BaseModel, ConfigDict

from speaches.audio import Audio

class KnownSpeaker(BaseModel):
name: str
audio: Audio

model_config = ConfigDict(arbitrary_types_allowed=True)
178 changes: 0 additions & 178 deletions src/speaches/executors/pyannote_speaker_embedding.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/speaches/executors/pyannote_speaker_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class PyannoteModelFiles(BaseModel):
# MODEL_ID_BLACKLIST = {
# "eek/wespeaker-voxceleb-resnet293-LM" # reason: doesn't have `task` tag, also has pytorch binary file, onnx model file isn't named `model.onnx`
# }
MODEL_ID_WHITELIST = {"onnx-community/pyannote-segmentation-3.0"}
MODEL_ID_WHITELIST = {"fedirz/segmentation_community_1"}


class PyannoteSpeakerSegmentationModelRegistry(ModelRegistry):
Expand Down
39 changes: 26 additions & 13 deletions src/speaches/executors/shared/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@

from typing import TYPE_CHECKING

from speaches.executors.pyannote_speaker_segmentation import (
PyannoteSpeakerSegmentationModelManager,
pyannote_speaker_segmentation_model_registry,
)

if TYPE_CHECKING:
from speaches.config import Config

from speaches.executors.kokoro import KokoroModelManager, kokoro_model_registry
from speaches.executors.parakeet import ParakeetModelManager, parakeet_model_registry
from speaches.executors.piper import PiperModelManager, piper_model_registry
from speaches.executors.pyannote_speaker_embedding import (
PyannoteSpeakerEmbeddingModelManager,
pyannote_speaker_embedding_model_registry,
)
from speaches.executors.shared.executor import Executor
from speaches.executors.silero_vad_v5 import (
SileroVADModelManager,
silero_vad_model_registry,
from speaches.executors.silero_vad_v5 import SileroVADModelManager, silero_vad_model_registry
from speaches.executors.wespeaker_speaker_embedding import (
WespeakerSpeakerEmbeddingModelManager,
wespeaker_speaker_embedding_model_registry,
)
from speaches.executors.whisper import WhisperModelManager, whisper_model_registry

Expand Down Expand Up @@ -46,12 +48,18 @@ def __init__(self, config: Config) -> None:
model_registry=kokoro_model_registry,
task="text-to-speech",
)
self._pyannote_executor = Executor(
name="pyannote",
model_manager=PyannoteSpeakerEmbeddingModelManager(config.stt_model_ttl, config.unstable_ort_opts),
model_registry=pyannote_speaker_embedding_model_registry,
self._wespeaker_speaker_embedding_executor = Executor(
name="wespeaker-speaker-embedding",
model_manager=WespeakerSpeakerEmbeddingModelManager(0, config.unstable_ort_opts), # HACK: hardcoded ttl
model_registry=wespeaker_speaker_embedding_model_registry,
task="speaker-embedding",
)
self._pyannote_speaker_segmentation_executor = Executor(
name="pyannote-speaker-segmentation",
model_manager=PyannoteSpeakerSegmentationModelManager(0, config.unstable_ort_opts), # HACK: hardcoded ttl
model_registry=pyannote_speaker_segmentation_model_registry,
task="voice-activity-detection",
)
self._vad_executor = Executor(
name="vad",
model_manager=SileroVADModelManager(config.vad_model_ttl, config.unstable_ort_opts),
Expand All @@ -73,7 +81,11 @@ def text_to_speech(self): # noqa: ANN201

@property
def speaker_embedding(self): # noqa: ANN201
return (self._pyannote_executor,)
return (self._wespeaker_speaker_embedding_executor,)

@property
def speaker_segmentation(self): # noqa: ANN201
return (self._pyannote_speaker_segmentation_executor,)

@property
def vad(self): # noqa: ANN201
Expand All @@ -85,7 +97,8 @@ def all_executors(self): # noqa: ANN201
self._parakeet_executor,
self._piper_executor,
self._kokoro_executor,
self._pyannote_executor,
self._wespeaker_speaker_embedding_executor,
self._pyannote_speaker_segmentation_executor,
self._vad_executor,
)

Expand Down
Loading
Loading