Skip to content

Commit 9649e8f

Browse files
committed
mlx-whisper
1 parent 068339b commit 9649e8f

File tree

3 files changed

+14
-63
lines changed

3 files changed

+14
-63
lines changed

README.md

+6-4
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,21 @@
22

33
OpenAI Whisper API-style local server, runnig on FastAPI. This is for companies behind proxies or security firewalls.
44

5-
This API will be compatible with [OpenAI Whisper (speech to text) API](https://openai.com/blog/introducing-chatgpt-and-whisper-apis). See also [Create transcription - API Reference - OpenAI API](https://platform.openai.com/docs/api-reference/audio/create).
5+
This API will be compatible with [OpenAI Whisper (speech to text) API](https://openai.com/blog/introducing-chatgpt-and-whisper-apis). See also [Create transcription - API Reference - OpenAI API](https://platform.openai.com/docs/api-reference/audio/create).
66

77
Some of code has been copied from [whisper-ui](https://github.com/hayabhay/whisper-ui)
88

99
## Setup
10+
1011
This was built & tested on Python 3.10.8, Ubutu20.04/WSL2 but should also work on Python 3.9+.
1112

1213
```bash
1314
sudo apt install ffmpeg
14-
pip install fastapi python-multipart pydantic uvicorn ffmpeg-python openai-whisper
15+
pip install fastapi python-multipart pydantic uvicorn ffmpeg-python
1516
# or pip install -r requirements.txt
1617
```
1718

18-
or
19+
or
1920

2021
```bash
2122
docker compose build
@@ -24,12 +25,13 @@ docker compose build
2425
## Usage
2526

2627
### server
28+
2729
```bash
2830
export PYTHONPATH=.
2931
uvicorn main:app --host 0.0.0.0
3032
```
3133

32-
or
34+
or
3335

3436
```bash
3537
docker compose up

main.py

+8-58
Original file line numberDiff line numberDiff line change
@@ -10,73 +10,26 @@
1010
from datetime import timedelta
1111

1212
import numpy as np
13-
import whisper
13+
import mlx_whisper
1414

1515
app = FastAPI()
1616

17-
#url https://api.openai.com/v1/audio/transcriptions \
18-
# -H "Authorization: Bearer $OPENAI_API_KEY" \
19-
# -H "Content-Type: multipart/form-data" \
20-
# -F model="whisper-1" \
21-
# -F file="@/path/to/file/openai.mp3"
22-
23-
#{
24-
# "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger..."
25-
#}
26-
27-
# -----
28-
# copied from https://github.com/hayabhay/whisper-ui
29-
30-
# Whisper transcription functions
31-
# ----------------
32-
@lru_cache(maxsize=1)
33-
def get_whisper_model(whisper_model: str):
34-
"""Get a whisper model from the cache or download it if it doesn't exist"""
35-
model = whisper.load_model(whisper_model)
36-
return model
17+
# @lru_cache(maxsize=1)
18+
# def get_whisper_model(whisper_model: str):
19+
# """Get a whisper model from the cache or download it if it doesn't exist"""
20+
# model = mlx_whisper.load_model(whisper_model)
21+
# return model
3722

3823
def transcribe(audio_path: str, whisper_model: str, **whisper_args):
3924
"""Transcribe the audio file using whisper"""
40-
41-
# Get whisper model
42-
# NOTE: If mulitple models are selected, this may keep all of them in memory depending on the cache size
43-
transcriber = get_whisper_model(whisper_model)
44-
45-
# Set configs & transcribe
46-
if whisper_args["temperature_increment_on_fallback"] is not None:
47-
whisper_args["temperature"] = tuple(
48-
np.arange(whisper_args["temperature"], 1.0 + 1e-6, whisper_args["temperature_increment_on_fallback"])
49-
)
50-
else:
51-
whisper_args["temperature"] = [whisper_args["temperature"]]
52-
53-
del whisper_args["temperature_increment_on_fallback"]
54-
55-
transcript = transcriber.transcribe(
56-
audio_path,
57-
**whisper_args,
58-
)
59-
25+
transcript = mlx_whisper.transcribe(audio_path)
6026
return transcript
6127

62-
6328
WHISPER_DEFAULT_SETTINGS = {
64-
# "whisper_model": "base",
6529
"whisper_model": "large-v2",
66-
"temperature": 0.0,
67-
"temperature_increment_on_fallback": 0.2,
68-
"no_speech_threshold": 0.6,
69-
"logprob_threshold": -1.0,
70-
"compression_ratio_threshold": 2.4,
71-
"condition_on_previous_text": True,
72-
"verbose": False,
73-
# "verbose": True,
74-
"task": "transcribe",
75-
# "task": "translation",
7630
}
7731

7832
UPLOAD_DIR="/tmp"
79-
# -----
8033

8134
@app.post('/v1/audio/transcriptions')
8235
async def transcriptions(model: str = Form(...),
@@ -120,14 +73,12 @@ async def transcriptions(model: str = Form(...),
12073

12174
transcript = transcribe(audio_path=upload_name, **WHISPER_DEFAULT_SETTINGS)
12275

123-
12476
if response_format in ['text']:
12577
return transcript['text']
12678

12779
if response_format in ['srt']:
12880
ret = ""
12981
for seg in transcript['segments']:
130-
13182
td_s = timedelta(milliseconds=seg["start"]*1000)
13283
td_e = timedelta(milliseconds=seg["end"]*1000)
13384

@@ -151,11 +102,10 @@ async def transcriptions(model: str = Form(...),
151102
return ret
152103

153104
if response_format in ['verbose_json']:
154-
transcript.setdefault('task', WHISPER_DEFAULT_SETTINGS['task'])
105+
transcript.setdefault('task', 'transcribe')
155106
transcript.setdefault('duration', transcript['segments'][-1]['end'])
156107
if transcript['language'] == 'ja':
157108
transcript['language'] = 'japanese'
158109
return transcript
159110

160111
return {'text': transcript['text']}
161-

requirements.txt

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ nvidia-cublas-cu11==11.10.3.66
1919
nvidia-cuda-nvrtc-cu11==11.7.99
2020
nvidia-cuda-runtime-cu11==11.7.99
2121
nvidia-cudnn-cu11==8.5.0.96
22-
openai-whisper==20230308
2322
packaging==23.0
2423
pydantic==1.10.6
2524
python-multipart==0.0.6

0 commit comments

Comments
 (0)