mlx-whisper

longseespace · longseespace · commit 9649e8ff3bdb · 2024-12-21T17:15:37.000+07:00
diff --git a/README.md b/README.md
@@ -2,20 +2,21 @@
 
 OpenAI Whisper API-style local server, runnig on FastAPI. This is for companies behind proxies or security firewalls.
 
-This API will be compatible with [OpenAI Whisper (speech to text) API](https://openai.com/blog/introducing-chatgpt-and-whisper-apis). See also  [Create transcription - API Reference - OpenAI API](https://platform.openai.com/docs/api-reference/audio/create).
+This API will be compatible with [OpenAI Whisper (speech to text) API](https://openai.com/blog/introducing-chatgpt-and-whisper-apis). See also [Create transcription - API Reference - OpenAI API](https://platform.openai.com/docs/api-reference/audio/create).
 
 Some of code has been copied from [whisper-ui](https://github.com/hayabhay/whisper-ui)
 
 ## Setup
+
 This was built & tested on Python 3.10.8, Ubutu20.04/WSL2 but should also work on Python 3.9+.
 
 ```bash
 sudo apt install ffmpeg
-pip install fastapi python-multipart pydantic uvicorn ffmpeg-python openai-whisper
+pip install fastapi python-multipart pydantic uvicorn ffmpeg-python
 # or pip install -r requirements.txt
 ```
 
-or 
+or
 
 ```bash
 docker compose build
@@ -24,12 +25,13 @@ docker compose build
 ## Usage
 
 ### server
+
 ```bash
 export PYTHONPATH=.
 uvicorn main:app --host 0.0.0.0
 ```
 
-or 
+or
 
 ```bash
 docker compose up
diff --git a/main.py b/main.py
@@ -10,73 +10,26 @@
 from datetime import timedelta
 
 import numpy as np
-import whisper
+import mlx_whisper
 
 app = FastAPI()
 
-#url https://api.openai.com/v1/audio/transcriptions \
-#  -H "Authorization: Bearer $OPENAI_API_KEY" \
-#  -H "Content-Type: multipart/form-data" \
-#  -F model="whisper-1" \
-#  -F file="@/path/to/file/openai.mp3"
-
-#{
-#  "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger..."
-#}
-
-# -----
-# copied from https://github.com/hayabhay/whisper-ui
-
-# Whisper transcription functions
-# ----------------
-@lru_cache(maxsize=1)
-def get_whisper_model(whisper_model: str):
-    """Get a whisper model from the cache or download it if it doesn't exist"""
-    model = whisper.load_model(whisper_model)
-    return model
+# @lru_cache(maxsize=1)
+# def get_whisper_model(whisper_model: str):
+#     """Get a whisper model from the cache or download it if it doesn't exist"""
+#     model = mlx_whisper.load_model(whisper_model)
+#     return model
 
 def transcribe(audio_path: str, whisper_model: str, **whisper_args):
     """Transcribe the audio file using whisper"""
-
-    # Get whisper model
-    # NOTE: If mulitple models are selected, this may keep all of them in memory depending on the cache size
-    transcriber = get_whisper_model(whisper_model)
-
-    # Set configs & transcribe
-    if whisper_args["temperature_increment_on_fallback"] is not None:
-        whisper_args["temperature"] = tuple(
-            np.arange(whisper_args["temperature"], 1.0 + 1e-6, whisper_args["temperature_increment_on_fallback"])
-        )
-    else:
-        whisper_args["temperature"] = [whisper_args["temperature"]]
-
-    del whisper_args["temperature_increment_on_fallback"]
-
-    transcript = transcriber.transcribe(
-        audio_path,
-        **whisper_args,
-    )
-
+    transcript = mlx_whisper.transcribe(audio_path)
     return transcript
 
-
 WHISPER_DEFAULT_SETTINGS = {
-#    "whisper_model": "base",
     "whisper_model": "large-v2",
-    "temperature": 0.0,
-    "temperature_increment_on_fallback": 0.2,
-    "no_speech_threshold": 0.6,
-    "logprob_threshold": -1.0,
-    "compression_ratio_threshold": 2.4,
-    "condition_on_previous_text": True,
-    "verbose": False,
-#    "verbose": True,
-    "task": "transcribe",
-#    "task": "translation",
 }
 
 UPLOAD_DIR="/tmp"
-# -----
 
 @app.post('/v1/audio/transcriptions')
 async def transcriptions(model: str = Form(...),
@@ -120,14 +73,12 @@ async def transcriptions(model: str = Form(...),
 
     transcript = transcribe(audio_path=upload_name, **WHISPER_DEFAULT_SETTINGS)
 
-
     if response_format in ['text']:
         return transcript['text']
 
     if response_format in ['srt']:
         ret = ""
         for seg in transcript['segments']:
-            
             td_s = timedelta(milliseconds=seg["start"]*1000)
             td_e = timedelta(milliseconds=seg["end"]*1000)
 
@@ -151,11 +102,10 @@ async def transcriptions(model: str = Form(...),
         return ret
 
     if response_format in ['verbose_json']:
-        transcript.setdefault('task', WHISPER_DEFAULT_SETTINGS['task'])
+        transcript.setdefault('task', 'transcribe')
         transcript.setdefault('duration', transcript['segments'][-1]['end'])
         if transcript['language'] == 'ja':
             transcript['language'] = 'japanese'
         return transcript
 
     return {'text': transcript['text']}
-
diff --git a/requirements.txt b/requirements.txt
@@ -19,7 +19,6 @@ nvidia-cublas-cu11==11.10.3.66
 nvidia-cuda-nvrtc-cu11==11.7.99
 nvidia-cuda-runtime-cu11==11.7.99
 nvidia-cudnn-cu11==8.5.0.96
-openai-whisper==20230308
 packaging==23.0
 pydantic==1.10.6
 python-multipart==0.0.6