More tidying/documentation

guardian · Jan 23, 2025 · 5229a36 · 5229a36
1 parent c889435
commit 5229a36
Show file tree

Hide file tree

Showing 9 changed files with 44 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,4 @@ build
 localstack/
 **/package-lock.json
 **.DS_Store
-worker-tmp-files/
+worker-tmp-files/*
diff --git a/README.md b/README.md
@@ -1,16 +1,19 @@
 # transcription-service
-A self service app for journalists to upload audio/video files and receive transcription through email notifications. 
+
+A self service app for journalists to upload audio/video files and receive transcription through email notifications.
 
 We use localstack to run SQS locally rather than needing to create 'dev' queues in AWS. This is set up via docker.
 
 ## Get started
+
 1. Get Janus creds (for fetching creds from AWS Parameter Store)
 2. Use the `scripts/setup.sh` script to install dependencies, set up the nginx mapping and create a docker based sqs queue
 
 ```bash
 nvm use
 scripts/setup.sh
 ```
+
 3. Run the [express](https://expressjs.com/) backend API:
 
 ```bash
@@ -25,23 +28,37 @@ npm run client::start
 
 If all goes well the frontend is available at https://transcribe.local.dev-gutools.co.uk and the backend is available at https://api.transcribe.local.dev-gutools.co.uk
 
-
 ## Emulating a production deployment
+
 Occasionally you will want to develop something which relies on the specific ways we deploy into production.
 
-When in development we run two web servers, the client nextjs dev server has features like autoreloading on changes and it proxies to the api express server. 
+When in development we run two web servers, the client nextjs dev server has features like autoreloading on changes and it proxies to the api express server.
 
 In production we only run an express server which serves the client bundle whenever you hit a non-API endpoint. This is so that the clientside can handle routing for non-api endpoints.
 
-If you are writing something that depends specifically on interactions between the API sever and the frontend you may want to check it works in production. First you need to update the config value of `rootUrl` to `https://api.transcribe.local.dev-gutools.co.uk` and then run `npm run emulate-prod-locally`. This will trigger a build and have your express web server provide the frontend bundle, rather than the nextjs server. 
+If you are writing something that depends specifically on interactions between the API sever and the frontend you may want to check it works in production. First you need to update the config value of `rootUrl` to `https://api.transcribe.local.dev-gutools.co.uk` and then run `npm run emulate-prod-locally`. This will trigger a build and have your express web server provide the frontend bundle, rather than the nextjs server.
 
 Then you can test the app using [https://api.transcribe.local.dev-gutools.co.uk](https://api.transcribe.local.dev-gutools.co.uk)
 
 ## Purging local queue
+
 If you change the structure of messages on the queue you'll probably want to purge all local messages. There's a script
 for that!
 
 ```
 ./scripts/purge-local-queue.sh
 ```
 
+## Whisper engine
+
+This project currently makes use of both https://github.com/m-bain/whisperX and https://github.com/ggerganov/whisper.cpp
+WhisperX needs to run on a GPU instance with Nvidia Cuda drivers and a mountain of python dependencies installed. To improve
+transcript performance, these are baked into the AMI used for the transcription workers - see these prs for further details:
+
+- https://github.com/guardian/amigo/pull/1604
+- https://github.com/guardian/amigo/pull/1606
+- https://github.com/guardian/amigo/pull/1607
+
+Currently we are trialling whisperx, with the hope of improved performance and speaker diarization support. There is an
+intention, assuming whisperx has satisfactory performance, cost and transcript quality, to remove whisper.cpp,
+thereby significantly simplifying our current infrastructure and the worker app.
diff --git a/packages/cdk/lib/transcription-service.ts b/packages/cdk/lib/transcription-service.ts
@@ -288,7 +288,6 @@ export class TranscriptionService extends GuStack {
 
 		const workerApp = `${APP_NAME}-worker`;
 		const userData = UserData.forLinux({ shebang: '#!/bin/bash' });
-		const gpuUserData = UserData.forLinux({ shebang: '#!/bin/bash' });
 
 		const userDataCommands = [
 			`export STAGE=${props.stage}`,
@@ -299,7 +298,6 @@ export class TranscriptionService extends GuStack {
 		].join('\n');
 
 		userData.addCommands(userDataCommands);
-		gpuUserData.addCommands(userDataCommands);
 
 		const loggingStreamName =
 			GuLoggingStreamNameParameter.getInstance(this).valueAsString;
@@ -420,7 +418,7 @@ export class TranscriptionService extends GuStack {
 						volume: BlockDeviceVolume.ebs(100),
 					},
 				],
-				userData: gpuUserData,
+				userData,
 			},
 		);
 

diff --git a/packages/client/src/components/UploadForm.tsx b/packages/client/src/components/UploadForm.tsx
@@ -476,6 +476,10 @@ export const UploadForm = () => {
 							htmlFor="diarization-checkbox"
 							value="Speaker identification"
 						/>
+						<p className="font-light">
+							Speaker identification is a new feature - please share any
+							feedback with us.
+						</p>
 					</div>
 					<div className={'ml-3'}>
 						<div className="flex h-5 items-center gap-2">
@@ -486,7 +490,7 @@ export const UploadForm = () => {
 							/>
 							<div className="flex flex-col">
 								<Label htmlFor="diarization" className="font-light text-base">
-									Attempt speaker identification
+									Request speaker identification
 								</Label>
 							</div>
 						</div>

diff --git a/packages/worker/package.json b/packages/worker/package.json
@@ -6,7 +6,7 @@
 	"scripts": {
 		"build": "esbuild --bundle --platform=node --target=node20 --outfile=dist/index.js src/index.ts",
 		"package": "docker run --rm -v $PWD:/worker $(docker build -q deb-build/) fpm",
-		"start": "STAGE=DEV nodemon --verbose --watch src src/index.ts"
+		"start": "STAGE=DEV nodemon --watch src src/index.ts"
 	},
 	"keywords": [],
 	"author": "",

diff --git a/packages/worker/src/index.ts b/packages/worker/src/index.ts
@@ -43,7 +43,7 @@ import { MAX_RECEIVE_COUNT } from '@guardian/transcription-service-common';
 import { checkSpotInterrupt } from './spot-termination';
 import { AutoScalingClient } from '@aws-sdk/client-auto-scaling';
 
-const POLLING_INTERVAL_SECONDS = 5;
+const POLLING_INTERVAL_SECONDS = 15;
 
 // Mutable variable is needed here to get feedback from checkSpotInterrupt
 let INTERRUPTION_TIME: Date | undefined = undefined;
@@ -74,11 +74,9 @@ const main = async () => {
 	const asgName = isGpu
 		? `transcription-service-gpu-workers-${config.app.stage}`
 		: `transcription-service-workers-${config.app.stage}`;
-	const taskQueueUrl = isGpu
-		? config.app.gpuTaskQueueUrl
-		: config.app.taskQueueUrl;
+	const queueUrl = isGpu ? config.app.gpuTaskQueueUrl : config.app.taskQueueUrl;
 
-	console.log('QUEUE URL', taskQueueUrl, isGpu);
+	logger.info(`Worker reading from queue ${queueUrl}`);
 
 	if (config.app.stage !== 'DEV') {
 		// start job to regularly check the instance interruption (Note: deliberately not using await here so the job
@@ -99,7 +97,7 @@ const main = async () => {
 			await pollTranscriptionQueue(
 				pollCount,
 				sqsClient,
-				taskQueueUrl,
+				queueUrl,
 				autoScalingClient,
 				asgName,
 				metrics,

diff --git a/packages/worker/src/transcribe.ts b/packages/worker/src/transcribe.ts
@@ -308,6 +308,8 @@ export const runWhisperX = async (
 	const languageCodeParam =
 		languageCode === 'auto' ? [] : ['--language', languageCode];
 	const translateParam = translate ? ['--task', 'translate'] : [];
+	// On mac arm processors, we need to set the compute type to int8
+	// see https://github.com/m-bain/whisperX?tab=readme-ov-file#usage--command-line
 	const computeParam = stage === 'DEV' ? ['--compute', 'int8'] : [];
 	try {
 		const diarizeParam = diarize ? [`--diarize`] : [];
@@ -340,7 +342,9 @@ export const runWhisper = async (
 ) => {
 	const { containerId, numberOfThreads, model, wavPath } = whisperBaseParams;
 	if (!containerId) {
-		throw new Error("Container id undefined - can't run whisper container");
+		throw new Error(
+			"Container id undefined - can't run whisper container (has this worker ended up in whisperX mode?)",
+		);
 	}
 	const fileName = path.parse(wavPath).name;
 	logger.info(

diff --git a/scripts/download_whisperx_models.py b/scripts/download_whisperx_models.py
@@ -59,8 +59,9 @@ def download_huggingface_align_models():
 # Diarization - see https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
 
 def download_diarization_models(auth_token):
-    PYANNOTE_MODEL="pyannote/speaker-diarization-3.1"
-    Pipeline.from_pretrained(PYANNOTE_MODEL, use_auth_token=auth_token)
+    pyannote_model="pyannote/speaker-diarization-3.1"
+    print(f"Downloading diarization models {pyannote_model}")
+    Pipeline.from_pretrained(pyannote_model, use_auth_token=auth_token)
 
 # faster-whisper models
 
@@ -88,6 +89,7 @@ def download_model(
     Returns:
       The path to the downloaded model.
     """
+    print(f"Downloading whisper model {model}")
     repo_id = WHISPER_MODELS.get(model)
 
     allow_patterns = [

diff --git a/scripts/setup.sh b/scripts/setup.sh
@@ -48,7 +48,7 @@ TASK_QUEUE_URL=$(aws --endpoint-url=http://localhost:4566 sqs create-queue --que
 # We don't install the localstack dns so need to replace the endpoint with localhost
 TASK_QUEUE_URL_LOCALHOST=${TASK_QUEUE_URL/sqs.eu-west-1.localhost.localstack.cloud/localhost}
 
-echo "Created task queue in localstack, url: ${TASK_QUEUE_URL_LOCALHOST}"
+echo "Created cpu task queue in localstack, url: ${TASK_QUEUE_URL_LOCALHOST}"
 
 GPU_TASK_QUEUE_URL=$(aws --endpoint-url=http://localhost:4566 sqs create-queue --queue-name=$APP_NAME-gpu-task-queue-DEV.fifo \
   --attributes '{
@@ -59,7 +59,7 @@ GPU_TASK_QUEUE_URL=$(aws --endpoint-url=http://localhost:4566 sqs create-queue -
 # We don't install the localstack dns so need to replace the endpoint with localhost
 GPU_TASK_QUEUE_URL_LOCALHOST=${GPU_TASK_QUEUE_URL/sqs.eu-west-1.localhost.localstack.cloud/localhost}
 
-echo "Created task queue in localstack, url: ${GPU_TASK_QUEUE_URL_LOCALHOST}"
+echo "Created gpu task queue in localstack, url: ${GPU_TASK_QUEUE_URL_LOCALHOST}"
 
 #########
 ##### output queue