Skip to content

Commit

Permalink
zip implementation in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
marjisound committed Jan 17, 2025
1 parent 28b785e commit c80db41
Show file tree
Hide file tree
Showing 8 changed files with 666 additions and 16 deletions.
595 changes: 593 additions & 2 deletions package-lock.json

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions packages/backend-common/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
"name": "@guardian/transcription-service-backend-common",
"version": "1.0.0",
"dependencies": {
"@aws-sdk/client-auto-scaling": "^3.624.0",
"@aws-sdk/client-cloudwatch": "^3.624.0",
"@aws-sdk/client-dynamodb": "3.624.0",
"@aws-sdk/client-s3": "^3.624.0",
"@aws-sdk/client-secrets-manager": "^3.624.0",
"@aws-sdk/client-sqs": "^3.624.0",
"@aws-sdk/client-ssm": "^3.624.0",
"@aws-sdk/lib-dynamodb": "3.624.0",
"@aws-sdk/client-secrets-manager": "^3.624.0",
"@aws-sdk/client-cloudwatch": "^3.624.0",
"@aws-sdk/client-auto-scaling": "^3.624.0",
"@types/archiver": "^6.0.3",
"archiver": "^7.0.1",
"axios": "^1.7.4",
"winston": "^3.11.0"
},
Expand Down
4 changes: 2 additions & 2 deletions packages/backend-common/src/sqs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ const generateOutputSignedUrls = async (
const fileName = `${id}${translate ? '-translation' : ''}`;
const expiresIn = expiresInDays * 24 * 60 * 60;
const srtKey = `srt/${fileName}.srt`;
const jsonKey = `json/${fileName}.json`;
const jsonKey = `zip/${fileName}.zip`;
const textKey = `text/${fileName}.txt`;
const srtSignedS3Url = await getSignedUploadUrl(
region,
Expand Down Expand Up @@ -337,6 +337,6 @@ const generateOutputSignedUrls = async (
return {
srt: { url: srtSignedS3Url, key: srtKey },
text: { url: textSignedS3Url, key: textKey },
json: { url: jsonSignedS3Url, key: jsonKey },
zip: { url: jsonSignedS3Url, key: jsonKey },
};
};
31 changes: 31 additions & 0 deletions packages/backend-common/src/zip.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as stream from 'stream';
import archiver from 'archiver';
import { Transcripts } from '../../worker/src/transcribe';
import { promisify } from 'util';

export const getZipBlob = async (files: Transcripts) => {
// Create an archive stream and buffer
const archive = archiver('zip', { zlib: { level: 9 } });
const bufferStream = new stream.PassThrough();
const chunks: Uint8Array[] = [];

// Listen for 'data' events to collect chunks of the zip file
bufferStream.on('data', (chunk) => chunks.push(chunk));

// Pipe the archive data to the buffer stream
archive.pipe(bufferStream);

// Add files to the archive
archive.append(files.srt, { name: 'transcript.srt' });
archive.append(files.text, { name: 'transcript.txt' });
archive.append(files.json, { name: 'transcript.json' });

// Finalize the archive (ensures all files are added)
archive.finalize();

// Wait for the archive to complete and concatenate chunks into a Blob
await promisify(stream.finished)(bufferStream); // Ensure the stream finishes
const zipBlob = new Blob(chunks, { type: 'application/zip' });

return zipBlob;
};
4 changes: 2 additions & 2 deletions packages/common/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ export type SignedUrl = z.infer<typeof SignedUrl>;
const OutputBucketUrls = z.object({
srt: SignedUrl,
text: SignedUrl,
json: SignedUrl,
zip: SignedUrl,
});

export type OutputBucketUrls = z.infer<typeof OutputBucketUrls>;

const OutputBucketKeys = z.object({
srt: z.string(),
text: z.string(),
json: z.string(),
zip: z.string(),
});

export type OutputBucketKeys = z.infer<typeof OutputBucketKeys>;
Expand Down
8 changes: 6 additions & 2 deletions packages/worker/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ const pollTranscriptionQueue = async (
file: fileToTranscribe,
numberOfThreads,
model: config.app.stage === 'PROD' ? 'medium' : 'tiny',
subtitleFormat:
job.transcriptDestinationService === DestinationService.Giant
? 'vtt'
: 'srt',
};

const transcriptResult = await getTranscriptionText(
Expand Down Expand Up @@ -301,7 +305,7 @@ const pollTranscriptionQueue = async (

const outputBucketKeys: OutputBucketKeys = {
srt: outputBucketUrls.srt.key,
json: outputBucketUrls.json.key,
zip: outputBucketUrls.zip.key,
text: outputBucketUrls.text.key,
};

Expand All @@ -318,7 +322,7 @@ const pollTranscriptionQueue = async (
translationOutputBucketKeys: job.translationOutputBucketUrls &&
transcriptResult.transcriptTranslations && {
srt: job.translationOutputBucketUrls.srt.key,
json: job.translationOutputBucketUrls.json.key,
zip: job.translationOutputBucketUrls.zip.key,
text: job.translationOutputBucketUrls.text.key,
},
isTranslation: job.translate,
Expand Down
16 changes: 13 additions & 3 deletions packages/worker/src/transcribe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import {
} from '@guardian/transcription-service-common';
import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process';

type SubtitleFormat = 'srt' | 'vtt';

interface FfmpegResult {
wavPath: string;
duration?: number;
Expand Down Expand Up @@ -38,6 +40,7 @@ export type WhisperBaseParams = {
file: string;
numberOfThreads: number;
model: WhisperModel;
subtitleFormat: SubtitleFormat;
};

const CONTAINER_FOLDER = '/input';
Expand Down Expand Up @@ -139,14 +142,15 @@ const runTranscription = async (
const params = whisperParams(
false,
whisperBaseParams.wavPath,
whisperBaseParams.subtitleFormat,
languageCode,
translate,
);
const { fileName, metadata } = await runWhisper(whisperBaseParams, params);

const srtPath = path.resolve(
path.parse(whisperBaseParams.file).dir,
`${fileName}.srt`,
`${fileName}.vtt`,
);
const textPath = path.resolve(
path.parse(whisperBaseParams.file).dir,
Expand Down Expand Up @@ -176,7 +180,11 @@ const transcribeAndTranslate = async (
whisperBaseParams: WhisperBaseParams,
): Promise<TranscriptionResult> => {
try {
const dlParams = whisperParams(true, whisperBaseParams.wavPath);
const dlParams = whisperParams(
true,
whisperBaseParams.wavPath,
whisperBaseParams.subtitleFormat,
);
const { metadata } = await runWhisper(whisperBaseParams, dlParams);
const languageCode =
languageCodes.find((c) => c === metadata.detectedLanguageCode) || 'auto';
Expand Down Expand Up @@ -245,6 +253,7 @@ const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => {
const whisperParams = (
detectLanguageOnly: boolean,
file: string,
subtitleFormat: SubtitleFormat = 'srt',
languageCode: LanguageCode = 'auto',
translate: boolean = false,
) => {
Expand All @@ -255,8 +264,9 @@ const whisperParams = (
const containerOutputFilePath = path.resolve(CONTAINER_FOLDER, fileName);
logger.info(`Transcription output file path: ${containerOutputFilePath}`);
const translateParam: string[] = translate ? ['--translate'] : [];
logger.warn(`subtitleFormat is ${subtitleFormat}`);
return [
'--output-srt',
subtitleFormat == 'vtt' ? '--output-vtt' : '--output-srt',
'--output-txt',
'--output-json',
'--output-file',
Expand Down
16 changes: 14 additions & 2 deletions packages/worker/src/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
uploadToS3,
type OutputBucketUrls,
} from '@guardian/transcription-service-common';
import { getZipBlob } from '@guardian/transcription-service-backend-common/src/zip';

export const uploadAllTranscriptsToS3 = async (
destinationBucketUrls: OutputBucketUrls,
Expand All @@ -12,13 +13,24 @@ export const uploadAllTranscriptsToS3 = async (
const getBlob = (file: string) => new Blob([file as BlobPart]);
const blobs: [string, string, Blob][] = [
['srt', destinationBucketUrls.srt.url, getBlob(files.srt)],
['json', destinationBucketUrls.json.url, getBlob(files.json)],
['json', destinationBucketUrls.zip.url, getBlob(files.json)],
['text', destinationBucketUrls.text.url, getBlob(files.text)],
];

const zipBlob = await getZipBlob(files);

console.log(`zipBlob.type: ${zipBlob.type}`);

for (const blobDetail of blobs) {
const [fileFormat, url, blob] = blobDetail;
const response = await uploadToS3(url, blob);

const blobTest = blobDetail[0] === 'json' ? zipBlob : blob;

if (blobDetail[0] === 'json') {
console.log(`s3 url is: ${url}`);
}
const response = await uploadToS3(url, blobTest);

if (!response.isSuccess) {
throw new Error(
`Could not upload file format: ${fileFormat} to S3! ${response.errorMsg}`,
Expand Down

0 comments on commit c80db41

Please sign in to comment.