Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zip implementation in progress #120

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
595 changes: 593 additions & 2 deletions package-lock.json

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions packages/backend-common/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
"name": "@guardian/transcription-service-backend-common",
"version": "1.0.0",
"dependencies": {
"@aws-sdk/client-auto-scaling": "^3.624.0",
"@aws-sdk/client-cloudwatch": "^3.624.0",
"@aws-sdk/client-dynamodb": "3.624.0",
"@aws-sdk/client-s3": "^3.624.0",
"@aws-sdk/client-secrets-manager": "^3.624.0",
"@aws-sdk/client-sqs": "^3.624.0",
"@aws-sdk/client-ssm": "^3.624.0",
"@aws-sdk/lib-dynamodb": "3.624.0",
"@aws-sdk/client-secrets-manager": "^3.624.0",
"@aws-sdk/client-cloudwatch": "^3.624.0",
"@aws-sdk/client-auto-scaling": "^3.624.0",
"@types/archiver": "^6.0.3",
"archiver": "^7.0.1",
"axios": "^1.7.4",
"winston": "^3.11.0"
},
Expand Down
4 changes: 2 additions & 2 deletions packages/backend-common/src/sqs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ const generateOutputSignedUrls = async (
const fileName = `${id}${translate ? '-translation' : ''}`;
const expiresIn = expiresInDays * 24 * 60 * 60;
const srtKey = `srt/${fileName}.srt`;
const jsonKey = `json/${fileName}.json`;
const jsonKey = `zip/${fileName}.zip`;
const textKey = `text/${fileName}.txt`;
const srtSignedS3Url = await getSignedUploadUrl(
region,
Expand Down Expand Up @@ -337,6 +337,6 @@ const generateOutputSignedUrls = async (
return {
srt: { url: srtSignedS3Url, key: srtKey },
text: { url: textSignedS3Url, key: textKey },
json: { url: jsonSignedS3Url, key: jsonKey },
zip: { url: jsonSignedS3Url, key: jsonKey },
};
};
31 changes: 31 additions & 0 deletions packages/backend-common/src/zip.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as stream from 'stream';
import archiver from 'archiver';
import { Transcripts } from '../../worker/src/transcribe';
import { promisify } from 'util';

export const getZipBlob = async (files: Transcripts) => {
// Create an archive stream and buffer
const archive = archiver('zip', { zlib: { level: 9 } });
const bufferStream = new stream.PassThrough();
const chunks: Uint8Array[] = [];

// Listen for 'data' events to collect chunks of the zip file
bufferStream.on('data', (chunk) => chunks.push(chunk));

// Pipe the archive data to the buffer stream
archive.pipe(bufferStream);

// Add files to the archive
archive.append(files.srt, { name: 'transcript.srt' });
archive.append(files.text, { name: 'transcript.txt' });
archive.append(files.json, { name: 'transcript.json' });

// Finalize the archive (ensures all files are added)
archive.finalize();

// Wait for the archive to complete and concatenate chunks into a Blob
await promisify(stream.finished)(bufferStream); // Ensure the stream finishes
const zipBlob = new Blob(chunks, { type: 'application/zip' });

return zipBlob;
};
4 changes: 2 additions & 2 deletions packages/common/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ export type SignedUrl = z.infer<typeof SignedUrl>;
const OutputBucketUrls = z.object({
srt: SignedUrl,
text: SignedUrl,
json: SignedUrl,
zip: SignedUrl,
});

export type OutputBucketUrls = z.infer<typeof OutputBucketUrls>;

const OutputBucketKeys = z.object({
srt: z.string(),
text: z.string(),
json: z.string(),
zip: z.string(),
});

export type OutputBucketKeys = z.infer<typeof OutputBucketKeys>;
Expand Down
8 changes: 6 additions & 2 deletions packages/worker/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ const pollTranscriptionQueue = async (
file: fileToTranscribe,
numberOfThreads,
model: config.app.stage === 'PROD' ? 'medium' : 'tiny',
subtitleFormat:
job.transcriptDestinationService === DestinationService.Giant
? 'vtt'
: 'srt',
};

const transcriptResult = await getTranscriptionText(
Expand Down Expand Up @@ -301,7 +305,7 @@ const pollTranscriptionQueue = async (

const outputBucketKeys: OutputBucketKeys = {
srt: outputBucketUrls.srt.key,
json: outputBucketUrls.json.key,
zip: outputBucketUrls.zip.key,
text: outputBucketUrls.text.key,
};

Expand All @@ -318,7 +322,7 @@ const pollTranscriptionQueue = async (
translationOutputBucketKeys: job.translationOutputBucketUrls &&
transcriptResult.transcriptTranslations && {
srt: job.translationOutputBucketUrls.srt.key,
json: job.translationOutputBucketUrls.json.key,
zip: job.translationOutputBucketUrls.zip.key,
text: job.translationOutputBucketUrls.text.key,
},
isTranslation: job.translate,
Expand Down
16 changes: 13 additions & 3 deletions packages/worker/src/transcribe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import {
} from '@guardian/transcription-service-common';
import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process';

type SubtitleFormat = 'srt' | 'vtt';

interface FfmpegResult {
wavPath: string;
duration?: number;
Expand Down Expand Up @@ -38,6 +40,7 @@ export type WhisperBaseParams = {
file: string;
numberOfThreads: number;
model: WhisperModel;
subtitleFormat: SubtitleFormat;
};

const CONTAINER_FOLDER = '/input';
Expand Down Expand Up @@ -139,14 +142,15 @@ const runTranscription = async (
const params = whisperParams(
false,
whisperBaseParams.wavPath,
whisperBaseParams.subtitleFormat,
languageCode,
translate,
);
const { fileName, metadata } = await runWhisper(whisperBaseParams, params);

const srtPath = path.resolve(
path.parse(whisperBaseParams.file).dir,
`${fileName}.srt`,
`${fileName}.vtt`,
);
const textPath = path.resolve(
path.parse(whisperBaseParams.file).dir,
Expand Down Expand Up @@ -176,7 +180,11 @@ const transcribeAndTranslate = async (
whisperBaseParams: WhisperBaseParams,
): Promise<TranscriptionResult> => {
try {
const dlParams = whisperParams(true, whisperBaseParams.wavPath);
const dlParams = whisperParams(
true,
whisperBaseParams.wavPath,
whisperBaseParams.subtitleFormat,
);
const { metadata } = await runWhisper(whisperBaseParams, dlParams);
const languageCode =
languageCodes.find((c) => c === metadata.detectedLanguageCode) || 'auto';
Expand Down Expand Up @@ -245,6 +253,7 @@ const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => {
const whisperParams = (
detectLanguageOnly: boolean,
file: string,
subtitleFormat: SubtitleFormat = 'srt',
languageCode: LanguageCode = 'auto',
translate: boolean = false,
) => {
Expand All @@ -255,8 +264,9 @@ const whisperParams = (
const containerOutputFilePath = path.resolve(CONTAINER_FOLDER, fileName);
logger.info(`Transcription output file path: ${containerOutputFilePath}`);
const translateParam: string[] = translate ? ['--translate'] : [];
logger.warn(`subtitleFormat is ${subtitleFormat}`);
return [
'--output-srt',
subtitleFormat == 'vtt' ? '--output-vtt' : '--output-srt',
'--output-txt',
'--output-json',
'--output-file',
Expand Down
16 changes: 14 additions & 2 deletions packages/worker/src/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
uploadToS3,
type OutputBucketUrls,
} from '@guardian/transcription-service-common';
import { getZipBlob } from '@guardian/transcription-service-backend-common/src/zip';

export const uploadAllTranscriptsToS3 = async (
destinationBucketUrls: OutputBucketUrls,
Expand All @@ -12,13 +13,24 @@ export const uploadAllTranscriptsToS3 = async (
const getBlob = (file: string) => new Blob([file as BlobPart]);
const blobs: [string, string, Blob][] = [
['srt', destinationBucketUrls.srt.url, getBlob(files.srt)],
['json', destinationBucketUrls.json.url, getBlob(files.json)],
['json', destinationBucketUrls.zip.url, getBlob(files.json)],
['text', destinationBucketUrls.text.url, getBlob(files.text)],
];

const zipBlob = await getZipBlob(files);

console.log(`zipBlob.type: ${zipBlob.type}`);

for (const blobDetail of blobs) {
const [fileFormat, url, blob] = blobDetail;
const response = await uploadToS3(url, blob);

const blobTest = blobDetail[0] === 'json' ? zipBlob : blob;

if (blobDetail[0] === 'json') {
console.log(`s3 url is: ${url}`);
}
const response = await uploadToS3(url, blobTest);

if (!response.isSuccess) {
throw new Error(
`Could not upload file format: ${fileFormat} to S3! ${response.errorMsg}`,
Expand Down
Loading