From 529dbe3fabb9ccceb2052de20681004806af8681 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Fri, 10 May 2024 23:50:37 +0300 Subject: [PATCH 1/8] initial code --- src/libtorchaudio/forced_align/cpu/compute.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 81f5f0a459..76c70c34bd 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -28,12 +28,12 @@ void forced_align_impl( .device(logProbs.device()) .dtype(logProbs.dtype())) .fill_(kNegInfinity); - torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1); + // Instead of backPtr, we will store the best previous index in the alphas tensor. + // The last column of the alphas tensor will be used to store this information. auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); auto alphas_a = alphas.accessor(); - auto backPtr_a = backPtr.accessor(); auto R = 0; for (auto i = 1; i < L; i++) { if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) { @@ -80,7 +80,7 @@ void forced_align_impl( if (start == 0) { alphas_a[curIdxOffset][0] = alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; - backPtr_a[t][0] = 0; + alphas_a[curIdxOffset][S - 1] = 0; // Store the best previous index startloop += 1; } @@ -102,24 +102,24 @@ void forced_align_impl( scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { result = x2; - backPtr_a[t][i] = 2; + alphas_a[curIdxOffset][S - 1] = i - 2; // Store the best previous index } else if (x1 > x0 && x1 > x2) { result = x1; - backPtr_a[t][i] = 1; + alphas_a[curIdxOffset][S - 1] = i - 1; // Store the best previous index } else { result = x0; - backPtr_a[t][i] = 0; + alphas_a[curIdxOffset][S - 1] = i; // Store the best previous index } alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; } } auto idx1 = (T - 1) % 2; - auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2; + auto ltrIdx = alphas_a[idx1][S - 1]; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; - ltrIdx -= backPtr_a[t][ltrIdx]; + ltrIdx = alphas_a[(t) % 2][S - 1]; // Retrieve the best previous index } } From da405745f5976791a293813887abc52b5537216b Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 11 May 2024 01:12:31 +0300 Subject: [PATCH 2/8] . --- src/libtorchaudio/forced_align/cpu/compute.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 76c70c34bd..74a35c6f69 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -117,7 +117,8 @@ void forced_align_impl( auto ltrIdx = alphas_a[idx1][S - 1]; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { - auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; + auto lbl_idx = static_cast(ltrIdx) % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; + // auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; ltrIdx = alphas_a[(t) % 2][S - 1]; // Retrieve the best previous index } From 6b33076877a882050ad7ff22e1c3896860a94817 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 11 May 2024 15:02:15 +0300 Subject: [PATCH 3/8] back to origins --- .../forced_align/cpu/compute.cpp | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 74a35c6f69..26e9aae059 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -28,8 +28,10 @@ void forced_align_impl( .device(logProbs.device()) .dtype(logProbs.dtype())) .fill_(kNegInfinity); - // Instead of backPtr, we will store the best previous index in the alphas tensor. - // The last column of the alphas tensor will be used to store this information. + // Replace backPtr tensor with two std::vector + std::vector backPtrBit0(T * S, false); + std::vector backPtrBit1(T * S, false); + auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); @@ -80,7 +82,9 @@ void forced_align_impl( if (start == 0) { alphas_a[curIdxOffset][0] = alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; - alphas_a[curIdxOffset][S - 1] = 0; // Store the best previous index + // Set backPtr bits for t and 0 + // backPtrBit0[t * S + 0] = false; + // backPtrBit1[t * S + 0] = false; startloop += 1; } @@ -100,30 +104,34 @@ void forced_align_impl( x2 = alphas_a[prevIdxOffset][i - 2]; } scalar_t result = 0.0; + // Update backPtr bits based on the maximum value if (x2 > x1 && x2 > x0) { result = x2; - alphas_a[curIdxOffset][S - 1] = i - 2; // Store the best previous index + // backPtrBit0[t * S + i] = false; + backPtrBit1[t * S + i] = true; } else if (x1 > x0 && x1 > x2) { result = x1; - alphas_a[curIdxOffset][S - 1] = i - 1; // Store the best previous index + backPtrBit0[t * S + i] = true; + // backPtrBit1[t * S + i] = false; } else { result = x0; - alphas_a[curIdxOffset][S - 1] = i; // Store the best previous index + // backPtrBit0[t * S + i] = false; + // backPtrBit1[t * S + i] = false; } alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; } } auto idx1 = (T - 1) % 2; - auto ltrIdx = alphas_a[idx1][S - 1]; + auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { - auto lbl_idx = static_cast(ltrIdx) % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; - // auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; + auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; - ltrIdx = alphas_a[(t) % 2][S - 1]; // Retrieve the best previous index + // Calculate backPtr value from bits + int backPtrValue = (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx]; + ltrIdx -= backPtrValue; } } - std::tuple compute( const torch::Tensor& logProbs, const torch::Tensor& targets, From 6afc7cdaf0d1d92d57a316a8a58819c915dc234c Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sat, 11 May 2024 16:25:55 +0300 Subject: [PATCH 4/8] final --- src/libtorchaudio/forced_align/cpu/compute.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 26e9aae059..ea117d7142 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -82,9 +82,6 @@ void forced_align_impl( if (start == 0) { alphas_a[curIdxOffset][0] = alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; - // Set backPtr bits for t and 0 - // backPtrBit0[t * S + 0] = false; - // backPtrBit1[t * S + 0] = false; startloop += 1; } @@ -104,19 +101,14 @@ void forced_align_impl( x2 = alphas_a[prevIdxOffset][i - 2]; } scalar_t result = 0.0; - // Update backPtr bits based on the maximum value if (x2 > x1 && x2 > x0) { result = x2; - // backPtrBit0[t * S + i] = false; backPtrBit1[t * S + i] = true; } else if (x1 > x0 && x1 > x2) { result = x1; backPtrBit0[t * S + i] = true; - // backPtrBit1[t * S + i] = false; } else { result = x0; - // backPtrBit0[t * S + i] = false; - // backPtrBit1[t * S + i] = false; } alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; } @@ -128,10 +120,10 @@ void forced_align_impl( auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; // Calculate backPtr value from bits - int backPtrValue = (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx]; - ltrIdx -= backPtrValue; + ltrIdx -= (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx]; } } + std::tuple compute( const torch::Tensor& logProbs, const torch::Tensor& targets, From 15c4a9be79c34dbd7a48d6ebe6065c3083137a19 Mon Sep 17 00:00:00 2001 From: MahmoudAshraf97 Date: Sun, 12 May 2024 18:25:51 +0300 Subject: [PATCH 5/8] reduce the `backPtr` size because the first row is always unused --- src/libtorchaudio/forced_align/cpu/compute.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index ea117d7142..082b18482e 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -29,8 +29,8 @@ void forced_align_impl( .dtype(logProbs.dtype())) .fill_(kNegInfinity); // Replace backPtr tensor with two std::vector - std::vector backPtrBit0(T * S, false); - std::vector backPtrBit1(T * S, false); + std::vector backPtrBit0((T-1) * S, false); + std::vector backPtrBit1((T-1) * S, false); auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); @@ -103,10 +103,10 @@ void forced_align_impl( scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { result = x2; - backPtrBit1[t * S + i] = true; + backPtrBit1[(t-1) * S + i] = true; } else if (x1 > x0 && x1 > x2) { result = x1; - backPtrBit0[t * S + i] = true; + backPtrBit0[(t-1) * S + i] = true; } else { result = x0; } @@ -120,7 +120,7 @@ void forced_align_impl( auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; // Calculate backPtr value from bits - ltrIdx -= (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx]; + ltrIdx -= (backPtrBit1[(t-1) * S + ltrIdx] << 1) | backPtrBit0[(t-1) * S + ltrIdx]; } } From 6ebe49ce3beb270f81260dddcca410d2f90ee64a Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Tue, 14 May 2024 21:23:27 +0300 Subject: [PATCH 6/8] implement better trellis matrix structure --- .../forced_align/cpu/compute.cpp | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 082b18482e..e1f19bee29 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -28,10 +28,14 @@ void forced_align_impl( .device(logProbs.device()) .dtype(logProbs.dtype())) .fill_(kNegInfinity); - // Replace backPtr tensor with two std::vector - std::vector backPtrBit0((T-1) * S, false); - std::vector backPtrBit1((T-1) * S, false); + // Replace backPtr tensor with two std::vector + // allocate memory based on the expected needed size which is approximately + // S * (T-L), we will use a safety margin of (T-L) to avoid reallocation + std::vector backPtrBit0((S + 1) * (T - L), false); + std::vector backPtrBit1((S + 1) * (T - L), false); + unsigned int backPtr_offset[T - 1]; + unsigned int backPtr_seek[T - 1]; auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); @@ -56,6 +60,7 @@ void forced_align_impl( auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx]; } + unsigned int seek = 0; for (auto t = 1; t < T; t++) { if (T - t <= L + R) { if ((start % 2 == 1) && @@ -79,10 +84,13 @@ void forced_align_impl( for (auto j = 0; j < S; ++j) { alphas_a[curIdxOffset][j] = -std::numeric_limits::infinity(); } + backPtr_seek[t - 1] = seek; + backPtr_offset[t - 1] = start; if (start == 0) { alphas_a[curIdxOffset][0] = alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; startloop += 1; + seek += 1; } for (auto i = startloop; i < end; i++) { @@ -103,15 +111,16 @@ void forced_align_impl( scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { result = x2; - backPtrBit1[(t-1) * S + i] = true; + backPtrBit1[seek + i - startloop] = true; } else if (x1 > x0 && x1 > x2) { result = x1; - backPtrBit0[(t-1) * S + i] = true; + backPtrBit0[seek + i - startloop] = true; } else { result = x0; } alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; } + seek += (end - startloop); } auto idx1 = (T - 1) % 2; auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2; @@ -120,7 +129,9 @@ void forced_align_impl( auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; // Calculate backPtr value from bits - ltrIdx -= (backPtrBit1[(t-1) * S + ltrIdx] << 1) | backPtrBit0[(t-1) * S + ltrIdx]; + auto backPtr_idx = backPtr_seek[std::max(t - 1, static_cast(0))] + + ltrIdx - backPtr_offset[std::max(t - 1, static_cast(0))]; + ltrIdx -= (backPtrBit1[backPtr_idx] << 1) | backPtrBit0[backPtr_idx]; } } From 437e2a56e317852984aa26693d7f0f1e2299d0a9 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Sun, 19 May 2024 13:57:25 +0300 Subject: [PATCH 7/8] avoid seek overflow --- src/libtorchaudio/forced_align/cpu/compute.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index e1f19bee29..d8753de397 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -34,8 +34,8 @@ void forced_align_impl( // S * (T-L), we will use a safety margin of (T-L) to avoid reallocation std::vector backPtrBit0((S + 1) * (T - L), false); std::vector backPtrBit1((S + 1) * (T - L), false); - unsigned int backPtr_offset[T - 1]; - unsigned int backPtr_seek[T - 1]; + unsigned long long backPtr_offset[T - 1]; + unsigned long long backPtr_seek[T - 1]; auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); @@ -60,7 +60,7 @@ void forced_align_impl( auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx]; } - unsigned int seek = 0; + unsigned long long seek = 0; for (auto t = 1; t < T; t++) { if (T - t <= L + R) { if ((start % 2 == 1) && From c1a562a0b4ff7b331fe74e014d4e54c96d7b3c46 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Wed, 19 Jun 2024 17:39:36 +0300 Subject: [PATCH 8/8] fix building on mac and windows * initial code * . * back to origins * final * reduce the `backPtr` size because the first row is always unused * dynamic vectors * fix casting * . * preallocation * fix casting * missin ; * prevent redundant setting * missing ; * fixed backPtr indexing * fixed initial size * fix seek update * wrap up * implement better trellis matrix structure * avoid seek overflow * fix building on mac and windows --- src/libtorchaudio/forced_align/cpu/compute.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index d8753de397..1bc68826ec 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -34,8 +34,8 @@ void forced_align_impl( // S * (T-L), we will use a safety margin of (T-L) to avoid reallocation std::vector backPtrBit0((S + 1) * (T - L), false); std::vector backPtrBit1((S + 1) * (T - L), false); - unsigned long long backPtr_offset[T - 1]; - unsigned long long backPtr_seek[T - 1]; + std::vector backPtr_offset(T - 1); + std::vector backPtr_seek(T - 1); auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); @@ -129,8 +129,9 @@ void forced_align_impl( auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; // Calculate backPtr value from bits - auto backPtr_idx = backPtr_seek[std::max(t - 1, static_cast(0))] + - ltrIdx - backPtr_offset[std::max(t - 1, static_cast(0))]; + auto t_minus_one = t - 1 >= 0 ? t - 1 : 0; + auto backPtr_idx = backPtr_seek[t_minus_one] + + ltrIdx - backPtr_offset[t_minus_one]; ltrIdx -= (backPtrBit1[backPtr_idx] << 1) | backPtrBit0[backPtr_idx]; } }