From 529dbe3fabb9ccceb2052de20681004806af8681 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Fri, 10 May 2024 23:50:37 +0300
Subject: [PATCH 1/8] initial code

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 81f5f0a459..76c70c34bd 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -28,12 +28,12 @@ void forced_align_impl(
                                  .device(logProbs.device())
                                  .dtype(logProbs.dtype()))
                              .fill_(kNegInfinity);
-  torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1);
+  // Instead of backPtr, we will store the best previous index in the alphas tensor.
+  // The last column of the alphas tensor will be used to store this information.
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
   auto alphas_a = alphas.accessor<scalar_t, 2>();
-  auto backPtr_a = backPtr.accessor<int8_t, 2>();
   auto R = 0;
   for (auto i = 1; i < L; i++) {
     if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) {
@@ -80,7 +80,7 @@ void forced_align_impl(
     if (start == 0) {
       alphas_a[curIdxOffset][0] =
           alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
-      backPtr_a[t][0] = 0;
+      alphas_a[curIdxOffset][S - 1] = 0; // Store the best previous index
       startloop += 1;
     }
 
@@ -102,24 +102,24 @@ void forced_align_impl(
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        backPtr_a[t][i] = 2;
+        alphas_a[curIdxOffset][S - 1] = i - 2; // Store the best previous index
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
-        backPtr_a[t][i] = 1;
+        alphas_a[curIdxOffset][S - 1] = i - 1; // Store the best previous index
       } else {
         result = x0;
-        backPtr_a[t][i] = 0;
+        alphas_a[curIdxOffset][S - 1] = i; // Store the best previous index
       }
       alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     }
   }
   auto idx1 = (T - 1) % 2;
-  auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2;
+  auto ltrIdx = alphas_a[idx1][S - 1]; 
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
-    ltrIdx -= backPtr_a[t][ltrIdx];
+    ltrIdx = alphas_a[(t) % 2][S - 1]; // Retrieve the best previous index
   }
 }
 

From da405745f5976791a293813887abc52b5537216b Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 11 May 2024 01:12:31 +0300
Subject: [PATCH 2/8] .

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 76c70c34bd..74a35c6f69 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -117,7 +117,8 @@ void forced_align_impl(
   auto ltrIdx = alphas_a[idx1][S - 1]; 
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
-    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
+    auto lbl_idx = static_cast<int>(ltrIdx) % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; 
+    // auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
     ltrIdx = alphas_a[(t) % 2][S - 1]; // Retrieve the best previous index
   }

From 6b33076877a882050ad7ff22e1c3896860a94817 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 11 May 2024 15:02:15 +0300
Subject: [PATCH 3/8] back to origins

---
 .../forced_align/cpu/compute.cpp              | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 74a35c6f69..26e9aae059 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -28,8 +28,10 @@ void forced_align_impl(
                                  .device(logProbs.device())
                                  .dtype(logProbs.dtype()))
                              .fill_(kNegInfinity);
-  // Instead of backPtr, we will store the best previous index in the alphas tensor.
-  // The last column of the alphas tensor will be used to store this information.
+  // Replace backPtr tensor with two std::vector<bool>
+  std::vector<bool> backPtrBit0(T * S, false);
+  std::vector<bool> backPtrBit1(T * S, false);
+
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
@@ -80,7 +82,9 @@ void forced_align_impl(
     if (start == 0) {
       alphas_a[curIdxOffset][0] =
           alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
-      alphas_a[curIdxOffset][S - 1] = 0; // Store the best previous index
+      // Set backPtr bits for t and 0
+      // backPtrBit0[t * S + 0] = false;
+      // backPtrBit1[t * S + 0] = false;
       startloop += 1;
     }
 
@@ -100,30 +104,34 @@ void forced_align_impl(
         x2 = alphas_a[prevIdxOffset][i - 2];
       }
       scalar_t result = 0.0;
+      // Update backPtr bits based on the maximum value
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        alphas_a[curIdxOffset][S - 1] = i - 2; // Store the best previous index
+        // backPtrBit0[t * S + i] = false;
+        backPtrBit1[t * S + i] = true;
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
-        alphas_a[curIdxOffset][S - 1] = i - 1; // Store the best previous index
+        backPtrBit0[t * S + i] = true;
+        // backPtrBit1[t * S + i] = false;
       } else {
         result = x0;
-        alphas_a[curIdxOffset][S - 1] = i; // Store the best previous index
+        // backPtrBit0[t * S + i] = false;
+        // backPtrBit1[t * S + i] = false;
       }
       alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     }
   }
   auto idx1 = (T - 1) % 2;
-  auto ltrIdx = alphas_a[idx1][S - 1]; 
+  auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2;
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
-    auto lbl_idx = static_cast<int>(ltrIdx) % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; 
-    // auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
+    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
-    ltrIdx = alphas_a[(t) % 2][S - 1]; // Retrieve the best previous index
+    // Calculate backPtr value from bits
+    int backPtrValue = (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx];
+    ltrIdx -= backPtrValue;
   }
 }
-
 std::tuple<torch::Tensor, torch::Tensor> compute(
     const torch::Tensor& logProbs,
     const torch::Tensor& targets,

From 6afc7cdaf0d1d92d57a316a8a58819c915dc234c Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sat, 11 May 2024 16:25:55 +0300
Subject: [PATCH 4/8] final

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 26e9aae059..ea117d7142 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -82,9 +82,6 @@ void forced_align_impl(
     if (start == 0) {
       alphas_a[curIdxOffset][0] =
           alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
-      // Set backPtr bits for t and 0
-      // backPtrBit0[t * S + 0] = false;
-      // backPtrBit1[t * S + 0] = false;
       startloop += 1;
     }
 
@@ -104,19 +101,14 @@ void forced_align_impl(
         x2 = alphas_a[prevIdxOffset][i - 2];
       }
       scalar_t result = 0.0;
-      // Update backPtr bits based on the maximum value
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        // backPtrBit0[t * S + i] = false;
         backPtrBit1[t * S + i] = true;
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
         backPtrBit0[t * S + i] = true;
-        // backPtrBit1[t * S + i] = false;
       } else {
         result = x0;
-        // backPtrBit0[t * S + i] = false;
-        // backPtrBit1[t * S + i] = false;
       }
       alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     }
@@ -128,10 +120,10 @@ void forced_align_impl(
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
     // Calculate backPtr value from bits
-    int backPtrValue = (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx];
-    ltrIdx -= backPtrValue;
+    ltrIdx -= (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx];
   }
 }
+
 std::tuple<torch::Tensor, torch::Tensor> compute(
     const torch::Tensor& logProbs,
     const torch::Tensor& targets,

From 15c4a9be79c34dbd7a48d6ebe6065c3083137a19 Mon Sep 17 00:00:00 2001
From: MahmoudAshraf97 <hassouna97.ma@gmail.com>
Date: Sun, 12 May 2024 18:25:51 +0300
Subject: [PATCH 5/8] reduce the `backPtr` size because the first row is always
 unused

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index ea117d7142..082b18482e 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -29,8 +29,8 @@ void forced_align_impl(
                                  .dtype(logProbs.dtype()))
                              .fill_(kNegInfinity);
   // Replace backPtr tensor with two std::vector<bool>
-  std::vector<bool> backPtrBit0(T * S, false);
-  std::vector<bool> backPtrBit1(T * S, false);
+  std::vector<bool> backPtrBit0((T-1) * S, false);
+  std::vector<bool> backPtrBit1((T-1) * S, false);
 
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
@@ -103,10 +103,10 @@ void forced_align_impl(
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        backPtrBit1[t * S + i] = true;
+        backPtrBit1[(t-1) * S + i] = true;
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
-        backPtrBit0[t * S + i] = true;
+        backPtrBit0[(t-1) * S + i] = true;
       } else {
         result = x0;
       }
@@ -120,7 +120,7 @@ void forced_align_impl(
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
     // Calculate backPtr value from bits
-    ltrIdx -= (backPtrBit1[t * S + ltrIdx] << 1) | backPtrBit0[t * S + ltrIdx];
+    ltrIdx -= (backPtrBit1[(t-1) * S + ltrIdx] << 1) | backPtrBit0[(t-1) * S + ltrIdx];
   }
 }
 

From 6ebe49ce3beb270f81260dddcca410d2f90ee64a Mon Sep 17 00:00:00 2001
From: Mahmoud Ashraf <hassouna97.ma@gmail.com>
Date: Tue, 14 May 2024 21:23:27 +0300
Subject: [PATCH 6/8] implement better trellis matrix structure

---
 .../forced_align/cpu/compute.cpp              | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 082b18482e..e1f19bee29 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -28,10 +28,14 @@ void forced_align_impl(
                                  .device(logProbs.device())
                                  .dtype(logProbs.dtype()))
                              .fill_(kNegInfinity);
-  // Replace backPtr tensor with two std::vector<bool>
-  std::vector<bool> backPtrBit0((T-1) * S, false);
-  std::vector<bool> backPtrBit1((T-1) * S, false);
 
+  // Replace backPtr tensor with two std::vector<bool>
+  // allocate memory based on the expected needed size which is approximately
+  // S * (T-L), we will use a safety margin of (T-L) to avoid reallocation
+  std::vector<bool> backPtrBit0((S + 1) * (T - L), false);
+  std::vector<bool> backPtrBit1((S + 1) * (T - L), false);
+  unsigned int backPtr_offset[T - 1];
+  unsigned int backPtr_seek[T - 1];
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
@@ -56,6 +60,7 @@ void forced_align_impl(
     auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
     alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx];
   }
+  unsigned int seek = 0;
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
       if ((start % 2 == 1) &&
@@ -79,10 +84,13 @@ void forced_align_impl(
     for (auto j = 0; j < S; ++j) {
       alphas_a[curIdxOffset][j] = -std::numeric_limits<scalar_t>::infinity();
     }
+    backPtr_seek[t - 1] = seek;
+    backPtr_offset[t - 1] = start;
     if (start == 0) {
       alphas_a[curIdxOffset][0] =
           alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
       startloop += 1;
+      seek += 1;
     }
 
     for (auto i = startloop; i < end; i++) {
@@ -103,15 +111,16 @@ void forced_align_impl(
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        backPtrBit1[(t-1) * S + i] = true;
+        backPtrBit1[seek + i - startloop] = true;
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
-        backPtrBit0[(t-1) * S + i] = true;
+        backPtrBit0[seek + i - startloop] = true;
       } else {
         result = x0;
       }
       alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     }
+    seek += (end - startloop);
   }
   auto idx1 = (T - 1) % 2;
   auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2;
@@ -120,7 +129,9 @@ void forced_align_impl(
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
     // Calculate backPtr value from bits
-    ltrIdx -= (backPtrBit1[(t-1) * S + ltrIdx] << 1) | backPtrBit0[(t-1) * S + ltrIdx];
+    auto backPtr_idx = backPtr_seek[std::max(t - 1, static_cast<long int>(0))] +
+        ltrIdx - backPtr_offset[std::max(t - 1, static_cast<long int>(0))];
+    ltrIdx -= (backPtrBit1[backPtr_idx] << 1) | backPtrBit0[backPtr_idx];
   }
 }
 

From 437e2a56e317852984aa26693d7f0f1e2299d0a9 Mon Sep 17 00:00:00 2001
From: Mahmoud Ashraf <hassouna97.ma@gmail.com>
Date: Sun, 19 May 2024 13:57:25 +0300
Subject: [PATCH 7/8] avoid seek overflow

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index e1f19bee29..d8753de397 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -34,8 +34,8 @@ void forced_align_impl(
   // S * (T-L), we will use a safety margin of (T-L) to avoid reallocation
   std::vector<bool> backPtrBit0((S + 1) * (T - L), false);
   std::vector<bool> backPtrBit1((S + 1) * (T - L), false);
-  unsigned int backPtr_offset[T - 1];
-  unsigned int backPtr_seek[T - 1];
+  unsigned long long backPtr_offset[T - 1];
+  unsigned long long backPtr_seek[T - 1];
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
@@ -60,7 +60,7 @@ void forced_align_impl(
     auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
     alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx];
   }
-  unsigned int seek = 0;
+  unsigned long long seek = 0;
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
       if ((start % 2 == 1) &&

From c1a562a0b4ff7b331fe74e014d4e54c96d7b3c46 Mon Sep 17 00:00:00 2001
From: Mahmoud Ashraf <hassouna97.ma@gmail.com>
Date: Wed, 19 Jun 2024 17:39:36 +0300
Subject: [PATCH 8/8] fix building on mac and windows

* initial code

* .

* back to origins

* final

* reduce the `backPtr` size because the first row is always unused

* dynamic vectors

* fix casting

* .

* preallocation

* fix casting

* missin ;

* prevent redundant setting

* missing ;

* fixed backPtr indexing

* fixed initial size

* fix seek update

* wrap up

* implement better trellis matrix structure

* avoid seek overflow

* fix building on mac and windows
---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index d8753de397..1bc68826ec 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -34,8 +34,8 @@ void forced_align_impl(
   // S * (T-L), we will use a safety margin of (T-L) to avoid reallocation
   std::vector<bool> backPtrBit0((S + 1) * (T - L), false);
   std::vector<bool> backPtrBit1((S + 1) * (T - L), false);
-  unsigned long long backPtr_offset[T - 1];
-  unsigned long long backPtr_seek[T - 1];
+  std::vector<unsigned long long> backPtr_offset(T - 1);
+  std::vector<unsigned long long> backPtr_seek(T - 1);
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
@@ -129,8 +129,9 @@ void forced_align_impl(
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
     // Calculate backPtr value from bits
-    auto backPtr_idx = backPtr_seek[std::max(t - 1, static_cast<long int>(0))] +
-        ltrIdx - backPtr_offset[std::max(t - 1, static_cast<long int>(0))];
+    auto t_minus_one = t - 1 >= 0 ? t - 1 : 0;
+    auto backPtr_idx = backPtr_seek[t_minus_one] +
+                       ltrIdx - backPtr_offset[t_minus_one];
     ltrIdx -= (backPtrBit1[backPtr_idx] << 1) | backPtrBit0[backPtr_idx];
   }
 }