[Inductor][CPP] Fix the vec codegen for tanh (pytorch#148254)

leslie-fang-intel · pytorchmergebot · commit 165e33531c48 · 2025-03-03T11:46:57.000Z
**Summary** Fix pytorch#148241, The previous vectorized code generation for `tanh` used a decomposed implementation, leading to numerical differences that were further amplified by `atan2`. For example, in the given test case after `tanh`, the eager output at `[0,0,11,47]` was `-5.820766091346741e-10`, while the compiled output was `1.4319084584712982e-08`, resulting in different `atan2` outputs of `-2.3561` and `0.7853`. This issue is fixed by switching to the Sleef implementation. **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_repro.py -k test_tanh_atan2 ``` Pull Request resolved: pytorch#148254 Approved by: https://github.com/malfet, https://github.com/jgong5
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
@@ -1028,6 +1028,21 @@ def fn(a):
         a = torch.randn(1, 3)
         self.common(fn, (a,))
 
+    def test_tanh_atan2(self):
+        # https://github.com/pytorch/pytorch/issues/148241
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.shrink = nn.Tanhshrink()
+
+            def forward(self, x):
+                x = self.shrink(x)
+                x = torch.atan2(x, x)
+                return x
+
+        x = torch.randn(1, 3, 64, 64)
+        self.common(Model(), (x,))
+
     def test_index_propagation_issue_102065(self):
         def fn(x):
             x = torch.arange(x.numel())
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -1406,10 +1406,7 @@ def tan(a):
 
     @staticmethod
     def tanh(a):
-        vec_one = f"decltype({a})(1)"
-        vec_two = f"decltype({a})(2)"
-        vec_minus_two = f"decltype({a})(-2)"
-        return f"{vec_two} / ({vec_one} + ({vec_minus_two} * {a}).exp()) - {vec_one}"
+        return f"{a}.tanh()"
 
     @staticmethod
     def reciprocal(a):