diff --git a/rpg_cpp/rpg b/rpg_cpp/rpg index e5628849..2c77056c 100755 Binary files a/rpg_cpp/rpg and b/rpg_cpp/rpg differ diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/.Makefile 2.am.icloud b/rpg_cpp/thirdparty/fftw-3.3.10/.Makefile 2.am.icloud deleted file mode 100644 index 7226dd57..00000000 Binary files a/rpg_cpp/thirdparty/fftw-3.3.10/.Makefile 2.am.icloud and /dev/null differ diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/.Makefile.am.icloud b/rpg_cpp/thirdparty/fftw-3.3.10/.Makefile.am.icloud deleted file mode 100644 index 3e9f86c9..00000000 Binary files a/rpg_cpp/thirdparty/fftw-3.3.10/.Makefile.am.icloud and /dev/null differ diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/README b/rpg_cpp/thirdparty/fftw-3.3.10/README deleted file mode 100644 index 51babd1e..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/README +++ /dev/null @@ -1,65 +0,0 @@ -FFTW is a free collection of fast C routines for computing the -Discrete Fourier Transform in one or more dimensions. It includes -complex, real, symmetric, and parallel transforms, and can handle -arbitrary array sizes efficiently. FFTW is typically faster than -other publically-available FFT implementations, and is even -competitive with vendor-tuned libraries. (See our web page -http://fftw.org/ for extensive benchmarks.) To achieve this -performance, FFTW uses novel code-generation and runtime -self-optimization techniques (along with many other tricks). - -The doc/ directory contains the manual in texinfo, PDF, info, and HTML -formats. Frequently asked questions and answers can be found in the -doc/FAQ/ directory in ASCII and HTML. - -For a quick introduction to calling FFTW, see the "Tutorial" section -of the manual. - -INSTALLATION ------------- - -INSTALLATION FROM AN OFFICIAL RELEASE: - -Please read chapter 10 "Installation and Customization" of the manual. -In short: - - ./configure - make - make install - -INSTALLATION FROM THE GIT REPOSITORY: - -First, install these programs: - - ocaml, ocamlbuild, autoconf, automake, indent, and libtool. - -You also need the ocaml Num library, which was standard in Ocaml but -was removed without warning in OCaml 4.06.0 (3 Nov 2017). On Fedora -30, try installing the ocaml-num-devel package. - -Then, execute - - sh bootstrap.sh - make - -The bootstrap.sh script runs configure directly, but if you need to -re-run configure, you must pass the --enable-maintainer-mode flag: - - ./configure --enable-maintainer-mode [OTHER CONFIGURE FLAGS] - -Alternatively, you can run - - sh mkdist.sh - -which will run the entire bootstrapping process and generate -.tar.gz files similar to those for official releases. - -CONTACTS --------- - -FFTW was written by Matteo Frigo and Steven G. Johnson. You can -contact them at fftw@fftw.org. The latest version of FFTW, -benchmarks, links, and other information can be found at the FFTW home -page (http://www.fftw.org). You can also sign up to the fftw-announce -Google group to receive (infrequent) updates and information about new -releases. diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_10.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_10.c deleted file mode 100644 index 1736e787..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_10.c +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:27 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */ - -/* - * This function contains 102 FP additions, 72 FP multiplications, - * (or, 48 additions, 18 multiplications, 54 fused multiply/add), - * 47 stack variables, 4 constants, and 40 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - { - INT m; - for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) { - E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T1P, T16, T17, T18, T1s, T1x; - E T25, Tl, Ty, Tz, T1I, T1J, T1O, T13, T14, T15, T1h, T1m, T24; - { - E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5; - T1 = ri[0]; - T1T = ii[0]; - T3 = ri[WS(rs, 5)]; - T6 = ii[WS(rs, 5)]; - T2 = W[8]; - T4 = T2 * T3; - T1R = T2 * T6; - T5 = W[9]; - T7 = FMA(T5, T6, T4); - T1S = FNMS(T5, T3, T1R); - T8 = T1 - T7; - T23 = T1T - T1S; - T12 = T1 + T7; - T1U = T1S + T1T; - } - { - E TF, T1p, TY, T1w, TL, T1r, TS, T1u; - { - E TB, TE, TC, T1o, TA, TD; - TB = ri[WS(rs, 4)]; - TE = ii[WS(rs, 4)]; - TA = W[6]; - TC = TA * TB; - T1o = TA * TE; - TD = W[7]; - TF = FMA(TD, TE, TC); - T1p = FNMS(TD, TB, T1o); - } - { - E TU, TX, TV, T1v, TT, TW; - TU = ri[WS(rs, 1)]; - TX = ii[WS(rs, 1)]; - TT = W[0]; - TV = TT * TU; - T1v = TT * TX; - TW = W[1]; - TY = FMA(TW, TX, TV); - T1w = FNMS(TW, TU, T1v); - } - { - E TH, TK, TI, T1q, TG, TJ; - TH = ri[WS(rs, 9)]; - TK = ii[WS(rs, 9)]; - TG = W[16]; - TI = TG * TH; - T1q = TG * TK; - TJ = W[17]; - TL = FMA(TJ, TK, TI); - T1r = FNMS(TJ, TH, T1q); - } - { - E TO, TR, TP, T1t, TN, TQ; - TO = ri[WS(rs, 6)]; - TR = ii[WS(rs, 6)]; - TN = W[10]; - TP = TN * TO; - T1t = TN * TR; - TQ = W[11]; - TS = FMA(TQ, TR, TP); - T1u = FNMS(TQ, TO, T1t); - } - TM = TF - TL; - TZ = TS - TY; - T10 = TM + TZ; - T1F = T1p + T1r; - T1G = T1u + T1w; - T1P = T1F + T1G; - T16 = TF + TL; - T17 = TS + TY; - T18 = T16 + T17; - T1s = T1p - T1r; - T1x = T1u - T1w; - T25 = T1s + T1x; - } - { - E Te, T1e, Tx, T1l, Tk, T1g, Tr, T1j; - { - E Ta, Td, Tb, T1d, T9, Tc; - Ta = ri[WS(rs, 2)]; - Td = ii[WS(rs, 2)]; - T9 = W[2]; - Tb = T9 * Ta; - T1d = T9 * Td; - Tc = W[3]; - Te = FMA(Tc, Td, Tb); - T1e = FNMS(Tc, Ta, T1d); - } - { - E Tt, Tw, Tu, T1k, Ts, Tv; - Tt = ri[WS(rs, 3)]; - Tw = ii[WS(rs, 3)]; - Ts = W[4]; - Tu = Ts * Tt; - T1k = Ts * Tw; - Tv = W[5]; - Tx = FMA(Tv, Tw, Tu); - T1l = FNMS(Tv, Tt, T1k); - } - { - E Tg, Tj, Th, T1f, Tf, Ti; - Tg = ri[WS(rs, 7)]; - Tj = ii[WS(rs, 7)]; - Tf = W[12]; - Th = Tf * Tg; - T1f = Tf * Tj; - Ti = W[13]; - Tk = FMA(Ti, Tj, Th); - T1g = FNMS(Ti, Tg, T1f); - } - { - E Tn, Tq, To, T1i, Tm, Tp; - Tn = ri[WS(rs, 8)]; - Tq = ii[WS(rs, 8)]; - Tm = W[14]; - To = Tm * Tn; - T1i = Tm * Tq; - Tp = W[15]; - Tr = FMA(Tp, Tq, To); - T1j = FNMS(Tp, Tn, T1i); - } - Tl = Te - Tk; - Ty = Tr - Tx; - Tz = Tl + Ty; - T1I = T1e + T1g; - T1J = T1j + T1l; - T1O = T1I + T1J; - T13 = Te + Tk; - T14 = Tr + Tx; - T15 = T13 + T14; - T1h = T1e - T1g; - T1m = T1j - T1l; - T24 = T1h + T1m; - } - { - E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c; - T1b = Tz - T10; - T11 = Tz + T10; - T1a = FNMS(KP250000000, T11, T8); - T1n = T1h - T1m; - T1y = T1s - T1x; - T1z = FMA(KP618033988, T1y, T1n); - T1B = FNMS(KP618033988, T1n, T1y); - ri[WS(rs, 5)] = T8 + T11; - T1A = FNMS(KP559016994, T1b, T1a); - ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A); - ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A); - T1c = FMA(KP559016994, T1b, T1a); - ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c); - ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c); - } - { - E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29; - T28 = T24 - T25; - T26 = T24 + T25; - T27 = FNMS(KP250000000, T26, T23); - T2a = Tl - Ty; - T2b = TM - TZ; - T2c = FMA(KP618033988, T2b, T2a); - T2e = FNMS(KP618033988, T2a, T2b); - ii[WS(rs, 5)] = T26 + T23; - T2d = FNMS(KP559016994, T28, T27); - ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d); - ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d); - T29 = FMA(KP559016994, T28, T27); - ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29); - ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29); - } - { - E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E; - T1D = T15 - T18; - T19 = T15 + T18; - T1C = FNMS(KP250000000, T19, T12); - T1H = T1F - T1G; - T1K = T1I - T1J; - T1L = FNMS(KP618033988, T1K, T1H); - T1N = FMA(KP618033988, T1H, T1K); - ri[0] = T12 + T19; - T1M = FMA(KP559016994, T1D, T1C); - ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M); - ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M); - T1E = FNMS(KP559016994, T1D, T1C); - ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E); - ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E); - } - { - E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X; - T1W = T1O - T1P; - T1Q = T1O + T1P; - T1V = FNMS(KP250000000, T1Q, T1U); - T1Y = T16 - T17; - T1Z = T13 - T14; - T20 = FNMS(KP618033988, T1Z, T1Y); - T22 = FMA(KP618033988, T1Y, T1Z); - ii[0] = T1Q + T1U; - T21 = FMA(KP559016994, T1W, T1V); - ii[WS(rs, 4)] = FMA(KP951056516, T22, T21); - ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21); - T1X = FNMS(KP559016994, T1W, T1V); - ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X); - ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X); - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 10 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 48, 18, 54, 0 }, 0, 0, 0 }; - -void X(codelet_t1_10) (planner *p) { - X(kdft_dit_register) (p, t1_10, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */ - -/* - * This function contains 102 FP additions, 60 FP multiplications, - * (or, 72 additions, 30 multiplications, 30 fused multiply/add), - * 45 stack variables, 4 constants, and 40 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - { - INT m; - for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) { - E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g; - E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L; - { - E T1, T1B, T6, T1A; - T1 = ri[0]; - T1B = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 5)]; - T5 = ii[WS(rs, 5)]; - T2 = W[8]; - T4 = W[9]; - T6 = FMA(T2, T3, T4 * T5); - T1A = FNMS(T4, T3, T2 * T5); - } - T7 = T1 - T6; - T1O = T1B - T1A; - TT = T1 + T6; - T1C = T1A + T1B; - } - { - E Tz, T1b, TP, T1f, TE, T1c, TK, T1e; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 4)]; - Ty = ii[WS(rs, 4)]; - Tv = W[6]; - Tx = W[7]; - Tz = FMA(Tv, Tw, Tx * Ty); - T1b = FNMS(Tx, Tw, Tv * Ty); - } - { - E TM, TO, TL, TN; - TM = ri[WS(rs, 1)]; - TO = ii[WS(rs, 1)]; - TL = W[0]; - TN = W[1]; - TP = FMA(TL, TM, TN * TO); - T1f = FNMS(TN, TM, TL * TO); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 9)]; - TD = ii[WS(rs, 9)]; - TA = W[16]; - TC = W[17]; - TE = FMA(TA, TB, TC * TD); - T1c = FNMS(TC, TB, TA * TD); - } - { - E TH, TJ, TG, TI; - TH = ri[WS(rs, 6)]; - TJ = ii[WS(rs, 6)]; - TG = W[10]; - TI = W[11]; - TK = FMA(TG, TH, TI * TJ); - T1e = FNMS(TI, TH, TG * TJ); - } - TF = Tz - TE; - TQ = TK - TP; - TR = TF + TQ; - T1o = T1b + T1c; - T1p = T1e + T1f; - T1y = T1o + T1p; - TX = Tz + TE; - TY = TK + TP; - TZ = TX + TY; - T1d = T1b - T1c; - T1g = T1e - T1f; - T1M = T1d + T1g; - } - { - E Tc, T14, Ts, T18, Th, T15, Tn, T17; - { - E T9, Tb, T8, Ta; - T9 = ri[WS(rs, 2)]; - Tb = ii[WS(rs, 2)]; - T8 = W[2]; - Ta = W[3]; - Tc = FMA(T8, T9, Ta * Tb); - T14 = FNMS(Ta, T9, T8 * Tb); - } - { - E Tp, Tr, To, Tq; - Tp = ri[WS(rs, 3)]; - Tr = ii[WS(rs, 3)]; - To = W[4]; - Tq = W[5]; - Ts = FMA(To, Tp, Tq * Tr); - T18 = FNMS(Tq, Tp, To * Tr); - } - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 7)]; - Tg = ii[WS(rs, 7)]; - Td = W[12]; - Tf = W[13]; - Th = FMA(Td, Te, Tf * Tg); - T15 = FNMS(Tf, Te, Td * Tg); - } - { - E Tk, Tm, Tj, Tl; - Tk = ri[WS(rs, 8)]; - Tm = ii[WS(rs, 8)]; - Tj = W[14]; - Tl = W[15]; - Tn = FMA(Tj, Tk, Tl * Tm); - T17 = FNMS(Tl, Tk, Tj * Tm); - } - Ti = Tc - Th; - Tt = Tn - Ts; - Tu = Ti + Tt; - T1r = T14 + T15; - T1s = T17 + T18; - T1x = T1r + T1s; - TU = Tc + Th; - TV = Tn + Ts; - TW = TU + TV; - T16 = T14 - T15; - T19 = T17 - T18; - T1L = T16 + T19; - } - { - E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13; - T11 = KP559016994 * (Tu - TR); - TS = Tu + TR; - T12 = FNMS(KP250000000, TS, T7); - T1a = T16 - T19; - T1h = T1d - T1g; - T1i = FMA(KP951056516, T1a, KP587785252 * T1h); - T1k = FNMS(KP587785252, T1a, KP951056516 * T1h); - ri[WS(rs, 5)] = T7 + TS; - T1j = T12 - T11; - ri[WS(rs, 7)] = T1j - T1k; - ri[WS(rs, 3)] = T1j + T1k; - T13 = T11 + T12; - ri[WS(rs, 9)] = T13 - T1i; - ri[WS(rs, 1)] = T13 + T1i; - } - { - E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R; - T1N = KP559016994 * (T1L - T1M); - T1P = T1L + T1M; - T1Q = FNMS(KP250000000, T1P, T1O); - T1S = Ti - Tt; - T1T = TF - TQ; - T1U = FMA(KP951056516, T1S, KP587785252 * T1T); - T1W = FNMS(KP587785252, T1S, KP951056516 * T1T); - ii[WS(rs, 5)] = T1P + T1O; - T1V = T1Q - T1N; - ii[WS(rs, 3)] = T1V - T1W; - ii[WS(rs, 7)] = T1W + T1V; - T1R = T1N + T1Q; - ii[WS(rs, 1)] = T1R - T1U; - ii[WS(rs, 9)] = T1U + T1R; - } - { - E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n; - T1m = KP559016994 * (TW - TZ); - T10 = TW + TZ; - T1l = FNMS(KP250000000, T10, TT); - T1q = T1o - T1p; - T1t = T1r - T1s; - T1u = FNMS(KP587785252, T1t, KP951056516 * T1q); - T1w = FMA(KP951056516, T1t, KP587785252 * T1q); - ri[0] = TT + T10; - T1v = T1m + T1l; - ri[WS(rs, 4)] = T1v - T1w; - ri[WS(rs, 6)] = T1v + T1w; - T1n = T1l - T1m; - ri[WS(rs, 2)] = T1n - T1u; - ri[WS(rs, 8)] = T1n + T1u; - } - { - E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I; - T1H = KP559016994 * (T1x - T1y); - T1z = T1x + T1y; - T1G = FNMS(KP250000000, T1z, T1C); - T1D = TX - TY; - T1E = TU - TV; - T1F = FNMS(KP587785252, T1E, KP951056516 * T1D); - T1J = FMA(KP951056516, T1E, KP587785252 * T1D); - ii[0] = T1z + T1C; - T1K = T1H + T1G; - ii[WS(rs, 4)] = T1J + T1K; - ii[WS(rs, 6)] = T1K - T1J; - T1I = T1G - T1H; - ii[WS(rs, 2)] = T1F + T1I; - ii[WS(rs, 8)] = T1I - T1F; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 10 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 72, 30, 30, 0 }, 0, 0, 0 }; - -void X(codelet_t1_10) (planner *p) { - X(kdft_dit_register) (p, t1_10, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_12.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_12.c deleted file mode 100644 index f202ce2c..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_12.c +++ /dev/null @@ -1,581 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:28 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */ - -/* - * This function contains 118 FP additions, 68 FP multiplications, - * (or, 72 additions, 22 multiplications, 46 fused multiply/add), - * 47 stack variables, 2 constants, and 48 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP866025403, +0.866025403784438646763723170752936183471402627); - DK(KP500000000, +0.500000000000000000000000000000000000000000000); - { - INT m; - for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) { - E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2r, T1s, T2f, T1d, T21, T1H; - E T1Z, Te, T2o, T1l, T2h, TT, T1V, T1A, T1T; - T1 = ri[0]; - T2i = ii[0]; - { - E Th, Tk, Ti, T2d, Tg, Tj; - Th = ri[WS(rs, 6)]; - Tk = ii[WS(rs, 6)]; - Tg = W[10]; - Ti = Tg * Th; - T2d = Tg * Tk; - Tj = W[11]; - Tl = FMA(Tj, Tk, Ti); - T2e = FNMS(Tj, Th, T2d); - } - { - E TW, TZ, TX, T1X, TV, TY; - TW = ri[WS(rs, 9)]; - TZ = ii[WS(rs, 9)]; - TV = W[16]; - TX = TV * TW; - T1X = TV * TZ; - TY = W[17]; - T10 = FMA(TY, TZ, TX); - T1Y = FNMS(TY, TW, T1X); - } - { - E TC, TF, TD, T1R, TB, TE; - TC = ri[WS(rs, 3)]; - TF = ii[WS(rs, 3)]; - TB = W[4]; - TD = TB * TC; - T1R = TB * TF; - TE = W[5]; - TG = FMA(TE, TF, TD); - T1S = FNMS(TE, TC, T1R); - } - { - E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts; - Tn = ri[WS(rs, 10)]; - Tq = ii[WS(rs, 10)]; - Tm = W[18]; - To = Tm * Tn; - T1o = Tm * Tq; - Tt = ri[WS(rs, 2)]; - Tw = ii[WS(rs, 2)]; - Ts = W[2]; - Tu = Ts * Tt; - T1q = Ts * Tw; - { - E Tr, T1p, Tx, T1r, Tp, Tv; - Tp = W[19]; - Tr = FMA(Tp, Tq, To); - T1p = FNMS(Tp, Tn, T1o); - Tv = W[3]; - Tx = FMA(Tv, Tw, Tu); - T1r = FNMS(Tv, Tt, T1q); - Ty = Tr + Tx; - T2r = Tx - Tr; - T1s = T1p - T1r; - T2f = T1p + T1r; - } - } - { - E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17; - T12 = ri[WS(rs, 1)]; - T15 = ii[WS(rs, 1)]; - T11 = W[0]; - T13 = T11 * T12; - T1D = T11 * T15; - T18 = ri[WS(rs, 5)]; - T1b = ii[WS(rs, 5)]; - T17 = W[8]; - T19 = T17 * T18; - T1F = T17 * T1b; - { - E T16, T1E, T1c, T1G, T14, T1a; - T14 = W[1]; - T16 = FMA(T14, T15, T13); - T1E = FNMS(T14, T12, T1D); - T1a = W[9]; - T1c = FMA(T1a, T1b, T19); - T1G = FNMS(T1a, T18, T1F); - T1d = T16 + T1c; - T21 = T1c - T16; - T1H = T1E - T1G; - T1Z = T1E + T1G; - } - } - { - E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8; - T3 = ri[WS(rs, 4)]; - T6 = ii[WS(rs, 4)]; - T2 = W[6]; - T4 = T2 * T3; - T1h = T2 * T6; - T9 = ri[WS(rs, 8)]; - Tc = ii[WS(rs, 8)]; - T8 = W[14]; - Ta = T8 * T9; - T1j = T8 * Tc; - { - E T7, T1i, Td, T1k, T5, Tb; - T5 = W[7]; - T7 = FMA(T5, T6, T4); - T1i = FNMS(T5, T3, T1h); - Tb = W[15]; - Td = FMA(Tb, Tc, Ta); - T1k = FNMS(Tb, T9, T1j); - Te = T7 + Td; - T2o = Td - T7; - T1l = T1i - T1k; - T2h = T1i + T1k; - } - } - { - E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN; - TI = ri[WS(rs, 7)]; - TL = ii[WS(rs, 7)]; - TH = W[12]; - TJ = TH * TI; - T1w = TH * TL; - TO = ri[WS(rs, 11)]; - TR = ii[WS(rs, 11)]; - TN = W[20]; - TP = TN * TO; - T1y = TN * TR; - { - E TM, T1x, TS, T1z, TK, TQ; - TK = W[13]; - TM = FMA(TK, TL, TJ); - T1x = FNMS(TK, TI, T1w); - TQ = W[21]; - TS = FMA(TQ, TR, TP); - T1z = FNMS(TQ, TO, T1y); - TT = TM + TS; - T1V = TS - TM; - T1A = T1x - T1z; - T1T = T1x + T1z; - } - } - { - E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c; - { - E Tf, Tz, T2g, T2j; - Tf = T1 + Te; - Tz = Tl + Ty; - TA = Tf + Tz; - T28 = Tf - Tz; - T2g = T2e + T2f; - T2j = T2h + T2i; - T2k = T2g + T2j; - T2m = T2j - T2g; - } - { - E TU, T1e, T29, T2a; - TU = TG + TT; - T1e = T10 + T1d; - T1f = TU + T1e; - T2l = TU - T1e; - T29 = T1S + T1T; - T2a = T1Y + T1Z; - T2b = T29 - T2a; - T2c = T29 + T2a; - } - ri[WS(rs, 6)] = TA - T1f; - ii[WS(rs, 6)] = T2k - T2c; - ri[0] = TA + T1f; - ii[0] = T2c + T2k; - ri[WS(rs, 3)] = T28 - T2b; - ii[WS(rs, 3)] = T2l + T2m; - ri[WS(rs, 9)] = T28 + T2b; - ii[WS(rs, 9)] = T2m - T2l; - } - { - E T1m, T1K, T2p, T2y, T2s, T2x, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I; - E T1O; - { - E T1g, T2n, T2q, T1n; - T1g = FNMS(KP500000000, Te, T1); - T1m = FNMS(KP866025403, T1l, T1g); - T1K = FMA(KP866025403, T1l, T1g); - T2n = FNMS(KP500000000, T2h, T2i); - T2p = FMA(KP866025403, T2o, T2n); - T2y = FNMS(KP866025403, T2o, T2n); - T2q = FNMS(KP500000000, T2f, T2e); - T2s = FMA(KP866025403, T2r, T2q); - T2x = FNMS(KP866025403, T2r, T2q); - T1n = FNMS(KP500000000, Ty, Tl); - T1t = FNMS(KP866025403, T1s, T1n); - T1L = FMA(KP866025403, T1s, T1n); - } - { - E T1v, T1U, T20, T1C; - T1v = FNMS(KP500000000, TT, TG); - T1B = FNMS(KP866025403, T1A, T1v); - T1N = FMA(KP866025403, T1A, T1v); - T1U = FNMS(KP500000000, T1T, T1S); - T1W = FMA(KP866025403, T1V, T1U); - T25 = FNMS(KP866025403, T1V, T1U); - T20 = FNMS(KP500000000, T1Z, T1Y); - T22 = FMA(KP866025403, T21, T20); - T26 = FNMS(KP866025403, T21, T20); - T1C = FNMS(KP500000000, T1d, T10); - T1I = FNMS(KP866025403, T1H, T1C); - T1O = FMA(KP866025403, T1H, T1C); - } - { - E T1u, T1J, T2z, T2A; - T1u = T1m + T1t; - T1J = T1B + T1I; - ri[WS(rs, 2)] = T1u - T1J; - ri[WS(rs, 8)] = T1u + T1J; - T2z = T2x + T2y; - T2A = T25 + T26; - ii[WS(rs, 2)] = T2z - T2A; - ii[WS(rs, 8)] = T2A + T2z; - } - { - E T1M, T1P, T2v, T2w; - T1M = T1K + T1L; - T1P = T1N + T1O; - ri[WS(rs, 10)] = T1M - T1P; - ri[WS(rs, 4)] = T1M + T1P; - T2v = T1W + T22; - T2w = T2s + T2p; - ii[WS(rs, 4)] = T2v + T2w; - ii[WS(rs, 10)] = T2w - T2v; - } - { - E T1Q, T23, T2t, T2u; - T1Q = T1K - T1L; - T23 = T1W - T22; - ri[WS(rs, 7)] = T1Q - T23; - ri[WS(rs, 1)] = T1Q + T23; - T2t = T2p - T2s; - T2u = T1N - T1O; - ii[WS(rs, 1)] = T2t - T2u; - ii[WS(rs, 7)] = T2u + T2t; - } - { - E T24, T27, T2B, T2C; - T24 = T1m - T1t; - T27 = T25 - T26; - ri[WS(rs, 11)] = T24 - T27; - ri[WS(rs, 5)] = T24 + T27; - T2B = T2y - T2x; - T2C = T1B - T1I; - ii[WS(rs, 5)] = T2B - T2C; - ii[WS(rs, 11)] = T2C + T2B; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 12 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 72, 22, 46, 0 }, 0, 0, 0 }; - -void X(codelet_t1_12) (planner *p) { - X(kdft_dit_register) (p, t1_12, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */ - -/* - * This function contains 118 FP additions, 60 FP multiplications, - * (or, 88 additions, 30 multiplications, 30 fused multiply/add), - * 47 stack variables, 2 constants, and 48 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP500000000, +0.500000000000000000000000000000000000000000000); - DK(KP866025403, +0.866025403784438646763723170752936183471402627); - { - INT m; - for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) { - E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F; - E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g; - E T1A, T1B; - { - E T6, T16, Tb, T17; - T1 = ri[0]; - T1W = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 4)]; - T5 = ii[WS(rs, 4)]; - T2 = W[6]; - T4 = W[7]; - T6 = FMA(T2, T3, T4 * T5); - T16 = FNMS(T4, T3, T2 * T5); - } - { - E T8, Ta, T7, T9; - T8 = ri[WS(rs, 8)]; - Ta = ii[WS(rs, 8)]; - T7 = W[14]; - T9 = W[15]; - Tb = FMA(T7, T8, T9 * Ta); - T17 = FNMS(T9, T8, T7 * Ta); - } - T18 = KP866025403 * (T16 - T17); - T21 = KP866025403 * (Tb - T6); - Tc = T6 + Tb; - T15 = FNMS(KP500000000, Tc, T1); - T1V = T16 + T17; - T22 = FNMS(KP500000000, T1V, T1W); - } - { - E T11, T1n, TW, T1m; - { - E TO, TQ, TN, TP; - TO = ri[WS(rs, 9)]; - TQ = ii[WS(rs, 9)]; - TN = W[16]; - TP = W[17]; - TR = FMA(TN, TO, TP * TQ); - T1E = FNMS(TP, TO, TN * TQ); - } - { - E TY, T10, TX, TZ; - TY = ri[WS(rs, 5)]; - T10 = ii[WS(rs, 5)]; - TX = W[8]; - TZ = W[9]; - T11 = FMA(TX, TY, TZ * T10); - T1n = FNMS(TZ, TY, TX * T10); - } - { - E TT, TV, TS, TU; - TT = ri[WS(rs, 1)]; - TV = ii[WS(rs, 1)]; - TS = W[0]; - TU = W[1]; - TW = FMA(TS, TT, TU * TV); - T1m = FNMS(TU, TT, TS * TV); - } - T1o = KP866025403 * (T1m - T1n); - T1D = KP866025403 * (T11 - TW); - T12 = TW + T11; - T1l = FNMS(KP500000000, T12, TR); - T1F = T1m + T1n; - T1G = FNMS(KP500000000, T1F, T1E); - } - { - E Ts, T1c, Tn, T1b; - { - E Tf, Th, Te, Tg; - Tf = ri[WS(rs, 6)]; - Th = ii[WS(rs, 6)]; - Te = W[10]; - Tg = W[11]; - Ti = FMA(Te, Tf, Tg * Th); - T1S = FNMS(Tg, Tf, Te * Th); - } - { - E Tp, Tr, To, Tq; - Tp = ri[WS(rs, 2)]; - Tr = ii[WS(rs, 2)]; - To = W[2]; - Tq = W[3]; - Ts = FMA(To, Tp, Tq * Tr); - T1c = FNMS(Tq, Tp, To * Tr); - } - { - E Tk, Tm, Tj, Tl; - Tk = ri[WS(rs, 10)]; - Tm = ii[WS(rs, 10)]; - Tj = W[18]; - Tl = W[19]; - Tn = FMA(Tj, Tk, Tl * Tm); - T1b = FNMS(Tl, Tk, Tj * Tm); - } - T1d = KP866025403 * (T1b - T1c); - T24 = KP866025403 * (Ts - Tn); - Tt = Tn + Ts; - T1a = FNMS(KP500000000, Tt, Ti); - T1T = T1b + T1c; - T25 = FNMS(KP500000000, T1T, T1S); - } - { - E TK, T1i, TF, T1h; - { - E Tx, Tz, Tw, Ty; - Tx = ri[WS(rs, 3)]; - Tz = ii[WS(rs, 3)]; - Tw = W[4]; - Ty = W[5]; - TA = FMA(Tw, Tx, Ty * Tz); - T1z = FNMS(Ty, Tx, Tw * Tz); - } - { - E TH, TJ, TG, TI; - TH = ri[WS(rs, 11)]; - TJ = ii[WS(rs, 11)]; - TG = W[20]; - TI = W[21]; - TK = FMA(TG, TH, TI * TJ); - T1i = FNMS(TI, TH, TG * TJ); - } - { - E TC, TE, TB, TD; - TC = ri[WS(rs, 7)]; - TE = ii[WS(rs, 7)]; - TB = W[12]; - TD = W[13]; - TF = FMA(TB, TC, TD * TE); - T1h = FNMS(TD, TC, TB * TE); - } - T1j = KP866025403 * (T1h - T1i); - T1y = KP866025403 * (TK - TF); - TL = TF + TK; - T1g = FNMS(KP500000000, TL, TA); - T1A = T1h + T1i; - T1B = FNMS(KP500000000, T1A, T1z); - } - { - E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R; - { - E Td, Tu, T1U, T1X; - Td = T1 + Tc; - Tu = Ti + Tt; - Tv = Td + Tu; - T1N = Td - Tu; - T1U = T1S + T1T; - T1X = T1V + T1W; - T1Y = T1U + T1X; - T20 = T1X - T1U; - } - { - E TM, T13, T1O, T1P; - TM = TA + TL; - T13 = TR + T12; - T14 = TM + T13; - T1Z = TM - T13; - T1O = T1z + T1A; - T1P = T1E + T1F; - T1Q = T1O - T1P; - T1R = T1O + T1P; - } - ri[WS(rs, 6)] = Tv - T14; - ii[WS(rs, 6)] = T1Y - T1R; - ri[0] = Tv + T14; - ii[0] = T1R + T1Y; - ri[WS(rs, 3)] = T1N - T1Q; - ii[WS(rs, 3)] = T1Z + T20; - ri[WS(rs, 9)] = T1N + T1Q; - ii[WS(rs, 9)] = T20 - T1Z; - } - { - E T1t, T1x, T27, T2a, T1w, T28, T1I, T29; - { - E T1r, T1s, T23, T26; - T1r = T15 + T18; - T1s = T1a + T1d; - T1t = T1r + T1s; - T1x = T1r - T1s; - T23 = T21 + T22; - T26 = T24 + T25; - T27 = T23 - T26; - T2a = T26 + T23; - } - { - E T1u, T1v, T1C, T1H; - T1u = T1g + T1j; - T1v = T1l + T1o; - T1w = T1u + T1v; - T28 = T1u - T1v; - T1C = T1y + T1B; - T1H = T1D + T1G; - T1I = T1C - T1H; - T29 = T1C + T1H; - } - ri[WS(rs, 10)] = T1t - T1w; - ii[WS(rs, 10)] = T2a - T29; - ri[WS(rs, 4)] = T1t + T1w; - ii[WS(rs, 4)] = T29 + T2a; - ri[WS(rs, 7)] = T1x - T1I; - ii[WS(rs, 7)] = T28 + T27; - ri[WS(rs, 1)] = T1x + T1I; - ii[WS(rs, 1)] = T27 - T28; - } - { - E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e; - { - E T19, T1e, T2b, T2c; - T19 = T15 - T18; - T1e = T1a - T1d; - T1f = T19 + T1e; - T1J = T19 - T1e; - T2b = T25 - T24; - T2c = T22 - T21; - T2d = T2b + T2c; - T2f = T2c - T2b; - } - { - E T1k, T1p, T1K, T1L; - T1k = T1g - T1j; - T1p = T1l - T1o; - T1q = T1k + T1p; - T2g = T1k - T1p; - T1K = T1B - T1y; - T1L = T1G - T1D; - T1M = T1K - T1L; - T2e = T1K + T1L; - } - ri[WS(rs, 2)] = T1f - T1q; - ii[WS(rs, 2)] = T2d - T2e; - ri[WS(rs, 8)] = T1f + T1q; - ii[WS(rs, 8)] = T2e + T2d; - ri[WS(rs, 11)] = T1J - T1M; - ii[WS(rs, 11)] = T2g + T2f; - ri[WS(rs, 5)] = T1J + T1M; - ii[WS(rs, 5)] = T2f - T2g; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 12 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 88, 30, 30, 0 }, 0, 0, 0 }; - -void X(codelet_t1_12) (planner *p) { - X(kdft_dit_register) (p, t1_12, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_15.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_15.c deleted file mode 100644 index 4ec15464..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_15.c +++ /dev/null @@ -1,816 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:28 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */ - -/* - * This function contains 184 FP additions, 140 FP multiplications, - * (or, 72 additions, 28 multiplications, 112 fused multiply/add), - * 51 stack variables, 6 constants, and 60 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - DK(KP866025403, +0.866025403784438646763723170752936183471402627); - DK(KP500000000, +0.500000000000000000000000000000000000000000000); - { - INT m; - for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) { - E T1, T3j, T1G, T3u, Te, T1B, T3i, T3t, T1y, T2i, T2a, T2M, T37, T2V, Tz; - E T2e, T1O, T2t, T39, T2X, TT, T2f, T1V, T2z, T3a, T2Y, T1e, T2h, T23, T2G; - E T36, T2U; - { - E T7, T1D, Td, T1F; - T1 = ri[0]; - T3j = ii[0]; - { - E T3, T6, T4, T1C, T2, T5; - T3 = ri[WS(rs, 5)]; - T6 = ii[WS(rs, 5)]; - T2 = W[8]; - T4 = T2 * T3; - T1C = T2 * T6; - T5 = W[9]; - T7 = FMA(T5, T6, T4); - T1D = FNMS(T5, T3, T1C); - } - { - E T9, Tc, Ta, T1E, T8, Tb; - T9 = ri[WS(rs, 10)]; - Tc = ii[WS(rs, 10)]; - T8 = W[18]; - Ta = T8 * T9; - T1E = T8 * Tc; - Tb = W[19]; - Td = FMA(Tb, Tc, Ta); - T1F = FNMS(Tb, T9, T1E); - } - T1G = T1D - T1F; - T3u = Td - T7; - Te = T7 + Td; - T1B = FNMS(KP500000000, Te, T1); - T3i = T1D + T1F; - T3t = FNMS(KP500000000, T3i, T3j); - } - { - E T1k, T2I, T1w, T28, T1q, T26; - { - E T1g, T1j, T1h, T2H, T1f, T1i; - T1g = ri[WS(rs, 9)]; - T1j = ii[WS(rs, 9)]; - T1f = W[16]; - T1h = T1f * T1g; - T2H = T1f * T1j; - T1i = W[17]; - T1k = FMA(T1i, T1j, T1h); - T2I = FNMS(T1i, T1g, T2H); - } - { - E T1s, T1v, T1t, T27, T1r, T1u; - T1s = ri[WS(rs, 4)]; - T1v = ii[WS(rs, 4)]; - T1r = W[6]; - T1t = T1r * T1s; - T27 = T1r * T1v; - T1u = W[7]; - T1w = FMA(T1u, T1v, T1t); - T28 = FNMS(T1u, T1s, T27); - } - { - E T1m, T1p, T1n, T25, T1l, T1o; - T1m = ri[WS(rs, 14)]; - T1p = ii[WS(rs, 14)]; - T1l = W[26]; - T1n = T1l * T1m; - T25 = T1l * T1p; - T1o = W[27]; - T1q = FMA(T1o, T1p, T1n); - T26 = FNMS(T1o, T1m, T25); - } - { - E T29, T1x, T24, T2L, T2J, T2K; - T29 = T26 - T28; - T1x = T1q + T1w; - T24 = FNMS(KP500000000, T1x, T1k); - T1y = T1k + T1x; - T2i = FMA(KP866025403, T29, T24); - T2a = FNMS(KP866025403, T29, T24); - T2L = T1w - T1q; - T2J = T26 + T28; - T2K = FNMS(KP500000000, T2J, T2I); - T2M = FMA(KP866025403, T2L, T2K); - T37 = T2I + T2J; - T2V = FNMS(KP866025403, T2L, T2K); - } - } - { - E Tl, T2p, Tx, T1M, Tr, T1K; - { - E Th, Tk, Ti, T2o, Tg, Tj; - Th = ri[WS(rs, 3)]; - Tk = ii[WS(rs, 3)]; - Tg = W[4]; - Ti = Tg * Th; - T2o = Tg * Tk; - Tj = W[5]; - Tl = FMA(Tj, Tk, Ti); - T2p = FNMS(Tj, Th, T2o); - } - { - E Tt, Tw, Tu, T1L, Ts, Tv; - Tt = ri[WS(rs, 13)]; - Tw = ii[WS(rs, 13)]; - Ts = W[24]; - Tu = Ts * Tt; - T1L = Ts * Tw; - Tv = W[25]; - Tx = FMA(Tv, Tw, Tu); - T1M = FNMS(Tv, Tt, T1L); - } - { - E Tn, Tq, To, T1J, Tm, Tp; - Tn = ri[WS(rs, 8)]; - Tq = ii[WS(rs, 8)]; - Tm = W[14]; - To = Tm * Tn; - T1J = Tm * Tq; - Tp = W[15]; - Tr = FMA(Tp, Tq, To); - T1K = FNMS(Tp, Tn, T1J); - } - { - E T1N, Ty, T1I, T2s, T2q, T2r; - T1N = T1K - T1M; - Ty = Tr + Tx; - T1I = FNMS(KP500000000, Ty, Tl); - Tz = Tl + Ty; - T2e = FMA(KP866025403, T1N, T1I); - T1O = FNMS(KP866025403, T1N, T1I); - T2s = Tx - Tr; - T2q = T1K + T1M; - T2r = FNMS(KP500000000, T2q, T2p); - T2t = FMA(KP866025403, T2s, T2r); - T39 = T2p + T2q; - T2X = FNMS(KP866025403, T2s, T2r); - } - } - { - E TF, T2v, TR, T1T, TL, T1R; - { - E TB, TE, TC, T2u, TA, TD; - TB = ri[WS(rs, 12)]; - TE = ii[WS(rs, 12)]; - TA = W[22]; - TC = TA * TB; - T2u = TA * TE; - TD = W[23]; - TF = FMA(TD, TE, TC); - T2v = FNMS(TD, TB, T2u); - } - { - E TN, TQ, TO, T1S, TM, TP; - TN = ri[WS(rs, 7)]; - TQ = ii[WS(rs, 7)]; - TM = W[12]; - TO = TM * TN; - T1S = TM * TQ; - TP = W[13]; - TR = FMA(TP, TQ, TO); - T1T = FNMS(TP, TN, T1S); - } - { - E TH, TK, TI, T1Q, TG, TJ; - TH = ri[WS(rs, 2)]; - TK = ii[WS(rs, 2)]; - TG = W[2]; - TI = TG * TH; - T1Q = TG * TK; - TJ = W[3]; - TL = FMA(TJ, TK, TI); - T1R = FNMS(TJ, TH, T1Q); - } - { - E T1U, TS, T1P, T2y, T2w, T2x; - T1U = T1R - T1T; - TS = TL + TR; - T1P = FNMS(KP500000000, TS, TF); - TT = TF + TS; - T2f = FMA(KP866025403, T1U, T1P); - T1V = FNMS(KP866025403, T1U, T1P); - T2y = TR - TL; - T2w = T1R + T1T; - T2x = FNMS(KP500000000, T2w, T2v); - T2z = FMA(KP866025403, T2y, T2x); - T3a = T2v + T2w; - T2Y = FNMS(KP866025403, T2y, T2x); - } - } - { - E T10, T2C, T1c, T21, T16, T1Z; - { - E TW, TZ, TX, T2B, TV, TY; - TW = ri[WS(rs, 6)]; - TZ = ii[WS(rs, 6)]; - TV = W[10]; - TX = TV * TW; - T2B = TV * TZ; - TY = W[11]; - T10 = FMA(TY, TZ, TX); - T2C = FNMS(TY, TW, T2B); - } - { - E T18, T1b, T19, T20, T17, T1a; - T18 = ri[WS(rs, 1)]; - T1b = ii[WS(rs, 1)]; - T17 = W[0]; - T19 = T17 * T18; - T20 = T17 * T1b; - T1a = W[1]; - T1c = FMA(T1a, T1b, T19); - T21 = FNMS(T1a, T18, T20); - } - { - E T12, T15, T13, T1Y, T11, T14; - T12 = ri[WS(rs, 11)]; - T15 = ii[WS(rs, 11)]; - T11 = W[20]; - T13 = T11 * T12; - T1Y = T11 * T15; - T14 = W[21]; - T16 = FMA(T14, T15, T13); - T1Z = FNMS(T14, T12, T1Y); - } - { - E T22, T1d, T1X, T2F, T2D, T2E; - T22 = T1Z - T21; - T1d = T16 + T1c; - T1X = FNMS(KP500000000, T1d, T10); - T1e = T10 + T1d; - T2h = FMA(KP866025403, T22, T1X); - T23 = FNMS(KP866025403, T22, T1X); - T2F = T1c - T16; - T2D = T1Z + T21; - T2E = FNMS(KP500000000, T2D, T2C); - T2G = FMA(KP866025403, T2F, T2E); - T36 = T2C + T2D; - T2U = FNMS(KP866025403, T2F, T2E); - } - } - { - E T3c, T3e, Tf, T1A, T33, T34, T3d, T35; - { - E T38, T3b, TU, T1z; - T38 = T36 - T37; - T3b = T39 - T3a; - T3c = FNMS(KP618033988, T3b, T38); - T3e = FMA(KP618033988, T38, T3b); - Tf = T1 + Te; - TU = Tz + TT; - T1z = T1e + T1y; - T1A = TU + T1z; - T33 = FNMS(KP250000000, T1A, Tf); - T34 = TU - T1z; - } - ri[0] = Tf + T1A; - T3d = FMA(KP559016994, T34, T33); - ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d); - ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d); - T35 = FNMS(KP559016994, T34, T33); - ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35); - ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35); - } - { - E T3q, T3s, T3k, T3h, T3l, T3m, T3r, T3n; - { - E T3o, T3p, T3f, T3g; - T3o = T1e - T1y; - T3p = Tz - TT; - T3q = FNMS(KP618033988, T3p, T3o); - T3s = FMA(KP618033988, T3o, T3p); - T3k = T3i + T3j; - T3f = T39 + T3a; - T3g = T36 + T37; - T3h = T3f + T3g; - T3l = FNMS(KP250000000, T3h, T3k); - T3m = T3f - T3g; - } - ii[0] = T3h + T3k; - T3r = FMA(KP559016994, T3m, T3l); - ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r); - ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r); - T3n = FNMS(KP559016994, T3m, T3l); - ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n); - ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n); - } - { - E T30, T32, T1H, T2c, T2R, T2S, T31, T2T; - { - E T2W, T2Z, T1W, T2b; - T2W = T2U - T2V; - T2Z = T2X - T2Y; - T30 = FNMS(KP618033988, T2Z, T2W); - T32 = FMA(KP618033988, T2W, T2Z); - T1H = FNMS(KP866025403, T1G, T1B); - T1W = T1O + T1V; - T2b = T23 + T2a; - T2c = T1W + T2b; - T2R = FNMS(KP250000000, T2c, T1H); - T2S = T1W - T2b; - } - ri[WS(rs, 5)] = T1H + T2c; - T31 = FMA(KP559016994, T2S, T2R); - ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31); - ri[WS(rs, 11)] = FMA(KP951056516, T32, T31); - T2T = FNMS(KP559016994, T2S, T2R); - ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T); - ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T); - } - { - E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N; - { - E T3O, T3P, T3I, T3J; - T3O = T23 - T2a; - T3P = T1O - T1V; - T3Q = FNMS(KP618033988, T3P, T3O); - T3S = FMA(KP618033988, T3O, T3P); - T3H = FNMS(KP866025403, T3u, T3t); - T3I = T2X + T2Y; - T3J = T2U + T2V; - T3K = T3I + T3J; - T3L = FNMS(KP250000000, T3K, T3H); - T3M = T3I - T3J; - } - ii[WS(rs, 5)] = T3K + T3H; - T3R = FMA(KP559016994, T3M, T3L); - ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R); - ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R); - T3N = FNMS(KP559016994, T3M, T3L); - ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N); - ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N); - } - { - E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B; - { - E T3C, T3D, T3w, T3x; - T3C = T2e - T2f; - T3D = T2h - T2i; - T3E = FMA(KP618033988, T3D, T3C); - T3G = FNMS(KP618033988, T3C, T3D); - T3v = FMA(KP866025403, T3u, T3t); - T3w = T2t + T2z; - T3x = T2G + T2M; - T3y = T3w + T3x; - T3z = FNMS(KP250000000, T3y, T3v); - T3A = T3w - T3x; - } - ii[WS(rs, 10)] = T3y + T3v; - T3F = FNMS(KP559016994, T3A, T3z); - ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F); - ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F); - T3B = FMA(KP559016994, T3A, T3z); - ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B); - ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B); - } - { - E T2O, T2Q, T2d, T2k, T2l, T2m, T2P, T2n; - { - E T2A, T2N, T2g, T2j; - T2A = T2t - T2z; - T2N = T2G - T2M; - T2O = FMA(KP618033988, T2N, T2A); - T2Q = FNMS(KP618033988, T2A, T2N); - T2d = FMA(KP866025403, T1G, T1B); - T2g = T2e + T2f; - T2j = T2h + T2i; - T2k = T2g + T2j; - T2l = FNMS(KP250000000, T2k, T2d); - T2m = T2g - T2j; - } - ri[WS(rs, 10)] = T2d + T2k; - T2P = FNMS(KP559016994, T2m, T2l); - ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P); - ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P); - T2n = FMA(KP559016994, T2m, T2l); - ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n); - ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n); - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 15 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 72, 28, 112, 0 }, 0, 0, 0 }; - -void X(codelet_t1_15) (planner *p) { - X(kdft_dit_register) (p, t1_15, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */ - -/* - * This function contains 184 FP additions, 112 FP multiplications, - * (or, 128 additions, 56 multiplications, 56 fused multiply/add), - * 65 stack variables, 6 constants, and 60 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP500000000, +0.500000000000000000000000000000000000000000000); - DK(KP866025403, +0.866025403784438646763723170752936183471402627); - { - INT m; - for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) { - E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k; - E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w; - E T24, T2v, T1B, T1R; - { - E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q; - T1 = ri[0]; - T2R = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 5)]; - T5 = ii[WS(rs, 5)]; - T2 = W[8]; - T4 = W[9]; - T6 = FMA(T2, T3, T4 * T5); - T1o = FNMS(T4, T3, T2 * T5); - } - { - E T8, Ta, T7, T9; - T8 = ri[WS(rs, 10)]; - Ta = ii[WS(rs, 10)]; - T7 = W[18]; - T9 = W[19]; - Tb = FMA(T7, T8, T9 * Ta); - T1p = FNMS(T9, T8, T7 * Ta); - } - T1q = KP866025403 * (T1o - T1p); - T34 = KP866025403 * (Tb - T6); - Tc = T6 + Tb; - Td = T1 + Tc; - T1n = FNMS(KP500000000, Tc, T1); - T2Q = T1o + T1p; - T2S = T2Q + T2R; - T35 = FNMS(KP500000000, T2Q, T2R); - } - { - E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j; - E T2i; - { - E TO, TQ, TN, TP; - TO = ri[WS(rs, 6)]; - TQ = ii[WS(rs, 6)]; - TN = W[10]; - TP = W[11]; - TR = FMA(TN, TO, TP * TQ); - T2c = FNMS(TP, TO, TN * TQ); - } - { - E T15, T17, T14, T16; - T15 = ri[WS(rs, 9)]; - T17 = ii[WS(rs, 9)]; - T14 = W[16]; - T16 = W[17]; - T18 = FMA(T14, T15, T16 * T17); - T2h = FNMS(T16, T15, T14 * T17); - } - { - E TT, TV, TS, TU; - TT = ri[WS(rs, 11)]; - TV = ii[WS(rs, 11)]; - TS = W[20]; - TU = W[21]; - TW = FMA(TS, TT, TU * TV); - T1E = FNMS(TU, TT, TS * TV); - } - { - E TY, T10, TX, TZ; - TY = ri[WS(rs, 1)]; - T10 = ii[WS(rs, 1)]; - TX = W[0]; - TZ = W[1]; - T11 = FMA(TX, TY, TZ * T10); - T1F = FNMS(TZ, TY, TX * T10); - } - T12 = TW + T11; - T2d = T1E + T1F; - { - E T1a, T1c, T19, T1b; - T1a = ri[WS(rs, 14)]; - T1c = ii[WS(rs, 14)]; - T19 = W[26]; - T1b = W[27]; - T1d = FMA(T19, T1a, T1b * T1c); - T1J = FNMS(T1b, T1a, T19 * T1c); - } - { - E T1f, T1h, T1e, T1g; - T1f = ri[WS(rs, 4)]; - T1h = ii[WS(rs, 4)]; - T1e = W[6]; - T1g = W[7]; - T1i = FMA(T1e, T1f, T1g * T1h); - T1K = FNMS(T1g, T1f, T1e * T1h); - } - T1j = T1d + T1i; - T2i = T1J + T1K; - { - E T1D, T1G, T2g, T2j; - T13 = TR + T12; - T1k = T18 + T1j; - T1l = T13 + T1k; - T2E = T2c + T2d; - T2F = T2h + T2i; - T2O = T2E + T2F; - T1D = FNMS(KP500000000, T12, TR); - T1G = KP866025403 * (T1E - T1F); - T1H = T1D - T1G; - T1T = T1D + T1G; - T2g = KP866025403 * (T1i - T1d); - T2j = FNMS(KP500000000, T2i, T2h); - T2k = T2g + T2j; - T2t = T2j - T2g; - { - E T2b, T2e, T1I, T1L; - T2b = KP866025403 * (T11 - TW); - T2e = FNMS(KP500000000, T2d, T2c); - T2f = T2b + T2e; - T2s = T2e - T2b; - T1I = FNMS(KP500000000, T1j, T18); - T1L = KP866025403 * (T1J - T1K); - T1M = T1I - T1L; - T1U = T1I + T1L; - } - } - } - { - E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK; - E T27; - { - E Tf, Th, Te, Tg; - Tf = ri[WS(rs, 3)]; - Th = ii[WS(rs, 3)]; - Te = W[4]; - Tg = W[5]; - Ti = FMA(Te, Tf, Tg * Th); - T21 = FNMS(Tg, Tf, Te * Th); - } - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 12)]; - Ty = ii[WS(rs, 12)]; - Tv = W[22]; - Tx = W[23]; - Tz = FMA(Tv, Tw, Tx * Ty); - T26 = FNMS(Tx, Tw, Tv * Ty); - } - { - E Tk, Tm, Tj, Tl; - Tk = ri[WS(rs, 8)]; - Tm = ii[WS(rs, 8)]; - Tj = W[14]; - Tl = W[15]; - Tn = FMA(Tj, Tk, Tl * Tm); - T1t = FNMS(Tl, Tk, Tj * Tm); - } - { - E Tp, Tr, To, Tq; - Tp = ri[WS(rs, 13)]; - Tr = ii[WS(rs, 13)]; - To = W[24]; - Tq = W[25]; - Ts = FMA(To, Tp, Tq * Tr); - T1u = FNMS(Tq, Tp, To * Tr); - } - Tt = Tn + Ts; - T22 = T1t + T1u; - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 2)]; - TD = ii[WS(rs, 2)]; - TA = W[2]; - TC = W[3]; - TE = FMA(TA, TB, TC * TD); - T1y = FNMS(TC, TB, TA * TD); - } - { - E TG, TI, TF, TH; - TG = ri[WS(rs, 7)]; - TI = ii[WS(rs, 7)]; - TF = W[12]; - TH = W[13]; - TJ = FMA(TF, TG, TH * TI); - T1z = FNMS(TH, TG, TF * TI); - } - TK = TE + TJ; - T27 = T1y + T1z; - { - E T1s, T1v, T25, T28; - Tu = Ti + Tt; - TL = Tz + TK; - TM = Tu + TL; - T2H = T21 + T22; - T2I = T26 + T27; - T2N = T2H + T2I; - T1s = FNMS(KP500000000, Tt, Ti); - T1v = KP866025403 * (T1t - T1u); - T1w = T1s - T1v; - T1Q = T1s + T1v; - T25 = KP866025403 * (TJ - TE); - T28 = FNMS(KP500000000, T27, T26); - T29 = T25 + T28; - T2w = T28 - T25; - { - E T20, T23, T1x, T1A; - T20 = KP866025403 * (Ts - Tn); - T23 = FNMS(KP500000000, T22, T21); - T24 = T20 + T23; - T2v = T23 - T20; - T1x = FNMS(KP500000000, TK, Tz); - T1A = KP866025403 * (T1y - T1z); - T1B = T1x - T1A; - T1R = T1x + T1A; - } - } - } - { - E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D; - T2C = KP559016994 * (TM - T1l); - T1m = TM + T1l; - T2B = FNMS(KP250000000, T1m, Td); - T2G = T2E - T2F; - T2J = T2H - T2I; - T2K = FNMS(KP587785252, T2J, KP951056516 * T2G); - T2M = FMA(KP951056516, T2J, KP587785252 * T2G); - ri[0] = Td + T1m; - T2L = T2C + T2B; - ri[WS(rs, 9)] = T2L - T2M; - ri[WS(rs, 6)] = T2L + T2M; - T2D = T2B - T2C; - ri[WS(rs, 12)] = T2D - T2K; - ri[WS(rs, 3)] = T2D + T2K; - } - { - E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V; - T2U = KP559016994 * (T2N - T2O); - T2P = T2N + T2O; - T2T = FNMS(KP250000000, T2P, T2S); - T2W = T13 - T1k; - T2X = Tu - TL; - T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W); - T30 = FMA(KP951056516, T2X, KP587785252 * T2W); - ii[0] = T2P + T2S; - T2Z = T2U + T2T; - ii[WS(rs, 6)] = T2Z - T30; - ii[WS(rs, 9)] = T30 + T2Z; - T2V = T2T - T2U; - ii[WS(rs, 3)] = T2V - T2Y; - ii[WS(rs, 12)] = T2Y + T2V; - } - { - E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r; - { - E T2u, T2x, T1C, T1N; - T2u = T2s - T2t; - T2x = T2v - T2w; - T2y = FNMS(KP587785252, T2x, KP951056516 * T2u); - T2A = FMA(KP951056516, T2x, KP587785252 * T2u); - T1r = T1n - T1q; - T1C = T1w + T1B; - T1N = T1H + T1M; - T1O = T1C + T1N; - T2p = FNMS(KP250000000, T1O, T1r); - T2q = KP559016994 * (T1C - T1N); - } - ri[WS(rs, 5)] = T1r + T1O; - T2z = T2q + T2p; - ri[WS(rs, 14)] = T2z - T2A; - ri[WS(rs, 11)] = T2z + T2A; - T2r = T2p - T2q; - ri[WS(rs, 2)] = T2r - T2y; - ri[WS(rs, 8)] = T2r + T2y; - } - { - E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o; - { - E T3f, T3g, T3j, T3k; - T3f = T1H - T1M; - T3g = T1w - T1B; - T3h = FNMS(KP587785252, T3g, KP951056516 * T3f); - T3q = FMA(KP951056516, T3g, KP587785252 * T3f); - T3i = T35 - T34; - T3j = T2v + T2w; - T3k = T2s + T2t; - T3l = T3j + T3k; - T3m = FNMS(KP250000000, T3l, T3i); - T3n = KP559016994 * (T3j - T3k); - } - ii[WS(rs, 5)] = T3l + T3i; - T3p = T3n + T3m; - ii[WS(rs, 11)] = T3p - T3q; - ii[WS(rs, 14)] = T3q + T3p; - T3o = T3m - T3n; - ii[WS(rs, 2)] = T3h + T3o; - ii[WS(rs, 8)] = T3o - T3h; - } - { - E T3c, T3d, T36, T37, T33, T38, T3e, T39; - { - E T3a, T3b, T31, T32; - T3a = T1Q - T1R; - T3b = T1T - T1U; - T3c = FMA(KP951056516, T3a, KP587785252 * T3b); - T3d = FNMS(KP587785252, T3a, KP951056516 * T3b); - T36 = T34 + T35; - T31 = T24 + T29; - T32 = T2f + T2k; - T37 = T31 + T32; - T33 = KP559016994 * (T31 - T32); - T38 = FNMS(KP250000000, T37, T36); - } - ii[WS(rs, 10)] = T37 + T36; - T3e = T38 - T33; - ii[WS(rs, 7)] = T3d + T3e; - ii[WS(rs, 13)] = T3e - T3d; - T39 = T33 + T38; - ii[WS(rs, 1)] = T39 - T3c; - ii[WS(rs, 4)] = T3c + T39; - } - { - E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z; - { - E T2a, T2l, T1S, T1V; - T2a = T24 - T29; - T2l = T2f - T2k; - T2m = FMA(KP951056516, T2a, KP587785252 * T2l); - T2o = FNMS(KP587785252, T2a, KP951056516 * T2l); - T1P = T1n + T1q; - T1S = T1Q + T1R; - T1V = T1T + T1U; - T1W = T1S + T1V; - T1X = KP559016994 * (T1S - T1V); - T1Y = FNMS(KP250000000, T1W, T1P); - } - ri[WS(rs, 10)] = T1P + T1W; - T2n = T1Y - T1X; - ri[WS(rs, 7)] = T2n - T2o; - ri[WS(rs, 13)] = T2n + T2o; - T1Z = T1X + T1Y; - ri[WS(rs, 4)] = T1Z - T2m; - ri[WS(rs, 1)] = T1Z + T2m; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 15 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 128, 56, 56, 0 }, 0, 0, 0 }; - -void X(codelet_t1_15) (planner *p) { - X(kdft_dit_register) (p, t1_15, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_16.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_16.c deleted file mode 100644 index 9520624f..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_16.c +++ /dev/null @@ -1,796 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:28 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */ - -/* - * This function contains 174 FP additions, 100 FP multiplications, - * (or, 104 additions, 30 multiplications, 70 fused multiply/add), - * 60 stack variables, 3 constants, and 64 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP414213562, +0.414213562373095048801688724209698078569671875); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) { - E T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N; - E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W; - E T1W, T21; - { - E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5; - T1 = ri[0]; - T3n = ii[0]; - T3 = ri[WS(rs, 8)]; - T6 = ii[WS(rs, 8)]; - T2 = W[14]; - T4 = T2 * T3; - T3l = T2 * T6; - T5 = W[15]; - T7 = FMA(T5, T6, T4); - T3m = FNMS(T5, T3, T3l); - T8 = T1 + T7; - T3z = T3n - T3m; - T1I = T1 - T7; - T3o = T3m + T3n; - } - { - E T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m; - T1h = ri[WS(rs, 15)]; - T1k = ii[WS(rs, 15)]; - T1g = W[28]; - T1i = T1g * T1h; - T2k = T1g * T1k; - T1n = ri[WS(rs, 7)]; - T1q = ii[WS(rs, 7)]; - T1m = W[12]; - T1o = T1m * T1n; - T2m = T1m * T1q; - { - E T1l, T2l, T1r, T2n, T1j, T1p; - T1j = W[29]; - T1l = FMA(T1j, T1k, T1i); - T2l = FNMS(T1j, T1h, T2k); - T1p = W[13]; - T1r = FMA(T1p, T1q, T1o); - T2n = FNMS(T1p, T1n, T2m); - T1s = T1l + T1r; - T35 = T2l + T2n; - T2o = T2l - T2n; - T2r = T1l - T1r; - } - } - { - E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z; - T1u = ri[WS(rs, 3)]; - T1x = ii[WS(rs, 3)]; - T1t = W[4]; - T1v = T1t * T1u; - T2s = T1t * T1x; - T1A = ri[WS(rs, 11)]; - T1D = ii[WS(rs, 11)]; - T1z = W[20]; - T1B = T1z * T1A; - T2u = T1z * T1D; - { - E T1y, T2t, T1E, T2v, T1w, T1C; - T1w = W[5]; - T1y = FMA(T1w, T1x, T1v); - T2t = FNMS(T1w, T1u, T2s); - T1C = W[21]; - T1E = FMA(T1C, T1D, T1B); - T2v = FNMS(T1C, T1A, T2u); - T1F = T1y + T1E; - T36 = T2t + T2v; - T2p = T1y - T1E; - T2w = T2t - T2v; - } - } - { - E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf; - Ta = ri[WS(rs, 4)]; - Td = ii[WS(rs, 4)]; - T9 = W[6]; - Tb = T9 * Ta; - T1J = T9 * Td; - Tg = ri[WS(rs, 12)]; - Tj = ii[WS(rs, 12)]; - Tf = W[22]; - Th = Tf * Tg; - T1L = Tf * Tj; - { - E Te, T1K, Tk, T1M, Tc, Ti; - Tc = W[7]; - Te = FMA(Tc, Td, Tb); - T1K = FNMS(Tc, Ta, T1J); - Ti = W[23]; - Tk = FMA(Ti, Tj, Th); - T1M = FNMS(Ti, Tg, T1L); - Tl = Te + Tk; - T3A = Te - Tk; - T1N = T1K - T1M; - T3k = T1K + T1M; - } - } - { - E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt; - To = ri[WS(rs, 2)]; - Tr = ii[WS(rs, 2)]; - Tn = W[2]; - Tp = Tn * To; - T1P = Tn * Tr; - Tu = ri[WS(rs, 10)]; - Tx = ii[WS(rs, 10)]; - Tt = W[18]; - Tv = Tt * Tu; - T1R = Tt * Tx; - { - E Ts, T1Q, Ty, T1S, Tq, Tw; - Tq = W[3]; - Ts = FMA(Tq, Tr, Tp); - T1Q = FNMS(Tq, To, T1P); - Tw = W[19]; - Ty = FMA(Tw, Tx, Tv); - T1S = FNMS(Tw, Tu, T1R); - Tz = Ts + Ty; - T2V = T1Q + T1S; - T1T = T1Q - T1S; - T1U = Ts - Ty; - } - } - { - E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV; - TQ = ri[WS(rs, 1)]; - TT = ii[WS(rs, 1)]; - TP = W[0]; - TR = TP * TQ; - T25 = TP * TT; - TW = ri[WS(rs, 9)]; - TZ = ii[WS(rs, 9)]; - TV = W[16]; - TX = TV * TW; - T27 = TV * TZ; - { - E TU, T26, T10, T28, TS, TY; - TS = W[1]; - TU = FMA(TS, TT, TR); - T26 = FNMS(TS, TQ, T25); - TY = W[17]; - T10 = FMA(TY, TZ, TX); - T28 = FNMS(TY, TW, T27); - T11 = TU + T10; - T30 = T26 + T28; - T29 = T26 - T28; - T2c = TU - T10; - } - } - { - E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18; - T13 = ri[WS(rs, 5)]; - T16 = ii[WS(rs, 5)]; - T12 = W[8]; - T14 = T12 * T13; - T2d = T12 * T16; - T19 = ri[WS(rs, 13)]; - T1c = ii[WS(rs, 13)]; - T18 = W[24]; - T1a = T18 * T19; - T2f = T18 * T1c; - { - E T17, T2e, T1d, T2g, T15, T1b; - T15 = W[9]; - T17 = FMA(T15, T16, T14); - T2e = FNMS(T15, T13, T2d); - T1b = W[25]; - T1d = FMA(T1b, T1c, T1a); - T2g = FNMS(T1b, T19, T2f); - T1e = T17 + T1d; - T31 = T2e + T2g; - T2a = T17 - T1d; - T2h = T2e - T2g; - } - } - { - E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG; - TB = ri[WS(rs, 14)]; - TE = ii[WS(rs, 14)]; - TA = W[26]; - TC = TA * TB; - T1X = TA * TE; - TH = ri[WS(rs, 6)]; - TK = ii[WS(rs, 6)]; - TG = W[10]; - TI = TG * TH; - T1Z = TG * TK; - { - E TF, T1Y, TL, T20, TD, TJ; - TD = W[27]; - TF = FMA(TD, TE, TC); - T1Y = FNMS(TD, TB, T1X); - TJ = W[11]; - TL = FMA(TJ, TK, TI); - T20 = FNMS(TJ, TH, T1Z); - TM = TF + TL; - T2W = T1Y + T20; - T1W = TF - TL; - T21 = T1Y - T20; - } - } - { - E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i; - { - E Tm, TN, T3j, T3p; - Tm = T8 + Tl; - TN = Tz + TM; - TO = Tm + TN; - T3e = Tm - TN; - T3j = T2V + T2W; - T3p = T3k + T3o; - T3q = T3j + T3p; - T3s = T3p - T3j; - } - { - E T1f, T1G, T3f, T3g; - T1f = T11 + T1e; - T1G = T1s + T1F; - T1H = T1f + T1G; - T3r = T1G - T1f; - T3f = T30 + T31; - T3g = T35 + T36; - T3h = T3f - T3g; - T3i = T3f + T3g; - } - ri[WS(rs, 8)] = TO - T1H; - ii[WS(rs, 8)] = T3q - T3i; - ri[0] = TO + T1H; - ii[0] = T3i + T3q; - ri[WS(rs, 12)] = T3e - T3h; - ii[WS(rs, 12)] = T3s - T3r; - ri[WS(rs, 4)] = T3e + T3h; - ii[WS(rs, 4)] = T3r + T3s; - } - { - E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c; - { - E T2U, T2X, T3t, T3u; - T2U = T8 - Tl; - T2X = T2V - T2W; - T2Y = T2U + T2X; - T3a = T2U - T2X; - T3t = TM - Tz; - T3u = T3o - T3k; - T3v = T3t + T3u; - T3x = T3u - T3t; - } - { - E T2Z, T32, T34, T37; - T2Z = T11 - T1e; - T32 = T30 - T31; - T33 = T2Z + T32; - T3b = T32 - T2Z; - T34 = T1s - T1F; - T37 = T35 - T36; - T38 = T34 - T37; - T3c = T34 + T37; - } - { - E T39, T3w, T3d, T3y; - T39 = T33 + T38; - ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y); - ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y); - T3w = T3b + T3c; - ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v); - ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v); - T3d = T3b - T3c; - ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a); - ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a); - T3y = T38 - T33; - ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x); - ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x); - } - } - { - E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y; - E T2C; - { - E T1V, T22, T2b, T2i; - T1O = T1I - T1N; - T3B = T3z - T3A; - T3H = T3A + T3z; - T2E = T1I + T1N; - T1V = T1T - T1U; - T22 = T1W + T21; - T23 = T1V - T22; - T3C = T1V + T22; - { - E T2M, T2N, T2F, T2G; - T2M = T2r + T2w; - T2N = T2o - T2p; - T2O = FNMS(KP414213562, T2N, T2M); - T2S = FMA(KP414213562, T2M, T2N); - T2F = T1U + T1T; - T2G = T1W - T21; - T2H = T2F + T2G; - T3I = T2G - T2F; - } - T2b = T29 + T2a; - T2i = T2c - T2h; - T2j = FMA(KP414213562, T2i, T2b); - T2B = FNMS(KP414213562, T2b, T2i); - { - E T2J, T2K, T2q, T2x; - T2J = T2c + T2h; - T2K = T29 - T2a; - T2L = FMA(KP414213562, T2K, T2J); - T2R = FNMS(KP414213562, T2J, T2K); - T2q = T2o + T2p; - T2x = T2r - T2w; - T2y = FNMS(KP414213562, T2x, T2q); - T2C = FMA(KP414213562, T2q, T2x); - } - } - { - E T24, T2z, T3J, T3K; - T24 = FMA(KP707106781, T23, T1O); - T2z = T2j - T2y; - ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24); - ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24); - T3J = FMA(KP707106781, T3I, T3H); - T3K = T2C - T2B; - ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J); - ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J); - } - { - E T2A, T2D, T3L, T3M; - T2A = FNMS(KP707106781, T23, T1O); - T2D = T2B + T2C; - ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A); - ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A); - T3L = FNMS(KP707106781, T3I, T3H); - T3M = T2j + T2y; - ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L); - ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L); - } - { - E T2I, T2P, T3D, T3E; - T2I = FMA(KP707106781, T2H, T2E); - T2P = T2L + T2O; - ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I); - ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I); - T3D = FMA(KP707106781, T3C, T3B); - T3E = T2R + T2S; - ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D); - ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D); - } - { - E T2Q, T2T, T3F, T3G; - T2Q = FNMS(KP707106781, T2H, T2E); - T2T = T2R - T2S; - ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q); - ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q); - T3F = FNMS(KP707106781, T3C, T3B); - T3G = T2O - T2L; - ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F); - ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 16 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 104, 30, 70, 0 }, 0, 0, 0 }; - -void X(codelet_t1_16) (planner *p) { - X(kdft_dit_register) (p, t1_16, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */ - -/* - * This function contains 174 FP additions, 84 FP multiplications, - * (or, 136 additions, 46 multiplications, 38 fused multiply/add), - * 52 stack variables, 3 constants, and 64 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP382683432, +0.382683432365089771728459984030398866761344562); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) { - E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H; - E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x; - E T2y, T2z, T1O, T2g, T1T, T2h; - { - E T1, T2T, T6, T2S; - T1 = ri[0]; - T2T = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 8)]; - T5 = ii[WS(rs, 8)]; - T2 = W[14]; - T4 = W[15]; - T6 = FMA(T2, T3, T4 * T5); - T2S = FNMS(T4, T3, T2 * T5); - } - T7 = T1 + T6; - T37 = T2T - T2S; - T1t = T1 - T6; - T2U = T2S + T2T; - } - { - E Tc, T1u, Th, T1v; - { - E T9, Tb, T8, Ta; - T9 = ri[WS(rs, 4)]; - Tb = ii[WS(rs, 4)]; - T8 = W[6]; - Ta = W[7]; - Tc = FMA(T8, T9, Ta * Tb); - T1u = FNMS(Ta, T9, T8 * Tb); - } - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 12)]; - Tg = ii[WS(rs, 12)]; - Td = W[22]; - Tf = W[23]; - Th = FMA(Td, Te, Tf * Tg); - T1v = FNMS(Tf, Te, Td * Tg); - } - Ti = Tc + Th; - T38 = Tc - Th; - T1w = T1u - T1v; - T2R = T1u + T1v; - } - { - E To, T1y, Tt, T1z, T1A, T1B; - { - E Tl, Tn, Tk, Tm; - Tl = ri[WS(rs, 2)]; - Tn = ii[WS(rs, 2)]; - Tk = W[2]; - Tm = W[3]; - To = FMA(Tk, Tl, Tm * Tn); - T1y = FNMS(Tm, Tl, Tk * Tn); - } - { - E Tq, Ts, Tp, Tr; - Tq = ri[WS(rs, 10)]; - Ts = ii[WS(rs, 10)]; - Tp = W[18]; - Tr = W[19]; - Tt = FMA(Tp, Tq, Tr * Ts); - T1z = FNMS(Tr, Tq, Tp * Ts); - } - Tu = To + Tt; - T2s = T1y + T1z; - T1A = T1y - T1z; - T1B = To - Tt; - T1C = T1A - T1B; - T2c = T1B + T1A; - } - { - E Tz, T1E, TE, T1F, T1D, T1G; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 14)]; - Ty = ii[WS(rs, 14)]; - Tv = W[26]; - Tx = W[27]; - Tz = FMA(Tv, Tw, Tx * Ty); - T1E = FNMS(Tx, Tw, Tv * Ty); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 6)]; - TD = ii[WS(rs, 6)]; - TA = W[10]; - TC = W[11]; - TE = FMA(TA, TB, TC * TD); - T1F = FNMS(TC, TB, TA * TD); - } - TF = Tz + TE; - T2t = T1E + T1F; - T1D = Tz - TE; - T1G = T1E - T1F; - T1H = T1D + T1G; - T2d = T1D - T1G; - } - { - E T19, T20, T1p, T1X, T1e, T21, T1k, T1W; - { - E T16, T18, T15, T17; - T16 = ri[WS(rs, 15)]; - T18 = ii[WS(rs, 15)]; - T15 = W[28]; - T17 = W[29]; - T19 = FMA(T15, T16, T17 * T18); - T20 = FNMS(T17, T16, T15 * T18); - } - { - E T1m, T1o, T1l, T1n; - T1m = ri[WS(rs, 11)]; - T1o = ii[WS(rs, 11)]; - T1l = W[20]; - T1n = W[21]; - T1p = FMA(T1l, T1m, T1n * T1o); - T1X = FNMS(T1n, T1m, T1l * T1o); - } - { - E T1b, T1d, T1a, T1c; - T1b = ri[WS(rs, 7)]; - T1d = ii[WS(rs, 7)]; - T1a = W[12]; - T1c = W[13]; - T1e = FMA(T1a, T1b, T1c * T1d); - T21 = FNMS(T1c, T1b, T1a * T1d); - } - { - E T1h, T1j, T1g, T1i; - T1h = ri[WS(rs, 3)]; - T1j = ii[WS(rs, 3)]; - T1g = W[4]; - T1i = W[5]; - T1k = FMA(T1g, T1h, T1i * T1j); - T1W = FNMS(T1i, T1h, T1g * T1j); - } - T1f = T19 + T1e; - T1q = T1k + T1p; - T2B = T1f - T1q; - T2C = T20 + T21; - T2D = T1W + T1X; - T2E = T2C - T2D; - { - E T1V, T1Y, T22, T23; - T1V = T19 - T1e; - T1Y = T1W - T1X; - T1Z = T1V - T1Y; - T2j = T1V + T1Y; - T22 = T20 - T21; - T23 = T1k - T1p; - T24 = T22 + T23; - T2k = T22 - T23; - } - } - { - E TM, T1K, T12, T1R, TR, T1L, TX, T1Q; - { - E TJ, TL, TI, TK; - TJ = ri[WS(rs, 1)]; - TL = ii[WS(rs, 1)]; - TI = W[0]; - TK = W[1]; - TM = FMA(TI, TJ, TK * TL); - T1K = FNMS(TK, TJ, TI * TL); - } - { - E TZ, T11, TY, T10; - TZ = ri[WS(rs, 13)]; - T11 = ii[WS(rs, 13)]; - TY = W[24]; - T10 = W[25]; - T12 = FMA(TY, TZ, T10 * T11); - T1R = FNMS(T10, TZ, TY * T11); - } - { - E TO, TQ, TN, TP; - TO = ri[WS(rs, 9)]; - TQ = ii[WS(rs, 9)]; - TN = W[16]; - TP = W[17]; - TR = FMA(TN, TO, TP * TQ); - T1L = FNMS(TP, TO, TN * TQ); - } - { - E TU, TW, TT, TV; - TU = ri[WS(rs, 5)]; - TW = ii[WS(rs, 5)]; - TT = W[8]; - TV = W[9]; - TX = FMA(TT, TU, TV * TW); - T1Q = FNMS(TV, TU, TT * TW); - } - TS = TM + TR; - T13 = TX + T12; - T2w = TS - T13; - T2x = T1K + T1L; - T2y = T1Q + T1R; - T2z = T2x - T2y; - { - E T1M, T1N, T1P, T1S; - T1M = T1K - T1L; - T1N = TX - T12; - T1O = T1M + T1N; - T2g = T1M - T1N; - T1P = TM - TR; - T1S = T1Q - T1R; - T1T = T1P - T1S; - T2h = T1P + T1S; - } - } - { - E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d; - { - E T1x, T1I, T3e, T3f; - T1x = T1t - T1w; - T1I = KP707106781 * (T1C - T1H); - T1J = T1x + T1I; - T27 = T1x - T1I; - T3e = KP707106781 * (T2d - T2c); - T3f = T38 + T37; - T3g = T3e + T3f; - T3i = T3f - T3e; - } - { - E T1U, T25, T28, T29; - T1U = FMA(KP923879532, T1O, KP382683432 * T1T); - T25 = FNMS(KP923879532, T24, KP382683432 * T1Z); - T26 = T1U + T25; - T3h = T25 - T1U; - T28 = FNMS(KP923879532, T1T, KP382683432 * T1O); - T29 = FMA(KP382683432, T24, KP923879532 * T1Z); - T2a = T28 - T29; - T3d = T28 + T29; - } - ri[WS(rs, 11)] = T1J - T26; - ii[WS(rs, 11)] = T3g - T3d; - ri[WS(rs, 3)] = T1J + T26; - ii[WS(rs, 3)] = T3d + T3g; - ri[WS(rs, 15)] = T27 - T2a; - ii[WS(rs, 15)] = T3i - T3h; - ri[WS(rs, 7)] = T27 + T2a; - ii[WS(rs, 7)] = T3h + T3i; - } - { - E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z; - { - E T2r, T2u, T30, T31; - T2r = T7 - Ti; - T2u = T2s - T2t; - T2v = T2r + T2u; - T2H = T2r - T2u; - T30 = TF - Tu; - T31 = T2U - T2R; - T32 = T30 + T31; - T34 = T31 - T30; - } - { - E T2A, T2F, T2I, T2J; - T2A = T2w + T2z; - T2F = T2B - T2E; - T2G = KP707106781 * (T2A + T2F); - T33 = KP707106781 * (T2F - T2A); - T2I = T2z - T2w; - T2J = T2B + T2E; - T2K = KP707106781 * (T2I - T2J); - T2Z = KP707106781 * (T2I + T2J); - } - ri[WS(rs, 10)] = T2v - T2G; - ii[WS(rs, 10)] = T32 - T2Z; - ri[WS(rs, 2)] = T2v + T2G; - ii[WS(rs, 2)] = T2Z + T32; - ri[WS(rs, 14)] = T2H - T2K; - ii[WS(rs, 14)] = T34 - T33; - ri[WS(rs, 6)] = T2H + T2K; - ii[WS(rs, 6)] = T33 + T34; - } - { - E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35; - { - E T2b, T2e, T36, T39; - T2b = T1t + T1w; - T2e = KP707106781 * (T2c + T2d); - T2f = T2b + T2e; - T2n = T2b - T2e; - T36 = KP707106781 * (T1C + T1H); - T39 = T37 - T38; - T3a = T36 + T39; - T3c = T39 - T36; - } - { - E T2i, T2l, T2o, T2p; - T2i = FMA(KP382683432, T2g, KP923879532 * T2h); - T2l = FNMS(KP382683432, T2k, KP923879532 * T2j); - T2m = T2i + T2l; - T3b = T2l - T2i; - T2o = FNMS(KP382683432, T2h, KP923879532 * T2g); - T2p = FMA(KP923879532, T2k, KP382683432 * T2j); - T2q = T2o - T2p; - T35 = T2o + T2p; - } - ri[WS(rs, 9)] = T2f - T2m; - ii[WS(rs, 9)] = T3a - T35; - ri[WS(rs, 1)] = T2f + T2m; - ii[WS(rs, 1)] = T35 + T3a; - ri[WS(rs, 13)] = T2n - T2q; - ii[WS(rs, 13)] = T3c - T3b; - ri[WS(rs, 5)] = T2n + T2q; - ii[WS(rs, 5)] = T3b + T3c; - } - { - E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P; - { - E Tj, TG, T2Q, T2V; - Tj = T7 + Ti; - TG = Tu + TF; - TH = Tj + TG; - T2L = Tj - TG; - T2Q = T2s + T2t; - T2V = T2R + T2U; - T2W = T2Q + T2V; - T2Y = T2V - T2Q; - } - { - E T14, T1r, T2M, T2N; - T14 = TS + T13; - T1r = T1f + T1q; - T1s = T14 + T1r; - T2X = T1r - T14; - T2M = T2x + T2y; - T2N = T2C + T2D; - T2O = T2M - T2N; - T2P = T2M + T2N; - } - ri[WS(rs, 8)] = TH - T1s; - ii[WS(rs, 8)] = T2W - T2P; - ri[0] = TH + T1s; - ii[0] = T2P + T2W; - ri[WS(rs, 12)] = T2L - T2O; - ii[WS(rs, 12)] = T2Y - T2X; - ri[WS(rs, 4)] = T2L + T2O; - ii[WS(rs, 4)] = T2X + T2Y; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 16 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 136, 46, 38, 0 }, 0, 0, 0 }; - -void X(codelet_t1_16) (planner *p) { - X(kdft_dit_register) (p, t1_16, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_20.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_20.c deleted file mode 100644 index 6bd86f34..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_20.c +++ /dev/null @@ -1,1050 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:29 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include dft/scalar/t.h */ - -/* - * This function contains 246 FP additions, 148 FP multiplications, - * (or, 136 additions, 38 multiplications, 110 fused multiply/add), - * 61 stack variables, 4 constants, and 80 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - { - INT m; - for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) { - E T8, T4N, T2i, T4r, Tl, T4O, T2n, T4n, TN, T2b, T40, T4b, T2v, T3v, T3i; - E T3F, T27, T2f, T3W, T4f, T2R, T3z, T3a, T3J, T1G, T2e, T3T, T4e, T2K, T3y; - E T33, T3I, T1e, T2c, T43, T4c, T2C, T3w, T3p, T3G; - { - E T1, T4q, T3, T6, T4, T4o, T2, T7, T4p, T5; - T1 = ri[0]; - T4q = ii[0]; - T3 = ri[WS(rs, 10)]; - T6 = ii[WS(rs, 10)]; - T2 = W[18]; - T4 = T2 * T3; - T4o = T2 * T6; - T5 = W[19]; - T7 = FMA(T5, T6, T4); - T4p = FNMS(T5, T3, T4o); - T8 = T1 + T7; - T4N = T4q - T4p; - T2i = T1 - T7; - T4r = T4p + T4q; - } - { - E Ta, Td, Tb, T2j, Tg, Tj, Th, T2l, T9, Tf; - Ta = ri[WS(rs, 5)]; - Td = ii[WS(rs, 5)]; - T9 = W[8]; - Tb = T9 * Ta; - T2j = T9 * Td; - Tg = ri[WS(rs, 15)]; - Tj = ii[WS(rs, 15)]; - Tf = W[28]; - Th = Tf * Tg; - T2l = Tf * Tj; - { - E Te, T2k, Tk, T2m, Tc, Ti; - Tc = W[9]; - Te = FMA(Tc, Td, Tb); - T2k = FNMS(Tc, Ta, T2j); - Ti = W[29]; - Tk = FMA(Ti, Tj, Th); - T2m = FNMS(Ti, Tg, T2l); - Tl = Te + Tk; - T4O = Te - Tk; - T2n = T2k - T2m; - T4n = T2k + T2m; - } - } - { - E Ts, T3d, TL, T2t, Ty, T3f, TF, T2r; - { - E To, Tr, Tp, T3c, Tn, Tq; - To = ri[WS(rs, 4)]; - Tr = ii[WS(rs, 4)]; - Tn = W[6]; - Tp = Tn * To; - T3c = Tn * Tr; - Tq = W[7]; - Ts = FMA(Tq, Tr, Tp); - T3d = FNMS(Tq, To, T3c); - } - { - E TH, TK, TI, T2s, TG, TJ; - TH = ri[WS(rs, 19)]; - TK = ii[WS(rs, 19)]; - TG = W[36]; - TI = TG * TH; - T2s = TG * TK; - TJ = W[37]; - TL = FMA(TJ, TK, TI); - T2t = FNMS(TJ, TH, T2s); - } - { - E Tu, Tx, Tv, T3e, Tt, Tw; - Tu = ri[WS(rs, 14)]; - Tx = ii[WS(rs, 14)]; - Tt = W[26]; - Tv = Tt * Tu; - T3e = Tt * Tx; - Tw = W[27]; - Ty = FMA(Tw, Tx, Tv); - T3f = FNMS(Tw, Tu, T3e); - } - { - E TB, TE, TC, T2q, TA, TD; - TB = ri[WS(rs, 9)]; - TE = ii[WS(rs, 9)]; - TA = W[16]; - TC = TA * TB; - T2q = TA * TE; - TD = W[17]; - TF = FMA(TD, TE, TC); - T2r = FNMS(TD, TB, T2q); - } - { - E Tz, TM, T3Y, T3Z; - Tz = Ts + Ty; - TM = TF + TL; - TN = Tz - TM; - T2b = Tz + TM; - T3Y = T3d + T3f; - T3Z = T2r + T2t; - T40 = T3Y - T3Z; - T4b = T3Y + T3Z; - } - { - E T2p, T2u, T3g, T3h; - T2p = Ts - Ty; - T2u = T2r - T2t; - T2v = T2p - T2u; - T3v = T2p + T2u; - T3g = T3d - T3f; - T3h = TF - TL; - T3i = T3g + T3h; - T3F = T3g - T3h; - } - } - { - E T1M, T35, T25, T2P, T1S, T37, T1Z, T2N; - { - E T1I, T1L, T1J, T34, T1H, T1K; - T1I = ri[WS(rs, 12)]; - T1L = ii[WS(rs, 12)]; - T1H = W[22]; - T1J = T1H * T1I; - T34 = T1H * T1L; - T1K = W[23]; - T1M = FMA(T1K, T1L, T1J); - T35 = FNMS(T1K, T1I, T34); - } - { - E T21, T24, T22, T2O, T20, T23; - T21 = ri[WS(rs, 7)]; - T24 = ii[WS(rs, 7)]; - T20 = W[12]; - T22 = T20 * T21; - T2O = T20 * T24; - T23 = W[13]; - T25 = FMA(T23, T24, T22); - T2P = FNMS(T23, T21, T2O); - } - { - E T1O, T1R, T1P, T36, T1N, T1Q; - T1O = ri[WS(rs, 2)]; - T1R = ii[WS(rs, 2)]; - T1N = W[2]; - T1P = T1N * T1O; - T36 = T1N * T1R; - T1Q = W[3]; - T1S = FMA(T1Q, T1R, T1P); - T37 = FNMS(T1Q, T1O, T36); - } - { - E T1V, T1Y, T1W, T2M, T1U, T1X; - T1V = ri[WS(rs, 17)]; - T1Y = ii[WS(rs, 17)]; - T1U = W[32]; - T1W = T1U * T1V; - T2M = T1U * T1Y; - T1X = W[33]; - T1Z = FMA(T1X, T1Y, T1W); - T2N = FNMS(T1X, T1V, T2M); - } - { - E T1T, T26, T3U, T3V; - T1T = T1M + T1S; - T26 = T1Z + T25; - T27 = T1T - T26; - T2f = T1T + T26; - T3U = T35 + T37; - T3V = T2N + T2P; - T3W = T3U - T3V; - T4f = T3U + T3V; - } - { - E T2L, T2Q, T38, T39; - T2L = T1M - T1S; - T2Q = T2N - T2P; - T2R = T2L - T2Q; - T3z = T2L + T2Q; - T38 = T35 - T37; - T39 = T1Z - T25; - T3a = T38 + T39; - T3J = T38 - T39; - } - } - { - E T1l, T2Y, T1E, T2I, T1r, T30, T1y, T2G; - { - E T1h, T1k, T1i, T2X, T1g, T1j; - T1h = ri[WS(rs, 8)]; - T1k = ii[WS(rs, 8)]; - T1g = W[14]; - T1i = T1g * T1h; - T2X = T1g * T1k; - T1j = W[15]; - T1l = FMA(T1j, T1k, T1i); - T2Y = FNMS(T1j, T1h, T2X); - } - { - E T1A, T1D, T1B, T2H, T1z, T1C; - T1A = ri[WS(rs, 3)]; - T1D = ii[WS(rs, 3)]; - T1z = W[4]; - T1B = T1z * T1A; - T2H = T1z * T1D; - T1C = W[5]; - T1E = FMA(T1C, T1D, T1B); - T2I = FNMS(T1C, T1A, T2H); - } - { - E T1n, T1q, T1o, T2Z, T1m, T1p; - T1n = ri[WS(rs, 18)]; - T1q = ii[WS(rs, 18)]; - T1m = W[34]; - T1o = T1m * T1n; - T2Z = T1m * T1q; - T1p = W[35]; - T1r = FMA(T1p, T1q, T1o); - T30 = FNMS(T1p, T1n, T2Z); - } - { - E T1u, T1x, T1v, T2F, T1t, T1w; - T1u = ri[WS(rs, 13)]; - T1x = ii[WS(rs, 13)]; - T1t = W[24]; - T1v = T1t * T1u; - T2F = T1t * T1x; - T1w = W[25]; - T1y = FMA(T1w, T1x, T1v); - T2G = FNMS(T1w, T1u, T2F); - } - { - E T1s, T1F, T3R, T3S; - T1s = T1l + T1r; - T1F = T1y + T1E; - T1G = T1s - T1F; - T2e = T1s + T1F; - T3R = T2Y + T30; - T3S = T2G + T2I; - T3T = T3R - T3S; - T4e = T3R + T3S; - } - { - E T2E, T2J, T31, T32; - T2E = T1l - T1r; - T2J = T2G - T2I; - T2K = T2E - T2J; - T3y = T2E + T2J; - T31 = T2Y - T30; - T32 = T1y - T1E; - T33 = T31 + T32; - T3I = T31 - T32; - } - } - { - E TT, T3k, T1c, T2A, TZ, T3m, T16, T2y; - { - E TP, TS, TQ, T3j, TO, TR; - TP = ri[WS(rs, 16)]; - TS = ii[WS(rs, 16)]; - TO = W[30]; - TQ = TO * TP; - T3j = TO * TS; - TR = W[31]; - TT = FMA(TR, TS, TQ); - T3k = FNMS(TR, TP, T3j); - } - { - E T18, T1b, T19, T2z, T17, T1a; - T18 = ri[WS(rs, 11)]; - T1b = ii[WS(rs, 11)]; - T17 = W[20]; - T19 = T17 * T18; - T2z = T17 * T1b; - T1a = W[21]; - T1c = FMA(T1a, T1b, T19); - T2A = FNMS(T1a, T18, T2z); - } - { - E TV, TY, TW, T3l, TU, TX; - TV = ri[WS(rs, 6)]; - TY = ii[WS(rs, 6)]; - TU = W[10]; - TW = TU * TV; - T3l = TU * TY; - TX = W[11]; - TZ = FMA(TX, TY, TW); - T3m = FNMS(TX, TV, T3l); - } - { - E T12, T15, T13, T2x, T11, T14; - T12 = ri[WS(rs, 1)]; - T15 = ii[WS(rs, 1)]; - T11 = W[0]; - T13 = T11 * T12; - T2x = T11 * T15; - T14 = W[1]; - T16 = FMA(T14, T15, T13); - T2y = FNMS(T14, T12, T2x); - } - { - E T10, T1d, T41, T42; - T10 = TT + TZ; - T1d = T16 + T1c; - T1e = T10 - T1d; - T2c = T10 + T1d; - T41 = T3k + T3m; - T42 = T2y + T2A; - T43 = T41 - T42; - T4c = T41 + T42; - } - { - E T2w, T2B, T3n, T3o; - T2w = TT - TZ; - T2B = T2y - T2A; - T2C = T2w - T2B; - T3w = T2w + T2B; - T3n = T3k - T3m; - T3o = T16 - T1c; - T3p = T3n + T3o; - T3G = T3n - T3o; - } - } - { - E T45, T47, Tm, T29, T3O, T3P, T46, T3Q; - { - E T3X, T44, T1f, T28; - T3X = T3T - T3W; - T44 = T40 - T43; - T45 = FNMS(KP618033988, T44, T3X); - T47 = FMA(KP618033988, T3X, T44); - Tm = T8 - Tl; - T1f = TN + T1e; - T28 = T1G + T27; - T29 = T1f + T28; - T3O = FNMS(KP250000000, T29, Tm); - T3P = T1f - T28; - } - ri[WS(rs, 10)] = Tm + T29; - T46 = FMA(KP559016994, T3P, T3O); - ri[WS(rs, 14)] = FNMS(KP951056516, T47, T46); - ri[WS(rs, 6)] = FMA(KP951056516, T47, T46); - T3Q = FNMS(KP559016994, T3P, T3O); - ri[WS(rs, 2)] = FNMS(KP951056516, T45, T3Q); - ri[WS(rs, 18)] = FMA(KP951056516, T45, T3Q); - } - { - E T4K, T4M, T4B, T4E, T4F, T4G, T4L, T4H; - { - E T4I, T4J, T4C, T4D; - T4I = T1G - T27; - T4J = TN - T1e; - T4K = FNMS(KP618033988, T4J, T4I); - T4M = FMA(KP618033988, T4I, T4J); - T4B = T4r - T4n; - T4C = T40 + T43; - T4D = T3T + T3W; - T4E = T4C + T4D; - T4F = FNMS(KP250000000, T4E, T4B); - T4G = T4C - T4D; - } - ii[WS(rs, 10)] = T4E + T4B; - T4L = FMA(KP559016994, T4G, T4F); - ii[WS(rs, 6)] = FNMS(KP951056516, T4M, T4L); - ii[WS(rs, 14)] = FMA(KP951056516, T4M, T4L); - T4H = FNMS(KP559016994, T4G, T4F); - ii[WS(rs, 2)] = FMA(KP951056516, T4K, T4H); - ii[WS(rs, 18)] = FNMS(KP951056516, T4K, T4H); - } - { - E T4h, T4j, T2a, T2h, T48, T49, T4i, T4a; - { - E T4d, T4g, T2d, T2g; - T4d = T4b - T4c; - T4g = T4e - T4f; - T4h = FMA(KP618033988, T4g, T4d); - T4j = FNMS(KP618033988, T4d, T4g); - T2a = T8 + Tl; - T2d = T2b + T2c; - T2g = T2e + T2f; - T2h = T2d + T2g; - T48 = FNMS(KP250000000, T2h, T2a); - T49 = T2d - T2g; - } - ri[0] = T2a + T2h; - T4i = FNMS(KP559016994, T49, T48); - ri[WS(rs, 12)] = FNMS(KP951056516, T4j, T4i); - ri[WS(rs, 8)] = FMA(KP951056516, T4j, T4i); - T4a = FMA(KP559016994, T49, T48); - ri[WS(rs, 4)] = FNMS(KP951056516, T4h, T4a); - ri[WS(rs, 16)] = FMA(KP951056516, T4h, T4a); - } - { - E T4y, T4A, T4s, T4m, T4t, T4u, T4z, T4v; - { - E T4w, T4x, T4k, T4l; - T4w = T2b - T2c; - T4x = T2e - T2f; - T4y = FMA(KP618033988, T4x, T4w); - T4A = FNMS(KP618033988, T4w, T4x); - T4s = T4n + T4r; - T4k = T4b + T4c; - T4l = T4e + T4f; - T4m = T4k + T4l; - T4t = FNMS(KP250000000, T4m, T4s); - T4u = T4k - T4l; - } - ii[0] = T4m + T4s; - T4z = FNMS(KP559016994, T4u, T4t); - ii[WS(rs, 8)] = FNMS(KP951056516, T4A, T4z); - ii[WS(rs, 12)] = FMA(KP951056516, T4A, T4z); - T4v = FMA(KP559016994, T4u, T4t); - ii[WS(rs, 4)] = FMA(KP951056516, T4y, T4v); - ii[WS(rs, 16)] = FNMS(KP951056516, T4y, T4v); - } - { - E T3r, T3t, T2o, T2T, T2U, T2V, T3s, T2W; - { - E T3b, T3q, T2D, T2S; - T3b = T33 - T3a; - T3q = T3i - T3p; - T3r = FNMS(KP618033988, T3q, T3b); - T3t = FMA(KP618033988, T3b, T3q); - T2o = T2i - T2n; - T2D = T2v + T2C; - T2S = T2K + T2R; - T2T = T2D + T2S; - T2U = FNMS(KP250000000, T2T, T2o); - T2V = T2D - T2S; - } - ri[WS(rs, 15)] = T2o + T2T; - T3s = FMA(KP559016994, T2V, T2U); - ri[WS(rs, 11)] = FMA(KP951056516, T3t, T3s); - ri[WS(rs, 19)] = FNMS(KP951056516, T3t, T3s); - T2W = FNMS(KP559016994, T2V, T2U); - ri[WS(rs, 3)] = FMA(KP951056516, T3r, T2W); - ri[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W); - } - { - E T5a, T5c, T51, T54, T55, T56, T5b, T57; - { - E T58, T59, T52, T53; - T58 = T2K - T2R; - T59 = T2v - T2C; - T5a = FNMS(KP618033988, T59, T58); - T5c = FMA(KP618033988, T58, T59); - T51 = T4O + T4N; - T52 = T3i + T3p; - T53 = T33 + T3a; - T54 = T52 + T53; - T55 = FNMS(KP250000000, T54, T51); - T56 = T52 - T53; - } - ii[WS(rs, 15)] = T54 + T51; - T5b = FMA(KP559016994, T56, T55); - ii[WS(rs, 11)] = FNMS(KP951056516, T5c, T5b); - ii[WS(rs, 19)] = FMA(KP951056516, T5c, T5b); - T57 = FNMS(KP559016994, T56, T55); - ii[WS(rs, 3)] = FNMS(KP951056516, T5a, T57); - ii[WS(rs, 7)] = FMA(KP951056516, T5a, T57); - } - { - E T3L, T3N, T3u, T3B, T3C, T3D, T3M, T3E; - { - E T3H, T3K, T3x, T3A; - T3H = T3F - T3G; - T3K = T3I - T3J; - T3L = FMA(KP618033988, T3K, T3H); - T3N = FNMS(KP618033988, T3H, T3K); - T3u = T2i + T2n; - T3x = T3v + T3w; - T3A = T3y + T3z; - T3B = T3x + T3A; - T3C = FNMS(KP250000000, T3B, T3u); - T3D = T3x - T3A; - } - ri[WS(rs, 5)] = T3u + T3B; - T3M = FNMS(KP559016994, T3D, T3C); - ri[WS(rs, 13)] = FMA(KP951056516, T3N, T3M); - ri[WS(rs, 17)] = FNMS(KP951056516, T3N, T3M); - T3E = FMA(KP559016994, T3D, T3C); - ri[WS(rs, 1)] = FMA(KP951056516, T3L, T3E); - ri[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E); - } - { - E T4Y, T50, T4P, T4S, T4T, T4U, T4Z, T4V; - { - E T4W, T4X, T4Q, T4R; - T4W = T3v - T3w; - T4X = T3y - T3z; - T4Y = FMA(KP618033988, T4X, T4W); - T50 = FNMS(KP618033988, T4W, T4X); - T4P = T4N - T4O; - T4Q = T3F + T3G; - T4R = T3I + T3J; - T4S = T4Q + T4R; - T4T = FNMS(KP250000000, T4S, T4P); - T4U = T4Q - T4R; - } - ii[WS(rs, 5)] = T4S + T4P; - T4Z = FNMS(KP559016994, T4U, T4T); - ii[WS(rs, 13)] = FNMS(KP951056516, T50, T4Z); - ii[WS(rs, 17)] = FMA(KP951056516, T50, T4Z); - T4V = FMA(KP559016994, T4U, T4T); - ii[WS(rs, 1)] = FNMS(KP951056516, T4Y, T4V); - ii[WS(rs, 9)] = FMA(KP951056516, T4Y, T4V); - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 20 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, { 136, 38, 110, 0 }, 0, 0, 0 }; - -void X(codelet_t1_20) (planner *p) { - X(kdft_dit_register) (p, t1_20, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 20 -name t1_20 -include dft/scalar/t.h */ - -/* - * This function contains 246 FP additions, 124 FP multiplications, - * (or, 184 additions, 62 multiplications, 62 fused multiply/add), - * 85 stack variables, 4 constants, and 80 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - { - INT m; - for (m = mb, W = W + (mb * 38); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 38, MAKE_VOLATILE_STRIDE(40, rs)) { - E Tj, T1R, T4g, T4p, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3i, T3l, T44, T3D; - E T3E, T3K, T1V, T1W, T1X, T23, T28, T4r, T2W, T2X, T4c, T33, T34, T35, T2G; - E T2L, T2M, TG, T13, T14, T3p, T3s, T43, T3A, T3B, T3J, T1S, T1T, T1U, T2e; - E T2j, T4q, T2T, T2U, T4b, T30, T31, T32, T2v, T2A, T2B; - { - E T1, T3O, T6, T3N, Tc, T2n, Th, T2o; - T1 = ri[0]; - T3O = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 10)]; - T5 = ii[WS(rs, 10)]; - T2 = W[18]; - T4 = W[19]; - T6 = FMA(T2, T3, T4 * T5); - T3N = FNMS(T4, T3, T2 * T5); - } - { - E T9, Tb, T8, Ta; - T9 = ri[WS(rs, 5)]; - Tb = ii[WS(rs, 5)]; - T8 = W[8]; - Ta = W[9]; - Tc = FMA(T8, T9, Ta * Tb); - T2n = FNMS(Ta, T9, T8 * Tb); - } - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 15)]; - Tg = ii[WS(rs, 15)]; - Td = W[28]; - Tf = W[29]; - Th = FMA(Td, Te, Tf * Tg); - T2o = FNMS(Tf, Te, Td * Tg); - } - { - E T7, Ti, T4e, T4f; - T7 = T1 + T6; - Ti = Tc + Th; - Tj = T7 - Ti; - T1R = T7 + Ti; - T4e = T3O - T3N; - T4f = Tc - Th; - T4g = T4e - T4f; - T4p = T4f + T4e; - } - { - E T2m, T2p, T3M, T3P; - T2m = T1 - T6; - T2p = T2n - T2o; - T2q = T2m - T2p; - T37 = T2m + T2p; - T3M = T2n + T2o; - T3P = T3N + T3O; - T3Q = T3M + T3P; - T42 = T3P - T3M; - } - } - { - E T1f, T3g, T21, T2C, T1N, T3k, T27, T2K, T1q, T3h, T22, T2F, T1C, T3j, T26; - E T2H; - { - E T19, T1Z, T1e, T20; - { - E T16, T18, T15, T17; - T16 = ri[WS(rs, 8)]; - T18 = ii[WS(rs, 8)]; - T15 = W[14]; - T17 = W[15]; - T19 = FMA(T15, T16, T17 * T18); - T1Z = FNMS(T17, T16, T15 * T18); - } - { - E T1b, T1d, T1a, T1c; - T1b = ri[WS(rs, 18)]; - T1d = ii[WS(rs, 18)]; - T1a = W[34]; - T1c = W[35]; - T1e = FMA(T1a, T1b, T1c * T1d); - T20 = FNMS(T1c, T1b, T1a * T1d); - } - T1f = T19 + T1e; - T3g = T1Z + T20; - T21 = T1Z - T20; - T2C = T19 - T1e; - } - { - E T1H, T2I, T1M, T2J; - { - E T1E, T1G, T1D, T1F; - T1E = ri[WS(rs, 17)]; - T1G = ii[WS(rs, 17)]; - T1D = W[32]; - T1F = W[33]; - T1H = FMA(T1D, T1E, T1F * T1G); - T2I = FNMS(T1F, T1E, T1D * T1G); - } - { - E T1J, T1L, T1I, T1K; - T1J = ri[WS(rs, 7)]; - T1L = ii[WS(rs, 7)]; - T1I = W[12]; - T1K = W[13]; - T1M = FMA(T1I, T1J, T1K * T1L); - T2J = FNMS(T1K, T1J, T1I * T1L); - } - T1N = T1H + T1M; - T3k = T2I + T2J; - T27 = T1H - T1M; - T2K = T2I - T2J; - } - { - E T1k, T2D, T1p, T2E; - { - E T1h, T1j, T1g, T1i; - T1h = ri[WS(rs, 13)]; - T1j = ii[WS(rs, 13)]; - T1g = W[24]; - T1i = W[25]; - T1k = FMA(T1g, T1h, T1i * T1j); - T2D = FNMS(T1i, T1h, T1g * T1j); - } - { - E T1m, T1o, T1l, T1n; - T1m = ri[WS(rs, 3)]; - T1o = ii[WS(rs, 3)]; - T1l = W[4]; - T1n = W[5]; - T1p = FMA(T1l, T1m, T1n * T1o); - T2E = FNMS(T1n, T1m, T1l * T1o); - } - T1q = T1k + T1p; - T3h = T2D + T2E; - T22 = T1k - T1p; - T2F = T2D - T2E; - } - { - E T1w, T24, T1B, T25; - { - E T1t, T1v, T1s, T1u; - T1t = ri[WS(rs, 12)]; - T1v = ii[WS(rs, 12)]; - T1s = W[22]; - T1u = W[23]; - T1w = FMA(T1s, T1t, T1u * T1v); - T24 = FNMS(T1u, T1t, T1s * T1v); - } - { - E T1y, T1A, T1x, T1z; - T1y = ri[WS(rs, 2)]; - T1A = ii[WS(rs, 2)]; - T1x = W[2]; - T1z = W[3]; - T1B = FMA(T1x, T1y, T1z * T1A); - T25 = FNMS(T1z, T1y, T1x * T1A); - } - T1C = T1w + T1B; - T3j = T24 + T25; - T26 = T24 - T25; - T2H = T1w - T1B; - } - T1r = T1f - T1q; - T1O = T1C - T1N; - T1P = T1r + T1O; - T3i = T3g - T3h; - T3l = T3j - T3k; - T44 = T3i + T3l; - T3D = T3g + T3h; - T3E = T3j + T3k; - T3K = T3D + T3E; - T1V = T1f + T1q; - T1W = T1C + T1N; - T1X = T1V + T1W; - T23 = T21 + T22; - T28 = T26 + T27; - T4r = T23 + T28; - T2W = T21 - T22; - T2X = T26 - T27; - T4c = T2W + T2X; - T33 = T2C + T2F; - T34 = T2H + T2K; - T35 = T33 + T34; - T2G = T2C - T2F; - T2L = T2H - T2K; - T2M = T2G + T2L; - } - { - E Tu, T3n, T2c, T2r, T12, T3r, T2i, T2z, TF, T3o, T2d, T2u, TR, T3q, T2h; - E T2w; - { - E To, T2a, Tt, T2b; - { - E Tl, Tn, Tk, Tm; - Tl = ri[WS(rs, 4)]; - Tn = ii[WS(rs, 4)]; - Tk = W[6]; - Tm = W[7]; - To = FMA(Tk, Tl, Tm * Tn); - T2a = FNMS(Tm, Tl, Tk * Tn); - } - { - E Tq, Ts, Tp, Tr; - Tq = ri[WS(rs, 14)]; - Ts = ii[WS(rs, 14)]; - Tp = W[26]; - Tr = W[27]; - Tt = FMA(Tp, Tq, Tr * Ts); - T2b = FNMS(Tr, Tq, Tp * Ts); - } - Tu = To + Tt; - T3n = T2a + T2b; - T2c = T2a - T2b; - T2r = To - Tt; - } - { - E TW, T2x, T11, T2y; - { - E TT, TV, TS, TU; - TT = ri[WS(rs, 1)]; - TV = ii[WS(rs, 1)]; - TS = W[0]; - TU = W[1]; - TW = FMA(TS, TT, TU * TV); - T2x = FNMS(TU, TT, TS * TV); - } - { - E TY, T10, TX, TZ; - TY = ri[WS(rs, 11)]; - T10 = ii[WS(rs, 11)]; - TX = W[20]; - TZ = W[21]; - T11 = FMA(TX, TY, TZ * T10); - T2y = FNMS(TZ, TY, TX * T10); - } - T12 = TW + T11; - T3r = T2x + T2y; - T2i = TW - T11; - T2z = T2x - T2y; - } - { - E Tz, T2s, TE, T2t; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 9)]; - Ty = ii[WS(rs, 9)]; - Tv = W[16]; - Tx = W[17]; - Tz = FMA(Tv, Tw, Tx * Ty); - T2s = FNMS(Tx, Tw, Tv * Ty); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 19)]; - TD = ii[WS(rs, 19)]; - TA = W[36]; - TC = W[37]; - TE = FMA(TA, TB, TC * TD); - T2t = FNMS(TC, TB, TA * TD); - } - TF = Tz + TE; - T3o = T2s + T2t; - T2d = Tz - TE; - T2u = T2s - T2t; - } - { - E TL, T2f, TQ, T2g; - { - E TI, TK, TH, TJ; - TI = ri[WS(rs, 16)]; - TK = ii[WS(rs, 16)]; - TH = W[30]; - TJ = W[31]; - TL = FMA(TH, TI, TJ * TK); - T2f = FNMS(TJ, TI, TH * TK); - } - { - E TN, TP, TM, TO; - TN = ri[WS(rs, 6)]; - TP = ii[WS(rs, 6)]; - TM = W[10]; - TO = W[11]; - TQ = FMA(TM, TN, TO * TP); - T2g = FNMS(TO, TN, TM * TP); - } - TR = TL + TQ; - T3q = T2f + T2g; - T2h = T2f - T2g; - T2w = TL - TQ; - } - TG = Tu - TF; - T13 = TR - T12; - T14 = TG + T13; - T3p = T3n - T3o; - T3s = T3q - T3r; - T43 = T3p + T3s; - T3A = T3n + T3o; - T3B = T3q + T3r; - T3J = T3A + T3B; - T1S = Tu + TF; - T1T = TR + T12; - T1U = T1S + T1T; - T2e = T2c + T2d; - T2j = T2h + T2i; - T4q = T2e + T2j; - T2T = T2c - T2d; - T2U = T2h - T2i; - T4b = T2T + T2U; - T30 = T2r + T2u; - T31 = T2w + T2z; - T32 = T30 + T31; - T2v = T2r - T2u; - T2A = T2w - T2z; - T2B = T2v + T2A; - } - { - E T3e, T1Q, T3d, T3u, T3w, T3m, T3t, T3v, T3f; - T3e = KP559016994 * (T14 - T1P); - T1Q = T14 + T1P; - T3d = FNMS(KP250000000, T1Q, Tj); - T3m = T3i - T3l; - T3t = T3p - T3s; - T3u = FNMS(KP587785252, T3t, KP951056516 * T3m); - T3w = FMA(KP951056516, T3t, KP587785252 * T3m); - ri[WS(rs, 10)] = Tj + T1Q; - T3v = T3e + T3d; - ri[WS(rs, 14)] = T3v - T3w; - ri[WS(rs, 6)] = T3v + T3w; - T3f = T3d - T3e; - ri[WS(rs, 2)] = T3f - T3u; - ri[WS(rs, 18)] = T3f + T3u; - } - { - E T47, T45, T46, T41, T4a, T3Z, T40, T49, T48; - T47 = KP559016994 * (T43 - T44); - T45 = T43 + T44; - T46 = FNMS(KP250000000, T45, T42); - T3Z = T1r - T1O; - T40 = TG - T13; - T41 = FNMS(KP587785252, T40, KP951056516 * T3Z); - T4a = FMA(KP951056516, T40, KP587785252 * T3Z); - ii[WS(rs, 10)] = T45 + T42; - T49 = T47 + T46; - ii[WS(rs, 6)] = T49 - T4a; - ii[WS(rs, 14)] = T4a + T49; - T48 = T46 - T47; - ii[WS(rs, 2)] = T41 + T48; - ii[WS(rs, 18)] = T48 - T41; - } - { - E T3x, T1Y, T3y, T3G, T3I, T3C, T3F, T3H, T3z; - T3x = KP559016994 * (T1U - T1X); - T1Y = T1U + T1X; - T3y = FNMS(KP250000000, T1Y, T1R); - T3C = T3A - T3B; - T3F = T3D - T3E; - T3G = FMA(KP951056516, T3C, KP587785252 * T3F); - T3I = FNMS(KP587785252, T3C, KP951056516 * T3F); - ri[0] = T1R + T1Y; - T3H = T3y - T3x; - ri[WS(rs, 12)] = T3H - T3I; - ri[WS(rs, 8)] = T3H + T3I; - T3z = T3x + T3y; - ri[WS(rs, 4)] = T3z - T3G; - ri[WS(rs, 16)] = T3z + T3G; - } - { - E T3U, T3L, T3V, T3T, T3Y, T3R, T3S, T3X, T3W; - T3U = KP559016994 * (T3J - T3K); - T3L = T3J + T3K; - T3V = FNMS(KP250000000, T3L, T3Q); - T3R = T1S - T1T; - T3S = T1V - T1W; - T3T = FMA(KP951056516, T3R, KP587785252 * T3S); - T3Y = FNMS(KP587785252, T3R, KP951056516 * T3S); - ii[0] = T3L + T3Q; - T3X = T3V - T3U; - ii[WS(rs, 8)] = T3X - T3Y; - ii[WS(rs, 12)] = T3Y + T3X; - T3W = T3U + T3V; - ii[WS(rs, 4)] = T3T + T3W; - ii[WS(rs, 16)] = T3W - T3T; - } - { - E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q; - T2P = KP559016994 * (T2B - T2M); - T2N = T2B + T2M; - T2O = FNMS(KP250000000, T2N, T2q); - T29 = T23 - T28; - T2k = T2e - T2j; - T2l = FNMS(KP587785252, T2k, KP951056516 * T29); - T2R = FMA(KP951056516, T2k, KP587785252 * T29); - ri[WS(rs, 15)] = T2q + T2N; - T2S = T2P + T2O; - ri[WS(rs, 11)] = T2R + T2S; - ri[WS(rs, 19)] = T2S - T2R; - T2Q = T2O - T2P; - ri[WS(rs, 3)] = T2l + T2Q; - ri[WS(rs, 7)] = T2Q - T2l; - } - { - E T4u, T4s, T4t, T4y, T4A, T4w, T4x, T4z, T4v; - T4u = KP559016994 * (T4q - T4r); - T4s = T4q + T4r; - T4t = FNMS(KP250000000, T4s, T4p); - T4w = T2G - T2L; - T4x = T2v - T2A; - T4y = FNMS(KP587785252, T4x, KP951056516 * T4w); - T4A = FMA(KP951056516, T4x, KP587785252 * T4w); - ii[WS(rs, 15)] = T4s + T4p; - T4z = T4u + T4t; - ii[WS(rs, 11)] = T4z - T4A; - ii[WS(rs, 19)] = T4A + T4z; - T4v = T4t - T4u; - ii[WS(rs, 3)] = T4v - T4y; - ii[WS(rs, 7)] = T4y + T4v; - } - { - E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a; - T36 = KP559016994 * (T32 - T35); - T38 = T32 + T35; - T39 = FNMS(KP250000000, T38, T37); - T2V = T2T - T2U; - T2Y = T2W - T2X; - T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y); - T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y); - ri[WS(rs, 5)] = T37 + T38; - T3c = T39 - T36; - ri[WS(rs, 13)] = T3b + T3c; - ri[WS(rs, 17)] = T3c - T3b; - T3a = T36 + T39; - ri[WS(rs, 1)] = T2Z + T3a; - ri[WS(rs, 9)] = T3a - T2Z; - } - { - E T4d, T4h, T4i, T4m, T4o, T4k, T4l, T4n, T4j; - T4d = KP559016994 * (T4b - T4c); - T4h = T4b + T4c; - T4i = FNMS(KP250000000, T4h, T4g); - T4k = T30 - T31; - T4l = T33 - T34; - T4m = FMA(KP951056516, T4k, KP587785252 * T4l); - T4o = FNMS(KP587785252, T4k, KP951056516 * T4l); - ii[WS(rs, 5)] = T4h + T4g; - T4n = T4i - T4d; - ii[WS(rs, 13)] = T4n - T4o; - ii[WS(rs, 17)] = T4o + T4n; - T4j = T4d + T4i; - ii[WS(rs, 1)] = T4j - T4m; - ii[WS(rs, 9)] = T4m + T4j; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 20 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 20, "t1_20", twinstr, &GENUS, { 184, 62, 62, 0 }, 0, 0, 0 }; - -void X(codelet_t1_20) (planner *p) { - X(kdft_dit_register) (p, t1_20, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_25.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_25.c deleted file mode 100644 index 78327e8c..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_25.c +++ /dev/null @@ -1,1572 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:30 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 25 -name t1_25 -include dft/scalar/t.h */ - -/* - * This function contains 400 FP additions, 364 FP multiplications, - * (or, 84 additions, 48 multiplications, 316 fused multiply/add), - * 138 stack variables, 47 constants, and 100 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP860541664, +0.860541664367944677098261680920518816412804187); - DK(KP560319534, +0.560319534973832390111614715371676131169633784); - DK(KP681693190, +0.681693190061530575150324149145440022633095390); - DK(KP949179823, +0.949179823508441261575555465843363271711583843); - DK(KP557913902, +0.557913902031834264187699648465567037992437152); - DK(KP249506682, +0.249506682107067890488084201715862638334226305); - DK(KP614372930, +0.614372930789563808870829930444362096004872855); - DK(KP621716863, +0.621716863012209892444754556304102309693593202); - DK(KP998026728, +0.998026728428271561952336806863450553336905220); - DK(KP906616052, +0.906616052148196230441134447086066874408359177); - DK(KP845997307, +0.845997307939530944175097360758058292389769300); - DK(KP968479752, +0.968479752739016373193524836781420152702090879); - DK(KP994076283, +0.994076283785401014123185814696322018529298887); - DK(KP772036680, +0.772036680810363904029489473607579825330539880); - DK(KP734762448, +0.734762448793050413546343770063151342619912334); - DK(KP062914667, +0.062914667253649757225485955897349402364686947); - DK(KP943557151, +0.943557151597354104399655195398983005179443399); - DK(KP803003575, +0.803003575438660414833440593570376004635464850); - DK(KP554608978, +0.554608978404018097464974850792216217022558774); - DK(KP248028675, +0.248028675328619457762448260696444630363259177); - DK(KP525970792, +0.525970792408939708442463226536226366643874659); - DK(KP726211448, +0.726211448929902658173535992263577167607493062); - DK(KP833417178, +0.833417178328688677408962550243238843138996060); - DK(KP921177326, +0.921177326965143320250447435415066029359282231); - DK(KP541454447, +0.541454447536312777046285590082819509052033189); - DK(KP242145790, +0.242145790282157779872542093866183953459003101); - DK(KP683113946, +0.683113946453479238701949862233725244439656928); - DK(KP559154169, +0.559154169276087864842202529084232643714075927); - DK(KP992114701, +0.992114701314477831049793042785778521453036709); - DK(KP968583161, +0.968583161128631119490168375464735813836012403); - DK(KP851038619, +0.851038619207379630836264138867114231259902550); - DK(KP912018591, +0.912018591466481957908415381764119056233607330); - DK(KP912575812, +0.912575812670962425556968549836277086778922727); - DK(KP470564281, +0.470564281212251493087595091036643380879947982); - DK(KP827271945, +0.827271945972475634034355757144307982555673741); - DK(KP126329378, +0.126329378446108174786050455341811215027378105); - DK(KP904730450, +0.904730450839922351881287709692877908104763647); - DK(KP831864738, +0.831864738706457140726048799369896829771167132); - DK(KP871714437, +0.871714437527667770979999223229522602943903653); - DK(KP549754652, +0.549754652192770074288023275540779861653779767); - DK(KP634619297, +0.634619297544148100711287640319130485732531031); - DK(KP939062505, +0.939062505817492352556001843133229685779824606); - DK(KP256756360, +0.256756360367726783319498520922669048172391148); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - { - INT m; - for (m = mb, W = W + (mb * 48); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) { - E T1, T6X, T3Y, T5G, T7c, T7C, Ts, T3L, T3M, T6W, T77, T78, T4P, T5T, T4W; - E T5Q, T2G, T5S, T4M, T3G, T5P, T4T, T45, T65, T4c, T68, T11, T64, T42, T2Z; - E T67, T49, T4k, T61, T4r, T5Y, T1z, T5X, T4o, T3d, T60, T4h, T4A, T5M, T4H; - E T5J, T28, T5L, T4x, T3s, T5I, T4E; - { - E T7, T3P, Tq, T3W, Tk, T3U, Td, T3R; - T1 = ri[0]; - T6X = ii[0]; - { - E T3, T6, T4, T3O, T2, T5; - T3 = ri[WS(rs, 5)]; - T6 = ii[WS(rs, 5)]; - T2 = W[8]; - T4 = T2 * T3; - T3O = T2 * T6; - T5 = W[9]; - T7 = FMA(T5, T6, T4); - T3P = FNMS(T5, T3, T3O); - } - { - E Tm, Tp, Tn, T3V, Tl, To; - Tm = ri[WS(rs, 15)]; - Tp = ii[WS(rs, 15)]; - Tl = W[28]; - Tn = Tl * Tm; - T3V = Tl * Tp; - To = W[29]; - Tq = FMA(To, Tp, Tn); - T3W = FNMS(To, Tm, T3V); - } - { - E Tg, Tj, Th, T3T, Tf, Ti; - Tg = ri[WS(rs, 10)]; - Tj = ii[WS(rs, 10)]; - Tf = W[18]; - Th = Tf * Tg; - T3T = Tf * Tj; - Ti = W[19]; - Tk = FMA(Ti, Tj, Th); - T3U = FNMS(Ti, Tg, T3T); - } - { - E T9, Tc, Ta, T3Q, T8, Tb; - T9 = ri[WS(rs, 20)]; - Tc = ii[WS(rs, 20)]; - T8 = W[38]; - Ta = T8 * T9; - T3Q = T8 * Tc; - Tb = W[39]; - Td = FMA(Tb, Tc, Ta); - T3R = FNMS(Tb, T9, T3Q); - } - { - E T3S, T3X, T7a, T7b; - T3S = T3P - T3R; - T3X = T3U - T3W; - T3Y = FMA(KP618033988, T3X, T3S); - T5G = FNMS(KP618033988, T3S, T3X); - T7a = T7 - Td; - T7b = Tk - Tq; - T7c = FMA(KP618033988, T7b, T7a); - T7C = FNMS(KP618033988, T7a, T7b); - } - { - E Te, Tr, T6U, T6V; - Te = T7 + Td; - Tr = Tk + Tq; - Ts = Te + Tr; - T3L = FNMS(KP250000000, Ts, T1); - T3M = Te - Tr; - T6U = T3P + T3R; - T6V = T3U + T3W; - T6W = T6U + T6V; - T77 = FNMS(KP250000000, T6W, T6X); - T78 = T6U - T6V; - } - } - { - E T2e, T3u, T2x, T3B, T2D, T3D, T2E, T3E, T2k, T3w, T2q, T3y, T2r, T3z; - { - E T2a, T2d, T2b, T3t, T29, T2c; - T2a = ri[WS(rs, 3)]; - T2d = ii[WS(rs, 3)]; - T29 = W[4]; - T2b = T29 * T2a; - T3t = T29 * T2d; - T2c = W[5]; - T2e = FMA(T2c, T2d, T2b); - T3u = FNMS(T2c, T2a, T3t); - } - { - E T2t, T2w, T2u, T3A, T2z, T2C, T2A, T3C, T2s, T2y, T2v, T2B; - T2t = ri[WS(rs, 13)]; - T2w = ii[WS(rs, 13)]; - T2s = W[24]; - T2u = T2s * T2t; - T3A = T2s * T2w; - T2z = ri[WS(rs, 18)]; - T2C = ii[WS(rs, 18)]; - T2y = W[34]; - T2A = T2y * T2z; - T3C = T2y * T2C; - T2v = W[25]; - T2x = FMA(T2v, T2w, T2u); - T3B = FNMS(T2v, T2t, T3A); - T2B = W[35]; - T2D = FMA(T2B, T2C, T2A); - T3D = FNMS(T2B, T2z, T3C); - T2E = T2x + T2D; - T3E = T3B + T3D; - } - { - E T2g, T2j, T2h, T3v, T2m, T2p, T2n, T3x, T2f, T2l, T2i, T2o; - T2g = ri[WS(rs, 8)]; - T2j = ii[WS(rs, 8)]; - T2f = W[14]; - T2h = T2f * T2g; - T3v = T2f * T2j; - T2m = ri[WS(rs, 23)]; - T2p = ii[WS(rs, 23)]; - T2l = W[44]; - T2n = T2l * T2m; - T3x = T2l * T2p; - T2i = W[15]; - T2k = FMA(T2i, T2j, T2h); - T3w = FNMS(T2i, T2g, T3v); - T2o = W[45]; - T2q = FMA(T2o, T2p, T2n); - T3y = FNMS(T2o, T2m, T3x); - T2r = T2k + T2q; - T3z = T3w + T3y; - } - { - E T4N, T4O, T4U, T4V; - T4N = T3y - T3w; - T4O = T3D - T3B; - T4P = FMA(KP618033988, T4O, T4N); - T5T = FNMS(KP618033988, T4N, T4O); - T4U = T2k - T2q; - T4V = T2x - T2D; - T4W = FMA(KP618033988, T4V, T4U); - T5Q = FNMS(KP618033988, T4U, T4V); - } - { - E T4L, T2F, T4K, T4S, T3F, T4R; - T4L = T2E - T2r; - T2F = T2r + T2E; - T4K = FNMS(KP250000000, T2F, T2e); - T2G = T2e + T2F; - T5S = FMA(KP559016994, T4L, T4K); - T4M = FNMS(KP559016994, T4L, T4K); - T4S = T3E - T3z; - T3F = T3z + T3E; - T4R = FNMS(KP250000000, T3F, T3u); - T3G = T3u + T3F; - T5P = FMA(KP559016994, T4S, T4R); - T4T = FNMS(KP559016994, T4S, T4R); - } - } - { - E Tz, T2N, TS, T2U, TY, T2W, TZ, T2X, TF, T2P, TL, T2R, TM, T2S; - { - E Tv, Ty, Tw, T2M, Tu, Tx; - Tv = ri[WS(rs, 1)]; - Ty = ii[WS(rs, 1)]; - Tu = W[0]; - Tw = Tu * Tv; - T2M = Tu * Ty; - Tx = W[1]; - Tz = FMA(Tx, Ty, Tw); - T2N = FNMS(Tx, Tv, T2M); - } - { - E TO, TR, TP, T2T, TU, TX, TV, T2V, TN, TT, TQ, TW; - TO = ri[WS(rs, 11)]; - TR = ii[WS(rs, 11)]; - TN = W[20]; - TP = TN * TO; - T2T = TN * TR; - TU = ri[WS(rs, 16)]; - TX = ii[WS(rs, 16)]; - TT = W[30]; - TV = TT * TU; - T2V = TT * TX; - TQ = W[21]; - TS = FMA(TQ, TR, TP); - T2U = FNMS(TQ, TO, T2T); - TW = W[31]; - TY = FMA(TW, TX, TV); - T2W = FNMS(TW, TU, T2V); - TZ = TS + TY; - T2X = T2U + T2W; - } - { - E TB, TE, TC, T2O, TH, TK, TI, T2Q, TA, TG, TD, TJ; - TB = ri[WS(rs, 6)]; - TE = ii[WS(rs, 6)]; - TA = W[10]; - TC = TA * TB; - T2O = TA * TE; - TH = ri[WS(rs, 21)]; - TK = ii[WS(rs, 21)]; - TG = W[40]; - TI = TG * TH; - T2Q = TG * TK; - TD = W[11]; - TF = FMA(TD, TE, TC); - T2P = FNMS(TD, TB, T2O); - TJ = W[41]; - TL = FMA(TJ, TK, TI); - T2R = FNMS(TJ, TH, T2Q); - TM = TF + TL; - T2S = T2P + T2R; - } - { - E T43, T44, T4a, T4b; - T43 = T2P - T2R; - T44 = T2W - T2U; - T45 = FNMS(KP618033988, T44, T43); - T65 = FMA(KP618033988, T43, T44); - T4a = TL - TF; - T4b = TY - TS; - T4c = FMA(KP618033988, T4b, T4a); - T68 = FNMS(KP618033988, T4a, T4b); - } - { - E T41, T10, T40, T48, T2Y, T47; - T41 = TM - TZ; - T10 = TM + TZ; - T40 = FNMS(KP250000000, T10, Tz); - T11 = Tz + T10; - T64 = FNMS(KP559016994, T41, T40); - T42 = FMA(KP559016994, T41, T40); - T48 = T2S - T2X; - T2Y = T2S + T2X; - T47 = FNMS(KP250000000, T2Y, T2N); - T2Z = T2N + T2Y; - T67 = FNMS(KP559016994, T48, T47); - T49 = FMA(KP559016994, T48, T47); - } - } - { - E T17, T31, T1q, T38, T1w, T3a, T1x, T3b, T1d, T33, T1j, T35, T1k, T36; - { - E T13, T16, T14, T30, T12, T15; - T13 = ri[WS(rs, 4)]; - T16 = ii[WS(rs, 4)]; - T12 = W[6]; - T14 = T12 * T13; - T30 = T12 * T16; - T15 = W[7]; - T17 = FMA(T15, T16, T14); - T31 = FNMS(T15, T13, T30); - } - { - E T1m, T1p, T1n, T37, T1s, T1v, T1t, T39, T1l, T1r, T1o, T1u; - T1m = ri[WS(rs, 14)]; - T1p = ii[WS(rs, 14)]; - T1l = W[26]; - T1n = T1l * T1m; - T37 = T1l * T1p; - T1s = ri[WS(rs, 19)]; - T1v = ii[WS(rs, 19)]; - T1r = W[36]; - T1t = T1r * T1s; - T39 = T1r * T1v; - T1o = W[27]; - T1q = FMA(T1o, T1p, T1n); - T38 = FNMS(T1o, T1m, T37); - T1u = W[37]; - T1w = FMA(T1u, T1v, T1t); - T3a = FNMS(T1u, T1s, T39); - T1x = T1q + T1w; - T3b = T38 + T3a; - } - { - E T19, T1c, T1a, T32, T1f, T1i, T1g, T34, T18, T1e, T1b, T1h; - T19 = ri[WS(rs, 9)]; - T1c = ii[WS(rs, 9)]; - T18 = W[16]; - T1a = T18 * T19; - T32 = T18 * T1c; - T1f = ri[WS(rs, 24)]; - T1i = ii[WS(rs, 24)]; - T1e = W[46]; - T1g = T1e * T1f; - T34 = T1e * T1i; - T1b = W[17]; - T1d = FMA(T1b, T1c, T1a); - T33 = FNMS(T1b, T19, T32); - T1h = W[47]; - T1j = FMA(T1h, T1i, T1g); - T35 = FNMS(T1h, T1f, T34); - T1k = T1d + T1j; - T36 = T33 + T35; - } - { - E T4i, T4j, T4p, T4q; - T4i = T1j - T1d; - T4j = T1w - T1q; - T4k = FMA(KP618033988, T4j, T4i); - T61 = FNMS(KP618033988, T4i, T4j); - T4p = T35 - T33; - T4q = T3a - T38; - T4r = FMA(KP618033988, T4q, T4p); - T5Y = FNMS(KP618033988, T4p, T4q); - } - { - E T4n, T1y, T4m, T4g, T3c, T4f; - T4n = T1k - T1x; - T1y = T1k + T1x; - T4m = FNMS(KP250000000, T1y, T17); - T1z = T17 + T1y; - T5X = FNMS(KP559016994, T4n, T4m); - T4o = FMA(KP559016994, T4n, T4m); - T4g = T3b - T36; - T3c = T36 + T3b; - T4f = FNMS(KP250000000, T3c, T31); - T3d = T31 + T3c; - T60 = FMA(KP559016994, T4g, T4f); - T4h = FNMS(KP559016994, T4g, T4f); - } - } - { - E T1G, T3g, T1Z, T3n, T25, T3p, T26, T3q, T1M, T3i, T1S, T3k, T1T, T3l; - { - E T1C, T1F, T1D, T3f, T1B, T1E; - T1C = ri[WS(rs, 2)]; - T1F = ii[WS(rs, 2)]; - T1B = W[2]; - T1D = T1B * T1C; - T3f = T1B * T1F; - T1E = W[3]; - T1G = FMA(T1E, T1F, T1D); - T3g = FNMS(T1E, T1C, T3f); - } - { - E T1V, T1Y, T1W, T3m, T21, T24, T22, T3o, T1U, T20, T1X, T23; - T1V = ri[WS(rs, 12)]; - T1Y = ii[WS(rs, 12)]; - T1U = W[22]; - T1W = T1U * T1V; - T3m = T1U * T1Y; - T21 = ri[WS(rs, 17)]; - T24 = ii[WS(rs, 17)]; - T20 = W[32]; - T22 = T20 * T21; - T3o = T20 * T24; - T1X = W[23]; - T1Z = FMA(T1X, T1Y, T1W); - T3n = FNMS(T1X, T1V, T3m); - T23 = W[33]; - T25 = FMA(T23, T24, T22); - T3p = FNMS(T23, T21, T3o); - T26 = T1Z + T25; - T3q = T3n + T3p; - } - { - E T1I, T1L, T1J, T3h, T1O, T1R, T1P, T3j, T1H, T1N, T1K, T1Q; - T1I = ri[WS(rs, 7)]; - T1L = ii[WS(rs, 7)]; - T1H = W[12]; - T1J = T1H * T1I; - T3h = T1H * T1L; - T1O = ri[WS(rs, 22)]; - T1R = ii[WS(rs, 22)]; - T1N = W[42]; - T1P = T1N * T1O; - T3j = T1N * T1R; - T1K = W[13]; - T1M = FMA(T1K, T1L, T1J); - T3i = FNMS(T1K, T1I, T3h); - T1Q = W[43]; - T1S = FMA(T1Q, T1R, T1P); - T3k = FNMS(T1Q, T1O, T3j); - T1T = T1M + T1S; - T3l = T3i + T3k; - } - { - E T4y, T4z, T4F, T4G; - T4y = T3k - T3i; - T4z = T3n - T3p; - T4A = FNMS(KP618033988, T4z, T4y); - T5M = FMA(KP618033988, T4y, T4z); - T4F = T1S - T1M; - T4G = T25 - T1Z; - T4H = FMA(KP618033988, T4G, T4F); - T5J = FNMS(KP618033988, T4F, T4G); - } - { - E T4w, T27, T4v, T4D, T3r, T4C; - T4w = T26 - T1T; - T27 = T1T + T26; - T4v = FNMS(KP250000000, T27, T1G); - T28 = T1G + T27; - T5L = FMA(KP559016994, T4w, T4v); - T4x = FNMS(KP559016994, T4w, T4v); - T4D = T3q - T3l; - T3r = T3l + T3q; - T4C = FNMS(KP250000000, T3r, T3g); - T3s = T3g + T3r; - T5I = FMA(KP559016994, T4D, T4C); - T4E = FNMS(KP559016994, T4D, T4C); - } - } - { - E T3I, T3K, Tt, T2I, T2J, T2K, T3J, T2L; - { - E T3e, T3H, T1A, T2H; - T3e = T2Z - T3d; - T3H = T3s - T3G; - T3I = FMA(KP618033988, T3H, T3e); - T3K = FNMS(KP618033988, T3e, T3H); - Tt = T1 + Ts; - T1A = T11 + T1z; - T2H = T28 + T2G; - T2I = T1A + T2H; - T2J = FNMS(KP250000000, T2I, Tt); - T2K = T1A - T2H; - } - ri[0] = Tt + T2I; - T3J = FNMS(KP559016994, T2K, T2J); - ri[WS(rs, 10)] = FNMS(KP951056516, T3K, T3J); - ri[WS(rs, 15)] = FMA(KP951056516, T3K, T3J); - T2L = FMA(KP559016994, T2K, T2J); - ri[WS(rs, 20)] = FNMS(KP951056516, T3I, T2L); - ri[WS(rs, 5)] = FMA(KP951056516, T3I, T2L); - } - { - E T74, T76, T6Y, T6T, T6Z, T70, T75, T71; - { - E T72, T73, T6R, T6S; - T72 = T11 - T1z; - T73 = T28 - T2G; - T74 = FMA(KP618033988, T73, T72); - T76 = FNMS(KP618033988, T72, T73); - T6Y = T6W + T6X; - T6R = T2Z + T3d; - T6S = T3s + T3G; - T6T = T6R + T6S; - T6Z = FNMS(KP250000000, T6T, T6Y); - T70 = T6R - T6S; - } - ii[0] = T6T + T6Y; - T75 = FNMS(KP559016994, T70, T6Z); - ii[WS(rs, 10)] = FMA(KP951056516, T76, T75); - ii[WS(rs, 15)] = FNMS(KP951056516, T76, T75); - T71 = FMA(KP559016994, T70, T6Z); - ii[WS(rs, 5)] = FNMS(KP951056516, T74, T71); - ii[WS(rs, 20)] = FMA(KP951056516, T74, T71); - } - { - E T3Z, T5d, T7d, T7p, T56, T59, T7l, T7k, T7e, T7f, T7g, T4u, T4Z, T50, T5y; - E T5B, T7x, T7w, T7q, T7r, T7s, T5k, T5r, T5s, T3N, T79; - T3N = FMA(KP559016994, T3M, T3L); - T3Z = FMA(KP951056516, T3Y, T3N); - T5d = FNMS(KP951056516, T3Y, T3N); - T79 = FMA(KP559016994, T78, T77); - T7d = FNMS(KP951056516, T7c, T79); - T7p = FMA(KP951056516, T7c, T79); - { - E T4e, T54, T4Y, T58, T4t, T55, T4J, T57; - { - E T46, T4d, T4Q, T4X; - T46 = FMA(KP951056516, T45, T42); - T4d = FMA(KP951056516, T4c, T49); - T4e = FMA(KP256756360, T4d, T46); - T54 = FNMS(KP256756360, T46, T4d); - T4Q = FNMS(KP951056516, T4P, T4M); - T4X = FNMS(KP951056516, T4W, T4T); - T4Y = FMA(KP939062505, T4X, T4Q); - T58 = FNMS(KP939062505, T4Q, T4X); - } - { - E T4l, T4s, T4B, T4I; - T4l = FMA(KP951056516, T4k, T4h); - T4s = FNMS(KP951056516, T4r, T4o); - T4t = FMA(KP634619297, T4s, T4l); - T55 = FNMS(KP634619297, T4l, T4s); - T4B = FNMS(KP951056516, T4A, T4x); - T4I = FMA(KP951056516, T4H, T4E); - T4J = FMA(KP549754652, T4I, T4B); - T57 = FNMS(KP549754652, T4B, T4I); - } - T56 = FMA(KP871714437, T55, T54); - T59 = FNMS(KP831864738, T58, T57); - T7l = FNMS(KP831864738, T4Y, T4J); - T7k = FNMS(KP871714437, T4t, T4e); - T7e = FNMS(KP871714437, T55, T54); - T7f = FMA(KP831864738, T58, T57); - T7g = FMA(KP904730450, T7f, T7e); - T4u = FMA(KP871714437, T4t, T4e); - T4Z = FMA(KP831864738, T4Y, T4J); - T50 = FMA(KP904730450, T4Z, T4u); - } - { - E T5g, T5z, T5q, T5x, T5j, T5A, T5n, T5w; - { - E T5e, T5f, T5o, T5p; - T5e = FMA(KP951056516, T4P, T4M); - T5f = FMA(KP951056516, T4W, T4T); - T5g = FNMS(KP126329378, T5f, T5e); - T5z = FMA(KP126329378, T5e, T5f); - T5o = FNMS(KP951056516, T4k, T4h); - T5p = FMA(KP951056516, T4r, T4o); - T5q = FMA(KP827271945, T5p, T5o); - T5x = FNMS(KP827271945, T5o, T5p); - } - { - E T5h, T5i, T5l, T5m; - T5h = FNMS(KP951056516, T4H, T4E); - T5i = FMA(KP951056516, T4A, T4x); - T5j = FNMS(KP470564281, T5i, T5h); - T5A = FMA(KP470564281, T5h, T5i); - T5l = FNMS(KP951056516, T4c, T49); - T5m = FNMS(KP951056516, T45, T42); - T5n = FMA(KP634619297, T5m, T5l); - T5w = FNMS(KP634619297, T5l, T5m); - } - T5y = FMA(KP912575812, T5x, T5w); - T5B = FNMS(KP912018591, T5A, T5z); - T7x = FMA(KP912018591, T5j, T5g); - T7w = FMA(KP912575812, T5q, T5n); - T7q = FMA(KP912018591, T5A, T5z); - T7r = FNMS(KP912575812, T5x, T5w); - T7s = FMA(KP851038619, T7r, T7q); - T5k = FNMS(KP912018591, T5j, T5g); - T5r = FNMS(KP912575812, T5q, T5n); - T5s = FNMS(KP851038619, T5r, T5k); - } - ri[WS(rs, 1)] = FMA(KP968583161, T50, T3Z); - ii[WS(rs, 1)] = FMA(KP968583161, T7g, T7d); - ri[WS(rs, 4)] = FNMS(KP992114701, T5s, T5d); - ii[WS(rs, 4)] = FNMS(KP992114701, T7s, T7p); - { - E T5a, T5c, T53, T5b, T51, T52; - T5a = FMA(KP559154169, T59, T56); - T5c = FNMS(KP683113946, T56, T59); - T51 = FNMS(KP242145790, T50, T3Z); - T52 = FNMS(KP904730450, T4Z, T4u); - T53 = FMA(KP541454447, T52, T51); - T5b = FNMS(KP541454447, T52, T51); - ri[WS(rs, 6)] = FMA(KP921177326, T5a, T53); - ri[WS(rs, 16)] = FMA(KP833417178, T5c, T5b); - ri[WS(rs, 21)] = FNMS(KP921177326, T5a, T53); - ri[WS(rs, 11)] = FNMS(KP833417178, T5c, T5b); - } - { - E T7m, T7o, T7j, T7n, T7h, T7i; - T7m = FMA(KP559154169, T7l, T7k); - T7o = FNMS(KP683113946, T7k, T7l); - T7h = FNMS(KP242145790, T7g, T7d); - T7i = FNMS(KP904730450, T7f, T7e); - T7j = FMA(KP541454447, T7i, T7h); - T7n = FNMS(KP541454447, T7i, T7h); - ii[WS(rs, 6)] = FNMS(KP921177326, T7m, T7j); - ii[WS(rs, 16)] = FNMS(KP833417178, T7o, T7n); - ii[WS(rs, 21)] = FMA(KP921177326, T7m, T7j); - ii[WS(rs, 11)] = FMA(KP833417178, T7o, T7n); - } - { - E T5C, T5E, T5v, T5D, T5t, T5u; - T5C = FNMS(KP726211448, T5B, T5y); - T5E = FMA(KP525970792, T5y, T5B); - T5t = FMA(KP248028675, T5s, T5d); - T5u = FMA(KP851038619, T5r, T5k); - T5v = FMA(KP554608978, T5u, T5t); - T5D = FNMS(KP554608978, T5u, T5t); - ri[WS(rs, 9)] = FNMS(KP803003575, T5C, T5v); - ri[WS(rs, 19)] = FMA(KP943557151, T5E, T5D); - ri[WS(rs, 24)] = FMA(KP803003575, T5C, T5v); - ri[WS(rs, 14)] = FNMS(KP943557151, T5E, T5D); - } - { - E T7y, T7A, T7v, T7z, T7t, T7u; - T7y = FMA(KP726211448, T7x, T7w); - T7A = FNMS(KP525970792, T7w, T7x); - T7t = FMA(KP248028675, T7s, T7p); - T7u = FNMS(KP851038619, T7r, T7q); - T7v = FMA(KP554608978, T7u, T7t); - T7z = FNMS(KP554608978, T7u, T7t); - ii[WS(rs, 9)] = FNMS(KP803003575, T7y, T7v); - ii[WS(rs, 19)] = FNMS(KP943557151, T7A, T7z); - ii[WS(rs, 24)] = FMA(KP803003575, T7y, T7v); - ii[WS(rs, 14)] = FMA(KP943557151, T7A, T7z); - } - } - { - E T5H, T6p, T7D, T7P, T6i, T6l, T7X, T7W, T7Q, T7R, T7S, T5W, T6b, T6c, T6K; - E T6N, T7L, T7K, T7E, T7F, T7G, T6w, T6D, T6E, T5F, T7B; - T5F = FNMS(KP559016994, T3M, T3L); - T5H = FMA(KP951056516, T5G, T5F); - T6p = FNMS(KP951056516, T5G, T5F); - T7B = FNMS(KP559016994, T78, T77); - T7D = FMA(KP951056516, T7C, T7B); - T7P = FNMS(KP951056516, T7C, T7B); - { - E T5O, T6j, T6a, T6h, T5V, T6k, T63, T6g; - { - E T5K, T5N, T66, T69; - T5K = FMA(KP951056516, T5J, T5I); - T5N = FMA(KP951056516, T5M, T5L); - T5O = FMA(KP062914667, T5N, T5K); - T6j = FNMS(KP062914667, T5K, T5N); - T66 = FNMS(KP951056516, T65, T64); - T69 = FMA(KP951056516, T68, T67); - T6a = FMA(KP939062505, T69, T66); - T6h = FNMS(KP939062505, T66, T69); - } - { - E T5R, T5U, T5Z, T62; - T5R = FNMS(KP951056516, T5Q, T5P); - T5U = FNMS(KP951056516, T5T, T5S); - T5V = FNMS(KP827271945, T5U, T5R); - T6k = FMA(KP827271945, T5R, T5U); - T5Z = FNMS(KP951056516, T5Y, T5X); - T62 = FMA(KP951056516, T61, T60); - T63 = FNMS(KP126329378, T62, T5Z); - T6g = FMA(KP126329378, T5Z, T62); - } - T6i = FMA(KP734762448, T6h, T6g); - T6l = FNMS(KP772036680, T6k, T6j); - T7X = FNMS(KP772036680, T5V, T5O); - T7W = FMA(KP734762448, T6a, T63); - T7Q = FMA(KP772036680, T6k, T6j); - T7R = FNMS(KP734762448, T6h, T6g); - T7S = FMA(KP994076283, T7R, T7Q); - T5W = FMA(KP772036680, T5V, T5O); - T6b = FNMS(KP734762448, T6a, T63); - T6c = FNMS(KP994076283, T6b, T5W); - } - { - E T6s, T6L, T6C, T6J, T6v, T6M, T6z, T6I; - { - E T6q, T6r, T6A, T6B; - T6q = FMA(KP951056516, T5Q, T5P); - T6r = FMA(KP951056516, T5T, T5S); - T6s = FMA(KP062914667, T6r, T6q); - T6L = FNMS(KP062914667, T6q, T6r); - T6A = FMA(KP951056516, T65, T64); - T6B = FNMS(KP951056516, T68, T67); - T6C = FMA(KP549754652, T6B, T6A); - T6J = FNMS(KP549754652, T6A, T6B); - } - { - E T6t, T6u, T6x, T6y; - T6t = FNMS(KP951056516, T5J, T5I); - T6u = FNMS(KP951056516, T5M, T5L); - T6v = FMA(KP634619297, T6u, T6t); - T6M = FNMS(KP634619297, T6t, T6u); - T6x = FNMS(KP951056516, T61, T60); - T6y = FMA(KP951056516, T5Y, T5X); - T6z = FNMS(KP470564281, T6y, T6x); - T6I = FMA(KP470564281, T6x, T6y); - } - T6K = FMA(KP968479752, T6J, T6I); - T6N = FNMS(KP845997307, T6M, T6L); - T7L = FNMS(KP845997307, T6v, T6s); - T7K = FNMS(KP968479752, T6C, T6z); - T7E = FMA(KP845997307, T6M, T6L); - T7F = FNMS(KP968479752, T6J, T6I); - T7G = FMA(KP906616052, T7F, T7E); - T6w = FMA(KP845997307, T6v, T6s); - T6D = FMA(KP968479752, T6C, T6z); - T6E = FMA(KP906616052, T6D, T6w); - } - ri[WS(rs, 3)] = FMA(KP998026728, T6c, T5H); - ii[WS(rs, 3)] = FNMS(KP998026728, T7S, T7P); - ri[WS(rs, 2)] = FMA(KP998026728, T6E, T6p); - ii[WS(rs, 2)] = FNMS(KP998026728, T7G, T7D); - { - E T6m, T6o, T6f, T6n, T6d, T6e; - T6m = FNMS(KP621716863, T6l, T6i); - T6o = FMA(KP614372930, T6i, T6l); - T6d = FNMS(KP249506682, T6c, T5H); - T6e = FMA(KP994076283, T6b, T5W); - T6f = FNMS(KP557913902, T6e, T6d); - T6n = FMA(KP557913902, T6e, T6d); - ri[WS(rs, 23)] = FNMS(KP943557151, T6m, T6f); - ri[WS(rs, 13)] = FMA(KP949179823, T6o, T6n); - ri[WS(rs, 8)] = FMA(KP943557151, T6m, T6f); - ri[WS(rs, 18)] = FNMS(KP949179823, T6o, T6n); - } - { - E T7Y, T80, T7V, T7Z, T7T, T7U; - T7Y = FMA(KP621716863, T7X, T7W); - T80 = FNMS(KP614372930, T7W, T7X); - T7T = FMA(KP249506682, T7S, T7P); - T7U = FNMS(KP994076283, T7R, T7Q); - T7V = FMA(KP557913902, T7U, T7T); - T7Z = FNMS(KP557913902, T7U, T7T); - ii[WS(rs, 8)] = FNMS(KP943557151, T7Y, T7V); - ii[WS(rs, 18)] = FNMS(KP949179823, T80, T7Z); - ii[WS(rs, 23)] = FMA(KP943557151, T7Y, T7V); - ii[WS(rs, 13)] = FMA(KP949179823, T80, T7Z); - } - { - E T6O, T6Q, T6H, T6P, T6F, T6G; - T6O = FMA(KP681693190, T6N, T6K); - T6Q = FNMS(KP560319534, T6K, T6N); - T6F = FNMS(KP249506682, T6E, T6p); - T6G = FNMS(KP906616052, T6D, T6w); - T6H = FNMS(KP557913902, T6G, T6F); - T6P = FMA(KP557913902, T6G, T6F); - ri[WS(rs, 22)] = FNMS(KP860541664, T6O, T6H); - ri[WS(rs, 17)] = FMA(KP949179823, T6Q, T6P); - ri[WS(rs, 7)] = FMA(KP860541664, T6O, T6H); - ri[WS(rs, 12)] = FNMS(KP949179823, T6Q, T6P); - } - { - E T7M, T7O, T7J, T7N, T7H, T7I; - T7M = FMA(KP681693190, T7L, T7K); - T7O = FNMS(KP560319534, T7K, T7L); - T7H = FMA(KP249506682, T7G, T7D); - T7I = FNMS(KP906616052, T7F, T7E); - T7J = FMA(KP557913902, T7I, T7H); - T7N = FNMS(KP557913902, T7I, T7H); - ii[WS(rs, 7)] = FMA(KP860541664, T7M, T7J); - ii[WS(rs, 17)] = FMA(KP949179823, T7O, T7N); - ii[WS(rs, 22)] = FNMS(KP860541664, T7M, T7J); - ii[WS(rs, 12)] = FNMS(KP949179823, T7O, T7N); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 25 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 25, "t1_25", twinstr, &GENUS, { 84, 48, 316, 0 }, 0, 0, 0 }; - -void X(codelet_t1_25) (planner *p) { - X(kdft_dit_register) (p, t1_25, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 25 -name t1_25 -include dft/scalar/t.h */ - -/* - * This function contains 400 FP additions, 280 FP multiplications, - * (or, 260 additions, 140 multiplications, 140 fused multiply/add), - * 101 stack variables, 20 constants, and 100 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP998026728, +0.998026728428271561952336806863450553336905220); - DK(KP062790519, +0.062790519529313376076178224565631133122484832); - DK(KP425779291, +0.425779291565072648862502445744251703979973042); - DK(KP904827052, +0.904827052466019527713668647932697593970413911); - DK(KP992114701, +0.992114701314477831049793042785778521453036709); - DK(KP125333233, +0.125333233564304245373118759816508793942918247); - DK(KP637423989, +0.637423989748689710176712811676016195434917298); - DK(KP770513242, +0.770513242775789230803009636396177847271667672); - DK(KP684547105, +0.684547105928688673732283357621209269889519233); - DK(KP728968627, +0.728968627421411523146730319055259111372571664); - DK(KP481753674, +0.481753674101715274987191502872129653528542010); - DK(KP876306680, +0.876306680043863587308115903922062583399064238); - DK(KP844327925, +0.844327925502015078548558063966681505381659241); - DK(KP535826794, +0.535826794978996618271308767867639978063575346); - DK(KP248689887, +0.248689887164854788242283746006447968417567406); - DK(KP968583161, +0.968583161128631119490168375464735813836012403); - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - { - INT m; - for (m = mb, W = W + (mb * 48); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 48, MAKE_VOLATILE_STRIDE(50, rs)) { - E T1, T6b, T2l, T6o, To, T2m, T6a, T6p, T6t, T6S, T2u, T4I, T2i, T60, T3O; - E T5D, T4r, T58, T3Z, T5C, T4q, T5b, TS, T5W, T2G, T5s, T4g, T4M, T2R, T5t; - E T4h, T4P, T1l, T5X, T33, T5w, T4j, T4W, T3e, T5v, T4k, T4T, T1P, T5Z, T3r; - E T5z, T4o, T51, T3C, T5A, T4n, T54; - { - E T6, T2o, Tb, T2p, Tc, T68, Th, T2r, Tm, T2s, Tn, T69; - T1 = ri[0]; - T6b = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 5)]; - T5 = ii[WS(rs, 5)]; - T2 = W[8]; - T4 = W[9]; - T6 = FMA(T2, T3, T4 * T5); - T2o = FNMS(T4, T3, T2 * T5); - } - { - E T8, Ta, T7, T9; - T8 = ri[WS(rs, 20)]; - Ta = ii[WS(rs, 20)]; - T7 = W[38]; - T9 = W[39]; - Tb = FMA(T7, T8, T9 * Ta); - T2p = FNMS(T9, T8, T7 * Ta); - } - Tc = T6 + Tb; - T68 = T2o + T2p; - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 10)]; - Tg = ii[WS(rs, 10)]; - Td = W[18]; - Tf = W[19]; - Th = FMA(Td, Te, Tf * Tg); - T2r = FNMS(Tf, Te, Td * Tg); - } - { - E Tj, Tl, Ti, Tk; - Tj = ri[WS(rs, 15)]; - Tl = ii[WS(rs, 15)]; - Ti = W[28]; - Tk = W[29]; - Tm = FMA(Ti, Tj, Tk * Tl); - T2s = FNMS(Tk, Tj, Ti * Tl); - } - Tn = Th + Tm; - T69 = T2r + T2s; - T2l = KP559016994 * (Tc - Tn); - T6o = KP559016994 * (T68 - T69); - To = Tc + Tn; - T2m = FNMS(KP250000000, To, T1); - T6a = T68 + T69; - T6p = FNMS(KP250000000, T6a, T6b); - { - E T6r, T6s, T2q, T2t; - T6r = T6 - Tb; - T6s = Th - Tm; - T6t = FMA(KP951056516, T6r, KP587785252 * T6s); - T6S = FNMS(KP587785252, T6r, KP951056516 * T6s); - T2q = T2o - T2p; - T2t = T2r - T2s; - T2u = FMA(KP951056516, T2q, KP587785252 * T2t); - T4I = FNMS(KP587785252, T2q, KP951056516 * T2t); - } - } - { - E T1U, T3S, T3J, T3M, T3X, T3W, T3P, T3Q, T3T, T25, T2g, T2h; - { - E T1R, T1T, T1Q, T1S; - T1R = ri[WS(rs, 3)]; - T1T = ii[WS(rs, 3)]; - T1Q = W[4]; - T1S = W[5]; - T1U = FMA(T1Q, T1R, T1S * T1T); - T3S = FNMS(T1S, T1R, T1Q * T1T); - } - { - E T1Z, T3H, T2f, T3L, T24, T3I, T2a, T3K; - { - E T1W, T1Y, T1V, T1X; - T1W = ri[WS(rs, 8)]; - T1Y = ii[WS(rs, 8)]; - T1V = W[14]; - T1X = W[15]; - T1Z = FMA(T1V, T1W, T1X * T1Y); - T3H = FNMS(T1X, T1W, T1V * T1Y); - } - { - E T2c, T2e, T2b, T2d; - T2c = ri[WS(rs, 18)]; - T2e = ii[WS(rs, 18)]; - T2b = W[34]; - T2d = W[35]; - T2f = FMA(T2b, T2c, T2d * T2e); - T3L = FNMS(T2d, T2c, T2b * T2e); - } - { - E T21, T23, T20, T22; - T21 = ri[WS(rs, 23)]; - T23 = ii[WS(rs, 23)]; - T20 = W[44]; - T22 = W[45]; - T24 = FMA(T20, T21, T22 * T23); - T3I = FNMS(T22, T21, T20 * T23); - } - { - E T27, T29, T26, T28; - T27 = ri[WS(rs, 13)]; - T29 = ii[WS(rs, 13)]; - T26 = W[24]; - T28 = W[25]; - T2a = FMA(T26, T27, T28 * T29); - T3K = FNMS(T28, T27, T26 * T29); - } - T3J = T3H - T3I; - T3M = T3K - T3L; - T3X = T2a - T2f; - T3W = T1Z - T24; - T3P = T3H + T3I; - T3Q = T3K + T3L; - T3T = T3P + T3Q; - T25 = T1Z + T24; - T2g = T2a + T2f; - T2h = T25 + T2g; - } - T2i = T1U + T2h; - T60 = T3S + T3T; - { - E T3N, T57, T3G, T56, T3E, T3F; - T3N = FMA(KP951056516, T3J, KP587785252 * T3M); - T57 = FNMS(KP587785252, T3J, KP951056516 * T3M); - T3E = KP559016994 * (T25 - T2g); - T3F = FNMS(KP250000000, T2h, T1U); - T3G = T3E + T3F; - T56 = T3F - T3E; - T3O = T3G + T3N; - T5D = T56 + T57; - T4r = T3G - T3N; - T58 = T56 - T57; - } - { - E T3Y, T59, T3V, T5a, T3R, T3U; - T3Y = FMA(KP951056516, T3W, KP587785252 * T3X); - T59 = FNMS(KP587785252, T3W, KP951056516 * T3X); - T3R = KP559016994 * (T3P - T3Q); - T3U = FNMS(KP250000000, T3T, T3S); - T3V = T3R + T3U; - T5a = T3U - T3R; - T3Z = T3V - T3Y; - T5C = T5a - T59; - T4q = T3Y + T3V; - T5b = T59 + T5a; - } - } - { - E Tu, T2K, T2B, T2E, T2P, T2O, T2H, T2I, T2L, TF, TQ, TR; - { - E Tr, Tt, Tq, Ts; - Tr = ri[WS(rs, 1)]; - Tt = ii[WS(rs, 1)]; - Tq = W[0]; - Ts = W[1]; - Tu = FMA(Tq, Tr, Ts * Tt); - T2K = FNMS(Ts, Tr, Tq * Tt); - } - { - E Tz, T2z, TP, T2D, TE, T2A, TK, T2C; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 6)]; - Ty = ii[WS(rs, 6)]; - Tv = W[10]; - Tx = W[11]; - Tz = FMA(Tv, Tw, Tx * Ty); - T2z = FNMS(Tx, Tw, Tv * Ty); - } - { - E TM, TO, TL, TN; - TM = ri[WS(rs, 16)]; - TO = ii[WS(rs, 16)]; - TL = W[30]; - TN = W[31]; - TP = FMA(TL, TM, TN * TO); - T2D = FNMS(TN, TM, TL * TO); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 21)]; - TD = ii[WS(rs, 21)]; - TA = W[40]; - TC = W[41]; - TE = FMA(TA, TB, TC * TD); - T2A = FNMS(TC, TB, TA * TD); - } - { - E TH, TJ, TG, TI; - TH = ri[WS(rs, 11)]; - TJ = ii[WS(rs, 11)]; - TG = W[20]; - TI = W[21]; - TK = FMA(TG, TH, TI * TJ); - T2C = FNMS(TI, TH, TG * TJ); - } - T2B = T2z - T2A; - T2E = T2C - T2D; - T2P = TK - TP; - T2O = Tz - TE; - T2H = T2z + T2A; - T2I = T2C + T2D; - T2L = T2H + T2I; - TF = Tz + TE; - TQ = TK + TP; - TR = TF + TQ; - } - TS = Tu + TR; - T5W = T2K + T2L; - { - E T2F, T4L, T2y, T4K, T2w, T2x; - T2F = FMA(KP951056516, T2B, KP587785252 * T2E); - T4L = FNMS(KP587785252, T2B, KP951056516 * T2E); - T2w = KP559016994 * (TF - TQ); - T2x = FNMS(KP250000000, TR, Tu); - T2y = T2w + T2x; - T4K = T2x - T2w; - T2G = T2y + T2F; - T5s = T4K + T4L; - T4g = T2y - T2F; - T4M = T4K - T4L; - } - { - E T2Q, T4N, T2N, T4O, T2J, T2M; - T2Q = FMA(KP951056516, T2O, KP587785252 * T2P); - T4N = FNMS(KP587785252, T2O, KP951056516 * T2P); - T2J = KP559016994 * (T2H - T2I); - T2M = FNMS(KP250000000, T2L, T2K); - T2N = T2J + T2M; - T4O = T2M - T2J; - T2R = T2N - T2Q; - T5t = T4O - T4N; - T4h = T2Q + T2N; - T4P = T4N + T4O; - } - } - { - E TX, T37, T2Y, T31, T3c, T3b, T34, T35, T38, T18, T1j, T1k; - { - E TU, TW, TT, TV; - TU = ri[WS(rs, 4)]; - TW = ii[WS(rs, 4)]; - TT = W[6]; - TV = W[7]; - TX = FMA(TT, TU, TV * TW); - T37 = FNMS(TV, TU, TT * TW); - } - { - E T12, T2W, T1i, T30, T17, T2X, T1d, T2Z; - { - E TZ, T11, TY, T10; - TZ = ri[WS(rs, 9)]; - T11 = ii[WS(rs, 9)]; - TY = W[16]; - T10 = W[17]; - T12 = FMA(TY, TZ, T10 * T11); - T2W = FNMS(T10, TZ, TY * T11); - } - { - E T1f, T1h, T1e, T1g; - T1f = ri[WS(rs, 19)]; - T1h = ii[WS(rs, 19)]; - T1e = W[36]; - T1g = W[37]; - T1i = FMA(T1e, T1f, T1g * T1h); - T30 = FNMS(T1g, T1f, T1e * T1h); - } - { - E T14, T16, T13, T15; - T14 = ri[WS(rs, 24)]; - T16 = ii[WS(rs, 24)]; - T13 = W[46]; - T15 = W[47]; - T17 = FMA(T13, T14, T15 * T16); - T2X = FNMS(T15, T14, T13 * T16); - } - { - E T1a, T1c, T19, T1b; - T1a = ri[WS(rs, 14)]; - T1c = ii[WS(rs, 14)]; - T19 = W[26]; - T1b = W[27]; - T1d = FMA(T19, T1a, T1b * T1c); - T2Z = FNMS(T1b, T1a, T19 * T1c); - } - T2Y = T2W - T2X; - T31 = T2Z - T30; - T3c = T1d - T1i; - T3b = T12 - T17; - T34 = T2W + T2X; - T35 = T2Z + T30; - T38 = T34 + T35; - T18 = T12 + T17; - T1j = T1d + T1i; - T1k = T18 + T1j; - } - T1l = TX + T1k; - T5X = T37 + T38; - { - E T32, T4V, T2V, T4U, T2T, T2U; - T32 = FMA(KP951056516, T2Y, KP587785252 * T31); - T4V = FNMS(KP587785252, T2Y, KP951056516 * T31); - T2T = KP559016994 * (T18 - T1j); - T2U = FNMS(KP250000000, T1k, TX); - T2V = T2T + T2U; - T4U = T2U - T2T; - T33 = T2V + T32; - T5w = T4U + T4V; - T4j = T2V - T32; - T4W = T4U - T4V; - } - { - E T3d, T4R, T3a, T4S, T36, T39; - T3d = FMA(KP951056516, T3b, KP587785252 * T3c); - T4R = FNMS(KP587785252, T3b, KP951056516 * T3c); - T36 = KP559016994 * (T34 - T35); - T39 = FNMS(KP250000000, T38, T37); - T3a = T36 + T39; - T4S = T39 - T36; - T3e = T3a - T3d; - T5v = T4S - T4R; - T4k = T3d + T3a; - T4T = T4R + T4S; - } - } - { - E T1r, T3v, T3m, T3p, T3A, T3z, T3s, T3t, T3w, T1C, T1N, T1O; - { - E T1o, T1q, T1n, T1p; - T1o = ri[WS(rs, 2)]; - T1q = ii[WS(rs, 2)]; - T1n = W[2]; - T1p = W[3]; - T1r = FMA(T1n, T1o, T1p * T1q); - T3v = FNMS(T1p, T1o, T1n * T1q); - } - { - E T1w, T3k, T1M, T3o, T1B, T3l, T1H, T3n; - { - E T1t, T1v, T1s, T1u; - T1t = ri[WS(rs, 7)]; - T1v = ii[WS(rs, 7)]; - T1s = W[12]; - T1u = W[13]; - T1w = FMA(T1s, T1t, T1u * T1v); - T3k = FNMS(T1u, T1t, T1s * T1v); - } - { - E T1J, T1L, T1I, T1K; - T1J = ri[WS(rs, 17)]; - T1L = ii[WS(rs, 17)]; - T1I = W[32]; - T1K = W[33]; - T1M = FMA(T1I, T1J, T1K * T1L); - T3o = FNMS(T1K, T1J, T1I * T1L); - } - { - E T1y, T1A, T1x, T1z; - T1y = ri[WS(rs, 22)]; - T1A = ii[WS(rs, 22)]; - T1x = W[42]; - T1z = W[43]; - T1B = FMA(T1x, T1y, T1z * T1A); - T3l = FNMS(T1z, T1y, T1x * T1A); - } - { - E T1E, T1G, T1D, T1F; - T1E = ri[WS(rs, 12)]; - T1G = ii[WS(rs, 12)]; - T1D = W[22]; - T1F = W[23]; - T1H = FMA(T1D, T1E, T1F * T1G); - T3n = FNMS(T1F, T1E, T1D * T1G); - } - T3m = T3k - T3l; - T3p = T3n - T3o; - T3A = T1H - T1M; - T3z = T1w - T1B; - T3s = T3k + T3l; - T3t = T3n + T3o; - T3w = T3s + T3t; - T1C = T1w + T1B; - T1N = T1H + T1M; - T1O = T1C + T1N; - } - T1P = T1r + T1O; - T5Z = T3v + T3w; - { - E T3q, T50, T3j, T4Z, T3h, T3i; - T3q = FMA(KP951056516, T3m, KP587785252 * T3p); - T50 = FNMS(KP587785252, T3m, KP951056516 * T3p); - T3h = KP559016994 * (T1C - T1N); - T3i = FNMS(KP250000000, T1O, T1r); - T3j = T3h + T3i; - T4Z = T3i - T3h; - T3r = T3j + T3q; - T5z = T4Z + T50; - T4o = T3j - T3q; - T51 = T4Z - T50; - } - { - E T3B, T52, T3y, T53, T3u, T3x; - T3B = FMA(KP951056516, T3z, KP587785252 * T3A); - T52 = FNMS(KP587785252, T3z, KP951056516 * T3A); - T3u = KP559016994 * (T3s - T3t); - T3x = FNMS(KP250000000, T3w, T3v); - T3y = T3u + T3x; - T53 = T3x - T3u; - T3C = T3y - T3B; - T5A = T53 - T52; - T4n = T3B + T3y; - T54 = T52 + T53; - } - } - { - E T62, T64, Tp, T2k, T5T, T5U, T63, T5V; - { - E T5Y, T61, T1m, T2j; - T5Y = T5W - T5X; - T61 = T5Z - T60; - T62 = FMA(KP951056516, T5Y, KP587785252 * T61); - T64 = FNMS(KP587785252, T5Y, KP951056516 * T61); - Tp = T1 + To; - T1m = TS + T1l; - T2j = T1P + T2i; - T2k = T1m + T2j; - T5T = KP559016994 * (T1m - T2j); - T5U = FNMS(KP250000000, T2k, Tp); - } - ri[0] = Tp + T2k; - T63 = T5U - T5T; - ri[WS(rs, 10)] = T63 - T64; - ri[WS(rs, 15)] = T63 + T64; - T5V = T5T + T5U; - ri[WS(rs, 20)] = T5V - T62; - ri[WS(rs, 5)] = T5V + T62; - } - { - E T6i, T6j, T6c, T67, T6d, T6e, T6k, T6f; - { - E T6g, T6h, T65, T66; - T6g = TS - T1l; - T6h = T1P - T2i; - T6i = FMA(KP951056516, T6g, KP587785252 * T6h); - T6j = FNMS(KP587785252, T6g, KP951056516 * T6h); - T6c = T6a + T6b; - T65 = T5W + T5X; - T66 = T5Z + T60; - T67 = T65 + T66; - T6d = KP559016994 * (T65 - T66); - T6e = FNMS(KP250000000, T67, T6c); - } - ii[0] = T67 + T6c; - T6k = T6e - T6d; - ii[WS(rs, 10)] = T6j + T6k; - ii[WS(rs, 15)] = T6k - T6j; - T6f = T6d + T6e; - ii[WS(rs, 5)] = T6f - T6i; - ii[WS(rs, 20)] = T6i + T6f; - } - { - E T2v, T4f, T6u, T6G, T42, T6z, T43, T6y, T4A, T6H, T4D, T6F, T4u, T6L, T4v; - E T6K, T48, T6v, T4b, T6n, T2n, T6q; - T2n = T2l + T2m; - T2v = T2n + T2u; - T4f = T2n - T2u; - T6q = T6o + T6p; - T6u = T6q - T6t; - T6G = T6t + T6q; - { - E T2S, T3f, T3g, T3D, T40, T41; - T2S = FMA(KP968583161, T2G, KP248689887 * T2R); - T3f = FMA(KP535826794, T33, KP844327925 * T3e); - T3g = T2S + T3f; - T3D = FMA(KP876306680, T3r, KP481753674 * T3C); - T40 = FMA(KP728968627, T3O, KP684547105 * T3Z); - T41 = T3D + T40; - T42 = T3g + T41; - T6z = T3D - T40; - T43 = KP559016994 * (T3g - T41); - T6y = T2S - T3f; - } - { - E T4y, T4z, T6D, T4B, T4C, T6E; - T4y = FNMS(KP844327925, T4g, KP535826794 * T4h); - T4z = FNMS(KP637423989, T4k, KP770513242 * T4j); - T6D = T4y + T4z; - T4B = FMA(KP125333233, T4r, KP992114701 * T4q); - T4C = FMA(KP904827052, T4o, KP425779291 * T4n); - T6E = T4C + T4B; - T4A = T4y - T4z; - T6H = KP559016994 * (T6D + T6E); - T4D = T4B - T4C; - T6F = T6D - T6E; - } - { - E T4i, T4l, T4m, T4p, T4s, T4t; - T4i = FMA(KP535826794, T4g, KP844327925 * T4h); - T4l = FMA(KP637423989, T4j, KP770513242 * T4k); - T4m = T4i - T4l; - T4p = FNMS(KP425779291, T4o, KP904827052 * T4n); - T4s = FNMS(KP992114701, T4r, KP125333233 * T4q); - T4t = T4p + T4s; - T4u = T4m + T4t; - T6L = T4p - T4s; - T4v = KP559016994 * (T4m - T4t); - T6K = T4i + T4l; - } - { - E T46, T47, T6l, T49, T4a, T6m; - T46 = FNMS(KP248689887, T2G, KP968583161 * T2R); - T47 = FNMS(KP844327925, T33, KP535826794 * T3e); - T6l = T46 + T47; - T49 = FNMS(KP481753674, T3r, KP876306680 * T3C); - T4a = FNMS(KP684547105, T3O, KP728968627 * T3Z); - T6m = T49 + T4a; - T48 = T46 - T47; - T6v = KP559016994 * (T6l - T6m); - T4b = T49 - T4a; - T6n = T6l + T6m; - } - ri[WS(rs, 1)] = T2v + T42; - ii[WS(rs, 1)] = T6n + T6u; - ri[WS(rs, 4)] = T4f + T4u; - ii[WS(rs, 4)] = T6F + T6G; - { - E T4c, T4e, T45, T4d, T44; - T4c = FMA(KP951056516, T48, KP587785252 * T4b); - T4e = FNMS(KP587785252, T48, KP951056516 * T4b); - T44 = FNMS(KP250000000, T42, T2v); - T45 = T43 + T44; - T4d = T44 - T43; - ri[WS(rs, 21)] = T45 - T4c; - ri[WS(rs, 16)] = T4d + T4e; - ri[WS(rs, 6)] = T45 + T4c; - ri[WS(rs, 11)] = T4d - T4e; - } - { - E T6A, T6B, T6x, T6C, T6w; - T6A = FMA(KP951056516, T6y, KP587785252 * T6z); - T6B = FNMS(KP587785252, T6y, KP951056516 * T6z); - T6w = FNMS(KP250000000, T6n, T6u); - T6x = T6v + T6w; - T6C = T6w - T6v; - ii[WS(rs, 6)] = T6x - T6A; - ii[WS(rs, 16)] = T6C - T6B; - ii[WS(rs, 21)] = T6A + T6x; - ii[WS(rs, 11)] = T6B + T6C; - } - { - E T4E, T4G, T4x, T4F, T4w; - T4E = FMA(KP951056516, T4A, KP587785252 * T4D); - T4G = FNMS(KP587785252, T4A, KP951056516 * T4D); - T4w = FNMS(KP250000000, T4u, T4f); - T4x = T4v + T4w; - T4F = T4w - T4v; - ri[WS(rs, 24)] = T4x - T4E; - ri[WS(rs, 19)] = T4F + T4G; - ri[WS(rs, 9)] = T4x + T4E; - ri[WS(rs, 14)] = T4F - T4G; - } - { - E T6M, T6N, T6J, T6O, T6I; - T6M = FMA(KP951056516, T6K, KP587785252 * T6L); - T6N = FNMS(KP587785252, T6K, KP951056516 * T6L); - T6I = FNMS(KP250000000, T6F, T6G); - T6J = T6H + T6I; - T6O = T6I - T6H; - ii[WS(rs, 9)] = T6J - T6M; - ii[WS(rs, 19)] = T6O - T6N; - ii[WS(rs, 24)] = T6M + T6J; - ii[WS(rs, 14)] = T6N + T6O; - } - } - { - E T4J, T5r, T6U, T76, T5e, T6Z, T5f, T6Y, T5M, T77, T5P, T75, T5G, T7b, T5H; - E T7a, T5k, T6V, T5n, T6R, T4H, T6T; - T4H = T2m - T2l; - T4J = T4H - T4I; - T5r = T4H + T4I; - T6T = T6p - T6o; - T6U = T6S + T6T; - T76 = T6T - T6S; - { - E T4Q, T4X, T4Y, T55, T5c, T5d; - T4Q = FMA(KP876306680, T4M, KP481753674 * T4P); - T4X = FNMS(KP425779291, T4W, KP904827052 * T4T); - T4Y = T4Q + T4X; - T55 = FMA(KP535826794, T51, KP844327925 * T54); - T5c = FMA(KP062790519, T58, KP998026728 * T5b); - T5d = T55 + T5c; - T5e = T4Y + T5d; - T6Z = T55 - T5c; - T5f = KP559016994 * (T4Y - T5d); - T6Y = T4Q - T4X; - } - { - E T5K, T5L, T73, T5N, T5O, T74; - T5K = FNMS(KP684547105, T5s, KP728968627 * T5t); - T5L = FMA(KP125333233, T5w, KP992114701 * T5v); - T73 = T5K - T5L; - T5N = FNMS(KP998026728, T5z, KP062790519 * T5A); - T5O = FMA(KP770513242, T5D, KP637423989 * T5C); - T74 = T5N - T5O; - T5M = T5K + T5L; - T77 = KP559016994 * (T73 - T74); - T5P = T5N + T5O; - T75 = T73 + T74; - } - { - E T5u, T5x, T5y, T5B, T5E, T5F; - T5u = FMA(KP728968627, T5s, KP684547105 * T5t); - T5x = FNMS(KP992114701, T5w, KP125333233 * T5v); - T5y = T5u + T5x; - T5B = FMA(KP062790519, T5z, KP998026728 * T5A); - T5E = FNMS(KP637423989, T5D, KP770513242 * T5C); - T5F = T5B + T5E; - T5G = T5y + T5F; - T7b = T5B - T5E; - T5H = KP559016994 * (T5y - T5F); - T7a = T5u - T5x; - } - { - E T5i, T5j, T6P, T5l, T5m, T6Q; - T5i = FNMS(KP481753674, T4M, KP876306680 * T4P); - T5j = FMA(KP904827052, T4W, KP425779291 * T4T); - T6P = T5i - T5j; - T5l = FNMS(KP844327925, T51, KP535826794 * T54); - T5m = FNMS(KP998026728, T58, KP062790519 * T5b); - T6Q = T5l + T5m; - T5k = T5i + T5j; - T6V = KP559016994 * (T6P - T6Q); - T5n = T5l - T5m; - T6R = T6P + T6Q; - } - ri[WS(rs, 2)] = T4J + T5e; - ii[WS(rs, 2)] = T6R + T6U; - ri[WS(rs, 3)] = T5r + T5G; - ii[WS(rs, 3)] = T75 + T76; - { - E T5o, T5q, T5h, T5p, T5g; - T5o = FMA(KP951056516, T5k, KP587785252 * T5n); - T5q = FNMS(KP587785252, T5k, KP951056516 * T5n); - T5g = FNMS(KP250000000, T5e, T4J); - T5h = T5f + T5g; - T5p = T5g - T5f; - ri[WS(rs, 22)] = T5h - T5o; - ri[WS(rs, 17)] = T5p + T5q; - ri[WS(rs, 7)] = T5h + T5o; - ri[WS(rs, 12)] = T5p - T5q; - } - { - E T70, T71, T6X, T72, T6W; - T70 = FMA(KP951056516, T6Y, KP587785252 * T6Z); - T71 = FNMS(KP587785252, T6Y, KP951056516 * T6Z); - T6W = FNMS(KP250000000, T6R, T6U); - T6X = T6V + T6W; - T72 = T6W - T6V; - ii[WS(rs, 7)] = T6X - T70; - ii[WS(rs, 17)] = T72 - T71; - ii[WS(rs, 22)] = T70 + T6X; - ii[WS(rs, 12)] = T71 + T72; - } - { - E T5Q, T5S, T5J, T5R, T5I; - T5Q = FMA(KP951056516, T5M, KP587785252 * T5P); - T5S = FNMS(KP587785252, T5M, KP951056516 * T5P); - T5I = FNMS(KP250000000, T5G, T5r); - T5J = T5H + T5I; - T5R = T5I - T5H; - ri[WS(rs, 23)] = T5J - T5Q; - ri[WS(rs, 18)] = T5R + T5S; - ri[WS(rs, 8)] = T5J + T5Q; - ri[WS(rs, 13)] = T5R - T5S; - } - { - E T7c, T7d, T79, T7e, T78; - T7c = FMA(KP951056516, T7a, KP587785252 * T7b); - T7d = FNMS(KP587785252, T7a, KP951056516 * T7b); - T78 = FNMS(KP250000000, T75, T76); - T79 = T77 + T78; - T7e = T78 - T77; - ii[WS(rs, 8)] = T79 - T7c; - ii[WS(rs, 18)] = T7e - T7d; - ii[WS(rs, 23)] = T7c + T79; - ii[WS(rs, 13)] = T7d + T7e; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 25 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 25, "t1_25", twinstr, &GENUS, { 260, 140, 140, 0 }, 0, 0, 0 }; - -void X(codelet_t1_25) (planner *p) { - X(kdft_dit_register) (p, t1_25, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_32.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_32.c deleted file mode 100644 index d42496ae..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_32.c +++ /dev/null @@ -1,1809 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:28 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 32 -name t1_32 -include dft/scalar/t.h */ - -/* - * This function contains 434 FP additions, 260 FP multiplications, - * (or, 236 additions, 62 multiplications, 198 fused multiply/add), - * 102 stack variables, 7 constants, and 128 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP198912367, +0.198912367379658006911597622644676228597850501); - DK(KP668178637, +0.668178637919298919997757686523080761552472251); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP414213562, +0.414213562373095048801688724209698078569671875); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 62); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) { - E T8, T8x, T3w, T87, Tl, T8y, T3B, T83, Tz, T6F, T3J, T5T, TM, T6G, T3Q; - E T5U, T11, T1e, T6M, T6J, T6K, T6L, T3Z, T5X, T46, T5Y, T1s, T1F, T6O, T6P; - E T6Q, T6R, T4e, T60, T4l, T61, T32, T7b, T78, T7N, T54, T6f, T5r, T6c, T29; - E T70, T6X, T7I, T4v, T68, T4S, T65, T3t, T79, T7e, T7O, T5b, T5s, T5i, T5t; - E T2A, T6Y, T73, T7J, T4C, T4T, T4J, T4U; - { - E T1, T86, T3, T6, T4, T84, T2, T7, T85, T5; - T1 = ri[0]; - T86 = ii[0]; - T3 = ri[WS(rs, 16)]; - T6 = ii[WS(rs, 16)]; - T2 = W[30]; - T4 = T2 * T3; - T84 = T2 * T6; - T5 = W[31]; - T7 = FMA(T5, T6, T4); - T85 = FNMS(T5, T3, T84); - T8 = T1 + T7; - T8x = T86 - T85; - T3w = T1 - T7; - T87 = T85 + T86; - } - { - E Ta, Td, Tb, T3x, Tg, Tj, Th, T3z, T9, Tf; - Ta = ri[WS(rs, 8)]; - Td = ii[WS(rs, 8)]; - T9 = W[14]; - Tb = T9 * Ta; - T3x = T9 * Td; - Tg = ri[WS(rs, 24)]; - Tj = ii[WS(rs, 24)]; - Tf = W[46]; - Th = Tf * Tg; - T3z = Tf * Tj; - { - E Te, T3y, Tk, T3A, Tc, Ti; - Tc = W[15]; - Te = FMA(Tc, Td, Tb); - T3y = FNMS(Tc, Ta, T3x); - Ti = W[47]; - Tk = FMA(Ti, Tj, Th); - T3A = FNMS(Ti, Tg, T3z); - Tl = Te + Tk; - T8y = Te - Tk; - T3B = T3y - T3A; - T83 = T3y + T3A; - } - } - { - E Ts, T3F, Ty, T3H, T3D, T3I; - { - E To, Tr, Tp, T3E, Tn, Tq; - To = ri[WS(rs, 4)]; - Tr = ii[WS(rs, 4)]; - Tn = W[6]; - Tp = Tn * To; - T3E = Tn * Tr; - Tq = W[7]; - Ts = FMA(Tq, Tr, Tp); - T3F = FNMS(Tq, To, T3E); - } - { - E Tu, Tx, Tv, T3G, Tt, Tw; - Tu = ri[WS(rs, 20)]; - Tx = ii[WS(rs, 20)]; - Tt = W[38]; - Tv = Tt * Tu; - T3G = Tt * Tx; - Tw = W[39]; - Ty = FMA(Tw, Tx, Tv); - T3H = FNMS(Tw, Tu, T3G); - } - Tz = Ts + Ty; - T6F = T3F + T3H; - T3D = Ts - Ty; - T3I = T3F - T3H; - T3J = T3D + T3I; - T5T = T3I - T3D; - } - { - E TF, T3M, TL, T3O, T3K, T3P; - { - E TB, TE, TC, T3L, TA, TD; - TB = ri[WS(rs, 28)]; - TE = ii[WS(rs, 28)]; - TA = W[54]; - TC = TA * TB; - T3L = TA * TE; - TD = W[55]; - TF = FMA(TD, TE, TC); - T3M = FNMS(TD, TB, T3L); - } - { - E TH, TK, TI, T3N, TG, TJ; - TH = ri[WS(rs, 12)]; - TK = ii[WS(rs, 12)]; - TG = W[22]; - TI = TG * TH; - T3N = TG * TK; - TJ = W[23]; - TL = FMA(TJ, TK, TI); - T3O = FNMS(TJ, TH, T3N); - } - TM = TF + TL; - T6G = T3M + T3O; - T3K = TF - TL; - T3P = T3M - T3O; - T3Q = T3K - T3P; - T5U = T3K + T3P; - } - { - E TU, T3U, T1d, T44, T10, T3W, T17, T42; - { - E TQ, TT, TR, T3T, TP, TS; - TQ = ri[WS(rs, 2)]; - TT = ii[WS(rs, 2)]; - TP = W[2]; - TR = TP * TQ; - T3T = TP * TT; - TS = W[3]; - TU = FMA(TS, TT, TR); - T3U = FNMS(TS, TQ, T3T); - } - { - E T19, T1c, T1a, T43, T18, T1b; - T19 = ri[WS(rs, 26)]; - T1c = ii[WS(rs, 26)]; - T18 = W[50]; - T1a = T18 * T19; - T43 = T18 * T1c; - T1b = W[51]; - T1d = FMA(T1b, T1c, T1a); - T44 = FNMS(T1b, T19, T43); - } - { - E TW, TZ, TX, T3V, TV, TY; - TW = ri[WS(rs, 18)]; - TZ = ii[WS(rs, 18)]; - TV = W[34]; - TX = TV * TW; - T3V = TV * TZ; - TY = W[35]; - T10 = FMA(TY, TZ, TX); - T3W = FNMS(TY, TW, T3V); - } - { - E T13, T16, T14, T41, T12, T15; - T13 = ri[WS(rs, 10)]; - T16 = ii[WS(rs, 10)]; - T12 = W[18]; - T14 = T12 * T13; - T41 = T12 * T16; - T15 = W[19]; - T17 = FMA(T15, T16, T14); - T42 = FNMS(T15, T13, T41); - } - T11 = TU + T10; - T1e = T17 + T1d; - T6M = T11 - T1e; - T6J = T3U + T3W; - T6K = T42 + T44; - T6L = T6J - T6K; - { - E T3X, T3Y, T40, T45; - T3X = T3U - T3W; - T3Y = T17 - T1d; - T3Z = T3X - T3Y; - T5X = T3X + T3Y; - T40 = TU - T10; - T45 = T42 - T44; - T46 = T40 + T45; - T5Y = T40 - T45; - } - } - { - E T1l, T49, T1E, T4j, T1r, T4b, T1y, T4h; - { - E T1h, T1k, T1i, T48, T1g, T1j; - T1h = ri[WS(rs, 30)]; - T1k = ii[WS(rs, 30)]; - T1g = W[58]; - T1i = T1g * T1h; - T48 = T1g * T1k; - T1j = W[59]; - T1l = FMA(T1j, T1k, T1i); - T49 = FNMS(T1j, T1h, T48); - } - { - E T1A, T1D, T1B, T4i, T1z, T1C; - T1A = ri[WS(rs, 22)]; - T1D = ii[WS(rs, 22)]; - T1z = W[42]; - T1B = T1z * T1A; - T4i = T1z * T1D; - T1C = W[43]; - T1E = FMA(T1C, T1D, T1B); - T4j = FNMS(T1C, T1A, T4i); - } - { - E T1n, T1q, T1o, T4a, T1m, T1p; - T1n = ri[WS(rs, 14)]; - T1q = ii[WS(rs, 14)]; - T1m = W[26]; - T1o = T1m * T1n; - T4a = T1m * T1q; - T1p = W[27]; - T1r = FMA(T1p, T1q, T1o); - T4b = FNMS(T1p, T1n, T4a); - } - { - E T1u, T1x, T1v, T4g, T1t, T1w; - T1u = ri[WS(rs, 6)]; - T1x = ii[WS(rs, 6)]; - T1t = W[10]; - T1v = T1t * T1u; - T4g = T1t * T1x; - T1w = W[11]; - T1y = FMA(T1w, T1x, T1v); - T4h = FNMS(T1w, T1u, T4g); - } - T1s = T1l + T1r; - T1F = T1y + T1E; - T6O = T1s - T1F; - T6P = T49 + T4b; - T6Q = T4h + T4j; - T6R = T6P - T6Q; - { - E T4c, T4d, T4f, T4k; - T4c = T49 - T4b; - T4d = T1y - T1E; - T4e = T4c - T4d; - T60 = T4c + T4d; - T4f = T1l - T1r; - T4k = T4h - T4j; - T4l = T4f + T4k; - T61 = T4f - T4k; - } - } - { - E T2H, T4Z, T30, T5p, T2N, T51, T2U, T5n; - { - E T2D, T2G, T2E, T4Y, T2C, T2F; - T2D = ri[WS(rs, 31)]; - T2G = ii[WS(rs, 31)]; - T2C = W[60]; - T2E = T2C * T2D; - T4Y = T2C * T2G; - T2F = W[61]; - T2H = FMA(T2F, T2G, T2E); - T4Z = FNMS(T2F, T2D, T4Y); - } - { - E T2W, T2Z, T2X, T5o, T2V, T2Y; - T2W = ri[WS(rs, 23)]; - T2Z = ii[WS(rs, 23)]; - T2V = W[44]; - T2X = T2V * T2W; - T5o = T2V * T2Z; - T2Y = W[45]; - T30 = FMA(T2Y, T2Z, T2X); - T5p = FNMS(T2Y, T2W, T5o); - } - { - E T2J, T2M, T2K, T50, T2I, T2L; - T2J = ri[WS(rs, 15)]; - T2M = ii[WS(rs, 15)]; - T2I = W[28]; - T2K = T2I * T2J; - T50 = T2I * T2M; - T2L = W[29]; - T2N = FMA(T2L, T2M, T2K); - T51 = FNMS(T2L, T2J, T50); - } - { - E T2Q, T2T, T2R, T5m, T2P, T2S; - T2Q = ri[WS(rs, 7)]; - T2T = ii[WS(rs, 7)]; - T2P = W[12]; - T2R = T2P * T2Q; - T5m = T2P * T2T; - T2S = W[13]; - T2U = FMA(T2S, T2T, T2R); - T5n = FNMS(T2S, T2Q, T5m); - } - { - E T2O, T31, T76, T77; - T2O = T2H + T2N; - T31 = T2U + T30; - T32 = T2O + T31; - T7b = T2O - T31; - T76 = T4Z + T51; - T77 = T5n + T5p; - T78 = T76 - T77; - T7N = T76 + T77; - } - { - E T52, T53, T5l, T5q; - T52 = T4Z - T51; - T53 = T2U - T30; - T54 = T52 - T53; - T6f = T52 + T53; - T5l = T2H - T2N; - T5q = T5n - T5p; - T5r = T5l + T5q; - T6c = T5l - T5q; - } - } - { - E T1O, T4q, T27, T4Q, T1U, T4s, T21, T4O; - { - E T1K, T1N, T1L, T4p, T1J, T1M; - T1K = ri[WS(rs, 1)]; - T1N = ii[WS(rs, 1)]; - T1J = W[0]; - T1L = T1J * T1K; - T4p = T1J * T1N; - T1M = W[1]; - T1O = FMA(T1M, T1N, T1L); - T4q = FNMS(T1M, T1K, T4p); - } - { - E T23, T26, T24, T4P, T22, T25; - T23 = ri[WS(rs, 25)]; - T26 = ii[WS(rs, 25)]; - T22 = W[48]; - T24 = T22 * T23; - T4P = T22 * T26; - T25 = W[49]; - T27 = FMA(T25, T26, T24); - T4Q = FNMS(T25, T23, T4P); - } - { - E T1Q, T1T, T1R, T4r, T1P, T1S; - T1Q = ri[WS(rs, 17)]; - T1T = ii[WS(rs, 17)]; - T1P = W[32]; - T1R = T1P * T1Q; - T4r = T1P * T1T; - T1S = W[33]; - T1U = FMA(T1S, T1T, T1R); - T4s = FNMS(T1S, T1Q, T4r); - } - { - E T1X, T20, T1Y, T4N, T1W, T1Z; - T1X = ri[WS(rs, 9)]; - T20 = ii[WS(rs, 9)]; - T1W = W[16]; - T1Y = T1W * T1X; - T4N = T1W * T20; - T1Z = W[17]; - T21 = FMA(T1Z, T20, T1Y); - T4O = FNMS(T1Z, T1X, T4N); - } - { - E T1V, T28, T6V, T6W; - T1V = T1O + T1U; - T28 = T21 + T27; - T29 = T1V + T28; - T70 = T1V - T28; - T6V = T4q + T4s; - T6W = T4O + T4Q; - T6X = T6V - T6W; - T7I = T6V + T6W; - } - { - E T4t, T4u, T4M, T4R; - T4t = T4q - T4s; - T4u = T21 - T27; - T4v = T4t - T4u; - T68 = T4t + T4u; - T4M = T1O - T1U; - T4R = T4O - T4Q; - T4S = T4M + T4R; - T65 = T4M - T4R; - } - } - { - E T38, T56, T3r, T5g, T3e, T58, T3l, T5e; - { - E T34, T37, T35, T55, T33, T36; - T34 = ri[WS(rs, 3)]; - T37 = ii[WS(rs, 3)]; - T33 = W[4]; - T35 = T33 * T34; - T55 = T33 * T37; - T36 = W[5]; - T38 = FMA(T36, T37, T35); - T56 = FNMS(T36, T34, T55); - } - { - E T3n, T3q, T3o, T5f, T3m, T3p; - T3n = ri[WS(rs, 11)]; - T3q = ii[WS(rs, 11)]; - T3m = W[20]; - T3o = T3m * T3n; - T5f = T3m * T3q; - T3p = W[21]; - T3r = FMA(T3p, T3q, T3o); - T5g = FNMS(T3p, T3n, T5f); - } - { - E T3a, T3d, T3b, T57, T39, T3c; - T3a = ri[WS(rs, 19)]; - T3d = ii[WS(rs, 19)]; - T39 = W[36]; - T3b = T39 * T3a; - T57 = T39 * T3d; - T3c = W[37]; - T3e = FMA(T3c, T3d, T3b); - T58 = FNMS(T3c, T3a, T57); - } - { - E T3h, T3k, T3i, T5d, T3g, T3j; - T3h = ri[WS(rs, 27)]; - T3k = ii[WS(rs, 27)]; - T3g = W[52]; - T3i = T3g * T3h; - T5d = T3g * T3k; - T3j = W[53]; - T3l = FMA(T3j, T3k, T3i); - T5e = FNMS(T3j, T3h, T5d); - } - { - E T3f, T3s, T7c, T7d; - T3f = T38 + T3e; - T3s = T3l + T3r; - T3t = T3f + T3s; - T79 = T3s - T3f; - T7c = T56 + T58; - T7d = T5e + T5g; - T7e = T7c - T7d; - T7O = T7c + T7d; - } - { - E T59, T5a, T5c, T5h; - T59 = T56 - T58; - T5a = T38 - T3e; - T5b = T59 - T5a; - T5s = T5a + T59; - T5c = T3l - T3r; - T5h = T5e - T5g; - T5i = T5c + T5h; - T5t = T5c - T5h; - } - } - { - E T2f, T4x, T2y, T4H, T2l, T4z, T2s, T4F; - { - E T2b, T2e, T2c, T4w, T2a, T2d; - T2b = ri[WS(rs, 5)]; - T2e = ii[WS(rs, 5)]; - T2a = W[8]; - T2c = T2a * T2b; - T4w = T2a * T2e; - T2d = W[9]; - T2f = FMA(T2d, T2e, T2c); - T4x = FNMS(T2d, T2b, T4w); - } - { - E T2u, T2x, T2v, T4G, T2t, T2w; - T2u = ri[WS(rs, 13)]; - T2x = ii[WS(rs, 13)]; - T2t = W[24]; - T2v = T2t * T2u; - T4G = T2t * T2x; - T2w = W[25]; - T2y = FMA(T2w, T2x, T2v); - T4H = FNMS(T2w, T2u, T4G); - } - { - E T2h, T2k, T2i, T4y, T2g, T2j; - T2h = ri[WS(rs, 21)]; - T2k = ii[WS(rs, 21)]; - T2g = W[40]; - T2i = T2g * T2h; - T4y = T2g * T2k; - T2j = W[41]; - T2l = FMA(T2j, T2k, T2i); - T4z = FNMS(T2j, T2h, T4y); - } - { - E T2o, T2r, T2p, T4E, T2n, T2q; - T2o = ri[WS(rs, 29)]; - T2r = ii[WS(rs, 29)]; - T2n = W[56]; - T2p = T2n * T2o; - T4E = T2n * T2r; - T2q = W[57]; - T2s = FMA(T2q, T2r, T2p); - T4F = FNMS(T2q, T2o, T4E); - } - { - E T2m, T2z, T71, T72; - T2m = T2f + T2l; - T2z = T2s + T2y; - T2A = T2m + T2z; - T6Y = T2z - T2m; - T71 = T4x + T4z; - T72 = T4F + T4H; - T73 = T71 - T72; - T7J = T71 + T72; - } - { - E T4A, T4B, T4D, T4I; - T4A = T4x - T4z; - T4B = T2f - T2l; - T4C = T4A - T4B; - T4T = T4B + T4A; - T4D = T2s - T2y; - T4I = T4F - T4H; - T4J = T4D + T4I; - T4U = T4D - T4I; - } - } - { - E TO, T7C, T7Z, T80, T89, T8e, T1H, T8d, T3v, T8b, T7L, T7T, T7Q, T7U, T7F; - E T81; - { - E Tm, TN, T7X, T7Y; - Tm = T8 + Tl; - TN = Tz + TM; - TO = Tm + TN; - T7C = Tm - TN; - T7X = T7I + T7J; - T7Y = T7N + T7O; - T7Z = T7X - T7Y; - T80 = T7X + T7Y; - } - { - E T82, T88, T1f, T1G; - T82 = T6F + T6G; - T88 = T83 + T87; - T89 = T82 + T88; - T8e = T88 - T82; - T1f = T11 + T1e; - T1G = T1s + T1F; - T1H = T1f + T1G; - T8d = T1G - T1f; - } - { - E T2B, T3u, T7H, T7K; - T2B = T29 + T2A; - T3u = T32 + T3t; - T3v = T2B + T3u; - T8b = T3u - T2B; - T7H = T29 - T2A; - T7K = T7I - T7J; - T7L = T7H + T7K; - T7T = T7K - T7H; - } - { - E T7M, T7P, T7D, T7E; - T7M = T32 - T3t; - T7P = T7N - T7O; - T7Q = T7M - T7P; - T7U = T7M + T7P; - T7D = T6J + T6K; - T7E = T6P + T6Q; - T7F = T7D - T7E; - T81 = T7D + T7E; - } - { - E T1I, T8a, T7W, T8c; - T1I = TO + T1H; - ri[WS(rs, 16)] = T1I - T3v; - ri[0] = T1I + T3v; - T8a = T81 + T89; - ii[0] = T80 + T8a; - ii[WS(rs, 16)] = T8a - T80; - T7W = TO - T1H; - ri[WS(rs, 24)] = T7W - T7Z; - ri[WS(rs, 8)] = T7W + T7Z; - T8c = T89 - T81; - ii[WS(rs, 8)] = T8b + T8c; - ii[WS(rs, 24)] = T8c - T8b; - } - { - E T7G, T7R, T8f, T8g; - T7G = T7C + T7F; - T7R = T7L + T7Q; - ri[WS(rs, 20)] = FNMS(KP707106781, T7R, T7G); - ri[WS(rs, 4)] = FMA(KP707106781, T7R, T7G); - T8f = T8d + T8e; - T8g = T7T + T7U; - ii[WS(rs, 4)] = FMA(KP707106781, T8g, T8f); - ii[WS(rs, 20)] = FNMS(KP707106781, T8g, T8f); - } - { - E T7S, T7V, T8h, T8i; - T7S = T7C - T7F; - T7V = T7T - T7U; - ri[WS(rs, 28)] = FNMS(KP707106781, T7V, T7S); - ri[WS(rs, 12)] = FMA(KP707106781, T7V, T7S); - T8h = T8e - T8d; - T8i = T7Q - T7L; - ii[WS(rs, 12)] = FMA(KP707106781, T8i, T8h); - ii[WS(rs, 28)] = FNMS(KP707106781, T8i, T8h); - } - } - { - E T6I, T7m, T7w, T7A, T8l, T8r, T6T, T8m, T75, T7j, T7p, T8s, T7t, T7z, T7g; - E T7k; - { - E T6E, T6H, T7u, T7v; - T6E = T8 - Tl; - T6H = T6F - T6G; - T6I = T6E - T6H; - T7m = T6E + T6H; - T7u = T7b + T7e; - T7v = T78 + T79; - T7w = FNMS(KP414213562, T7v, T7u); - T7A = FMA(KP414213562, T7u, T7v); - } - { - E T8j, T8k, T6N, T6S; - T8j = TM - Tz; - T8k = T87 - T83; - T8l = T8j + T8k; - T8r = T8k - T8j; - T6N = T6L - T6M; - T6S = T6O + T6R; - T6T = T6N - T6S; - T8m = T6N + T6S; - } - { - E T6Z, T74, T7n, T7o; - T6Z = T6X - T6Y; - T74 = T70 - T73; - T75 = FMA(KP414213562, T74, T6Z); - T7j = FNMS(KP414213562, T6Z, T74); - T7n = T6M + T6L; - T7o = T6O - T6R; - T7p = T7n + T7o; - T8s = T7o - T7n; - } - { - E T7r, T7s, T7a, T7f; - T7r = T70 + T73; - T7s = T6X + T6Y; - T7t = FMA(KP414213562, T7s, T7r); - T7z = FNMS(KP414213562, T7r, T7s); - T7a = T78 - T79; - T7f = T7b - T7e; - T7g = FNMS(KP414213562, T7f, T7a); - T7k = FMA(KP414213562, T7a, T7f); - } - { - E T6U, T7h, T8t, T8u; - T6U = FMA(KP707106781, T6T, T6I); - T7h = T75 - T7g; - ri[WS(rs, 22)] = FNMS(KP923879532, T7h, T6U); - ri[WS(rs, 6)] = FMA(KP923879532, T7h, T6U); - T8t = FMA(KP707106781, T8s, T8r); - T8u = T7k - T7j; - ii[WS(rs, 6)] = FMA(KP923879532, T8u, T8t); - ii[WS(rs, 22)] = FNMS(KP923879532, T8u, T8t); - } - { - E T7i, T7l, T8v, T8w; - T7i = FNMS(KP707106781, T6T, T6I); - T7l = T7j + T7k; - ri[WS(rs, 14)] = FNMS(KP923879532, T7l, T7i); - ri[WS(rs, 30)] = FMA(KP923879532, T7l, T7i); - T8v = FNMS(KP707106781, T8s, T8r); - T8w = T75 + T7g; - ii[WS(rs, 14)] = FNMS(KP923879532, T8w, T8v); - ii[WS(rs, 30)] = FMA(KP923879532, T8w, T8v); - } - { - E T7q, T7x, T8n, T8o; - T7q = FMA(KP707106781, T7p, T7m); - T7x = T7t + T7w; - ri[WS(rs, 18)] = FNMS(KP923879532, T7x, T7q); - ri[WS(rs, 2)] = FMA(KP923879532, T7x, T7q); - T8n = FMA(KP707106781, T8m, T8l); - T8o = T7z + T7A; - ii[WS(rs, 2)] = FMA(KP923879532, T8o, T8n); - ii[WS(rs, 18)] = FNMS(KP923879532, T8o, T8n); - } - { - E T7y, T7B, T8p, T8q; - T7y = FNMS(KP707106781, T7p, T7m); - T7B = T7z - T7A; - ri[WS(rs, 26)] = FNMS(KP923879532, T7B, T7y); - ri[WS(rs, 10)] = FMA(KP923879532, T7B, T7y); - T8p = FNMS(KP707106781, T8m, T8l); - T8q = T7w - T7t; - ii[WS(rs, 10)] = FMA(KP923879532, T8q, T8p); - ii[WS(rs, 26)] = FNMS(KP923879532, T8q, T8p); - } - } - { - E T3S, T5C, T4n, T8C, T8B, T8H, T5F, T8I, T5w, T5Q, T5A, T5M, T4X, T5P, T5z; - E T5J; - { - E T3C, T3R, T5D, T5E; - T3C = T3w + T3B; - T3R = T3J + T3Q; - T3S = FNMS(KP707106781, T3R, T3C); - T5C = FMA(KP707106781, T3R, T3C); - { - E T47, T4m, T8z, T8A; - T47 = FNMS(KP414213562, T46, T3Z); - T4m = FMA(KP414213562, T4l, T4e); - T4n = T47 - T4m; - T8C = T47 + T4m; - T8z = T8x - T8y; - T8A = T5T + T5U; - T8B = FMA(KP707106781, T8A, T8z); - T8H = FNMS(KP707106781, T8A, T8z); - } - T5D = FMA(KP414213562, T3Z, T46); - T5E = FNMS(KP414213562, T4e, T4l); - T5F = T5D + T5E; - T8I = T5E - T5D; - { - E T5k, T5L, T5v, T5K, T5j, T5u; - T5j = T5b + T5i; - T5k = FNMS(KP707106781, T5j, T54); - T5L = FMA(KP707106781, T5j, T54); - T5u = T5s + T5t; - T5v = FNMS(KP707106781, T5u, T5r); - T5K = FMA(KP707106781, T5u, T5r); - T5w = FNMS(KP668178637, T5v, T5k); - T5Q = FMA(KP198912367, T5K, T5L); - T5A = FMA(KP668178637, T5k, T5v); - T5M = FNMS(KP198912367, T5L, T5K); - } - { - E T4L, T5I, T4W, T5H, T4K, T4V; - T4K = T4C + T4J; - T4L = FNMS(KP707106781, T4K, T4v); - T5I = FMA(KP707106781, T4K, T4v); - T4V = T4T + T4U; - T4W = FNMS(KP707106781, T4V, T4S); - T5H = FMA(KP707106781, T4V, T4S); - T4X = FMA(KP668178637, T4W, T4L); - T5P = FNMS(KP198912367, T5H, T5I); - T5z = FNMS(KP668178637, T4L, T4W); - T5J = FMA(KP198912367, T5I, T5H); - } - } - { - E T4o, T5x, T8J, T8K; - T4o = FMA(KP923879532, T4n, T3S); - T5x = T4X - T5w; - ri[WS(rs, 21)] = FNMS(KP831469612, T5x, T4o); - ri[WS(rs, 5)] = FMA(KP831469612, T5x, T4o); - T8J = FMA(KP923879532, T8I, T8H); - T8K = T5A - T5z; - ii[WS(rs, 5)] = FMA(KP831469612, T8K, T8J); - ii[WS(rs, 21)] = FNMS(KP831469612, T8K, T8J); - } - { - E T5y, T5B, T8L, T8M; - T5y = FNMS(KP923879532, T4n, T3S); - T5B = T5z + T5A; - ri[WS(rs, 13)] = FNMS(KP831469612, T5B, T5y); - ri[WS(rs, 29)] = FMA(KP831469612, T5B, T5y); - T8L = FNMS(KP923879532, T8I, T8H); - T8M = T4X + T5w; - ii[WS(rs, 13)] = FNMS(KP831469612, T8M, T8L); - ii[WS(rs, 29)] = FMA(KP831469612, T8M, T8L); - } - { - E T5G, T5N, T8D, T8E; - T5G = FMA(KP923879532, T5F, T5C); - T5N = T5J + T5M; - ri[WS(rs, 17)] = FNMS(KP980785280, T5N, T5G); - ri[WS(rs, 1)] = FMA(KP980785280, T5N, T5G); - T8D = FMA(KP923879532, T8C, T8B); - T8E = T5P + T5Q; - ii[WS(rs, 1)] = FMA(KP980785280, T8E, T8D); - ii[WS(rs, 17)] = FNMS(KP980785280, T8E, T8D); - } - { - E T5O, T5R, T8F, T8G; - T5O = FNMS(KP923879532, T5F, T5C); - T5R = T5P - T5Q; - ri[WS(rs, 25)] = FNMS(KP980785280, T5R, T5O); - ri[WS(rs, 9)] = FMA(KP980785280, T5R, T5O); - T8F = FNMS(KP923879532, T8C, T8B); - T8G = T5M - T5J; - ii[WS(rs, 9)] = FMA(KP980785280, T8G, T8F); - ii[WS(rs, 25)] = FNMS(KP980785280, T8G, T8F); - } - } - { - E T5W, T6o, T63, T8W, T8P, T8V, T6r, T8Q, T6i, T6C, T6m, T6y, T6b, T6B, T6l; - E T6v; - { - E T5S, T5V, T6p, T6q; - T5S = T3w - T3B; - T5V = T5T - T5U; - T5W = FMA(KP707106781, T5V, T5S); - T6o = FNMS(KP707106781, T5V, T5S); - { - E T5Z, T62, T8N, T8O; - T5Z = FMA(KP414213562, T5Y, T5X); - T62 = FNMS(KP414213562, T61, T60); - T63 = T5Z - T62; - T8W = T5Z + T62; - T8N = T8y + T8x; - T8O = T3Q - T3J; - T8P = FMA(KP707106781, T8O, T8N); - T8V = FNMS(KP707106781, T8O, T8N); - } - T6p = FNMS(KP414213562, T5X, T5Y); - T6q = FMA(KP414213562, T60, T61); - T6r = T6p + T6q; - T8Q = T6q - T6p; - { - E T6e, T6x, T6h, T6w, T6d, T6g; - T6d = T5i - T5b; - T6e = FNMS(KP707106781, T6d, T6c); - T6x = FMA(KP707106781, T6d, T6c); - T6g = T5s - T5t; - T6h = FNMS(KP707106781, T6g, T6f); - T6w = FMA(KP707106781, T6g, T6f); - T6i = FNMS(KP668178637, T6h, T6e); - T6C = FMA(KP198912367, T6w, T6x); - T6m = FMA(KP668178637, T6e, T6h); - T6y = FNMS(KP198912367, T6x, T6w); - } - { - E T67, T6u, T6a, T6t, T66, T69; - T66 = T4J - T4C; - T67 = FNMS(KP707106781, T66, T65); - T6u = FMA(KP707106781, T66, T65); - T69 = T4T - T4U; - T6a = FNMS(KP707106781, T69, T68); - T6t = FMA(KP707106781, T69, T68); - T6b = FMA(KP668178637, T6a, T67); - T6B = FNMS(KP198912367, T6t, T6u); - T6l = FNMS(KP668178637, T67, T6a); - T6v = FMA(KP198912367, T6u, T6t); - } - } - { - E T64, T6j, T8R, T8S; - T64 = FMA(KP923879532, T63, T5W); - T6j = T6b + T6i; - ri[WS(rs, 19)] = FNMS(KP831469612, T6j, T64); - ri[WS(rs, 3)] = FMA(KP831469612, T6j, T64); - T8R = FMA(KP923879532, T8Q, T8P); - T8S = T6l + T6m; - ii[WS(rs, 3)] = FMA(KP831469612, T8S, T8R); - ii[WS(rs, 19)] = FNMS(KP831469612, T8S, T8R); - } - { - E T6k, T6n, T8T, T8U; - T6k = FNMS(KP923879532, T63, T5W); - T6n = T6l - T6m; - ri[WS(rs, 27)] = FNMS(KP831469612, T6n, T6k); - ri[WS(rs, 11)] = FMA(KP831469612, T6n, T6k); - T8T = FNMS(KP923879532, T8Q, T8P); - T8U = T6i - T6b; - ii[WS(rs, 11)] = FMA(KP831469612, T8U, T8T); - ii[WS(rs, 27)] = FNMS(KP831469612, T8U, T8T); - } - { - E T6s, T6z, T8X, T8Y; - T6s = FNMS(KP923879532, T6r, T6o); - T6z = T6v - T6y; - ri[WS(rs, 23)] = FNMS(KP980785280, T6z, T6s); - ri[WS(rs, 7)] = FMA(KP980785280, T6z, T6s); - T8X = FNMS(KP923879532, T8W, T8V); - T8Y = T6C - T6B; - ii[WS(rs, 7)] = FMA(KP980785280, T8Y, T8X); - ii[WS(rs, 23)] = FNMS(KP980785280, T8Y, T8X); - } - { - E T6A, T6D, T8Z, T90; - T6A = FMA(KP923879532, T6r, T6o); - T6D = T6B + T6C; - ri[WS(rs, 15)] = FNMS(KP980785280, T6D, T6A); - ri[WS(rs, 31)] = FMA(KP980785280, T6D, T6A); - T8Z = FMA(KP923879532, T8W, T8V); - T90 = T6v + T6y; - ii[WS(rs, 15)] = FNMS(KP980785280, T90, T8Z); - ii[WS(rs, 31)] = FMA(KP980785280, T90, T8Z); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 32 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 32, "t1_32", twinstr, &GENUS, { 236, 62, 198, 0 }, 0, 0, 0 }; - -void X(codelet_t1_32) (planner *p) { - X(kdft_dit_register) (p, t1_32, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 32 -name t1_32 -include dft/scalar/t.h */ - -/* - * This function contains 434 FP additions, 208 FP multiplications, - * (or, 340 additions, 114 multiplications, 94 fused multiply/add), - * 96 stack variables, 7 constants, and 128 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP195090322, +0.195090322016128267848284868477022240927691618); - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP555570233, +0.555570233019602224742830813948532874374937191); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP382683432, +0.382683432365089771728459984030398866761344562); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 62); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 62, MAKE_VOLATILE_STRIDE(64, rs)) { - E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41; - E T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U; - E T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x; - E T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P; - E T4m, T5h, T4v, T5e; - { - E T1, T76, T6, T75, Tc, T32, Th, T33; - T1 = ri[0]; - T76 = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 16)]; - T5 = ii[WS(rs, 16)]; - T2 = W[30]; - T4 = W[31]; - T6 = FMA(T2, T3, T4 * T5); - T75 = FNMS(T4, T3, T2 * T5); - } - { - E T9, Tb, T8, Ta; - T9 = ri[WS(rs, 8)]; - Tb = ii[WS(rs, 8)]; - T8 = W[14]; - Ta = W[15]; - Tc = FMA(T8, T9, Ta * Tb); - T32 = FNMS(Ta, T9, T8 * Tb); - } - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 24)]; - Tg = ii[WS(rs, 24)]; - Td = W[46]; - Tf = W[47]; - Th = FMA(Td, Te, Tf * Tg); - T33 = FNMS(Tf, Te, Td * Tg); - } - { - E T7, Ti, T7A, T7B; - T7 = T1 + T6; - Ti = Tc + Th; - Tj = T7 + Ti; - T5F = T7 - Ti; - T7A = T76 - T75; - T7B = Tc - Th; - T7C = T7A - T7B; - T7Q = T7B + T7A; - } - { - E T31, T34, T74, T77; - T31 = T1 - T6; - T34 = T32 - T33; - T35 = T31 - T34; - T4T = T31 + T34; - T74 = T32 + T33; - T77 = T75 + T76; - T78 = T74 + T77; - T7m = T77 - T74; - } - } - { - E T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y; - { - E T1v, T1x, T1u, T1w; - T1v = ri[WS(rs, 1)]; - T1x = ii[WS(rs, 1)]; - T1u = W[0]; - T1w = W[1]; - T1y = FMA(T1u, T1v, T1w * T1x); - T3G = FNMS(T1w, T1v, T1u * T1x); - } - { - E T1L, T1N, T1K, T1M; - T1L = ri[WS(rs, 25)]; - T1N = ii[WS(rs, 25)]; - T1K = W[48]; - T1M = W[49]; - T1O = FMA(T1K, T1L, T1M * T1N); - T3Z = FNMS(T1M, T1L, T1K * T1N); - } - { - E T1A, T1C, T1z, T1B; - T1A = ri[WS(rs, 17)]; - T1C = ii[WS(rs, 17)]; - T1z = W[32]; - T1B = W[33]; - T1D = FMA(T1z, T1A, T1B * T1C); - T3H = FNMS(T1B, T1A, T1z * T1C); - } - { - E T1G, T1I, T1F, T1H; - T1G = ri[WS(rs, 9)]; - T1I = ii[WS(rs, 9)]; - T1F = W[16]; - T1H = W[17]; - T1J = FMA(T1F, T1G, T1H * T1I); - T3Y = FNMS(T1H, T1G, T1F * T1I); - } - { - E T1E, T1P, T5W, T5X; - T1E = T1y + T1D; - T1P = T1J + T1O; - T1Q = T1E + T1P; - T61 = T1E - T1P; - T5W = T3G + T3H; - T5X = T3Y + T3Z; - T5Y = T5W - T5X; - T6J = T5W + T5X; - } - { - E T3I, T3J, T3X, T40; - T3I = T3G - T3H; - T3J = T1J - T1O; - T3K = T3I + T3J; - T59 = T3I - T3J; - T3X = T1y - T1D; - T40 = T3Y - T3Z; - T41 = T3X - T40; - T56 = T3X + T40; - } - } - { - E T2j, T4o, T2z, T49, T2o, T4p, T2u, T48; - { - E T2g, T2i, T2f, T2h; - T2g = ri[WS(rs, 31)]; - T2i = ii[WS(rs, 31)]; - T2f = W[60]; - T2h = W[61]; - T2j = FMA(T2f, T2g, T2h * T2i); - T4o = FNMS(T2h, T2g, T2f * T2i); - } - { - E T2w, T2y, T2v, T2x; - T2w = ri[WS(rs, 23)]; - T2y = ii[WS(rs, 23)]; - T2v = W[44]; - T2x = W[45]; - T2z = FMA(T2v, T2w, T2x * T2y); - T49 = FNMS(T2x, T2w, T2v * T2y); - } - { - E T2l, T2n, T2k, T2m; - T2l = ri[WS(rs, 15)]; - T2n = ii[WS(rs, 15)]; - T2k = W[28]; - T2m = W[29]; - T2o = FMA(T2k, T2l, T2m * T2n); - T4p = FNMS(T2m, T2l, T2k * T2n); - } - { - E T2r, T2t, T2q, T2s; - T2r = ri[WS(rs, 7)]; - T2t = ii[WS(rs, 7)]; - T2q = W[12]; - T2s = W[13]; - T2u = FMA(T2q, T2r, T2s * T2t); - T48 = FNMS(T2s, T2r, T2q * T2t); - } - { - E T2p, T2A, T6c, T6d; - T2p = T2j + T2o; - T2A = T2u + T2z; - T2B = T2p + T2A; - T67 = T2p - T2A; - T6c = T4o + T4p; - T6d = T48 + T49; - T6e = T6c - T6d; - T6O = T6c + T6d; - } - { - E T47, T4a, T4q, T4r; - T47 = T2j - T2o; - T4a = T48 - T49; - T4b = T47 - T4a; - T5d = T47 + T4a; - T4q = T4o - T4p; - T4r = T2u - T2z; - T4s = T4q + T4r; - T5g = T4q - T4r; - } - } - { - E To, T36, TE, T3d, Tt, T37, Tz, T3c; - { - E Tl, Tn, Tk, Tm; - Tl = ri[WS(rs, 4)]; - Tn = ii[WS(rs, 4)]; - Tk = W[6]; - Tm = W[7]; - To = FMA(Tk, Tl, Tm * Tn); - T36 = FNMS(Tm, Tl, Tk * Tn); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 12)]; - TD = ii[WS(rs, 12)]; - TA = W[22]; - TC = W[23]; - TE = FMA(TA, TB, TC * TD); - T3d = FNMS(TC, TB, TA * TD); - } - { - E Tq, Ts, Tp, Tr; - Tq = ri[WS(rs, 20)]; - Ts = ii[WS(rs, 20)]; - Tp = W[38]; - Tr = W[39]; - Tt = FMA(Tp, Tq, Tr * Ts); - T37 = FNMS(Tr, Tq, Tp * Ts); - } - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 28)]; - Ty = ii[WS(rs, 28)]; - Tv = W[54]; - Tx = W[55]; - Tz = FMA(Tv, Tw, Tx * Ty); - T3c = FNMS(Tx, Tw, Tv * Ty); - } - { - E Tu, TF, T5G, T5H; - Tu = To + Tt; - TF = Tz + TE; - TG = Tu + TF; - T7l = TF - Tu; - T5G = T36 + T37; - T5H = T3c + T3d; - T5I = T5G - T5H; - T73 = T5G + T5H; - } - { - E T38, T39, T3b, T3e; - T38 = T36 - T37; - T39 = To - Tt; - T3a = T38 - T39; - T4U = T39 + T38; - T3b = Tz - TE; - T3e = T3c - T3d; - T3f = T3b + T3e; - T4V = T3b - T3e; - } - } - { - E TM, T3i, T12, T3p, TR, T3j, TX, T3o; - { - E TJ, TL, TI, TK; - TJ = ri[WS(rs, 2)]; - TL = ii[WS(rs, 2)]; - TI = W[2]; - TK = W[3]; - TM = FMA(TI, TJ, TK * TL); - T3i = FNMS(TK, TJ, TI * TL); - } - { - E TZ, T11, TY, T10; - TZ = ri[WS(rs, 26)]; - T11 = ii[WS(rs, 26)]; - TY = W[50]; - T10 = W[51]; - T12 = FMA(TY, TZ, T10 * T11); - T3p = FNMS(T10, TZ, TY * T11); - } - { - E TO, TQ, TN, TP; - TO = ri[WS(rs, 18)]; - TQ = ii[WS(rs, 18)]; - TN = W[34]; - TP = W[35]; - TR = FMA(TN, TO, TP * TQ); - T3j = FNMS(TP, TO, TN * TQ); - } - { - E TU, TW, TT, TV; - TU = ri[WS(rs, 10)]; - TW = ii[WS(rs, 10)]; - TT = W[18]; - TV = W[19]; - TX = FMA(TT, TU, TV * TW); - T3o = FNMS(TV, TU, TT * TW); - } - { - E TS, T13, T5K, T5L; - TS = TM + TR; - T13 = TX + T12; - T14 = TS + T13; - T5N = TS - T13; - T5K = T3i + T3j; - T5L = T3o + T3p; - T5M = T5K - T5L; - T6E = T5K + T5L; - } - { - E T3k, T3l, T3n, T3q; - T3k = T3i - T3j; - T3l = TX - T12; - T3m = T3k + T3l; - T4Y = T3k - T3l; - T3n = TM - TR; - T3q = T3o - T3p; - T3r = T3n - T3q; - T4Z = T3n + T3q; - } - } - { - E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z; - { - E T16, T18, T15, T17; - T16 = ri[WS(rs, 30)]; - T18 = ii[WS(rs, 30)]; - T15 = W[58]; - T17 = W[59]; - T19 = FMA(T15, T16, T17 * T18); - T3t = FNMS(T17, T16, T15 * T18); - } - { - E T1m, T1o, T1l, T1n; - T1m = ri[WS(rs, 22)]; - T1o = ii[WS(rs, 22)]; - T1l = W[42]; - T1n = W[43]; - T1p = FMA(T1l, T1m, T1n * T1o); - T3A = FNMS(T1n, T1m, T1l * T1o); - } - { - E T1b, T1d, T1a, T1c; - T1b = ri[WS(rs, 14)]; - T1d = ii[WS(rs, 14)]; - T1a = W[26]; - T1c = W[27]; - T1e = FMA(T1a, T1b, T1c * T1d); - T3u = FNMS(T1c, T1b, T1a * T1d); - } - { - E T1h, T1j, T1g, T1i; - T1h = ri[WS(rs, 6)]; - T1j = ii[WS(rs, 6)]; - T1g = W[10]; - T1i = W[11]; - T1k = FMA(T1g, T1h, T1i * T1j); - T3z = FNMS(T1i, T1h, T1g * T1j); - } - { - E T1f, T1q, T5Q, T5R; - T1f = T19 + T1e; - T1q = T1k + T1p; - T1r = T1f + T1q; - T5P = T1f - T1q; - T5Q = T3t + T3u; - T5R = T3z + T3A; - T5S = T5Q - T5R; - T6F = T5Q + T5R; - } - { - E T3v, T3w, T3y, T3B; - T3v = T3t - T3u; - T3w = T1k - T1p; - T3x = T3v + T3w; - T51 = T3v - T3w; - T3y = T19 - T1e; - T3B = T3z - T3A; - T3C = T3y - T3B; - T52 = T3y + T3B; - } - } - { - E T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O; - { - E T1S, T1U, T1R, T1T; - T1S = ri[WS(rs, 5)]; - T1U = ii[WS(rs, 5)]; - T1R = W[8]; - T1T = W[9]; - T1V = FMA(T1R, T1S, T1T * T1U); - T3R = FNMS(T1T, T1S, T1R * T1U); - } - { - E T1X, T1Z, T1W, T1Y; - T1X = ri[WS(rs, 21)]; - T1Z = ii[WS(rs, 21)]; - T1W = W[40]; - T1Y = W[41]; - T20 = FMA(T1W, T1X, T1Y * T1Z); - T3S = FNMS(T1Y, T1X, T1W * T1Z); - } - T3Q = T1V - T20; - T3T = T3R - T3S; - { - E T23, T25, T22, T24; - T23 = ri[WS(rs, 29)]; - T25 = ii[WS(rs, 29)]; - T22 = W[56]; - T24 = W[57]; - T26 = FMA(T22, T23, T24 * T25); - T3M = FNMS(T24, T23, T22 * T25); - } - { - E T28, T2a, T27, T29; - T28 = ri[WS(rs, 13)]; - T2a = ii[WS(rs, 13)]; - T27 = W[24]; - T29 = W[25]; - T2b = FMA(T27, T28, T29 * T2a); - T3N = FNMS(T29, T28, T27 * T2a); - } - T3L = T26 - T2b; - T3O = T3M - T3N; - { - E T21, T2c, T62, T63; - T21 = T1V + T20; - T2c = T26 + T2b; - T2d = T21 + T2c; - T5Z = T2c - T21; - T62 = T3R + T3S; - T63 = T3M + T3N; - T64 = T62 - T63; - T6K = T62 + T63; - } - { - E T3P, T3U, T42, T43; - T3P = T3L - T3O; - T3U = T3Q + T3T; - T3V = KP707106781 * (T3P - T3U); - T57 = KP707106781 * (T3U + T3P); - T42 = T3T - T3Q; - T43 = T3L + T3O; - T44 = KP707106781 * (T42 - T43); - T5a = KP707106781 * (T42 + T43); - } - } - { - E T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k; - { - E T2D, T2F, T2C, T2E; - T2D = ri[WS(rs, 3)]; - T2F = ii[WS(rs, 3)]; - T2C = W[4]; - T2E = W[5]; - T2G = FMA(T2C, T2D, T2E * T2F); - T4c = FNMS(T2E, T2D, T2C * T2F); - } - { - E T2I, T2K, T2H, T2J; - T2I = ri[WS(rs, 19)]; - T2K = ii[WS(rs, 19)]; - T2H = W[36]; - T2J = W[37]; - T2L = FMA(T2H, T2I, T2J * T2K); - T4d = FNMS(T2J, T2I, T2H * T2K); - } - T4e = T4c - T4d; - T4f = T2G - T2L; - { - E T2O, T2Q, T2N, T2P; - T2O = ri[WS(rs, 27)]; - T2Q = ii[WS(rs, 27)]; - T2N = W[52]; - T2P = W[53]; - T2R = FMA(T2N, T2O, T2P * T2Q); - T4i = FNMS(T2P, T2O, T2N * T2Q); - } - { - E T2T, T2V, T2S, T2U; - T2T = ri[WS(rs, 11)]; - T2V = ii[WS(rs, 11)]; - T2S = W[20]; - T2U = W[21]; - T2W = FMA(T2S, T2T, T2U * T2V); - T4j = FNMS(T2U, T2T, T2S * T2V); - } - T4h = T2R - T2W; - T4k = T4i - T4j; - { - E T2M, T2X, T68, T69; - T2M = T2G + T2L; - T2X = T2R + T2W; - T2Y = T2M + T2X; - T6f = T2X - T2M; - T68 = T4c + T4d; - T69 = T4i + T4j; - T6a = T68 - T69; - T6P = T68 + T69; - } - { - E T4g, T4l, T4t, T4u; - T4g = T4e - T4f; - T4l = T4h + T4k; - T4m = KP707106781 * (T4g - T4l); - T5h = KP707106781 * (T4g + T4l); - T4t = T4h - T4k; - T4u = T4f + T4e; - T4v = KP707106781 * (T4t - T4u); - T5e = KP707106781 * (T4u + T4t); - } - } - { - E T1t, T6X, T7a, T7c, T30, T7b, T70, T71; - { - E TH, T1s, T72, T79; - TH = Tj + TG; - T1s = T14 + T1r; - T1t = TH + T1s; - T6X = TH - T1s; - T72 = T6E + T6F; - T79 = T73 + T78; - T7a = T72 + T79; - T7c = T79 - T72; - } - { - E T2e, T2Z, T6Y, T6Z; - T2e = T1Q + T2d; - T2Z = T2B + T2Y; - T30 = T2e + T2Z; - T7b = T2Z - T2e; - T6Y = T6J + T6K; - T6Z = T6O + T6P; - T70 = T6Y - T6Z; - T71 = T6Y + T6Z; - } - ri[WS(rs, 16)] = T1t - T30; - ii[WS(rs, 16)] = T7a - T71; - ri[0] = T1t + T30; - ii[0] = T71 + T7a; - ri[WS(rs, 24)] = T6X - T70; - ii[WS(rs, 24)] = T7c - T7b; - ri[WS(rs, 8)] = T6X + T70; - ii[WS(rs, 8)] = T7b + T7c; - } - { - E T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V; - { - E T6D, T6G, T7e, T7f; - T6D = Tj - TG; - T6G = T6E - T6F; - T6H = T6D + T6G; - T6T = T6D - T6G; - T7e = T1r - T14; - T7f = T78 - T73; - T7g = T7e + T7f; - T7i = T7f - T7e; - } - { - E T6I, T6L, T6N, T6Q; - T6I = T1Q - T2d; - T6L = T6J - T6K; - T6M = T6I + T6L; - T6U = T6L - T6I; - T6N = T2B - T2Y; - T6Q = T6O - T6P; - T6R = T6N - T6Q; - T6V = T6N + T6Q; - } - { - E T6S, T7d, T6W, T7h; - T6S = KP707106781 * (T6M + T6R); - ri[WS(rs, 20)] = T6H - T6S; - ri[WS(rs, 4)] = T6H + T6S; - T7d = KP707106781 * (T6U + T6V); - ii[WS(rs, 4)] = T7d + T7g; - ii[WS(rs, 20)] = T7g - T7d; - T6W = KP707106781 * (T6U - T6V); - ri[WS(rs, 28)] = T6T - T6W; - ri[WS(rs, 12)] = T6T + T6W; - T7h = KP707106781 * (T6R - T6M); - ii[WS(rs, 12)] = T7h + T7i; - ii[WS(rs, 28)] = T7i - T7h; - } - } - { - E T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h; - E T6l; - { - E T5O, T5T, T60, T65; - T5J = T5F - T5I; - T7n = T7l + T7m; - T7t = T7m - T7l; - T6n = T5F + T5I; - T5O = T5M - T5N; - T5T = T5P + T5S; - T5U = KP707106781 * (T5O - T5T); - T7k = KP707106781 * (T5O + T5T); - { - E T6v, T6w, T6o, T6p; - T6v = T67 + T6a; - T6w = T6e + T6f; - T6x = FNMS(KP382683432, T6w, KP923879532 * T6v); - T6B = FMA(KP923879532, T6w, KP382683432 * T6v); - T6o = T5N + T5M; - T6p = T5P - T5S; - T6q = KP707106781 * (T6o + T6p); - T7s = KP707106781 * (T6p - T6o); - } - T60 = T5Y - T5Z; - T65 = T61 - T64; - T66 = FMA(KP923879532, T60, KP382683432 * T65); - T6k = FNMS(KP923879532, T65, KP382683432 * T60); - { - E T6s, T6t, T6b, T6g; - T6s = T5Y + T5Z; - T6t = T61 + T64; - T6u = FMA(KP382683432, T6s, KP923879532 * T6t); - T6A = FNMS(KP382683432, T6t, KP923879532 * T6s); - T6b = T67 - T6a; - T6g = T6e - T6f; - T6h = FNMS(KP923879532, T6g, KP382683432 * T6b); - T6l = FMA(KP382683432, T6g, KP923879532 * T6b); - } - } - { - E T5V, T6i, T7r, T7u; - T5V = T5J + T5U; - T6i = T66 + T6h; - ri[WS(rs, 22)] = T5V - T6i; - ri[WS(rs, 6)] = T5V + T6i; - T7r = T6k + T6l; - T7u = T7s + T7t; - ii[WS(rs, 6)] = T7r + T7u; - ii[WS(rs, 22)] = T7u - T7r; - } - { - E T6j, T6m, T7v, T7w; - T6j = T5J - T5U; - T6m = T6k - T6l; - ri[WS(rs, 30)] = T6j - T6m; - ri[WS(rs, 14)] = T6j + T6m; - T7v = T6h - T66; - T7w = T7t - T7s; - ii[WS(rs, 14)] = T7v + T7w; - ii[WS(rs, 30)] = T7w - T7v; - } - { - E T6r, T6y, T7j, T7o; - T6r = T6n + T6q; - T6y = T6u + T6x; - ri[WS(rs, 18)] = T6r - T6y; - ri[WS(rs, 2)] = T6r + T6y; - T7j = T6A + T6B; - T7o = T7k + T7n; - ii[WS(rs, 2)] = T7j + T7o; - ii[WS(rs, 18)] = T7o - T7j; - } - { - E T6z, T6C, T7p, T7q; - T6z = T6n - T6q; - T6C = T6A - T6B; - ri[WS(rs, 26)] = T6z - T6C; - ri[WS(rs, 10)] = T6z + T6C; - T7p = T6x - T6u; - T7q = T7n - T7k; - ii[WS(rs, 10)] = T7p + T7q; - ii[WS(rs, 26)] = T7q - T7p; - } - } - { - E T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x; - E T4B, T3g, T7P; - T3g = KP707106781 * (T3a - T3f); - T3h = T35 - T3g; - T4D = T35 + T3g; - T7P = KP707106781 * (T4V - T4U); - T7R = T7P + T7Q; - T7X = T7Q - T7P; - { - E T3s, T3D, T4L, T4M; - T3s = FNMS(KP923879532, T3r, KP382683432 * T3m); - T3D = FMA(KP382683432, T3x, KP923879532 * T3C); - T3E = T3s - T3D; - T7O = T3s + T3D; - T4L = T4b + T4m; - T4M = T4s + T4v; - T4N = FNMS(KP555570233, T4M, KP831469612 * T4L); - T4R = FMA(KP831469612, T4M, KP555570233 * T4L); - } - { - E T3W, T45, T4E, T4F; - T3W = T3K - T3V; - T45 = T41 - T44; - T46 = FMA(KP980785280, T3W, KP195090322 * T45); - T4A = FNMS(KP980785280, T45, KP195090322 * T3W); - T4E = FMA(KP923879532, T3m, KP382683432 * T3r); - T4F = FNMS(KP923879532, T3x, KP382683432 * T3C); - T4G = T4E + T4F; - T7W = T4F - T4E; - } - { - E T4I, T4J, T4n, T4w; - T4I = T3K + T3V; - T4J = T41 + T44; - T4K = FMA(KP555570233, T4I, KP831469612 * T4J); - T4Q = FNMS(KP555570233, T4J, KP831469612 * T4I); - T4n = T4b - T4m; - T4w = T4s - T4v; - T4x = FNMS(KP980785280, T4w, KP195090322 * T4n); - T4B = FMA(KP195090322, T4w, KP980785280 * T4n); - } - { - E T3F, T4y, T7V, T7Y; - T3F = T3h + T3E; - T4y = T46 + T4x; - ri[WS(rs, 23)] = T3F - T4y; - ri[WS(rs, 7)] = T3F + T4y; - T7V = T4A + T4B; - T7Y = T7W + T7X; - ii[WS(rs, 7)] = T7V + T7Y; - ii[WS(rs, 23)] = T7Y - T7V; - } - { - E T4z, T4C, T7Z, T80; - T4z = T3h - T3E; - T4C = T4A - T4B; - ri[WS(rs, 31)] = T4z - T4C; - ri[WS(rs, 15)] = T4z + T4C; - T7Z = T4x - T46; - T80 = T7X - T7W; - ii[WS(rs, 15)] = T7Z + T80; - ii[WS(rs, 31)] = T80 - T7Z; - } - { - E T4H, T4O, T7N, T7S; - T4H = T4D + T4G; - T4O = T4K + T4N; - ri[WS(rs, 19)] = T4H - T4O; - ri[WS(rs, 3)] = T4H + T4O; - T7N = T4Q + T4R; - T7S = T7O + T7R; - ii[WS(rs, 3)] = T7N + T7S; - ii[WS(rs, 19)] = T7S - T7N; - } - { - E T4P, T4S, T7T, T7U; - T4P = T4D - T4G; - T4S = T4Q - T4R; - ri[WS(rs, 27)] = T4P - T4S; - ri[WS(rs, 11)] = T4P + T4S; - T7T = T4N - T4K; - T7U = T7R - T7O; - ii[WS(rs, 11)] = T7T + T7U; - ii[WS(rs, 27)] = T7U - T7T; - } - } - { - E T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j; - E T5n, T4W, T7z; - T4W = KP707106781 * (T4U + T4V); - T4X = T4T - T4W; - T5p = T4T + T4W; - T7z = KP707106781 * (T3a + T3f); - T7D = T7z + T7C; - T7J = T7C - T7z; - { - E T50, T53, T5x, T5y; - T50 = FNMS(KP382683432, T4Z, KP923879532 * T4Y); - T53 = FMA(KP923879532, T51, KP382683432 * T52); - T54 = T50 - T53; - T7y = T50 + T53; - T5x = T5d + T5e; - T5y = T5g + T5h; - T5z = FNMS(KP195090322, T5y, KP980785280 * T5x); - T5D = FMA(KP195090322, T5x, KP980785280 * T5y); - } - { - E T58, T5b, T5q, T5r; - T58 = T56 - T57; - T5b = T59 - T5a; - T5c = FMA(KP555570233, T58, KP831469612 * T5b); - T5m = FNMS(KP831469612, T58, KP555570233 * T5b); - T5q = FMA(KP382683432, T4Y, KP923879532 * T4Z); - T5r = FNMS(KP382683432, T51, KP923879532 * T52); - T5s = T5q + T5r; - T7I = T5r - T5q; - } - { - E T5u, T5v, T5f, T5i; - T5u = T56 + T57; - T5v = T59 + T5a; - T5w = FMA(KP980785280, T5u, KP195090322 * T5v); - T5C = FNMS(KP195090322, T5u, KP980785280 * T5v); - T5f = T5d - T5e; - T5i = T5g - T5h; - T5j = FNMS(KP831469612, T5i, KP555570233 * T5f); - T5n = FMA(KP831469612, T5f, KP555570233 * T5i); - } - { - E T55, T5k, T7H, T7K; - T55 = T4X + T54; - T5k = T5c + T5j; - ri[WS(rs, 21)] = T55 - T5k; - ri[WS(rs, 5)] = T55 + T5k; - T7H = T5m + T5n; - T7K = T7I + T7J; - ii[WS(rs, 5)] = T7H + T7K; - ii[WS(rs, 21)] = T7K - T7H; - } - { - E T5l, T5o, T7L, T7M; - T5l = T4X - T54; - T5o = T5m - T5n; - ri[WS(rs, 29)] = T5l - T5o; - ri[WS(rs, 13)] = T5l + T5o; - T7L = T5j - T5c; - T7M = T7J - T7I; - ii[WS(rs, 13)] = T7L + T7M; - ii[WS(rs, 29)] = T7M - T7L; - } - { - E T5t, T5A, T7x, T7E; - T5t = T5p + T5s; - T5A = T5w + T5z; - ri[WS(rs, 17)] = T5t - T5A; - ri[WS(rs, 1)] = T5t + T5A; - T7x = T5C + T5D; - T7E = T7y + T7D; - ii[WS(rs, 1)] = T7x + T7E; - ii[WS(rs, 17)] = T7E - T7x; - } - { - E T5B, T5E, T7F, T7G; - T5B = T5p - T5s; - T5E = T5C - T5D; - ri[WS(rs, 25)] = T5B - T5E; - ri[WS(rs, 9)] = T5B + T5E; - T7F = T5z - T5w; - T7G = T7D - T7y; - ii[WS(rs, 9)] = T7F + T7G; - ii[WS(rs, 25)] = T7G - T7F; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 32 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 32, "t1_32", twinstr, &GENUS, { 340, 114, 94, 0 }, 0, 0, 0 }; - -void X(codelet_t1_32) (planner *p) { - X(kdft_dit_register) (p, t1_32, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_64.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_64.c deleted file mode 100644 index 836512b3..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_64.c +++ /dev/null @@ -1,4105 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:29 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 64 -name t1_64 -include dft/scalar/t.h */ - -/* - * This function contains 1038 FP additions, 644 FP multiplications, - * (or, 520 additions, 126 multiplications, 518 fused multiply/add), - * 190 stack variables, 15 constants, and 256 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP995184726, +0.995184726672196886244836953109479921575474869); - DK(KP773010453, +0.773010453362736960810906609758469800971041293); - DK(KP956940335, +0.956940335732208864935797886980269969482849206); - DK(KP881921264, +0.881921264348355029712756863660388349508442621); - DK(KP098491403, +0.098491403357164253077197521291327432293052451); - DK(KP820678790, +0.820678790828660330972281985331011598767386482); - DK(KP303346683, +0.303346683607342391675883946941299872384187453); - DK(KP534511135, +0.534511135950791641089685961295362908582039528); - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP198912367, +0.198912367379658006911597622644676228597850501); - DK(KP668178637, +0.668178637919298919997757686523080761552472251); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - DK(KP414213562, +0.414213562373095048801688724209698078569671875); - { - INT m; - for (m = mb, W = W + (mb * 126); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) { - E Tm, TeM, TjR, Tkl, T7e, TcA, TiV, Tjm, T1G, TeW, TeZ, Ths, T7Q, TcJ, T7X; - E TcI, T29, Tf8, Tf5, Thv, T87, TcN, T8u, TcQ, T5K, Tg9, TfU, ThS, Taq, Tdm; - E Tbj, Tdx, TN, Tjl, TeP, TiP, T7l, TcB, T7s, TcC, T1f, TeR, TeU, Thr, T7B; - E TcG, T7I, TcF, T32, Tfj, Tfg, ThB, T8G, TcU, T93, TcX, T3X, TfI, Tft, ThH; - E T9h, Td3, Taa, Tde, T2A, Tf6, Tfb, Thw, T8m, TcR, T8x, TcO, T3t, Tfh, Tfm; - E ThC, T8V, TcY, T96, TcV, T4o, Tfu, TfL, ThI, T9w, Tdf, Tad, Td4, T6b, TfV; - E Tgc, ThT, TaF, Tdy, Tbm, Tdn, T4Q, ThN, TfA, TfN, Ta1, Tdh, Taf, Td8, T5h; - E ThO, TfF, TfO, T9M, Tdi, Tag, Tdb, T6D, ThY, Tg1, Tge, Tba, TdA, Tbo, Tdr; - E T74, ThZ, Tg6, Tgf, TaV, TdB, Tbp, Tdu; - { - E T1, TiT, T7, TiS, Te, T7a, Tk, T7c; - T1 = ri[0]; - TiT = ii[0]; - { - E T3, T6, T4, TiR, T2, T5; - T3 = ri[WS(rs, 32)]; - T6 = ii[WS(rs, 32)]; - T2 = W[62]; - T4 = T2 * T3; - TiR = T2 * T6; - T5 = W[63]; - T7 = FMA(T5, T6, T4); - TiS = FNMS(T5, T3, TiR); - } - { - E Ta, Td, Tb, T79, T9, Tc; - Ta = ri[WS(rs, 16)]; - Td = ii[WS(rs, 16)]; - T9 = W[30]; - Tb = T9 * Ta; - T79 = T9 * Td; - Tc = W[31]; - Te = FMA(Tc, Td, Tb); - T7a = FNMS(Tc, Ta, T79); - } - { - E Tg, Tj, Th, T7b, Tf, Ti; - Tg = ri[WS(rs, 48)]; - Tj = ii[WS(rs, 48)]; - Tf = W[94]; - Th = Tf * Tg; - T7b = Tf * Tj; - Ti = W[95]; - Tk = FMA(Ti, Tj, Th); - T7c = FNMS(Ti, Tg, T7b); - } - { - E T8, Tl, TjP, TjQ; - T8 = T1 + T7; - Tl = Te + Tk; - Tm = T8 + Tl; - TeM = T8 - Tl; - TjP = TiT - TiS; - TjQ = Te - Tk; - TjR = TjP - TjQ; - Tkl = TjQ + TjP; - } - { - E T78, T7d, TiQ, TiU; - T78 = T1 - T7; - T7d = T7a - T7c; - T7e = T78 - T7d; - TcA = T78 + T7d; - TiQ = T7a + T7c; - TiU = TiS + TiT; - TiV = TiQ + TiU; - Tjm = TiU - TiQ; - } - } - { - E T1l, T7L, T1E, T7V, T1r, T7N, T1y, T7T; - { - E T1h, T1k, T1i, T7K, T1g, T1j; - T1h = ri[WS(rs, 60)]; - T1k = ii[WS(rs, 60)]; - T1g = W[118]; - T1i = T1g * T1h; - T7K = T1g * T1k; - T1j = W[119]; - T1l = FMA(T1j, T1k, T1i); - T7L = FNMS(T1j, T1h, T7K); - } - { - E T1A, T1D, T1B, T7U, T1z, T1C; - T1A = ri[WS(rs, 44)]; - T1D = ii[WS(rs, 44)]; - T1z = W[86]; - T1B = T1z * T1A; - T7U = T1z * T1D; - T1C = W[87]; - T1E = FMA(T1C, T1D, T1B); - T7V = FNMS(T1C, T1A, T7U); - } - { - E T1n, T1q, T1o, T7M, T1m, T1p; - T1n = ri[WS(rs, 28)]; - T1q = ii[WS(rs, 28)]; - T1m = W[54]; - T1o = T1m * T1n; - T7M = T1m * T1q; - T1p = W[55]; - T1r = FMA(T1p, T1q, T1o); - T7N = FNMS(T1p, T1n, T7M); - } - { - E T1u, T1x, T1v, T7S, T1t, T1w; - T1u = ri[WS(rs, 12)]; - T1x = ii[WS(rs, 12)]; - T1t = W[22]; - T1v = T1t * T1u; - T7S = T1t * T1x; - T1w = W[23]; - T1y = FMA(T1w, T1x, T1v); - T7T = FNMS(T1w, T1u, T7S); - } - { - E T1s, T1F, TeX, TeY; - T1s = T1l + T1r; - T1F = T1y + T1E; - T1G = T1s + T1F; - TeW = T1s - T1F; - TeX = T7L + T7N; - TeY = T7T + T7V; - TeZ = TeX - TeY; - Ths = TeX + TeY; - } - { - E T7O, T7P, T7R, T7W; - T7O = T7L - T7N; - T7P = T1y - T1E; - T7Q = T7O + T7P; - TcJ = T7O - T7P; - T7R = T1l - T1r; - T7W = T7T - T7V; - T7X = T7R - T7W; - TcI = T7R + T7W; - } - } - { - E T1O, T82, T27, T8s, T1U, T84, T21, T8q; - { - E T1K, T1N, T1L, T81, T1J, T1M; - T1K = ri[WS(rs, 2)]; - T1N = ii[WS(rs, 2)]; - T1J = W[2]; - T1L = T1J * T1K; - T81 = T1J * T1N; - T1M = W[3]; - T1O = FMA(T1M, T1N, T1L); - T82 = FNMS(T1M, T1K, T81); - } - { - E T23, T26, T24, T8r, T22, T25; - T23 = ri[WS(rs, 50)]; - T26 = ii[WS(rs, 50)]; - T22 = W[98]; - T24 = T22 * T23; - T8r = T22 * T26; - T25 = W[99]; - T27 = FMA(T25, T26, T24); - T8s = FNMS(T25, T23, T8r); - } - { - E T1Q, T1T, T1R, T83, T1P, T1S; - T1Q = ri[WS(rs, 34)]; - T1T = ii[WS(rs, 34)]; - T1P = W[66]; - T1R = T1P * T1Q; - T83 = T1P * T1T; - T1S = W[67]; - T1U = FMA(T1S, T1T, T1R); - T84 = FNMS(T1S, T1Q, T83); - } - { - E T1X, T20, T1Y, T8p, T1W, T1Z; - T1X = ri[WS(rs, 18)]; - T20 = ii[WS(rs, 18)]; - T1W = W[34]; - T1Y = T1W * T1X; - T8p = T1W * T20; - T1Z = W[35]; - T21 = FMA(T1Z, T20, T1Y); - T8q = FNMS(T1Z, T1X, T8p); - } - { - E T1V, T28, Tf3, Tf4; - T1V = T1O + T1U; - T28 = T21 + T27; - T29 = T1V + T28; - Tf8 = T1V - T28; - Tf3 = T82 + T84; - Tf4 = T8q + T8s; - Tf5 = Tf3 - Tf4; - Thv = Tf3 + Tf4; - } - { - E T85, T86, T8o, T8t; - T85 = T82 - T84; - T86 = T21 - T27; - T87 = T85 + T86; - TcN = T85 - T86; - T8o = T1O - T1U; - T8t = T8q - T8s; - T8u = T8o - T8t; - TcQ = T8o + T8t; - } - } - { - E T5p, Tal, T5I, Tbh, T5v, Tan, T5C, Tbf; - { - E T5l, T5o, T5m, Tak, T5k, T5n; - T5l = ri[WS(rs, 63)]; - T5o = ii[WS(rs, 63)]; - T5k = W[124]; - T5m = T5k * T5l; - Tak = T5k * T5o; - T5n = W[125]; - T5p = FMA(T5n, T5o, T5m); - Tal = FNMS(T5n, T5l, Tak); - } - { - E T5E, T5H, T5F, Tbg, T5D, T5G; - T5E = ri[WS(rs, 47)]; - T5H = ii[WS(rs, 47)]; - T5D = W[92]; - T5F = T5D * T5E; - Tbg = T5D * T5H; - T5G = W[93]; - T5I = FMA(T5G, T5H, T5F); - Tbh = FNMS(T5G, T5E, Tbg); - } - { - E T5r, T5u, T5s, Tam, T5q, T5t; - T5r = ri[WS(rs, 31)]; - T5u = ii[WS(rs, 31)]; - T5q = W[60]; - T5s = T5q * T5r; - Tam = T5q * T5u; - T5t = W[61]; - T5v = FMA(T5t, T5u, T5s); - Tan = FNMS(T5t, T5r, Tam); - } - { - E T5y, T5B, T5z, Tbe, T5x, T5A; - T5y = ri[WS(rs, 15)]; - T5B = ii[WS(rs, 15)]; - T5x = W[28]; - T5z = T5x * T5y; - Tbe = T5x * T5B; - T5A = W[29]; - T5C = FMA(T5A, T5B, T5z); - Tbf = FNMS(T5A, T5y, Tbe); - } - { - E T5w, T5J, TfS, TfT; - T5w = T5p + T5v; - T5J = T5C + T5I; - T5K = T5w + T5J; - Tg9 = T5w - T5J; - TfS = Tal + Tan; - TfT = Tbf + Tbh; - TfU = TfS - TfT; - ThS = TfS + TfT; - } - { - E Tao, Tap, Tbd, Tbi; - Tao = Tal - Tan; - Tap = T5C - T5I; - Taq = Tao + Tap; - Tdm = Tao - Tap; - Tbd = T5p - T5v; - Tbi = Tbf - Tbh; - Tbj = Tbd - Tbi; - Tdx = Tbd + Tbi; - } - } - { - E Ts, T7g, TL, T7q, Ty, T7i, TF, T7o; - { - E To, Tr, Tp, T7f, Tn, Tq; - To = ri[WS(rs, 8)]; - Tr = ii[WS(rs, 8)]; - Tn = W[14]; - Tp = Tn * To; - T7f = Tn * Tr; - Tq = W[15]; - Ts = FMA(Tq, Tr, Tp); - T7g = FNMS(Tq, To, T7f); - } - { - E TH, TK, TI, T7p, TG, TJ; - TH = ri[WS(rs, 24)]; - TK = ii[WS(rs, 24)]; - TG = W[46]; - TI = TG * TH; - T7p = TG * TK; - TJ = W[47]; - TL = FMA(TJ, TK, TI); - T7q = FNMS(TJ, TH, T7p); - } - { - E Tu, Tx, Tv, T7h, Tt, Tw; - Tu = ri[WS(rs, 40)]; - Tx = ii[WS(rs, 40)]; - Tt = W[78]; - Tv = Tt * Tu; - T7h = Tt * Tx; - Tw = W[79]; - Ty = FMA(Tw, Tx, Tv); - T7i = FNMS(Tw, Tu, T7h); - } - { - E TB, TE, TC, T7n, TA, TD; - TB = ri[WS(rs, 56)]; - TE = ii[WS(rs, 56)]; - TA = W[110]; - TC = TA * TB; - T7n = TA * TE; - TD = W[111]; - TF = FMA(TD, TE, TC); - T7o = FNMS(TD, TB, T7n); - } - { - E Tz, TM, TeN, TeO; - Tz = Ts + Ty; - TM = TF + TL; - TN = Tz + TM; - Tjl = TM - Tz; - TeN = T7g + T7i; - TeO = T7o + T7q; - TeP = TeN - TeO; - TiP = TeN + TeO; - } - { - E T7j, T7k, T7m, T7r; - T7j = T7g - T7i; - T7k = Ts - Ty; - T7l = T7j - T7k; - TcB = T7k + T7j; - T7m = TF - TL; - T7r = T7o - T7q; - T7s = T7m + T7r; - TcC = T7m - T7r; - } - } - { - E TU, T7w, T1d, T7G, T10, T7y, T17, T7E; - { - E TQ, TT, TR, T7v, TP, TS; - TQ = ri[WS(rs, 4)]; - TT = ii[WS(rs, 4)]; - TP = W[6]; - TR = TP * TQ; - T7v = TP * TT; - TS = W[7]; - TU = FMA(TS, TT, TR); - T7w = FNMS(TS, TQ, T7v); - } - { - E T19, T1c, T1a, T7F, T18, T1b; - T19 = ri[WS(rs, 52)]; - T1c = ii[WS(rs, 52)]; - T18 = W[102]; - T1a = T18 * T19; - T7F = T18 * T1c; - T1b = W[103]; - T1d = FMA(T1b, T1c, T1a); - T7G = FNMS(T1b, T19, T7F); - } - { - E TW, TZ, TX, T7x, TV, TY; - TW = ri[WS(rs, 36)]; - TZ = ii[WS(rs, 36)]; - TV = W[70]; - TX = TV * TW; - T7x = TV * TZ; - TY = W[71]; - T10 = FMA(TY, TZ, TX); - T7y = FNMS(TY, TW, T7x); - } - { - E T13, T16, T14, T7D, T12, T15; - T13 = ri[WS(rs, 20)]; - T16 = ii[WS(rs, 20)]; - T12 = W[38]; - T14 = T12 * T13; - T7D = T12 * T16; - T15 = W[39]; - T17 = FMA(T15, T16, T14); - T7E = FNMS(T15, T13, T7D); - } - { - E T11, T1e, TeS, TeT; - T11 = TU + T10; - T1e = T17 + T1d; - T1f = T11 + T1e; - TeR = T11 - T1e; - TeS = T7w + T7y; - TeT = T7E + T7G; - TeU = TeS - TeT; - Thr = TeS + TeT; - } - { - E T7z, T7A, T7C, T7H; - T7z = T7w - T7y; - T7A = T17 - T1d; - T7B = T7z + T7A; - TcG = T7z - T7A; - T7C = TU - T10; - T7H = T7E - T7G; - T7I = T7C - T7H; - TcF = T7C + T7H; - } - } - { - E T2H, T8B, T30, T91, T2N, T8D, T2U, T8Z; - { - E T2D, T2G, T2E, T8A, T2C, T2F; - T2D = ri[WS(rs, 62)]; - T2G = ii[WS(rs, 62)]; - T2C = W[122]; - T2E = T2C * T2D; - T8A = T2C * T2G; - T2F = W[123]; - T2H = FMA(T2F, T2G, T2E); - T8B = FNMS(T2F, T2D, T8A); - } - { - E T2W, T2Z, T2X, T90, T2V, T2Y; - T2W = ri[WS(rs, 46)]; - T2Z = ii[WS(rs, 46)]; - T2V = W[90]; - T2X = T2V * T2W; - T90 = T2V * T2Z; - T2Y = W[91]; - T30 = FMA(T2Y, T2Z, T2X); - T91 = FNMS(T2Y, T2W, T90); - } - { - E T2J, T2M, T2K, T8C, T2I, T2L; - T2J = ri[WS(rs, 30)]; - T2M = ii[WS(rs, 30)]; - T2I = W[58]; - T2K = T2I * T2J; - T8C = T2I * T2M; - T2L = W[59]; - T2N = FMA(T2L, T2M, T2K); - T8D = FNMS(T2L, T2J, T8C); - } - { - E T2Q, T2T, T2R, T8Y, T2P, T2S; - T2Q = ri[WS(rs, 14)]; - T2T = ii[WS(rs, 14)]; - T2P = W[26]; - T2R = T2P * T2Q; - T8Y = T2P * T2T; - T2S = W[27]; - T2U = FMA(T2S, T2T, T2R); - T8Z = FNMS(T2S, T2Q, T8Y); - } - { - E T2O, T31, Tfe, Tff; - T2O = T2H + T2N; - T31 = T2U + T30; - T32 = T2O + T31; - Tfj = T2O - T31; - Tfe = T8B + T8D; - Tff = T8Z + T91; - Tfg = Tfe - Tff; - ThB = Tfe + Tff; - } - { - E T8E, T8F, T8X, T92; - T8E = T8B - T8D; - T8F = T2U - T30; - T8G = T8E + T8F; - TcU = T8E - T8F; - T8X = T2H - T2N; - T92 = T8Z - T91; - T93 = T8X - T92; - TcX = T8X + T92; - } - } - { - E T3C, T9c, T3V, Ta8, T3I, T9e, T3P, Ta6; - { - E T3y, T3B, T3z, T9b, T3x, T3A; - T3y = ri[WS(rs, 1)]; - T3B = ii[WS(rs, 1)]; - T3x = W[0]; - T3z = T3x * T3y; - T9b = T3x * T3B; - T3A = W[1]; - T3C = FMA(T3A, T3B, T3z); - T9c = FNMS(T3A, T3y, T9b); - } - { - E T3R, T3U, T3S, Ta7, T3Q, T3T; - T3R = ri[WS(rs, 49)]; - T3U = ii[WS(rs, 49)]; - T3Q = W[96]; - T3S = T3Q * T3R; - Ta7 = T3Q * T3U; - T3T = W[97]; - T3V = FMA(T3T, T3U, T3S); - Ta8 = FNMS(T3T, T3R, Ta7); - } - { - E T3E, T3H, T3F, T9d, T3D, T3G; - T3E = ri[WS(rs, 33)]; - T3H = ii[WS(rs, 33)]; - T3D = W[64]; - T3F = T3D * T3E; - T9d = T3D * T3H; - T3G = W[65]; - T3I = FMA(T3G, T3H, T3F); - T9e = FNMS(T3G, T3E, T9d); - } - { - E T3L, T3O, T3M, Ta5, T3K, T3N; - T3L = ri[WS(rs, 17)]; - T3O = ii[WS(rs, 17)]; - T3K = W[32]; - T3M = T3K * T3L; - Ta5 = T3K * T3O; - T3N = W[33]; - T3P = FMA(T3N, T3O, T3M); - Ta6 = FNMS(T3N, T3L, Ta5); - } - { - E T3J, T3W, Tfr, Tfs; - T3J = T3C + T3I; - T3W = T3P + T3V; - T3X = T3J + T3W; - TfI = T3J - T3W; - Tfr = T9c + T9e; - Tfs = Ta6 + Ta8; - Tft = Tfr - Tfs; - ThH = Tfr + Tfs; - } - { - E T9f, T9g, Ta4, Ta9; - T9f = T9c - T9e; - T9g = T3P - T3V; - T9h = T9f + T9g; - Td3 = T9f - T9g; - Ta4 = T3C - T3I; - Ta9 = Ta6 - Ta8; - Taa = Ta4 - Ta9; - Tde = Ta4 + Ta9; - } - } - { - E T2f, T8a, T2y, T8j, T2l, T8c, T2s, T8h; - { - E T2b, T2e, T2c, T89, T2a, T2d; - T2b = ri[WS(rs, 10)]; - T2e = ii[WS(rs, 10)]; - T2a = W[18]; - T2c = T2a * T2b; - T89 = T2a * T2e; - T2d = W[19]; - T2f = FMA(T2d, T2e, T2c); - T8a = FNMS(T2d, T2b, T89); - } - { - E T2u, T2x, T2v, T8i, T2t, T2w; - T2u = ri[WS(rs, 26)]; - T2x = ii[WS(rs, 26)]; - T2t = W[50]; - T2v = T2t * T2u; - T8i = T2t * T2x; - T2w = W[51]; - T2y = FMA(T2w, T2x, T2v); - T8j = FNMS(T2w, T2u, T8i); - } - { - E T2h, T2k, T2i, T8b, T2g, T2j; - T2h = ri[WS(rs, 42)]; - T2k = ii[WS(rs, 42)]; - T2g = W[82]; - T2i = T2g * T2h; - T8b = T2g * T2k; - T2j = W[83]; - T2l = FMA(T2j, T2k, T2i); - T8c = FNMS(T2j, T2h, T8b); - } - { - E T2o, T2r, T2p, T8g, T2n, T2q; - T2o = ri[WS(rs, 58)]; - T2r = ii[WS(rs, 58)]; - T2n = W[114]; - T2p = T2n * T2o; - T8g = T2n * T2r; - T2q = W[115]; - T2s = FMA(T2q, T2r, T2p); - T8h = FNMS(T2q, T2o, T8g); - } - { - E T2m, T2z, Tf9, Tfa; - T2m = T2f + T2l; - T2z = T2s + T2y; - T2A = T2m + T2z; - Tf6 = T2z - T2m; - Tf9 = T8a + T8c; - Tfa = T8h + T8j; - Tfb = Tf9 - Tfa; - Thw = Tf9 + Tfa; - { - E T8e, T8w, T8l, T8v; - { - E T88, T8d, T8f, T8k; - T88 = T2f - T2l; - T8d = T8a - T8c; - T8e = T88 + T8d; - T8w = T8d - T88; - T8f = T2s - T2y; - T8k = T8h - T8j; - T8l = T8f - T8k; - T8v = T8f + T8k; - } - T8m = T8e - T8l; - TcR = T8e + T8l; - T8x = T8v - T8w; - TcO = T8w + T8v; - } - } - } - { - E T38, T8J, T3r, T8S, T3e, T8L, T3l, T8Q; - { - E T34, T37, T35, T8I, T33, T36; - T34 = ri[WS(rs, 6)]; - T37 = ii[WS(rs, 6)]; - T33 = W[10]; - T35 = T33 * T34; - T8I = T33 * T37; - T36 = W[11]; - T38 = FMA(T36, T37, T35); - T8J = FNMS(T36, T34, T8I); - } - { - E T3n, T3q, T3o, T8R, T3m, T3p; - T3n = ri[WS(rs, 22)]; - T3q = ii[WS(rs, 22)]; - T3m = W[42]; - T3o = T3m * T3n; - T8R = T3m * T3q; - T3p = W[43]; - T3r = FMA(T3p, T3q, T3o); - T8S = FNMS(T3p, T3n, T8R); - } - { - E T3a, T3d, T3b, T8K, T39, T3c; - T3a = ri[WS(rs, 38)]; - T3d = ii[WS(rs, 38)]; - T39 = W[74]; - T3b = T39 * T3a; - T8K = T39 * T3d; - T3c = W[75]; - T3e = FMA(T3c, T3d, T3b); - T8L = FNMS(T3c, T3a, T8K); - } - { - E T3h, T3k, T3i, T8P, T3g, T3j; - T3h = ri[WS(rs, 54)]; - T3k = ii[WS(rs, 54)]; - T3g = W[106]; - T3i = T3g * T3h; - T8P = T3g * T3k; - T3j = W[107]; - T3l = FMA(T3j, T3k, T3i); - T8Q = FNMS(T3j, T3h, T8P); - } - { - E T3f, T3s, Tfk, Tfl; - T3f = T38 + T3e; - T3s = T3l + T3r; - T3t = T3f + T3s; - Tfh = T3s - T3f; - Tfk = T8J + T8L; - Tfl = T8Q + T8S; - Tfm = Tfk - Tfl; - ThC = Tfk + Tfl; - { - E T8N, T95, T8U, T94; - { - E T8H, T8M, T8O, T8T; - T8H = T38 - T3e; - T8M = T8J - T8L; - T8N = T8H + T8M; - T95 = T8M - T8H; - T8O = T3l - T3r; - T8T = T8Q - T8S; - T8U = T8O - T8T; - T94 = T8O + T8T; - } - T8V = T8N - T8U; - TcY = T8N + T8U; - T96 = T94 - T95; - TcV = T95 + T94; - } - } - } - { - E T43, T9k, T4m, T9t, T49, T9m, T4g, T9r; - { - E T3Z, T42, T40, T9j, T3Y, T41; - T3Z = ri[WS(rs, 9)]; - T42 = ii[WS(rs, 9)]; - T3Y = W[16]; - T40 = T3Y * T3Z; - T9j = T3Y * T42; - T41 = W[17]; - T43 = FMA(T41, T42, T40); - T9k = FNMS(T41, T3Z, T9j); - } - { - E T4i, T4l, T4j, T9s, T4h, T4k; - T4i = ri[WS(rs, 25)]; - T4l = ii[WS(rs, 25)]; - T4h = W[48]; - T4j = T4h * T4i; - T9s = T4h * T4l; - T4k = W[49]; - T4m = FMA(T4k, T4l, T4j); - T9t = FNMS(T4k, T4i, T9s); - } - { - E T45, T48, T46, T9l, T44, T47; - T45 = ri[WS(rs, 41)]; - T48 = ii[WS(rs, 41)]; - T44 = W[80]; - T46 = T44 * T45; - T9l = T44 * T48; - T47 = W[81]; - T49 = FMA(T47, T48, T46); - T9m = FNMS(T47, T45, T9l); - } - { - E T4c, T4f, T4d, T9q, T4b, T4e; - T4c = ri[WS(rs, 57)]; - T4f = ii[WS(rs, 57)]; - T4b = W[112]; - T4d = T4b * T4c; - T9q = T4b * T4f; - T4e = W[113]; - T4g = FMA(T4e, T4f, T4d); - T9r = FNMS(T4e, T4c, T9q); - } - { - E T4a, T4n, TfJ, TfK; - T4a = T43 + T49; - T4n = T4g + T4m; - T4o = T4a + T4n; - Tfu = T4n - T4a; - TfJ = T9k + T9m; - TfK = T9r + T9t; - TfL = TfJ - TfK; - ThI = TfJ + TfK; - { - E T9o, Tac, T9v, Tab; - { - E T9i, T9n, T9p, T9u; - T9i = T43 - T49; - T9n = T9k - T9m; - T9o = T9i + T9n; - Tac = T9n - T9i; - T9p = T4g - T4m; - T9u = T9r - T9t; - T9v = T9p - T9u; - Tab = T9p + T9u; - } - T9w = T9o - T9v; - Tdf = T9o + T9v; - Tad = Tab - Tac; - Td4 = Tac + Tab; - } - } - } - { - E T5Q, Tat, T69, TaC, T5W, Tav, T63, TaA; - { - E T5M, T5P, T5N, Tas, T5L, T5O; - T5M = ri[WS(rs, 7)]; - T5P = ii[WS(rs, 7)]; - T5L = W[12]; - T5N = T5L * T5M; - Tas = T5L * T5P; - T5O = W[13]; - T5Q = FMA(T5O, T5P, T5N); - Tat = FNMS(T5O, T5M, Tas); - } - { - E T65, T68, T66, TaB, T64, T67; - T65 = ri[WS(rs, 23)]; - T68 = ii[WS(rs, 23)]; - T64 = W[44]; - T66 = T64 * T65; - TaB = T64 * T68; - T67 = W[45]; - T69 = FMA(T67, T68, T66); - TaC = FNMS(T67, T65, TaB); - } - { - E T5S, T5V, T5T, Tau, T5R, T5U; - T5S = ri[WS(rs, 39)]; - T5V = ii[WS(rs, 39)]; - T5R = W[76]; - T5T = T5R * T5S; - Tau = T5R * T5V; - T5U = W[77]; - T5W = FMA(T5U, T5V, T5T); - Tav = FNMS(T5U, T5S, Tau); - } - { - E T5Z, T62, T60, Taz, T5Y, T61; - T5Z = ri[WS(rs, 55)]; - T62 = ii[WS(rs, 55)]; - T5Y = W[108]; - T60 = T5Y * T5Z; - Taz = T5Y * T62; - T61 = W[109]; - T63 = FMA(T61, T62, T60); - TaA = FNMS(T61, T5Z, Taz); - } - { - E T5X, T6a, Tga, Tgb; - T5X = T5Q + T5W; - T6a = T63 + T69; - T6b = T5X + T6a; - TfV = T6a - T5X; - Tga = Tat + Tav; - Tgb = TaA + TaC; - Tgc = Tga - Tgb; - ThT = Tga + Tgb; - { - E Tax, Tbl, TaE, Tbk; - { - E Tar, Taw, Tay, TaD; - Tar = T5Q - T5W; - Taw = Tat - Tav; - Tax = Tar + Taw; - Tbl = Taw - Tar; - Tay = T63 - T69; - TaD = TaA - TaC; - TaE = Tay - TaD; - Tbk = Tay + TaD; - } - TaF = Tax - TaE; - Tdy = Tax + TaE; - Tbm = Tbk - Tbl; - Tdn = Tbl + Tbk; - } - } - } - { - E T4v, T9V, T4O, T9R, T4B, T9X, T4I, T9P; - { - E T4r, T4u, T4s, T9U, T4q, T4t; - T4r = ri[WS(rs, 5)]; - T4u = ii[WS(rs, 5)]; - T4q = W[8]; - T4s = T4q * T4r; - T9U = T4q * T4u; - T4t = W[9]; - T4v = FMA(T4t, T4u, T4s); - T9V = FNMS(T4t, T4r, T9U); - } - { - E T4K, T4N, T4L, T9Q, T4J, T4M; - T4K = ri[WS(rs, 53)]; - T4N = ii[WS(rs, 53)]; - T4J = W[104]; - T4L = T4J * T4K; - T9Q = T4J * T4N; - T4M = W[105]; - T4O = FMA(T4M, T4N, T4L); - T9R = FNMS(T4M, T4K, T9Q); - } - { - E T4x, T4A, T4y, T9W, T4w, T4z; - T4x = ri[WS(rs, 37)]; - T4A = ii[WS(rs, 37)]; - T4w = W[72]; - T4y = T4w * T4x; - T9W = T4w * T4A; - T4z = W[73]; - T4B = FMA(T4z, T4A, T4y); - T9X = FNMS(T4z, T4x, T9W); - } - { - E T4E, T4H, T4F, T9O, T4D, T4G; - T4E = ri[WS(rs, 21)]; - T4H = ii[WS(rs, 21)]; - T4D = W[40]; - T4F = T4D * T4E; - T9O = T4D * T4H; - T4G = W[41]; - T4I = FMA(T4G, T4H, T4F); - T9P = FNMS(T4G, T4E, T9O); - } - { - E T4C, T4P, Tfz, Tfw, Tfx, Tfy; - T4C = T4v + T4B; - T4P = T4I + T4O; - Tfz = T4C - T4P; - Tfw = T9V + T9X; - Tfx = T9P + T9R; - Tfy = Tfw - Tfx; - T4Q = T4C + T4P; - ThN = Tfw + Tfx; - TfA = Tfy - Tfz; - TfN = Tfz + Tfy; - } - { - E T9T, Td7, Ta0, Td6; - { - E T9N, T9S, T9Y, T9Z; - T9N = T4v - T4B; - T9S = T9P - T9R; - T9T = T9N - T9S; - Td7 = T9N + T9S; - T9Y = T9V - T9X; - T9Z = T4I - T4O; - Ta0 = T9Y + T9Z; - Td6 = T9Y - T9Z; - } - Ta1 = FNMS(KP414213562, Ta0, T9T); - Tdh = FMA(KP414213562, Td6, Td7); - Taf = FMA(KP414213562, T9T, Ta0); - Td8 = FNMS(KP414213562, Td7, Td6); - } - } - { - E T4W, T9G, T5f, T9C, T52, T9I, T59, T9A; - { - E T4S, T4V, T4T, T9F, T4R, T4U; - T4S = ri[WS(rs, 61)]; - T4V = ii[WS(rs, 61)]; - T4R = W[120]; - T4T = T4R * T4S; - T9F = T4R * T4V; - T4U = W[121]; - T4W = FMA(T4U, T4V, T4T); - T9G = FNMS(T4U, T4S, T9F); - } - { - E T5b, T5e, T5c, T9B, T5a, T5d; - T5b = ri[WS(rs, 45)]; - T5e = ii[WS(rs, 45)]; - T5a = W[88]; - T5c = T5a * T5b; - T9B = T5a * T5e; - T5d = W[89]; - T5f = FMA(T5d, T5e, T5c); - T9C = FNMS(T5d, T5b, T9B); - } - { - E T4Y, T51, T4Z, T9H, T4X, T50; - T4Y = ri[WS(rs, 29)]; - T51 = ii[WS(rs, 29)]; - T4X = W[56]; - T4Z = T4X * T4Y; - T9H = T4X * T51; - T50 = W[57]; - T52 = FMA(T50, T51, T4Z); - T9I = FNMS(T50, T4Y, T9H); - } - { - E T55, T58, T56, T9z, T54, T57; - T55 = ri[WS(rs, 13)]; - T58 = ii[WS(rs, 13)]; - T54 = W[24]; - T56 = T54 * T55; - T9z = T54 * T58; - T57 = W[25]; - T59 = FMA(T57, T58, T56); - T9A = FNMS(T57, T55, T9z); - } - { - E T53, T5g, TfB, TfC, TfD, TfE; - T53 = T4W + T52; - T5g = T59 + T5f; - TfB = T53 - T5g; - TfC = T9G + T9I; - TfD = T9A + T9C; - TfE = TfC - TfD; - T5h = T53 + T5g; - ThO = TfC + TfD; - TfF = TfB + TfE; - TfO = TfB - TfE; - } - { - E T9E, Tda, T9L, Td9; - { - E T9y, T9D, T9J, T9K; - T9y = T4W - T52; - T9D = T9A - T9C; - T9E = T9y - T9D; - Tda = T9y + T9D; - T9J = T9G - T9I; - T9K = T59 - T5f; - T9L = T9J + T9K; - Td9 = T9J - T9K; - } - T9M = FMA(KP414213562, T9L, T9E); - Tdi = FNMS(KP414213562, Td9, Tda); - Tag = FNMS(KP414213562, T9E, T9L); - Tdb = FMA(KP414213562, Tda, Td9); - } - } - { - E T6i, Tb4, T6B, Tb0, T6o, Tb6, T6v, TaY; - { - E T6e, T6h, T6f, Tb3, T6d, T6g; - T6e = ri[WS(rs, 3)]; - T6h = ii[WS(rs, 3)]; - T6d = W[4]; - T6f = T6d * T6e; - Tb3 = T6d * T6h; - T6g = W[5]; - T6i = FMA(T6g, T6h, T6f); - Tb4 = FNMS(T6g, T6e, Tb3); - } - { - E T6x, T6A, T6y, TaZ, T6w, T6z; - T6x = ri[WS(rs, 51)]; - T6A = ii[WS(rs, 51)]; - T6w = W[100]; - T6y = T6w * T6x; - TaZ = T6w * T6A; - T6z = W[101]; - T6B = FMA(T6z, T6A, T6y); - Tb0 = FNMS(T6z, T6x, TaZ); - } - { - E T6k, T6n, T6l, Tb5, T6j, T6m; - T6k = ri[WS(rs, 35)]; - T6n = ii[WS(rs, 35)]; - T6j = W[68]; - T6l = T6j * T6k; - Tb5 = T6j * T6n; - T6m = W[69]; - T6o = FMA(T6m, T6n, T6l); - Tb6 = FNMS(T6m, T6k, Tb5); - } - { - E T6r, T6u, T6s, TaX, T6q, T6t; - T6r = ri[WS(rs, 19)]; - T6u = ii[WS(rs, 19)]; - T6q = W[36]; - T6s = T6q * T6r; - TaX = T6q * T6u; - T6t = W[37]; - T6v = FMA(T6t, T6u, T6s); - TaY = FNMS(T6t, T6r, TaX); - } - { - E T6p, T6C, Tg0, TfX, TfY, TfZ; - T6p = T6i + T6o; - T6C = T6v + T6B; - Tg0 = T6p - T6C; - TfX = Tb4 + Tb6; - TfY = TaY + Tb0; - TfZ = TfX - TfY; - T6D = T6p + T6C; - ThY = TfX + TfY; - Tg1 = TfZ - Tg0; - Tge = Tg0 + TfZ; - } - { - E Tb2, Tdq, Tb9, Tdp; - { - E TaW, Tb1, Tb7, Tb8; - TaW = T6i - T6o; - Tb1 = TaY - Tb0; - Tb2 = TaW - Tb1; - Tdq = TaW + Tb1; - Tb7 = Tb4 - Tb6; - Tb8 = T6v - T6B; - Tb9 = Tb7 + Tb8; - Tdp = Tb7 - Tb8; - } - Tba = FNMS(KP414213562, Tb9, Tb2); - TdA = FMA(KP414213562, Tdp, Tdq); - Tbo = FMA(KP414213562, Tb2, Tb9); - Tdr = FNMS(KP414213562, Tdq, Tdp); - } - } - { - E T6J, TaP, T72, TaL, T6P, TaR, T6W, TaJ; - { - E T6F, T6I, T6G, TaO, T6E, T6H; - T6F = ri[WS(rs, 59)]; - T6I = ii[WS(rs, 59)]; - T6E = W[116]; - T6G = T6E * T6F; - TaO = T6E * T6I; - T6H = W[117]; - T6J = FMA(T6H, T6I, T6G); - TaP = FNMS(T6H, T6F, TaO); - } - { - E T6Y, T71, T6Z, TaK, T6X, T70; - T6Y = ri[WS(rs, 43)]; - T71 = ii[WS(rs, 43)]; - T6X = W[84]; - T6Z = T6X * T6Y; - TaK = T6X * T71; - T70 = W[85]; - T72 = FMA(T70, T71, T6Z); - TaL = FNMS(T70, T6Y, TaK); - } - { - E T6L, T6O, T6M, TaQ, T6K, T6N; - T6L = ri[WS(rs, 27)]; - T6O = ii[WS(rs, 27)]; - T6K = W[52]; - T6M = T6K * T6L; - TaQ = T6K * T6O; - T6N = W[53]; - T6P = FMA(T6N, T6O, T6M); - TaR = FNMS(T6N, T6L, TaQ); - } - { - E T6S, T6V, T6T, TaI, T6R, T6U; - T6S = ri[WS(rs, 11)]; - T6V = ii[WS(rs, 11)]; - T6R = W[20]; - T6T = T6R * T6S; - TaI = T6R * T6V; - T6U = W[21]; - T6W = FMA(T6U, T6V, T6T); - TaJ = FNMS(T6U, T6S, TaI); - } - { - E T6Q, T73, Tg2, Tg3, Tg4, Tg5; - T6Q = T6J + T6P; - T73 = T6W + T72; - Tg2 = T6Q - T73; - Tg3 = TaP + TaR; - Tg4 = TaJ + TaL; - Tg5 = Tg3 - Tg4; - T74 = T6Q + T73; - ThZ = Tg3 + Tg4; - Tg6 = Tg2 + Tg5; - Tgf = Tg2 - Tg5; - } - { - E TaN, Tdt, TaU, Tds; - { - E TaH, TaM, TaS, TaT; - TaH = T6J - T6P; - TaM = TaJ - TaL; - TaN = TaH - TaM; - Tdt = TaH + TaM; - TaS = TaP - TaR; - TaT = T6W - T72; - TaU = TaS + TaT; - Tds = TaS - TaT; - } - TaV = FMA(KP414213562, TaU, TaN); - TdB = FNMS(KP414213562, Tds, Tdt); - Tbp = FNMS(KP414213562, TaN, TaU); - Tdu = FMA(KP414213562, Tdt, Tds); - } - } - { - E T1I, Tio, T3v, Tj1, TiX, Tj2, Tir, TiN, T76, TiK, TiC, TiG, T5j, TiJ, Tix; - E TiF; - { - E TO, T1H, Tip, Tiq; - TO = Tm + TN; - T1H = T1f + T1G; - T1I = TO + T1H; - Tio = TO - T1H; - { - E T2B, T3u, TiO, TiW; - T2B = T29 + T2A; - T3u = T32 + T3t; - T3v = T2B + T3u; - Tj1 = T3u - T2B; - TiO = Thr + Ths; - TiW = TiP + TiV; - TiX = TiO + TiW; - Tj2 = TiW - TiO; - } - Tip = Thv + Thw; - Tiq = ThB + ThC; - Tir = Tip - Tiq; - TiN = Tip + Tiq; - { - E T6c, T75, Tiy, Tiz, TiA, TiB; - T6c = T5K + T6b; - T75 = T6D + T74; - Tiy = T6c - T75; - Tiz = ThS + ThT; - TiA = ThY + ThZ; - TiB = Tiz - TiA; - T76 = T6c + T75; - TiK = Tiz + TiA; - TiC = Tiy - TiB; - TiG = Tiy + TiB; - } - { - E T4p, T5i, Tit, Tiu, Tiv, Tiw; - T4p = T3X + T4o; - T5i = T4Q + T5h; - Tit = T4p - T5i; - Tiu = ThH + ThI; - Tiv = ThN + ThO; - Tiw = Tiu - Tiv; - T5j = T4p + T5i; - TiJ = Tiu + Tiv; - Tix = Tit + Tiw; - TiF = Tiw - Tit; - } - } - { - E T3w, T77, TiM, TiY; - T3w = T1I + T3v; - T77 = T5j + T76; - ri[WS(rs, 32)] = T3w - T77; - ri[0] = T3w + T77; - TiM = TiJ + TiK; - TiY = TiN + TiX; - ii[0] = TiM + TiY; - ii[WS(rs, 32)] = TiY - TiM; - } - { - E Tis, TiD, Tj3, Tj4; - Tis = Tio + Tir; - TiD = Tix + TiC; - ri[WS(rs, 40)] = FNMS(KP707106781, TiD, Tis); - ri[WS(rs, 8)] = FMA(KP707106781, TiD, Tis); - Tj3 = Tj1 + Tj2; - Tj4 = TiF + TiG; - ii[WS(rs, 8)] = FMA(KP707106781, Tj4, Tj3); - ii[WS(rs, 40)] = FNMS(KP707106781, Tj4, Tj3); - } - { - E TiE, TiH, Tj5, Tj6; - TiE = Tio - Tir; - TiH = TiF - TiG; - ri[WS(rs, 56)] = FNMS(KP707106781, TiH, TiE); - ri[WS(rs, 24)] = FMA(KP707106781, TiH, TiE); - Tj5 = Tj2 - Tj1; - Tj6 = TiC - Tix; - ii[WS(rs, 24)] = FMA(KP707106781, Tj6, Tj5); - ii[WS(rs, 56)] = FNMS(KP707106781, Tj6, Tj5); - } - { - E TiI, TiL, TiZ, Tj0; - TiI = T1I - T3v; - TiL = TiJ - TiK; - ri[WS(rs, 48)] = TiI - TiL; - ri[WS(rs, 16)] = TiI + TiL; - TiZ = T76 - T5j; - Tj0 = TiX - TiN; - ii[WS(rs, 16)] = TiZ + Tj0; - ii[WS(rs, 48)] = Tj0 - TiZ; - } - } - { - E Thu, Ti8, Tj9, Tjf, ThF, Tjg, Tib, Tja, ThR, Til, Ti5, Tif, Ti2, Tim, Ti6; - E Tii; - { - E Thq, Tht, Tj7, Tj8; - Thq = Tm - TN; - Tht = Thr - Ths; - Thu = Thq - Tht; - Ti8 = Thq + Tht; - Tj7 = T1G - T1f; - Tj8 = TiV - TiP; - Tj9 = Tj7 + Tj8; - Tjf = Tj8 - Tj7; - } - { - E Thz, Ti9, ThE, Tia; - { - E Thx, Thy, ThA, ThD; - Thx = Thv - Thw; - Thy = T29 - T2A; - Thz = Thx - Thy; - Ti9 = Thy + Thx; - ThA = T32 - T3t; - ThD = ThB - ThC; - ThE = ThA + ThD; - Tia = ThA - ThD; - } - ThF = Thz - ThE; - Tjg = Tia - Ti9; - Tib = Ti9 + Tia; - Tja = Thz + ThE; - } - { - E ThL, Tie, ThQ, Tid; - { - E ThJ, ThK, ThM, ThP; - ThJ = ThH - ThI; - ThK = T5h - T4Q; - ThL = ThJ - ThK; - Tie = ThJ + ThK; - ThM = T3X - T4o; - ThP = ThN - ThO; - ThQ = ThM - ThP; - Tid = ThM + ThP; - } - ThR = FMA(KP414213562, ThQ, ThL); - Til = FNMS(KP414213562, Tid, Tie); - Ti5 = FNMS(KP414213562, ThL, ThQ); - Tif = FMA(KP414213562, Tie, Tid); - } - { - E ThW, Tih, Ti1, Tig; - { - E ThU, ThV, ThX, Ti0; - ThU = ThS - ThT; - ThV = T74 - T6D; - ThW = ThU - ThV; - Tih = ThU + ThV; - ThX = T5K - T6b; - Ti0 = ThY - ThZ; - Ti1 = ThX - Ti0; - Tig = ThX + Ti0; - } - Ti2 = FNMS(KP414213562, Ti1, ThW); - Tim = FMA(KP414213562, Tig, Tih); - Ti6 = FMA(KP414213562, ThW, Ti1); - Tii = FNMS(KP414213562, Tih, Tig); - } - { - E ThG, Ti3, Tjh, Tji; - ThG = FMA(KP707106781, ThF, Thu); - Ti3 = ThR - Ti2; - ri[WS(rs, 44)] = FNMS(KP923879532, Ti3, ThG); - ri[WS(rs, 12)] = FMA(KP923879532, Ti3, ThG); - Tjh = FMA(KP707106781, Tjg, Tjf); - Tji = Ti6 - Ti5; - ii[WS(rs, 12)] = FMA(KP923879532, Tji, Tjh); - ii[WS(rs, 44)] = FNMS(KP923879532, Tji, Tjh); - } - { - E Ti4, Ti7, Tjj, Tjk; - Ti4 = FNMS(KP707106781, ThF, Thu); - Ti7 = Ti5 + Ti6; - ri[WS(rs, 28)] = FNMS(KP923879532, Ti7, Ti4); - ri[WS(rs, 60)] = FMA(KP923879532, Ti7, Ti4); - Tjj = FNMS(KP707106781, Tjg, Tjf); - Tjk = ThR + Ti2; - ii[WS(rs, 28)] = FNMS(KP923879532, Tjk, Tjj); - ii[WS(rs, 60)] = FMA(KP923879532, Tjk, Tjj); - } - { - E Tic, Tij, Tjb, Tjc; - Tic = FMA(KP707106781, Tib, Ti8); - Tij = Tif + Tii; - ri[WS(rs, 36)] = FNMS(KP923879532, Tij, Tic); - ri[WS(rs, 4)] = FMA(KP923879532, Tij, Tic); - Tjb = FMA(KP707106781, Tja, Tj9); - Tjc = Til + Tim; - ii[WS(rs, 4)] = FMA(KP923879532, Tjc, Tjb); - ii[WS(rs, 36)] = FNMS(KP923879532, Tjc, Tjb); - } - { - E Tik, Tin, Tjd, Tje; - Tik = FNMS(KP707106781, Tib, Ti8); - Tin = Til - Tim; - ri[WS(rs, 52)] = FNMS(KP923879532, Tin, Tik); - ri[WS(rs, 20)] = FMA(KP923879532, Tin, Tik); - Tjd = FNMS(KP707106781, Tja, Tj9); - Tje = Tii - Tif; - ii[WS(rs, 20)] = FMA(KP923879532, Tje, Tjd); - ii[WS(rs, 52)] = FNMS(KP923879532, Tje, Tjd); - } - } - { - E Tf2, TjJ, Tgo, TjD, TgI, Tjv, Tha, Tjp, Tfp, Tjw, Tgr, Tjq, Th4, Tho, Th8; - E Thk, TfR, TgB, Tgl, Tgv, TgP, TjK, Thd, TjE, TgX, Thn, Th7, Thh, Tgi, TgC; - E Tgm, Tgy; - { - E TeQ, TjB, Tf1, TjC, TeV, Tf0; - TeQ = TeM + TeP; - TjB = Tjm - Tjl; - TeV = TeR + TeU; - Tf0 = TeW - TeZ; - Tf1 = TeV + Tf0; - TjC = Tf0 - TeV; - Tf2 = FNMS(KP707106781, Tf1, TeQ); - TjJ = FNMS(KP707106781, TjC, TjB); - Tgo = FMA(KP707106781, Tf1, TeQ); - TjD = FMA(KP707106781, TjC, TjB); - } - { - E TgE, Tjn, TgH, Tjo, TgF, TgG; - TgE = TeM - TeP; - Tjn = Tjl + Tjm; - TgF = TeU - TeR; - TgG = TeW + TeZ; - TgH = TgF - TgG; - Tjo = TgF + TgG; - TgI = FMA(KP707106781, TgH, TgE); - Tjv = FNMS(KP707106781, Tjo, Tjn); - Tha = FNMS(KP707106781, TgH, TgE); - Tjp = FMA(KP707106781, Tjo, Tjn); - } - { - E Tfd, Tgp, Tfo, Tgq; - { - E Tf7, Tfc, Tfi, Tfn; - Tf7 = Tf5 + Tf6; - Tfc = Tf8 + Tfb; - Tfd = FNMS(KP414213562, Tfc, Tf7); - Tgp = FMA(KP414213562, Tf7, Tfc); - Tfi = Tfg + Tfh; - Tfn = Tfj + Tfm; - Tfo = FMA(KP414213562, Tfn, Tfi); - Tgq = FNMS(KP414213562, Tfi, Tfn); - } - Tfp = Tfd - Tfo; - Tjw = Tgq - Tgp; - Tgr = Tgp + Tgq; - Tjq = Tfd + Tfo; - } - { - E Th0, Thj, Th3, Thi; - { - E TgY, TgZ, Th1, Th2; - TgY = Tg9 - Tgc; - TgZ = Tg6 - Tg1; - Th0 = FNMS(KP707106781, TgZ, TgY); - Thj = FMA(KP707106781, TgZ, TgY); - Th1 = TfU - TfV; - Th2 = Tge - Tgf; - Th3 = FNMS(KP707106781, Th2, Th1); - Thi = FMA(KP707106781, Th2, Th1); - } - Th4 = FNMS(KP668178637, Th3, Th0); - Tho = FMA(KP198912367, Thi, Thj); - Th8 = FMA(KP668178637, Th0, Th3); - Thk = FNMS(KP198912367, Thj, Thi); - } - { - E TfH, Tgu, TfQ, Tgt; - { - E Tfv, TfG, TfM, TfP; - Tfv = Tft + Tfu; - TfG = TfA + TfF; - TfH = FNMS(KP707106781, TfG, Tfv); - Tgu = FMA(KP707106781, TfG, Tfv); - TfM = TfI + TfL; - TfP = TfN + TfO; - TfQ = FNMS(KP707106781, TfP, TfM); - Tgt = FMA(KP707106781, TfP, TfM); - } - TfR = FMA(KP668178637, TfQ, TfH); - TgB = FNMS(KP198912367, Tgt, Tgu); - Tgl = FNMS(KP668178637, TfH, TfQ); - Tgv = FMA(KP198912367, Tgu, Tgt); - } - { - E TgL, Thb, TgO, Thc; - { - E TgJ, TgK, TgM, TgN; - TgJ = Tf5 - Tf6; - TgK = Tf8 - Tfb; - TgL = FMA(KP414213562, TgK, TgJ); - Thb = FNMS(KP414213562, TgJ, TgK); - TgM = Tfg - Tfh; - TgN = Tfj - Tfm; - TgO = FNMS(KP414213562, TgN, TgM); - Thc = FMA(KP414213562, TgM, TgN); - } - TgP = TgL - TgO; - TjK = TgL + TgO; - Thd = Thb + Thc; - TjE = Thc - Thb; - } - { - E TgT, Thg, TgW, Thf; - { - E TgR, TgS, TgU, TgV; - TgR = TfI - TfL; - TgS = TfF - TfA; - TgT = FNMS(KP707106781, TgS, TgR); - Thg = FMA(KP707106781, TgS, TgR); - TgU = Tft - Tfu; - TgV = TfN - TfO; - TgW = FNMS(KP707106781, TgV, TgU); - Thf = FMA(KP707106781, TgV, TgU); - } - TgX = FMA(KP668178637, TgW, TgT); - Thn = FNMS(KP198912367, Thf, Thg); - Th7 = FNMS(KP668178637, TgT, TgW); - Thh = FMA(KP198912367, Thg, Thf); - } - { - E Tg8, Tgx, Tgh, Tgw; - { - E TfW, Tg7, Tgd, Tgg; - TfW = TfU + TfV; - Tg7 = Tg1 + Tg6; - Tg8 = FNMS(KP707106781, Tg7, TfW); - Tgx = FMA(KP707106781, Tg7, TfW); - Tgd = Tg9 + Tgc; - Tgg = Tge + Tgf; - Tgh = FNMS(KP707106781, Tgg, Tgd); - Tgw = FMA(KP707106781, Tgg, Tgd); - } - Tgi = FNMS(KP668178637, Tgh, Tg8); - TgC = FMA(KP198912367, Tgw, Tgx); - Tgm = FMA(KP668178637, Tg8, Tgh); - Tgy = FNMS(KP198912367, Tgx, Tgw); - } - { - E Tfq, Tgj, Tjx, Tjy; - Tfq = FMA(KP923879532, Tfp, Tf2); - Tgj = TfR - Tgi; - ri[WS(rs, 42)] = FNMS(KP831469612, Tgj, Tfq); - ri[WS(rs, 10)] = FMA(KP831469612, Tgj, Tfq); - Tjx = FMA(KP923879532, Tjw, Tjv); - Tjy = Tgm - Tgl; - ii[WS(rs, 10)] = FMA(KP831469612, Tjy, Tjx); - ii[WS(rs, 42)] = FNMS(KP831469612, Tjy, Tjx); - } - { - E Tgk, Tgn, Tjz, TjA; - Tgk = FNMS(KP923879532, Tfp, Tf2); - Tgn = Tgl + Tgm; - ri[WS(rs, 26)] = FNMS(KP831469612, Tgn, Tgk); - ri[WS(rs, 58)] = FMA(KP831469612, Tgn, Tgk); - Tjz = FNMS(KP923879532, Tjw, Tjv); - TjA = TfR + Tgi; - ii[WS(rs, 26)] = FNMS(KP831469612, TjA, Tjz); - ii[WS(rs, 58)] = FMA(KP831469612, TjA, Tjz); - } - { - E Tgs, Tgz, Tjr, Tjs; - Tgs = FMA(KP923879532, Tgr, Tgo); - Tgz = Tgv + Tgy; - ri[WS(rs, 34)] = FNMS(KP980785280, Tgz, Tgs); - ri[WS(rs, 2)] = FMA(KP980785280, Tgz, Tgs); - Tjr = FMA(KP923879532, Tjq, Tjp); - Tjs = TgB + TgC; - ii[WS(rs, 2)] = FMA(KP980785280, Tjs, Tjr); - ii[WS(rs, 34)] = FNMS(KP980785280, Tjs, Tjr); - } - { - E TgA, TgD, Tjt, Tju; - TgA = FNMS(KP923879532, Tgr, Tgo); - TgD = TgB - TgC; - ri[WS(rs, 50)] = FNMS(KP980785280, TgD, TgA); - ri[WS(rs, 18)] = FMA(KP980785280, TgD, TgA); - Tjt = FNMS(KP923879532, Tjq, Tjp); - Tju = Tgy - Tgv; - ii[WS(rs, 18)] = FMA(KP980785280, Tju, Tjt); - ii[WS(rs, 50)] = FNMS(KP980785280, Tju, Tjt); - } - { - E TgQ, Th5, TjF, TjG; - TgQ = FMA(KP923879532, TgP, TgI); - Th5 = TgX + Th4; - ri[WS(rs, 38)] = FNMS(KP831469612, Th5, TgQ); - ri[WS(rs, 6)] = FMA(KP831469612, Th5, TgQ); - TjF = FMA(KP923879532, TjE, TjD); - TjG = Th7 + Th8; - ii[WS(rs, 6)] = FMA(KP831469612, TjG, TjF); - ii[WS(rs, 38)] = FNMS(KP831469612, TjG, TjF); - } - { - E Th6, Th9, TjH, TjI; - Th6 = FNMS(KP923879532, TgP, TgI); - Th9 = Th7 - Th8; - ri[WS(rs, 54)] = FNMS(KP831469612, Th9, Th6); - ri[WS(rs, 22)] = FMA(KP831469612, Th9, Th6); - TjH = FNMS(KP923879532, TjE, TjD); - TjI = Th4 - TgX; - ii[WS(rs, 22)] = FMA(KP831469612, TjI, TjH); - ii[WS(rs, 54)] = FNMS(KP831469612, TjI, TjH); - } - { - E The, Thl, TjL, TjM; - The = FNMS(KP923879532, Thd, Tha); - Thl = Thh - Thk; - ri[WS(rs, 46)] = FNMS(KP980785280, Thl, The); - ri[WS(rs, 14)] = FMA(KP980785280, Thl, The); - TjL = FNMS(KP923879532, TjK, TjJ); - TjM = Tho - Thn; - ii[WS(rs, 14)] = FMA(KP980785280, TjM, TjL); - ii[WS(rs, 46)] = FNMS(KP980785280, TjM, TjL); - } - { - E Thm, Thp, TjN, TjO; - Thm = FMA(KP923879532, Thd, Tha); - Thp = Thn + Tho; - ri[WS(rs, 30)] = FNMS(KP980785280, Thp, Thm); - ri[WS(rs, 62)] = FMA(KP980785280, Thp, Thm); - TjN = FMA(KP923879532, TjK, TjJ); - TjO = Thh + Thk; - ii[WS(rs, 30)] = FNMS(KP980785280, TjO, TjN); - ii[WS(rs, 62)] = FMA(KP980785280, TjO, TjN); - } - } - { - E T99, Tkw, TbB, Tkq, Taj, TbL, Tbv, TbF, Tce, Tcy, Tci, Tcu, Tc7, Tcx, Tch; - E Tcr, TbZ, TkK, Tcn, TkE, Tbs, TbM, Tbw, TbI, T80, TkD, TkJ, Tby, TbS, Tkp; - E Tkv, Tck; - { - E T8z, Tbz, T98, TbA; - { - E T8n, T8y, T8W, T97; - T8n = FNMS(KP707106781, T8m, T87); - T8y = FNMS(KP707106781, T8x, T8u); - T8z = FNMS(KP668178637, T8y, T8n); - Tbz = FMA(KP668178637, T8n, T8y); - T8W = FNMS(KP707106781, T8V, T8G); - T97 = FNMS(KP707106781, T96, T93); - T98 = FMA(KP668178637, T97, T8W); - TbA = FNMS(KP668178637, T8W, T97); - } - T99 = T8z - T98; - Tkw = TbA - Tbz; - TbB = Tbz + TbA; - Tkq = T8z + T98; - } - { - E Ta3, TbE, Tai, TbD; - { - E T9x, Ta2, Tae, Tah; - T9x = FNMS(KP707106781, T9w, T9h); - Ta2 = T9M - Ta1; - Ta3 = FNMS(KP923879532, Ta2, T9x); - TbE = FMA(KP923879532, Ta2, T9x); - Tae = FNMS(KP707106781, Tad, Taa); - Tah = Taf - Tag; - Tai = FNMS(KP923879532, Tah, Tae); - TbD = FMA(KP923879532, Tah, Tae); - } - Taj = FMA(KP534511135, Tai, Ta3); - TbL = FNMS(KP303346683, TbD, TbE); - Tbv = FNMS(KP534511135, Ta3, Tai); - TbF = FMA(KP303346683, TbE, TbD); - } - { - E Tca, Tct, Tcd, Tcs; - { - E Tc8, Tc9, Tcb, Tcc; - Tc8 = FMA(KP707106781, Tbm, Tbj); - Tc9 = Tba + TaV; - Tca = FNMS(KP923879532, Tc9, Tc8); - Tct = FMA(KP923879532, Tc9, Tc8); - Tcb = FMA(KP707106781, TaF, Taq); - Tcc = Tbo + Tbp; - Tcd = FNMS(KP923879532, Tcc, Tcb); - Tcs = FMA(KP923879532, Tcc, Tcb); - } - Tce = FNMS(KP820678790, Tcd, Tca); - Tcy = FMA(KP098491403, Tcs, Tct); - Tci = FMA(KP820678790, Tca, Tcd); - Tcu = FNMS(KP098491403, Tct, Tcs); - } - { - E Tc3, Tcq, Tc6, Tcp; - { - E Tc1, Tc2, Tc4, Tc5; - Tc1 = FMA(KP707106781, Tad, Taa); - Tc2 = Ta1 + T9M; - Tc3 = FNMS(KP923879532, Tc2, Tc1); - Tcq = FMA(KP923879532, Tc2, Tc1); - Tc4 = FMA(KP707106781, T9w, T9h); - Tc5 = Taf + Tag; - Tc6 = FNMS(KP923879532, Tc5, Tc4); - Tcp = FMA(KP923879532, Tc5, Tc4); - } - Tc7 = FMA(KP820678790, Tc6, Tc3); - Tcx = FNMS(KP098491403, Tcp, Tcq); - Tch = FNMS(KP820678790, Tc3, Tc6); - Tcr = FMA(KP098491403, Tcq, Tcp); - } - { - E TbV, Tcl, TbY, Tcm; - { - E TbT, TbU, TbW, TbX; - TbT = FMA(KP707106781, T8m, T87); - TbU = FMA(KP707106781, T8x, T8u); - TbV = FMA(KP198912367, TbU, TbT); - Tcl = FNMS(KP198912367, TbT, TbU); - TbW = FMA(KP707106781, T8V, T8G); - TbX = FMA(KP707106781, T96, T93); - TbY = FNMS(KP198912367, TbX, TbW); - Tcm = FMA(KP198912367, TbW, TbX); - } - TbZ = TbV - TbY; - TkK = TbV + TbY; - Tcn = Tcl + Tcm; - TkE = Tcm - Tcl; - } - { - E Tbc, TbH, Tbr, TbG; - { - E TaG, Tbb, Tbn, Tbq; - TaG = FNMS(KP707106781, TaF, Taq); - Tbb = TaV - Tba; - Tbc = FNMS(KP923879532, Tbb, TaG); - TbH = FMA(KP923879532, Tbb, TaG); - Tbn = FNMS(KP707106781, Tbm, Tbj); - Tbq = Tbo - Tbp; - Tbr = FNMS(KP923879532, Tbq, Tbn); - TbG = FMA(KP923879532, Tbq, Tbn); - } - Tbs = FNMS(KP534511135, Tbr, Tbc); - TbM = FMA(KP303346683, TbG, TbH); - Tbw = FMA(KP534511135, Tbc, Tbr); - TbI = FNMS(KP303346683, TbH, TbG); - } - { - E T7u, TbO, Tkn, TkB, T7Z, TkC, TbR, Tko, T7t, Tkm; - T7t = T7l - T7s; - T7u = FMA(KP707106781, T7t, T7e); - TbO = FNMS(KP707106781, T7t, T7e); - Tkm = TcC - TcB; - Tkn = FMA(KP707106781, Tkm, Tkl); - TkB = FNMS(KP707106781, Tkm, Tkl); - { - E T7J, T7Y, TbP, TbQ; - T7J = FMA(KP414213562, T7I, T7B); - T7Y = FNMS(KP414213562, T7X, T7Q); - T7Z = T7J - T7Y; - TkC = T7J + T7Y; - TbP = FNMS(KP414213562, T7B, T7I); - TbQ = FMA(KP414213562, T7Q, T7X); - TbR = TbP + TbQ; - Tko = TbQ - TbP; - } - T80 = FNMS(KP923879532, T7Z, T7u); - TkD = FNMS(KP923879532, TkC, TkB); - TkJ = FMA(KP923879532, TkC, TkB); - Tby = FMA(KP923879532, T7Z, T7u); - TbS = FNMS(KP923879532, TbR, TbO); - Tkp = FMA(KP923879532, Tko, Tkn); - Tkv = FNMS(KP923879532, Tko, Tkn); - Tck = FMA(KP923879532, TbR, TbO); - } - { - E T9a, Tbt, Tkx, Tky; - T9a = FMA(KP831469612, T99, T80); - Tbt = Taj - Tbs; - ri[WS(rs, 43)] = FNMS(KP881921264, Tbt, T9a); - ri[WS(rs, 11)] = FMA(KP881921264, Tbt, T9a); - Tkx = FMA(KP831469612, Tkw, Tkv); - Tky = Tbw - Tbv; - ii[WS(rs, 11)] = FMA(KP881921264, Tky, Tkx); - ii[WS(rs, 43)] = FNMS(KP881921264, Tky, Tkx); - } - { - E Tbu, Tbx, Tkz, TkA; - Tbu = FNMS(KP831469612, T99, T80); - Tbx = Tbv + Tbw; - ri[WS(rs, 27)] = FNMS(KP881921264, Tbx, Tbu); - ri[WS(rs, 59)] = FMA(KP881921264, Tbx, Tbu); - Tkz = FNMS(KP831469612, Tkw, Tkv); - TkA = Taj + Tbs; - ii[WS(rs, 27)] = FNMS(KP881921264, TkA, Tkz); - ii[WS(rs, 59)] = FMA(KP881921264, TkA, Tkz); - } - { - E TbC, TbJ, Tkr, Tks; - TbC = FMA(KP831469612, TbB, Tby); - TbJ = TbF + TbI; - ri[WS(rs, 35)] = FNMS(KP956940335, TbJ, TbC); - ri[WS(rs, 3)] = FMA(KP956940335, TbJ, TbC); - Tkr = FMA(KP831469612, Tkq, Tkp); - Tks = TbL + TbM; - ii[WS(rs, 3)] = FMA(KP956940335, Tks, Tkr); - ii[WS(rs, 35)] = FNMS(KP956940335, Tks, Tkr); - } - { - E TbK, TbN, Tkt, Tku; - TbK = FNMS(KP831469612, TbB, Tby); - TbN = TbL - TbM; - ri[WS(rs, 51)] = FNMS(KP956940335, TbN, TbK); - ri[WS(rs, 19)] = FMA(KP956940335, TbN, TbK); - Tkt = FNMS(KP831469612, Tkq, Tkp); - Tku = TbI - TbF; - ii[WS(rs, 19)] = FMA(KP956940335, Tku, Tkt); - ii[WS(rs, 51)] = FNMS(KP956940335, Tku, Tkt); - } - { - E Tc0, Tcf, TkF, TkG; - Tc0 = FMA(KP980785280, TbZ, TbS); - Tcf = Tc7 + Tce; - ri[WS(rs, 39)] = FNMS(KP773010453, Tcf, Tc0); - ri[WS(rs, 7)] = FMA(KP773010453, Tcf, Tc0); - TkF = FMA(KP980785280, TkE, TkD); - TkG = Tch + Tci; - ii[WS(rs, 7)] = FMA(KP773010453, TkG, TkF); - ii[WS(rs, 39)] = FNMS(KP773010453, TkG, TkF); - } - { - E Tcg, Tcj, TkH, TkI; - Tcg = FNMS(KP980785280, TbZ, TbS); - Tcj = Tch - Tci; - ri[WS(rs, 55)] = FNMS(KP773010453, Tcj, Tcg); - ri[WS(rs, 23)] = FMA(KP773010453, Tcj, Tcg); - TkH = FNMS(KP980785280, TkE, TkD); - TkI = Tce - Tc7; - ii[WS(rs, 23)] = FMA(KP773010453, TkI, TkH); - ii[WS(rs, 55)] = FNMS(KP773010453, TkI, TkH); - } - { - E Tco, Tcv, TkL, TkM; - Tco = FNMS(KP980785280, Tcn, Tck); - Tcv = Tcr - Tcu; - ri[WS(rs, 47)] = FNMS(KP995184726, Tcv, Tco); - ri[WS(rs, 15)] = FMA(KP995184726, Tcv, Tco); - TkL = FNMS(KP980785280, TkK, TkJ); - TkM = Tcy - Tcx; - ii[WS(rs, 15)] = FMA(KP995184726, TkM, TkL); - ii[WS(rs, 47)] = FNMS(KP995184726, TkM, TkL); - } - { - E Tcw, Tcz, TkN, TkO; - Tcw = FMA(KP980785280, Tcn, Tck); - Tcz = Tcx + Tcy; - ri[WS(rs, 31)] = FNMS(KP995184726, Tcz, Tcw); - ri[WS(rs, 63)] = FMA(KP995184726, Tcz, Tcw); - TkN = FMA(KP980785280, TkK, TkJ); - TkO = Tcr + Tcu; - ii[WS(rs, 31)] = FNMS(KP995184726, TkO, TkN); - ii[WS(rs, 63)] = FMA(KP995184726, TkO, TkN); - } - } - { - E Td1, Tk2, TdN, TjW, Tdl, TdX, TdH, TdR, Teq, TeK, Teu, TeG, Tej, TeJ, Tet; - E TeD, Teb, Tkg, Tez, Tka, TdE, TdY, TdI, TdU, TcM, Tk9, Tkf, TdK, Te4, TjV; - E Tk1, Tew; - { - E TcT, TdL, Td0, TdM; - { - E TcP, TcS, TcW, TcZ; - TcP = FMA(KP707106781, TcO, TcN); - TcS = FMA(KP707106781, TcR, TcQ); - TcT = FNMS(KP198912367, TcS, TcP); - TdL = FMA(KP198912367, TcP, TcS); - TcW = FMA(KP707106781, TcV, TcU); - TcZ = FMA(KP707106781, TcY, TcX); - Td0 = FMA(KP198912367, TcZ, TcW); - TdM = FNMS(KP198912367, TcW, TcZ); - } - Td1 = TcT - Td0; - Tk2 = TdM - TdL; - TdN = TdL + TdM; - TjW = TcT + Td0; - } - { - E Tdd, TdQ, Tdk, TdP; - { - E Td5, Tdc, Tdg, Tdj; - Td5 = FMA(KP707106781, Td4, Td3); - Tdc = Td8 + Tdb; - Tdd = FNMS(KP923879532, Tdc, Td5); - TdQ = FMA(KP923879532, Tdc, Td5); - Tdg = FMA(KP707106781, Tdf, Tde); - Tdj = Tdh + Tdi; - Tdk = FNMS(KP923879532, Tdj, Tdg); - TdP = FMA(KP923879532, Tdj, Tdg); - } - Tdl = FMA(KP820678790, Tdk, Tdd); - TdX = FNMS(KP098491403, TdP, TdQ); - TdH = FNMS(KP820678790, Tdd, Tdk); - TdR = FMA(KP098491403, TdQ, TdP); - } - { - E Tem, TeF, Tep, TeE; - { - E Tek, Tel, Ten, Teo; - Tek = FNMS(KP707106781, Tdy, Tdx); - Tel = Tdu - Tdr; - Tem = FNMS(KP923879532, Tel, Tek); - TeF = FMA(KP923879532, Tel, Tek); - Ten = FNMS(KP707106781, Tdn, Tdm); - Teo = TdA - TdB; - Tep = FNMS(KP923879532, Teo, Ten); - TeE = FMA(KP923879532, Teo, Ten); - } - Teq = FNMS(KP534511135, Tep, Tem); - TeK = FMA(KP303346683, TeE, TeF); - Teu = FMA(KP534511135, Tem, Tep); - TeG = FNMS(KP303346683, TeF, TeE); - } - { - E Tef, TeC, Tei, TeB; - { - E Ted, Tee, Teg, Teh; - Ted = FNMS(KP707106781, Tdf, Tde); - Tee = Tdb - Td8; - Tef = FNMS(KP923879532, Tee, Ted); - TeC = FMA(KP923879532, Tee, Ted); - Teg = FNMS(KP707106781, Td4, Td3); - Teh = Tdh - Tdi; - Tei = FNMS(KP923879532, Teh, Teg); - TeB = FMA(KP923879532, Teh, Teg); - } - Tej = FMA(KP534511135, Tei, Tef); - TeJ = FNMS(KP303346683, TeB, TeC); - Tet = FNMS(KP534511135, Tef, Tei); - TeD = FMA(KP303346683, TeC, TeB); - } - { - E Te7, Tex, Tea, Tey; - { - E Te5, Te6, Te8, Te9; - Te5 = FNMS(KP707106781, TcO, TcN); - Te6 = FNMS(KP707106781, TcR, TcQ); - Te7 = FMA(KP668178637, Te6, Te5); - Tex = FNMS(KP668178637, Te5, Te6); - Te8 = FNMS(KP707106781, TcV, TcU); - Te9 = FNMS(KP707106781, TcY, TcX); - Tea = FNMS(KP668178637, Te9, Te8); - Tey = FMA(KP668178637, Te8, Te9); - } - Teb = Te7 - Tea; - Tkg = Te7 + Tea; - Tez = Tex + Tey; - Tka = Tey - Tex; - } - { - E Tdw, TdT, TdD, TdS; - { - E Tdo, Tdv, Tdz, TdC; - Tdo = FMA(KP707106781, Tdn, Tdm); - Tdv = Tdr + Tdu; - Tdw = FNMS(KP923879532, Tdv, Tdo); - TdT = FMA(KP923879532, Tdv, Tdo); - Tdz = FMA(KP707106781, Tdy, Tdx); - TdC = TdA + TdB; - TdD = FNMS(KP923879532, TdC, Tdz); - TdS = FMA(KP923879532, TdC, Tdz); - } - TdE = FNMS(KP820678790, TdD, Tdw); - TdY = FMA(KP098491403, TdS, TdT); - TdI = FMA(KP820678790, Tdw, TdD); - TdU = FNMS(KP098491403, TdT, TdS); - } - { - E TcE, Te0, TjT, Tk7, TcL, Tk8, Te3, TjU, TcD, TjS; - TcD = TcB + TcC; - TcE = FMA(KP707106781, TcD, TcA); - Te0 = FNMS(KP707106781, TcD, TcA); - TjS = T7l + T7s; - TjT = FMA(KP707106781, TjS, TjR); - Tk7 = FNMS(KP707106781, TjS, TjR); - { - E TcH, TcK, Te1, Te2; - TcH = FMA(KP414213562, TcG, TcF); - TcK = FNMS(KP414213562, TcJ, TcI); - TcL = TcH + TcK; - Tk8 = TcK - TcH; - Te1 = FNMS(KP414213562, TcF, TcG); - Te2 = FMA(KP414213562, TcI, TcJ); - Te3 = Te1 - Te2; - TjU = Te1 + Te2; - } - TcM = FNMS(KP923879532, TcL, TcE); - Tk9 = FMA(KP923879532, Tk8, Tk7); - Tkf = FNMS(KP923879532, Tk8, Tk7); - TdK = FMA(KP923879532, TcL, TcE); - Te4 = FMA(KP923879532, Te3, Te0); - TjV = FMA(KP923879532, TjU, TjT); - Tk1 = FNMS(KP923879532, TjU, TjT); - Tew = FNMS(KP923879532, Te3, Te0); - } - { - E Td2, TdF, Tk3, Tk4; - Td2 = FMA(KP980785280, Td1, TcM); - TdF = Tdl - TdE; - ri[WS(rs, 41)] = FNMS(KP773010453, TdF, Td2); - ri[WS(rs, 9)] = FMA(KP773010453, TdF, Td2); - Tk3 = FMA(KP980785280, Tk2, Tk1); - Tk4 = TdI - TdH; - ii[WS(rs, 9)] = FMA(KP773010453, Tk4, Tk3); - ii[WS(rs, 41)] = FNMS(KP773010453, Tk4, Tk3); - } - { - E TdG, TdJ, Tk5, Tk6; - TdG = FNMS(KP980785280, Td1, TcM); - TdJ = TdH + TdI; - ri[WS(rs, 25)] = FNMS(KP773010453, TdJ, TdG); - ri[WS(rs, 57)] = FMA(KP773010453, TdJ, TdG); - Tk5 = FNMS(KP980785280, Tk2, Tk1); - Tk6 = Tdl + TdE; - ii[WS(rs, 25)] = FNMS(KP773010453, Tk6, Tk5); - ii[WS(rs, 57)] = FMA(KP773010453, Tk6, Tk5); - } - { - E TdO, TdV, TjX, TjY; - TdO = FMA(KP980785280, TdN, TdK); - TdV = TdR + TdU; - ri[WS(rs, 33)] = FNMS(KP995184726, TdV, TdO); - ri[WS(rs, 1)] = FMA(KP995184726, TdV, TdO); - TjX = FMA(KP980785280, TjW, TjV); - TjY = TdX + TdY; - ii[WS(rs, 1)] = FMA(KP995184726, TjY, TjX); - ii[WS(rs, 33)] = FNMS(KP995184726, TjY, TjX); - } - { - E TdW, TdZ, TjZ, Tk0; - TdW = FNMS(KP980785280, TdN, TdK); - TdZ = TdX - TdY; - ri[WS(rs, 49)] = FNMS(KP995184726, TdZ, TdW); - ri[WS(rs, 17)] = FMA(KP995184726, TdZ, TdW); - TjZ = FNMS(KP980785280, TjW, TjV); - Tk0 = TdU - TdR; - ii[WS(rs, 17)] = FMA(KP995184726, Tk0, TjZ); - ii[WS(rs, 49)] = FNMS(KP995184726, Tk0, TjZ); - } - { - E Tec, Ter, Tkb, Tkc; - Tec = FMA(KP831469612, Teb, Te4); - Ter = Tej + Teq; - ri[WS(rs, 37)] = FNMS(KP881921264, Ter, Tec); - ri[WS(rs, 5)] = FMA(KP881921264, Ter, Tec); - Tkb = FMA(KP831469612, Tka, Tk9); - Tkc = Tet + Teu; - ii[WS(rs, 5)] = FMA(KP881921264, Tkc, Tkb); - ii[WS(rs, 37)] = FNMS(KP881921264, Tkc, Tkb); - } - { - E Tes, Tev, Tkd, Tke; - Tes = FNMS(KP831469612, Teb, Te4); - Tev = Tet - Teu; - ri[WS(rs, 53)] = FNMS(KP881921264, Tev, Tes); - ri[WS(rs, 21)] = FMA(KP881921264, Tev, Tes); - Tkd = FNMS(KP831469612, Tka, Tk9); - Tke = Teq - Tej; - ii[WS(rs, 21)] = FMA(KP881921264, Tke, Tkd); - ii[WS(rs, 53)] = FNMS(KP881921264, Tke, Tkd); - } - { - E TeA, TeH, Tkh, Tki; - TeA = FNMS(KP831469612, Tez, Tew); - TeH = TeD - TeG; - ri[WS(rs, 45)] = FNMS(KP956940335, TeH, TeA); - ri[WS(rs, 13)] = FMA(KP956940335, TeH, TeA); - Tkh = FNMS(KP831469612, Tkg, Tkf); - Tki = TeK - TeJ; - ii[WS(rs, 13)] = FMA(KP956940335, Tki, Tkh); - ii[WS(rs, 45)] = FNMS(KP956940335, Tki, Tkh); - } - { - E TeI, TeL, Tkj, Tkk; - TeI = FMA(KP831469612, Tez, Tew); - TeL = TeJ + TeK; - ri[WS(rs, 29)] = FNMS(KP956940335, TeL, TeI); - ri[WS(rs, 61)] = FMA(KP956940335, TeL, TeI); - Tkj = FMA(KP831469612, Tkg, Tkf); - Tkk = TeD + TeG; - ii[WS(rs, 29)] = FNMS(KP956940335, Tkk, Tkj); - ii[WS(rs, 61)] = FMA(KP956940335, Tkk, Tkj); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 64 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 64, "t1_64", twinstr, &GENUS, { 520, 126, 518, 0 }, 0, 0, 0 }; - -void X(codelet_t1_64) (planner *p) { - X(kdft_dit_register) (p, t1_64, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 64 -name t1_64 -include dft/scalar/t.h */ - -/* - * This function contains 1038 FP additions, 500 FP multiplications, - * (or, 808 additions, 270 multiplications, 230 fused multiply/add), - * 176 stack variables, 15 constants, and 256 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP471396736, +0.471396736825997648556387625905254377657460319); - DK(KP881921264, +0.881921264348355029712756863660388349508442621); - DK(KP290284677, +0.290284677254462367636192375817395274691476278); - DK(KP956940335, +0.956940335732208864935797886980269969482849206); - DK(KP634393284, +0.634393284163645498215171613225493370675687095); - DK(KP773010453, +0.773010453362736960810906609758469800971041293); - DK(KP098017140, +0.098017140329560601994195563888641845861136673); - DK(KP995184726, +0.995184726672196886244836953109479921575474869); - DK(KP555570233, +0.555570233019602224742830813948532874374937191); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP195090322, +0.195090322016128267848284868477022240927691618); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP382683432, +0.382683432365089771728459984030398866761344562); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 126); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 126, MAKE_VOLATILE_STRIDE(128, rs)) { - E Tj, TcL, ThT, Tin, T6b, Taz, TgT, Thn, TG, Thm, TcO, TgO, T6m, ThQ, TaC; - E Tim, T14, Tfq, T6y, T9O, TaG, Tc0, TcU, TeE, T1r, Tfr, T6J, T9P, TaJ, Tc1; - E TcZ, TeF, T1Q, T2d, Tfx, Tfu, Tfv, Tfw, T6Q, TaM, Tdb, TeJ, T71, TaQ, T7a; - E TaN, Td6, TeI, T77, TaP, T2B, T2Y, Tfz, TfA, TfB, TfC, T7h, TaW, Tdm, TeM; - E T7s, TaU, T7B, TaX, Tdh, TeL, T7y, TaT, T5j, TfR, Tec, Tf0, TfY, Tgy, T8D; - E Tbl, T8O, Tbx, T9l, Tbm, TdV, TeX, T9i, Tbw, T3M, TfL, TdL, TeQ, TfI, Tgt; - E T7K, Tb2, T7V, Tbe, T8s, Tb3, Tdu, TeT, T8p, Tbd, T4x, TfJ, TdE, TdM, TfO; - E Tgu, T87, T8v, T8i, T8u, Tba, Tbg, Tdz, TdN, Tb7, Tbh, T64, TfZ, Te5, Ted; - E TfU, Tgz, T90, T9o, T9b, T9n, Tbt, Tbz, Te0, Tee, Tbq, TbA; - { - E T1, TgR, T6, TgQ, Tc, T68, Th, T69; - T1 = ri[0]; - TgR = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 32)]; - T5 = ii[WS(rs, 32)]; - T2 = W[62]; - T4 = W[63]; - T6 = FMA(T2, T3, T4 * T5); - TgQ = FNMS(T4, T3, T2 * T5); - } - { - E T9, Tb, T8, Ta; - T9 = ri[WS(rs, 16)]; - Tb = ii[WS(rs, 16)]; - T8 = W[30]; - Ta = W[31]; - Tc = FMA(T8, T9, Ta * Tb); - T68 = FNMS(Ta, T9, T8 * Tb); - } - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 48)]; - Tg = ii[WS(rs, 48)]; - Td = W[94]; - Tf = W[95]; - Th = FMA(Td, Te, Tf * Tg); - T69 = FNMS(Tf, Te, Td * Tg); - } - { - E T7, Ti, ThR, ThS; - T7 = T1 + T6; - Ti = Tc + Th; - Tj = T7 + Ti; - TcL = T7 - Ti; - ThR = TgR - TgQ; - ThS = Tc - Th; - ThT = ThR - ThS; - Tin = ThS + ThR; - } - { - E T67, T6a, TgP, TgS; - T67 = T1 - T6; - T6a = T68 - T69; - T6b = T67 - T6a; - Taz = T67 + T6a; - TgP = T68 + T69; - TgS = TgQ + TgR; - TgT = TgP + TgS; - Thn = TgS - TgP; - } - } - { - E To, T6c, Tt, T6d, T6e, T6f, Tz, T6i, TE, T6j, T6h, T6k; - { - E Tl, Tn, Tk, Tm; - Tl = ri[WS(rs, 8)]; - Tn = ii[WS(rs, 8)]; - Tk = W[14]; - Tm = W[15]; - To = FMA(Tk, Tl, Tm * Tn); - T6c = FNMS(Tm, Tl, Tk * Tn); - } - { - E Tq, Ts, Tp, Tr; - Tq = ri[WS(rs, 40)]; - Ts = ii[WS(rs, 40)]; - Tp = W[78]; - Tr = W[79]; - Tt = FMA(Tp, Tq, Tr * Ts); - T6d = FNMS(Tr, Tq, Tp * Ts); - } - T6e = T6c - T6d; - T6f = To - Tt; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 56)]; - Ty = ii[WS(rs, 56)]; - Tv = W[110]; - Tx = W[111]; - Tz = FMA(Tv, Tw, Tx * Ty); - T6i = FNMS(Tx, Tw, Tv * Ty); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 24)]; - TD = ii[WS(rs, 24)]; - TA = W[46]; - TC = W[47]; - TE = FMA(TA, TB, TC * TD); - T6j = FNMS(TC, TB, TA * TD); - } - T6h = Tz - TE; - T6k = T6i - T6j; - { - E Tu, TF, TcM, TcN; - Tu = To + Tt; - TF = Tz + TE; - TG = Tu + TF; - Thm = TF - Tu; - TcM = T6c + T6d; - TcN = T6i + T6j; - TcO = TcM - TcN; - TgO = TcM + TcN; - } - { - E T6g, T6l, TaA, TaB; - T6g = T6e - T6f; - T6l = T6h + T6k; - T6m = KP707106781 * (T6g - T6l); - ThQ = KP707106781 * (T6g + T6l); - TaA = T6f + T6e; - TaB = T6h - T6k; - TaC = KP707106781 * (TaA + TaB); - Tim = KP707106781 * (TaB - TaA); - } - } - { - E TS, TcQ, T6q, T6t, T13, TcR, T6r, T6w, T6s, T6x; - { - E TM, T6o, TR, T6p; - { - E TJ, TL, TI, TK; - TJ = ri[WS(rs, 4)]; - TL = ii[WS(rs, 4)]; - TI = W[6]; - TK = W[7]; - TM = FMA(TI, TJ, TK * TL); - T6o = FNMS(TK, TJ, TI * TL); - } - { - E TO, TQ, TN, TP; - TO = ri[WS(rs, 36)]; - TQ = ii[WS(rs, 36)]; - TN = W[70]; - TP = W[71]; - TR = FMA(TN, TO, TP * TQ); - T6p = FNMS(TP, TO, TN * TQ); - } - TS = TM + TR; - TcQ = T6o + T6p; - T6q = T6o - T6p; - T6t = TM - TR; - } - { - E TX, T6u, T12, T6v; - { - E TU, TW, TT, TV; - TU = ri[WS(rs, 20)]; - TW = ii[WS(rs, 20)]; - TT = W[38]; - TV = W[39]; - TX = FMA(TT, TU, TV * TW); - T6u = FNMS(TV, TU, TT * TW); - } - { - E TZ, T11, TY, T10; - TZ = ri[WS(rs, 52)]; - T11 = ii[WS(rs, 52)]; - TY = W[102]; - T10 = W[103]; - T12 = FMA(TY, TZ, T10 * T11); - T6v = FNMS(T10, TZ, TY * T11); - } - T13 = TX + T12; - TcR = T6u + T6v; - T6r = TX - T12; - T6w = T6u - T6v; - } - T14 = TS + T13; - Tfq = TcQ + TcR; - T6s = T6q + T6r; - T6x = T6t - T6w; - T6y = FNMS(KP923879532, T6x, KP382683432 * T6s); - T9O = FMA(KP923879532, T6s, KP382683432 * T6x); - { - E TaE, TaF, TcS, TcT; - TaE = T6q - T6r; - TaF = T6t + T6w; - TaG = FNMS(KP382683432, TaF, KP923879532 * TaE); - Tc0 = FMA(KP382683432, TaE, KP923879532 * TaF); - TcS = TcQ - TcR; - TcT = TS - T13; - TcU = TcS - TcT; - TeE = TcT + TcS; - } - } - { - E T1f, TcW, T6B, T6E, T1q, TcX, T6C, T6H, T6D, T6I; - { - E T19, T6z, T1e, T6A; - { - E T16, T18, T15, T17; - T16 = ri[WS(rs, 60)]; - T18 = ii[WS(rs, 60)]; - T15 = W[118]; - T17 = W[119]; - T19 = FMA(T15, T16, T17 * T18); - T6z = FNMS(T17, T16, T15 * T18); - } - { - E T1b, T1d, T1a, T1c; - T1b = ri[WS(rs, 28)]; - T1d = ii[WS(rs, 28)]; - T1a = W[54]; - T1c = W[55]; - T1e = FMA(T1a, T1b, T1c * T1d); - T6A = FNMS(T1c, T1b, T1a * T1d); - } - T1f = T19 + T1e; - TcW = T6z + T6A; - T6B = T6z - T6A; - T6E = T19 - T1e; - } - { - E T1k, T6F, T1p, T6G; - { - E T1h, T1j, T1g, T1i; - T1h = ri[WS(rs, 12)]; - T1j = ii[WS(rs, 12)]; - T1g = W[22]; - T1i = W[23]; - T1k = FMA(T1g, T1h, T1i * T1j); - T6F = FNMS(T1i, T1h, T1g * T1j); - } - { - E T1m, T1o, T1l, T1n; - T1m = ri[WS(rs, 44)]; - T1o = ii[WS(rs, 44)]; - T1l = W[86]; - T1n = W[87]; - T1p = FMA(T1l, T1m, T1n * T1o); - T6G = FNMS(T1n, T1m, T1l * T1o); - } - T1q = T1k + T1p; - TcX = T6F + T6G; - T6C = T1k - T1p; - T6H = T6F - T6G; - } - T1r = T1f + T1q; - Tfr = TcW + TcX; - T6D = T6B + T6C; - T6I = T6E - T6H; - T6J = FMA(KP382683432, T6D, KP923879532 * T6I); - T9P = FNMS(KP923879532, T6D, KP382683432 * T6I); - { - E TaH, TaI, TcV, TcY; - TaH = T6B - T6C; - TaI = T6E + T6H; - TaJ = FMA(KP923879532, TaH, KP382683432 * TaI); - Tc1 = FNMS(KP382683432, TaH, KP923879532 * TaI); - TcV = T1f - T1q; - TcY = TcW - TcX; - TcZ = TcV + TcY; - TeF = TcV - TcY; - } - } - { - E T1y, T6M, T1D, T6N, T1E, Td2, T1J, T74, T1O, T75, T1P, Td3, T21, Td8, T6W; - E T6Z, T2c, Td9, T6R, T6U; - { - E T1v, T1x, T1u, T1w; - T1v = ri[WS(rs, 2)]; - T1x = ii[WS(rs, 2)]; - T1u = W[2]; - T1w = W[3]; - T1y = FMA(T1u, T1v, T1w * T1x); - T6M = FNMS(T1w, T1v, T1u * T1x); - } - { - E T1A, T1C, T1z, T1B; - T1A = ri[WS(rs, 34)]; - T1C = ii[WS(rs, 34)]; - T1z = W[66]; - T1B = W[67]; - T1D = FMA(T1z, T1A, T1B * T1C); - T6N = FNMS(T1B, T1A, T1z * T1C); - } - T1E = T1y + T1D; - Td2 = T6M + T6N; - { - E T1G, T1I, T1F, T1H; - T1G = ri[WS(rs, 18)]; - T1I = ii[WS(rs, 18)]; - T1F = W[34]; - T1H = W[35]; - T1J = FMA(T1F, T1G, T1H * T1I); - T74 = FNMS(T1H, T1G, T1F * T1I); - } - { - E T1L, T1N, T1K, T1M; - T1L = ri[WS(rs, 50)]; - T1N = ii[WS(rs, 50)]; - T1K = W[98]; - T1M = W[99]; - T1O = FMA(T1K, T1L, T1M * T1N); - T75 = FNMS(T1M, T1L, T1K * T1N); - } - T1P = T1J + T1O; - Td3 = T74 + T75; - { - E T1V, T6X, T20, T6Y; - { - E T1S, T1U, T1R, T1T; - T1S = ri[WS(rs, 10)]; - T1U = ii[WS(rs, 10)]; - T1R = W[18]; - T1T = W[19]; - T1V = FMA(T1R, T1S, T1T * T1U); - T6X = FNMS(T1T, T1S, T1R * T1U); - } - { - E T1X, T1Z, T1W, T1Y; - T1X = ri[WS(rs, 42)]; - T1Z = ii[WS(rs, 42)]; - T1W = W[82]; - T1Y = W[83]; - T20 = FMA(T1W, T1X, T1Y * T1Z); - T6Y = FNMS(T1Y, T1X, T1W * T1Z); - } - T21 = T1V + T20; - Td8 = T6X + T6Y; - T6W = T1V - T20; - T6Z = T6X - T6Y; - } - { - E T26, T6S, T2b, T6T; - { - E T23, T25, T22, T24; - T23 = ri[WS(rs, 58)]; - T25 = ii[WS(rs, 58)]; - T22 = W[114]; - T24 = W[115]; - T26 = FMA(T22, T23, T24 * T25); - T6S = FNMS(T24, T23, T22 * T25); - } - { - E T28, T2a, T27, T29; - T28 = ri[WS(rs, 26)]; - T2a = ii[WS(rs, 26)]; - T27 = W[50]; - T29 = W[51]; - T2b = FMA(T27, T28, T29 * T2a); - T6T = FNMS(T29, T28, T27 * T2a); - } - T2c = T26 + T2b; - Td9 = T6S + T6T; - T6R = T26 - T2b; - T6U = T6S - T6T; - } - T1Q = T1E + T1P; - T2d = T21 + T2c; - Tfx = T1Q - T2d; - Tfu = Td2 + Td3; - Tfv = Td8 + Td9; - Tfw = Tfu - Tfv; - { - E T6O, T6P, Td7, Tda; - T6O = T6M - T6N; - T6P = T1J - T1O; - T6Q = T6O + T6P; - TaM = T6O - T6P; - Td7 = T1E - T1P; - Tda = Td8 - Td9; - Tdb = Td7 - Tda; - TeJ = Td7 + Tda; - } - { - E T6V, T70, T78, T79; - T6V = T6R - T6U; - T70 = T6W + T6Z; - T71 = KP707106781 * (T6V - T70); - TaQ = KP707106781 * (T70 + T6V); - T78 = T6Z - T6W; - T79 = T6R + T6U; - T7a = KP707106781 * (T78 - T79); - TaN = KP707106781 * (T78 + T79); - } - { - E Td4, Td5, T73, T76; - Td4 = Td2 - Td3; - Td5 = T2c - T21; - Td6 = Td4 - Td5; - TeI = Td4 + Td5; - T73 = T1y - T1D; - T76 = T74 - T75; - T77 = T73 - T76; - TaP = T73 + T76; - } - } - { - E T2j, T7d, T2o, T7e, T2p, Tdd, T2u, T7v, T2z, T7w, T2A, Tde, T2M, Tdj, T7n; - E T7q, T2X, Tdk, T7i, T7l; - { - E T2g, T2i, T2f, T2h; - T2g = ri[WS(rs, 62)]; - T2i = ii[WS(rs, 62)]; - T2f = W[122]; - T2h = W[123]; - T2j = FMA(T2f, T2g, T2h * T2i); - T7d = FNMS(T2h, T2g, T2f * T2i); - } - { - E T2l, T2n, T2k, T2m; - T2l = ri[WS(rs, 30)]; - T2n = ii[WS(rs, 30)]; - T2k = W[58]; - T2m = W[59]; - T2o = FMA(T2k, T2l, T2m * T2n); - T7e = FNMS(T2m, T2l, T2k * T2n); - } - T2p = T2j + T2o; - Tdd = T7d + T7e; - { - E T2r, T2t, T2q, T2s; - T2r = ri[WS(rs, 14)]; - T2t = ii[WS(rs, 14)]; - T2q = W[26]; - T2s = W[27]; - T2u = FMA(T2q, T2r, T2s * T2t); - T7v = FNMS(T2s, T2r, T2q * T2t); - } - { - E T2w, T2y, T2v, T2x; - T2w = ri[WS(rs, 46)]; - T2y = ii[WS(rs, 46)]; - T2v = W[90]; - T2x = W[91]; - T2z = FMA(T2v, T2w, T2x * T2y); - T7w = FNMS(T2x, T2w, T2v * T2y); - } - T2A = T2u + T2z; - Tde = T7v + T7w; - { - E T2G, T7o, T2L, T7p; - { - E T2D, T2F, T2C, T2E; - T2D = ri[WS(rs, 6)]; - T2F = ii[WS(rs, 6)]; - T2C = W[10]; - T2E = W[11]; - T2G = FMA(T2C, T2D, T2E * T2F); - T7o = FNMS(T2E, T2D, T2C * T2F); - } - { - E T2I, T2K, T2H, T2J; - T2I = ri[WS(rs, 38)]; - T2K = ii[WS(rs, 38)]; - T2H = W[74]; - T2J = W[75]; - T2L = FMA(T2H, T2I, T2J * T2K); - T7p = FNMS(T2J, T2I, T2H * T2K); - } - T2M = T2G + T2L; - Tdj = T7o + T7p; - T7n = T2G - T2L; - T7q = T7o - T7p; - } - { - E T2R, T7j, T2W, T7k; - { - E T2O, T2Q, T2N, T2P; - T2O = ri[WS(rs, 54)]; - T2Q = ii[WS(rs, 54)]; - T2N = W[106]; - T2P = W[107]; - T2R = FMA(T2N, T2O, T2P * T2Q); - T7j = FNMS(T2P, T2O, T2N * T2Q); - } - { - E T2T, T2V, T2S, T2U; - T2T = ri[WS(rs, 22)]; - T2V = ii[WS(rs, 22)]; - T2S = W[42]; - T2U = W[43]; - T2W = FMA(T2S, T2T, T2U * T2V); - T7k = FNMS(T2U, T2T, T2S * T2V); - } - T2X = T2R + T2W; - Tdk = T7j + T7k; - T7i = T2R - T2W; - T7l = T7j - T7k; - } - T2B = T2p + T2A; - T2Y = T2M + T2X; - Tfz = T2B - T2Y; - TfA = Tdd + Tde; - TfB = Tdj + Tdk; - TfC = TfA - TfB; - { - E T7f, T7g, Tdi, Tdl; - T7f = T7d - T7e; - T7g = T2u - T2z; - T7h = T7f + T7g; - TaW = T7f - T7g; - Tdi = T2p - T2A; - Tdl = Tdj - Tdk; - Tdm = Tdi - Tdl; - TeM = Tdi + Tdl; - } - { - E T7m, T7r, T7z, T7A; - T7m = T7i - T7l; - T7r = T7n + T7q; - T7s = KP707106781 * (T7m - T7r); - TaU = KP707106781 * (T7r + T7m); - T7z = T7q - T7n; - T7A = T7i + T7l; - T7B = KP707106781 * (T7z - T7A); - TaX = KP707106781 * (T7z + T7A); - } - { - E Tdf, Tdg, T7u, T7x; - Tdf = Tdd - Tde; - Tdg = T2X - T2M; - Tdh = Tdf - Tdg; - TeL = Tdf + Tdg; - T7u = T2j - T2o; - T7x = T7v - T7w; - T7y = T7u - T7x; - TaT = T7u + T7x; - } - } - { - E T4D, T9e, T4I, T9f, T4J, Te8, T4O, T8A, T4T, T8B, T4U, Te9, T56, TdS, T8G; - E T8H, T5h, TdT, T8J, T8M; - { - E T4A, T4C, T4z, T4B; - T4A = ri[WS(rs, 63)]; - T4C = ii[WS(rs, 63)]; - T4z = W[124]; - T4B = W[125]; - T4D = FMA(T4z, T4A, T4B * T4C); - T9e = FNMS(T4B, T4A, T4z * T4C); - } - { - E T4F, T4H, T4E, T4G; - T4F = ri[WS(rs, 31)]; - T4H = ii[WS(rs, 31)]; - T4E = W[60]; - T4G = W[61]; - T4I = FMA(T4E, T4F, T4G * T4H); - T9f = FNMS(T4G, T4F, T4E * T4H); - } - T4J = T4D + T4I; - Te8 = T9e + T9f; - { - E T4L, T4N, T4K, T4M; - T4L = ri[WS(rs, 15)]; - T4N = ii[WS(rs, 15)]; - T4K = W[28]; - T4M = W[29]; - T4O = FMA(T4K, T4L, T4M * T4N); - T8A = FNMS(T4M, T4L, T4K * T4N); - } - { - E T4Q, T4S, T4P, T4R; - T4Q = ri[WS(rs, 47)]; - T4S = ii[WS(rs, 47)]; - T4P = W[92]; - T4R = W[93]; - T4T = FMA(T4P, T4Q, T4R * T4S); - T8B = FNMS(T4R, T4Q, T4P * T4S); - } - T4U = T4O + T4T; - Te9 = T8A + T8B; - { - E T50, T8E, T55, T8F; - { - E T4X, T4Z, T4W, T4Y; - T4X = ri[WS(rs, 7)]; - T4Z = ii[WS(rs, 7)]; - T4W = W[12]; - T4Y = W[13]; - T50 = FMA(T4W, T4X, T4Y * T4Z); - T8E = FNMS(T4Y, T4X, T4W * T4Z); - } - { - E T52, T54, T51, T53; - T52 = ri[WS(rs, 39)]; - T54 = ii[WS(rs, 39)]; - T51 = W[76]; - T53 = W[77]; - T55 = FMA(T51, T52, T53 * T54); - T8F = FNMS(T53, T52, T51 * T54); - } - T56 = T50 + T55; - TdS = T8E + T8F; - T8G = T8E - T8F; - T8H = T50 - T55; - } - { - E T5b, T8K, T5g, T8L; - { - E T58, T5a, T57, T59; - T58 = ri[WS(rs, 55)]; - T5a = ii[WS(rs, 55)]; - T57 = W[108]; - T59 = W[109]; - T5b = FMA(T57, T58, T59 * T5a); - T8K = FNMS(T59, T58, T57 * T5a); - } - { - E T5d, T5f, T5c, T5e; - T5d = ri[WS(rs, 23)]; - T5f = ii[WS(rs, 23)]; - T5c = W[44]; - T5e = W[45]; - T5g = FMA(T5c, T5d, T5e * T5f); - T8L = FNMS(T5e, T5d, T5c * T5f); - } - T5h = T5b + T5g; - TdT = T8K + T8L; - T8J = T5b - T5g; - T8M = T8K - T8L; - } - { - E T4V, T5i, Tea, Teb; - T4V = T4J + T4U; - T5i = T56 + T5h; - T5j = T4V + T5i; - TfR = T4V - T5i; - Tea = Te8 - Te9; - Teb = T5h - T56; - Tec = Tea - Teb; - Tf0 = Tea + Teb; - } - { - E TfW, TfX, T8z, T8C; - TfW = Te8 + Te9; - TfX = TdS + TdT; - TfY = TfW - TfX; - Tgy = TfW + TfX; - T8z = T4D - T4I; - T8C = T8A - T8B; - T8D = T8z - T8C; - Tbl = T8z + T8C; - } - { - E T8I, T8N, T9j, T9k; - T8I = T8G - T8H; - T8N = T8J + T8M; - T8O = KP707106781 * (T8I - T8N); - Tbx = KP707106781 * (T8I + T8N); - T9j = T8J - T8M; - T9k = T8H + T8G; - T9l = KP707106781 * (T9j - T9k); - Tbm = KP707106781 * (T9k + T9j); - } - { - E TdR, TdU, T9g, T9h; - TdR = T4J - T4U; - TdU = TdS - TdT; - TdV = TdR - TdU; - TeX = TdR + TdU; - T9g = T9e - T9f; - T9h = T4O - T4T; - T9i = T9g + T9h; - Tbw = T9g - T9h; - } - } - { - E T36, T7G, T3b, T7H, T3c, Tdq, T3h, T8m, T3m, T8n, T3n, Tdr, T3z, TdI, T7Q; - E T7T, T3K, TdJ, T7L, T7O; - { - E T33, T35, T32, T34; - T33 = ri[WS(rs, 1)]; - T35 = ii[WS(rs, 1)]; - T32 = W[0]; - T34 = W[1]; - T36 = FMA(T32, T33, T34 * T35); - T7G = FNMS(T34, T33, T32 * T35); - } - { - E T38, T3a, T37, T39; - T38 = ri[WS(rs, 33)]; - T3a = ii[WS(rs, 33)]; - T37 = W[64]; - T39 = W[65]; - T3b = FMA(T37, T38, T39 * T3a); - T7H = FNMS(T39, T38, T37 * T3a); - } - T3c = T36 + T3b; - Tdq = T7G + T7H; - { - E T3e, T3g, T3d, T3f; - T3e = ri[WS(rs, 17)]; - T3g = ii[WS(rs, 17)]; - T3d = W[32]; - T3f = W[33]; - T3h = FMA(T3d, T3e, T3f * T3g); - T8m = FNMS(T3f, T3e, T3d * T3g); - } - { - E T3j, T3l, T3i, T3k; - T3j = ri[WS(rs, 49)]; - T3l = ii[WS(rs, 49)]; - T3i = W[96]; - T3k = W[97]; - T3m = FMA(T3i, T3j, T3k * T3l); - T8n = FNMS(T3k, T3j, T3i * T3l); - } - T3n = T3h + T3m; - Tdr = T8m + T8n; - { - E T3t, T7R, T3y, T7S; - { - E T3q, T3s, T3p, T3r; - T3q = ri[WS(rs, 9)]; - T3s = ii[WS(rs, 9)]; - T3p = W[16]; - T3r = W[17]; - T3t = FMA(T3p, T3q, T3r * T3s); - T7R = FNMS(T3r, T3q, T3p * T3s); - } - { - E T3v, T3x, T3u, T3w; - T3v = ri[WS(rs, 41)]; - T3x = ii[WS(rs, 41)]; - T3u = W[80]; - T3w = W[81]; - T3y = FMA(T3u, T3v, T3w * T3x); - T7S = FNMS(T3w, T3v, T3u * T3x); - } - T3z = T3t + T3y; - TdI = T7R + T7S; - T7Q = T3t - T3y; - T7T = T7R - T7S; - } - { - E T3E, T7M, T3J, T7N; - { - E T3B, T3D, T3A, T3C; - T3B = ri[WS(rs, 57)]; - T3D = ii[WS(rs, 57)]; - T3A = W[112]; - T3C = W[113]; - T3E = FMA(T3A, T3B, T3C * T3D); - T7M = FNMS(T3C, T3B, T3A * T3D); - } - { - E T3G, T3I, T3F, T3H; - T3G = ri[WS(rs, 25)]; - T3I = ii[WS(rs, 25)]; - T3F = W[48]; - T3H = W[49]; - T3J = FMA(T3F, T3G, T3H * T3I); - T7N = FNMS(T3H, T3G, T3F * T3I); - } - T3K = T3E + T3J; - TdJ = T7M + T7N; - T7L = T3E - T3J; - T7O = T7M - T7N; - } - { - E T3o, T3L, TdH, TdK; - T3o = T3c + T3n; - T3L = T3z + T3K; - T3M = T3o + T3L; - TfL = T3o - T3L; - TdH = T3c - T3n; - TdK = TdI - TdJ; - TdL = TdH - TdK; - TeQ = TdH + TdK; - } - { - E TfG, TfH, T7I, T7J; - TfG = Tdq + Tdr; - TfH = TdI + TdJ; - TfI = TfG - TfH; - Tgt = TfG + TfH; - T7I = T7G - T7H; - T7J = T3h - T3m; - T7K = T7I + T7J; - Tb2 = T7I - T7J; - } - { - E T7P, T7U, T8q, T8r; - T7P = T7L - T7O; - T7U = T7Q + T7T; - T7V = KP707106781 * (T7P - T7U); - Tbe = KP707106781 * (T7U + T7P); - T8q = T7T - T7Q; - T8r = T7L + T7O; - T8s = KP707106781 * (T8q - T8r); - Tb3 = KP707106781 * (T8q + T8r); - } - { - E Tds, Tdt, T8l, T8o; - Tds = Tdq - Tdr; - Tdt = T3K - T3z; - Tdu = Tds - Tdt; - TeT = Tds + Tdt; - T8l = T36 - T3b; - T8o = T8m - T8n; - T8p = T8l - T8o; - Tbd = T8l + T8o; - } - } - { - E T3X, TdB, T8a, T8d, T4v, Tdx, T80, T85, T48, TdC, T8b, T8g, T4k, Tdw, T7X; - E T84; - { - E T3R, T88, T3W, T89; - { - E T3O, T3Q, T3N, T3P; - T3O = ri[WS(rs, 5)]; - T3Q = ii[WS(rs, 5)]; - T3N = W[8]; - T3P = W[9]; - T3R = FMA(T3N, T3O, T3P * T3Q); - T88 = FNMS(T3P, T3O, T3N * T3Q); - } - { - E T3T, T3V, T3S, T3U; - T3T = ri[WS(rs, 37)]; - T3V = ii[WS(rs, 37)]; - T3S = W[72]; - T3U = W[73]; - T3W = FMA(T3S, T3T, T3U * T3V); - T89 = FNMS(T3U, T3T, T3S * T3V); - } - T3X = T3R + T3W; - TdB = T88 + T89; - T8a = T88 - T89; - T8d = T3R - T3W; - } - { - E T4p, T7Y, T4u, T7Z; - { - E T4m, T4o, T4l, T4n; - T4m = ri[WS(rs, 13)]; - T4o = ii[WS(rs, 13)]; - T4l = W[24]; - T4n = W[25]; - T4p = FMA(T4l, T4m, T4n * T4o); - T7Y = FNMS(T4n, T4m, T4l * T4o); - } - { - E T4r, T4t, T4q, T4s; - T4r = ri[WS(rs, 45)]; - T4t = ii[WS(rs, 45)]; - T4q = W[88]; - T4s = W[89]; - T4u = FMA(T4q, T4r, T4s * T4t); - T7Z = FNMS(T4s, T4r, T4q * T4t); - } - T4v = T4p + T4u; - Tdx = T7Y + T7Z; - T80 = T7Y - T7Z; - T85 = T4p - T4u; - } - { - E T42, T8e, T47, T8f; - { - E T3Z, T41, T3Y, T40; - T3Z = ri[WS(rs, 21)]; - T41 = ii[WS(rs, 21)]; - T3Y = W[40]; - T40 = W[41]; - T42 = FMA(T3Y, T3Z, T40 * T41); - T8e = FNMS(T40, T3Z, T3Y * T41); - } - { - E T44, T46, T43, T45; - T44 = ri[WS(rs, 53)]; - T46 = ii[WS(rs, 53)]; - T43 = W[104]; - T45 = W[105]; - T47 = FMA(T43, T44, T45 * T46); - T8f = FNMS(T45, T44, T43 * T46); - } - T48 = T42 + T47; - TdC = T8e + T8f; - T8b = T42 - T47; - T8g = T8e - T8f; - } - { - E T4e, T82, T4j, T83; - { - E T4b, T4d, T4a, T4c; - T4b = ri[WS(rs, 61)]; - T4d = ii[WS(rs, 61)]; - T4a = W[120]; - T4c = W[121]; - T4e = FMA(T4a, T4b, T4c * T4d); - T82 = FNMS(T4c, T4b, T4a * T4d); - } - { - E T4g, T4i, T4f, T4h; - T4g = ri[WS(rs, 29)]; - T4i = ii[WS(rs, 29)]; - T4f = W[56]; - T4h = W[57]; - T4j = FMA(T4f, T4g, T4h * T4i); - T83 = FNMS(T4h, T4g, T4f * T4i); - } - T4k = T4e + T4j; - Tdw = T82 + T83; - T7X = T4e - T4j; - T84 = T82 - T83; - } - { - E T49, T4w, TdA, TdD; - T49 = T3X + T48; - T4w = T4k + T4v; - T4x = T49 + T4w; - TfJ = T4w - T49; - TdA = T3X - T48; - TdD = TdB - TdC; - TdE = TdA + TdD; - TdM = TdD - TdA; - } - { - E TfM, TfN, T81, T86; - TfM = TdB + TdC; - TfN = Tdw + Tdx; - TfO = TfM - TfN; - Tgu = TfM + TfN; - T81 = T7X - T80; - T86 = T84 + T85; - T87 = FNMS(KP923879532, T86, KP382683432 * T81); - T8v = FMA(KP382683432, T86, KP923879532 * T81); - } - { - E T8c, T8h, Tb8, Tb9; - T8c = T8a + T8b; - T8h = T8d - T8g; - T8i = FMA(KP923879532, T8c, KP382683432 * T8h); - T8u = FNMS(KP923879532, T8h, KP382683432 * T8c); - Tb8 = T8a - T8b; - Tb9 = T8d + T8g; - Tba = FMA(KP382683432, Tb8, KP923879532 * Tb9); - Tbg = FNMS(KP382683432, Tb9, KP923879532 * Tb8); - } - { - E Tdv, Tdy, Tb5, Tb6; - Tdv = T4k - T4v; - Tdy = Tdw - Tdx; - Tdz = Tdv - Tdy; - TdN = Tdv + Tdy; - Tb5 = T7X + T80; - Tb6 = T84 - T85; - Tb7 = FNMS(KP382683432, Tb6, KP923879532 * Tb5); - Tbh = FMA(KP923879532, Tb6, KP382683432 * Tb5); - } - } - { - E T5u, TdW, T8S, T8V, T62, Te3, T94, T99, T5F, TdX, T8T, T8Y, T5R, Te2, T93; - E T96; - { - E T5o, T8Q, T5t, T8R; - { - E T5l, T5n, T5k, T5m; - T5l = ri[WS(rs, 3)]; - T5n = ii[WS(rs, 3)]; - T5k = W[4]; - T5m = W[5]; - T5o = FMA(T5k, T5l, T5m * T5n); - T8Q = FNMS(T5m, T5l, T5k * T5n); - } - { - E T5q, T5s, T5p, T5r; - T5q = ri[WS(rs, 35)]; - T5s = ii[WS(rs, 35)]; - T5p = W[68]; - T5r = W[69]; - T5t = FMA(T5p, T5q, T5r * T5s); - T8R = FNMS(T5r, T5q, T5p * T5s); - } - T5u = T5o + T5t; - TdW = T8Q + T8R; - T8S = T8Q - T8R; - T8V = T5o - T5t; - } - { - E T5W, T97, T61, T98; - { - E T5T, T5V, T5S, T5U; - T5T = ri[WS(rs, 11)]; - T5V = ii[WS(rs, 11)]; - T5S = W[20]; - T5U = W[21]; - T5W = FMA(T5S, T5T, T5U * T5V); - T97 = FNMS(T5U, T5T, T5S * T5V); - } - { - E T5Y, T60, T5X, T5Z; - T5Y = ri[WS(rs, 43)]; - T60 = ii[WS(rs, 43)]; - T5X = W[84]; - T5Z = W[85]; - T61 = FMA(T5X, T5Y, T5Z * T60); - T98 = FNMS(T5Z, T5Y, T5X * T60); - } - T62 = T5W + T61; - Te3 = T97 + T98; - T94 = T5W - T61; - T99 = T97 - T98; - } - { - E T5z, T8W, T5E, T8X; - { - E T5w, T5y, T5v, T5x; - T5w = ri[WS(rs, 19)]; - T5y = ii[WS(rs, 19)]; - T5v = W[36]; - T5x = W[37]; - T5z = FMA(T5v, T5w, T5x * T5y); - T8W = FNMS(T5x, T5w, T5v * T5y); - } - { - E T5B, T5D, T5A, T5C; - T5B = ri[WS(rs, 51)]; - T5D = ii[WS(rs, 51)]; - T5A = W[100]; - T5C = W[101]; - T5E = FMA(T5A, T5B, T5C * T5D); - T8X = FNMS(T5C, T5B, T5A * T5D); - } - T5F = T5z + T5E; - TdX = T8W + T8X; - T8T = T5z - T5E; - T8Y = T8W - T8X; - } - { - E T5L, T91, T5Q, T92; - { - E T5I, T5K, T5H, T5J; - T5I = ri[WS(rs, 59)]; - T5K = ii[WS(rs, 59)]; - T5H = W[116]; - T5J = W[117]; - T5L = FMA(T5H, T5I, T5J * T5K); - T91 = FNMS(T5J, T5I, T5H * T5K); - } - { - E T5N, T5P, T5M, T5O; - T5N = ri[WS(rs, 27)]; - T5P = ii[WS(rs, 27)]; - T5M = W[52]; - T5O = W[53]; - T5Q = FMA(T5M, T5N, T5O * T5P); - T92 = FNMS(T5O, T5N, T5M * T5P); - } - T5R = T5L + T5Q; - Te2 = T91 + T92; - T93 = T91 - T92; - T96 = T5L - T5Q; - } - { - E T5G, T63, Te1, Te4; - T5G = T5u + T5F; - T63 = T5R + T62; - T64 = T5G + T63; - TfZ = T63 - T5G; - Te1 = T5R - T62; - Te4 = Te2 - Te3; - Te5 = Te1 + Te4; - Ted = Te1 - Te4; - } - { - E TfS, TfT, T8U, T8Z; - TfS = TdW + TdX; - TfT = Te2 + Te3; - TfU = TfS - TfT; - Tgz = TfS + TfT; - T8U = T8S + T8T; - T8Z = T8V - T8Y; - T90 = FNMS(KP923879532, T8Z, KP382683432 * T8U); - T9o = FMA(KP923879532, T8U, KP382683432 * T8Z); - } - { - E T95, T9a, Tbr, Tbs; - T95 = T93 + T94; - T9a = T96 - T99; - T9b = FMA(KP382683432, T95, KP923879532 * T9a); - T9n = FNMS(KP923879532, T95, KP382683432 * T9a); - Tbr = T93 - T94; - Tbs = T96 + T99; - Tbt = FMA(KP923879532, Tbr, KP382683432 * Tbs); - Tbz = FNMS(KP382683432, Tbr, KP923879532 * Tbs); - } - { - E TdY, TdZ, Tbo, Tbp; - TdY = TdW - TdX; - TdZ = T5u - T5F; - Te0 = TdY - TdZ; - Tee = TdZ + TdY; - Tbo = T8S - T8T; - Tbp = T8V + T8Y; - Tbq = FNMS(KP382683432, Tbp, KP923879532 * Tbo); - TbA = FMA(KP382683432, Tbo, KP923879532 * Tbp); - } - } - { - E T1t, Tgn, TgK, TgL, TgV, Th1, T30, Th0, T66, TgX, Tgw, TgE, TgB, TgF, Tgq; - E TgM; - { - E TH, T1s, TgI, TgJ; - TH = Tj + TG; - T1s = T14 + T1r; - T1t = TH + T1s; - Tgn = TH - T1s; - TgI = Tgt + Tgu; - TgJ = Tgy + Tgz; - TgK = TgI - TgJ; - TgL = TgI + TgJ; - } - { - E TgN, TgU, T2e, T2Z; - TgN = Tfq + Tfr; - TgU = TgO + TgT; - TgV = TgN + TgU; - Th1 = TgU - TgN; - T2e = T1Q + T2d; - T2Z = T2B + T2Y; - T30 = T2e + T2Z; - Th0 = T2Z - T2e; - } - { - E T4y, T65, Tgs, Tgv; - T4y = T3M + T4x; - T65 = T5j + T64; - T66 = T4y + T65; - TgX = T65 - T4y; - Tgs = T3M - T4x; - Tgv = Tgt - Tgu; - Tgw = Tgs + Tgv; - TgE = Tgv - Tgs; - } - { - E Tgx, TgA, Tgo, Tgp; - Tgx = T5j - T64; - TgA = Tgy - Tgz; - TgB = Tgx - TgA; - TgF = Tgx + TgA; - Tgo = Tfu + Tfv; - Tgp = TfA + TfB; - Tgq = Tgo - Tgp; - TgM = Tgo + Tgp; - } - { - E T31, TgW, TgH, TgY; - T31 = T1t + T30; - ri[WS(rs, 32)] = T31 - T66; - ri[0] = T31 + T66; - TgW = TgM + TgV; - ii[0] = TgL + TgW; - ii[WS(rs, 32)] = TgW - TgL; - TgH = T1t - T30; - ri[WS(rs, 48)] = TgH - TgK; - ri[WS(rs, 16)] = TgH + TgK; - TgY = TgV - TgM; - ii[WS(rs, 16)] = TgX + TgY; - ii[WS(rs, 48)] = TgY - TgX; - } - { - E Tgr, TgC, TgZ, Th2; - Tgr = Tgn + Tgq; - TgC = KP707106781 * (Tgw + TgB); - ri[WS(rs, 40)] = Tgr - TgC; - ri[WS(rs, 8)] = Tgr + TgC; - TgZ = KP707106781 * (TgE + TgF); - Th2 = Th0 + Th1; - ii[WS(rs, 8)] = TgZ + Th2; - ii[WS(rs, 40)] = Th2 - TgZ; - } - { - E TgD, TgG, Th3, Th4; - TgD = Tgn - Tgq; - TgG = KP707106781 * (TgE - TgF); - ri[WS(rs, 56)] = TgD - TgG; - ri[WS(rs, 24)] = TgD + TgG; - Th3 = KP707106781 * (TgB - Tgw); - Th4 = Th1 - Th0; - ii[WS(rs, 24)] = Th3 + Th4; - ii[WS(rs, 56)] = Th4 - Th3; - } - } - { - E Tft, Tg7, Tgh, Tgl, Th9, Thf, TfE, Th6, TfQ, Tg4, Tga, The, Tge, Tgk, Tg1; - E Tg5; - { - E Tfp, Tfs, Tgf, Tgg; - Tfp = Tj - TG; - Tfs = Tfq - Tfr; - Tft = Tfp - Tfs; - Tg7 = Tfp + Tfs; - Tgf = TfR + TfU; - Tgg = TfY + TfZ; - Tgh = FNMS(KP382683432, Tgg, KP923879532 * Tgf); - Tgl = FMA(KP923879532, Tgg, KP382683432 * Tgf); - } - { - E Th7, Th8, Tfy, TfD; - Th7 = T1r - T14; - Th8 = TgT - TgO; - Th9 = Th7 + Th8; - Thf = Th8 - Th7; - Tfy = Tfw - Tfx; - TfD = Tfz + TfC; - TfE = KP707106781 * (Tfy - TfD); - Th6 = KP707106781 * (Tfy + TfD); - } - { - E TfK, TfP, Tg8, Tg9; - TfK = TfI - TfJ; - TfP = TfL - TfO; - TfQ = FMA(KP923879532, TfK, KP382683432 * TfP); - Tg4 = FNMS(KP923879532, TfP, KP382683432 * TfK); - Tg8 = Tfx + Tfw; - Tg9 = Tfz - TfC; - Tga = KP707106781 * (Tg8 + Tg9); - The = KP707106781 * (Tg9 - Tg8); - } - { - E Tgc, Tgd, TfV, Tg0; - Tgc = TfI + TfJ; - Tgd = TfL + TfO; - Tge = FMA(KP382683432, Tgc, KP923879532 * Tgd); - Tgk = FNMS(KP382683432, Tgd, KP923879532 * Tgc); - TfV = TfR - TfU; - Tg0 = TfY - TfZ; - Tg1 = FNMS(KP923879532, Tg0, KP382683432 * TfV); - Tg5 = FMA(KP382683432, Tg0, KP923879532 * TfV); - } - { - E TfF, Tg2, Thd, Thg; - TfF = Tft + TfE; - Tg2 = TfQ + Tg1; - ri[WS(rs, 44)] = TfF - Tg2; - ri[WS(rs, 12)] = TfF + Tg2; - Thd = Tg4 + Tg5; - Thg = The + Thf; - ii[WS(rs, 12)] = Thd + Thg; - ii[WS(rs, 44)] = Thg - Thd; - } - { - E Tg3, Tg6, Thh, Thi; - Tg3 = Tft - TfE; - Tg6 = Tg4 - Tg5; - ri[WS(rs, 60)] = Tg3 - Tg6; - ri[WS(rs, 28)] = Tg3 + Tg6; - Thh = Tg1 - TfQ; - Thi = Thf - The; - ii[WS(rs, 28)] = Thh + Thi; - ii[WS(rs, 60)] = Thi - Thh; - } - { - E Tgb, Tgi, Th5, Tha; - Tgb = Tg7 + Tga; - Tgi = Tge + Tgh; - ri[WS(rs, 36)] = Tgb - Tgi; - ri[WS(rs, 4)] = Tgb + Tgi; - Th5 = Tgk + Tgl; - Tha = Th6 + Th9; - ii[WS(rs, 4)] = Th5 + Tha; - ii[WS(rs, 36)] = Tha - Th5; - } - { - E Tgj, Tgm, Thb, Thc; - Tgj = Tg7 - Tga; - Tgm = Tgk - Tgl; - ri[WS(rs, 52)] = Tgj - Tgm; - ri[WS(rs, 20)] = Tgj + Tgm; - Thb = Tgh - Tge; - Thc = Th9 - Th6; - ii[WS(rs, 20)] = Thb + Thc; - ii[WS(rs, 52)] = Thc - Thb; - } - } - { - E Td1, Ten, Tdo, ThA, ThD, ThJ, Teq, ThI, Teh, TeB, Tel, Tex, TdQ, TeA, Tek; - E Teu; - { - E TcP, Td0, Teo, Tep; - TcP = TcL - TcO; - Td0 = KP707106781 * (TcU - TcZ); - Td1 = TcP - Td0; - Ten = TcP + Td0; - { - E Tdc, Tdn, ThB, ThC; - Tdc = FNMS(KP923879532, Tdb, KP382683432 * Td6); - Tdn = FMA(KP382683432, Tdh, KP923879532 * Tdm); - Tdo = Tdc - Tdn; - ThA = Tdc + Tdn; - ThB = KP707106781 * (TeF - TeE); - ThC = Thn - Thm; - ThD = ThB + ThC; - ThJ = ThC - ThB; - } - Teo = FMA(KP923879532, Td6, KP382683432 * Tdb); - Tep = FNMS(KP923879532, Tdh, KP382683432 * Tdm); - Teq = Teo + Tep; - ThI = Tep - Teo; - { - E Te7, Tev, Teg, Tew, Te6, Tef; - Te6 = KP707106781 * (Te0 - Te5); - Te7 = TdV - Te6; - Tev = TdV + Te6; - Tef = KP707106781 * (Ted - Tee); - Teg = Tec - Tef; - Tew = Tec + Tef; - Teh = FNMS(KP980785280, Teg, KP195090322 * Te7); - TeB = FMA(KP831469612, Tew, KP555570233 * Tev); - Tel = FMA(KP195090322, Teg, KP980785280 * Te7); - Tex = FNMS(KP555570233, Tew, KP831469612 * Tev); - } - { - E TdG, Tes, TdP, Tet, TdF, TdO; - TdF = KP707106781 * (Tdz - TdE); - TdG = Tdu - TdF; - Tes = Tdu + TdF; - TdO = KP707106781 * (TdM - TdN); - TdP = TdL - TdO; - Tet = TdL + TdO; - TdQ = FMA(KP980785280, TdG, KP195090322 * TdP); - TeA = FNMS(KP555570233, Tet, KP831469612 * Tes); - Tek = FNMS(KP980785280, TdP, KP195090322 * TdG); - Teu = FMA(KP555570233, Tes, KP831469612 * Tet); - } - } - { - E Tdp, Tei, ThH, ThK; - Tdp = Td1 + Tdo; - Tei = TdQ + Teh; - ri[WS(rs, 46)] = Tdp - Tei; - ri[WS(rs, 14)] = Tdp + Tei; - ThH = Tek + Tel; - ThK = ThI + ThJ; - ii[WS(rs, 14)] = ThH + ThK; - ii[WS(rs, 46)] = ThK - ThH; - } - { - E Tej, Tem, ThL, ThM; - Tej = Td1 - Tdo; - Tem = Tek - Tel; - ri[WS(rs, 62)] = Tej - Tem; - ri[WS(rs, 30)] = Tej + Tem; - ThL = Teh - TdQ; - ThM = ThJ - ThI; - ii[WS(rs, 30)] = ThL + ThM; - ii[WS(rs, 62)] = ThM - ThL; - } - { - E Ter, Tey, Thz, ThE; - Ter = Ten + Teq; - Tey = Teu + Tex; - ri[WS(rs, 38)] = Ter - Tey; - ri[WS(rs, 6)] = Ter + Tey; - Thz = TeA + TeB; - ThE = ThA + ThD; - ii[WS(rs, 6)] = Thz + ThE; - ii[WS(rs, 38)] = ThE - Thz; - } - { - E Tez, TeC, ThF, ThG; - Tez = Ten - Teq; - TeC = TeA - TeB; - ri[WS(rs, 54)] = Tez - TeC; - ri[WS(rs, 22)] = Tez + TeC; - ThF = Tex - Teu; - ThG = ThD - ThA; - ii[WS(rs, 22)] = ThF + ThG; - ii[WS(rs, 54)] = ThG - ThF; - } - } - { - E TeH, Tf9, TeO, Thk, Thp, Thv, Tfc, Thu, Tf3, Tfn, Tf7, Tfj, TeW, Tfm, Tf6; - E Tfg; - { - E TeD, TeG, Tfa, Tfb; - TeD = TcL + TcO; - TeG = KP707106781 * (TeE + TeF); - TeH = TeD - TeG; - Tf9 = TeD + TeG; - { - E TeK, TeN, Thl, Tho; - TeK = FNMS(KP382683432, TeJ, KP923879532 * TeI); - TeN = FMA(KP923879532, TeL, KP382683432 * TeM); - TeO = TeK - TeN; - Thk = TeK + TeN; - Thl = KP707106781 * (TcU + TcZ); - Tho = Thm + Thn; - Thp = Thl + Tho; - Thv = Tho - Thl; - } - Tfa = FMA(KP382683432, TeI, KP923879532 * TeJ); - Tfb = FNMS(KP382683432, TeL, KP923879532 * TeM); - Tfc = Tfa + Tfb; - Thu = Tfb - Tfa; - { - E TeZ, Tfh, Tf2, Tfi, TeY, Tf1; - TeY = KP707106781 * (Tee + Ted); - TeZ = TeX - TeY; - Tfh = TeX + TeY; - Tf1 = KP707106781 * (Te0 + Te5); - Tf2 = Tf0 - Tf1; - Tfi = Tf0 + Tf1; - Tf3 = FNMS(KP831469612, Tf2, KP555570233 * TeZ); - Tfn = FMA(KP195090322, Tfh, KP980785280 * Tfi); - Tf7 = FMA(KP831469612, TeZ, KP555570233 * Tf2); - Tfj = FNMS(KP195090322, Tfi, KP980785280 * Tfh); - } - { - E TeS, Tfe, TeV, Tff, TeR, TeU; - TeR = KP707106781 * (TdE + Tdz); - TeS = TeQ - TeR; - Tfe = TeQ + TeR; - TeU = KP707106781 * (TdM + TdN); - TeV = TeT - TeU; - Tff = TeT + TeU; - TeW = FMA(KP555570233, TeS, KP831469612 * TeV); - Tfm = FNMS(KP195090322, Tfe, KP980785280 * Tff); - Tf6 = FNMS(KP831469612, TeS, KP555570233 * TeV); - Tfg = FMA(KP980785280, Tfe, KP195090322 * Tff); - } - } - { - E TeP, Tf4, Tht, Thw; - TeP = TeH + TeO; - Tf4 = TeW + Tf3; - ri[WS(rs, 42)] = TeP - Tf4; - ri[WS(rs, 10)] = TeP + Tf4; - Tht = Tf6 + Tf7; - Thw = Thu + Thv; - ii[WS(rs, 10)] = Tht + Thw; - ii[WS(rs, 42)] = Thw - Tht; - } - { - E Tf5, Tf8, Thx, Thy; - Tf5 = TeH - TeO; - Tf8 = Tf6 - Tf7; - ri[WS(rs, 58)] = Tf5 - Tf8; - ri[WS(rs, 26)] = Tf5 + Tf8; - Thx = Tf3 - TeW; - Thy = Thv - Thu; - ii[WS(rs, 26)] = Thx + Thy; - ii[WS(rs, 58)] = Thy - Thx; - } - { - E Tfd, Tfk, Thj, Thq; - Tfd = Tf9 + Tfc; - Tfk = Tfg + Tfj; - ri[WS(rs, 34)] = Tfd - Tfk; - ri[WS(rs, 2)] = Tfd + Tfk; - Thj = Tfm + Tfn; - Thq = Thk + Thp; - ii[WS(rs, 2)] = Thj + Thq; - ii[WS(rs, 34)] = Thq - Thj; - } - { - E Tfl, Tfo, Thr, Ths; - Tfl = Tf9 - Tfc; - Tfo = Tfm - Tfn; - ri[WS(rs, 50)] = Tfl - Tfo; - ri[WS(rs, 18)] = Tfl + Tfo; - Thr = Tfj - Tfg; - Ths = Thp - Thk; - ii[WS(rs, 18)] = Thr + Ths; - ii[WS(rs, 50)] = Ths - Thr; - } - } - { - E T6L, T9x, TiD, TiJ, T7E, TiI, T9A, TiA, T8y, T9K, T9u, T9E, T9r, T9L, T9v; - E T9H; - { - E T6n, T6K, TiB, TiC; - T6n = T6b - T6m; - T6K = T6y - T6J; - T6L = T6n - T6K; - T9x = T6n + T6K; - TiB = T9P - T9O; - TiC = Tin - Tim; - TiD = TiB + TiC; - TiJ = TiC - TiB; - } - { - E T7c, T9y, T7D, T9z; - { - E T72, T7b, T7t, T7C; - T72 = T6Q - T71; - T7b = T77 - T7a; - T7c = FNMS(KP980785280, T7b, KP195090322 * T72); - T9y = FMA(KP980785280, T72, KP195090322 * T7b); - T7t = T7h - T7s; - T7C = T7y - T7B; - T7D = FMA(KP195090322, T7t, KP980785280 * T7C); - T9z = FNMS(KP980785280, T7t, KP195090322 * T7C); - } - T7E = T7c - T7D; - TiI = T9z - T9y; - T9A = T9y + T9z; - TiA = T7c + T7D; - } - { - E T8k, T9C, T8x, T9D; - { - E T7W, T8j, T8t, T8w; - T7W = T7K - T7V; - T8j = T87 - T8i; - T8k = T7W - T8j; - T9C = T7W + T8j; - T8t = T8p - T8s; - T8w = T8u - T8v; - T8x = T8t - T8w; - T9D = T8t + T8w; - } - T8y = FMA(KP995184726, T8k, KP098017140 * T8x); - T9K = FNMS(KP634393284, T9D, KP773010453 * T9C); - T9u = FNMS(KP995184726, T8x, KP098017140 * T8k); - T9E = FMA(KP634393284, T9C, KP773010453 * T9D); - } - { - E T9d, T9F, T9q, T9G; - { - E T8P, T9c, T9m, T9p; - T8P = T8D - T8O; - T9c = T90 - T9b; - T9d = T8P - T9c; - T9F = T8P + T9c; - T9m = T9i - T9l; - T9p = T9n - T9o; - T9q = T9m - T9p; - T9G = T9m + T9p; - } - T9r = FNMS(KP995184726, T9q, KP098017140 * T9d); - T9L = FMA(KP773010453, T9G, KP634393284 * T9F); - T9v = FMA(KP098017140, T9q, KP995184726 * T9d); - T9H = FNMS(KP634393284, T9G, KP773010453 * T9F); - } - { - E T7F, T9s, TiH, TiK; - T7F = T6L + T7E; - T9s = T8y + T9r; - ri[WS(rs, 47)] = T7F - T9s; - ri[WS(rs, 15)] = T7F + T9s; - TiH = T9u + T9v; - TiK = TiI + TiJ; - ii[WS(rs, 15)] = TiH + TiK; - ii[WS(rs, 47)] = TiK - TiH; - } - { - E T9t, T9w, TiL, TiM; - T9t = T6L - T7E; - T9w = T9u - T9v; - ri[WS(rs, 63)] = T9t - T9w; - ri[WS(rs, 31)] = T9t + T9w; - TiL = T9r - T8y; - TiM = TiJ - TiI; - ii[WS(rs, 31)] = TiL + TiM; - ii[WS(rs, 63)] = TiM - TiL; - } - { - E T9B, T9I, Tiz, TiE; - T9B = T9x + T9A; - T9I = T9E + T9H; - ri[WS(rs, 39)] = T9B - T9I; - ri[WS(rs, 7)] = T9B + T9I; - Tiz = T9K + T9L; - TiE = TiA + TiD; - ii[WS(rs, 7)] = Tiz + TiE; - ii[WS(rs, 39)] = TiE - Tiz; - } - { - E T9J, T9M, TiF, TiG; - T9J = T9x - T9A; - T9M = T9K - T9L; - ri[WS(rs, 55)] = T9J - T9M; - ri[WS(rs, 23)] = T9J + T9M; - TiF = T9H - T9E; - TiG = TiD - TiA; - ii[WS(rs, 23)] = TiF + TiG; - ii[WS(rs, 55)] = TiG - TiF; - } - } - { - E TaL, TbJ, Ti9, Tif, Tb0, Tie, TbM, Ti6, Tbk, TbW, TbG, TbQ, TbD, TbX, TbH; - E TbT; - { - E TaD, TaK, Ti7, Ti8; - TaD = Taz - TaC; - TaK = TaG - TaJ; - TaL = TaD - TaK; - TbJ = TaD + TaK; - Ti7 = Tc1 - Tc0; - Ti8 = ThT - ThQ; - Ti9 = Ti7 + Ti8; - Tif = Ti8 - Ti7; - } - { - E TaS, TbK, TaZ, TbL; - { - E TaO, TaR, TaV, TaY; - TaO = TaM - TaN; - TaR = TaP - TaQ; - TaS = FNMS(KP831469612, TaR, KP555570233 * TaO); - TbK = FMA(KP555570233, TaR, KP831469612 * TaO); - TaV = TaT - TaU; - TaY = TaW - TaX; - TaZ = FMA(KP831469612, TaV, KP555570233 * TaY); - TbL = FNMS(KP831469612, TaY, KP555570233 * TaV); - } - Tb0 = TaS - TaZ; - Tie = TbL - TbK; - TbM = TbK + TbL; - Ti6 = TaS + TaZ; - } - { - E Tbc, TbO, Tbj, TbP; - { - E Tb4, Tbb, Tbf, Tbi; - Tb4 = Tb2 - Tb3; - Tbb = Tb7 - Tba; - Tbc = Tb4 - Tbb; - TbO = Tb4 + Tbb; - Tbf = Tbd - Tbe; - Tbi = Tbg - Tbh; - Tbj = Tbf - Tbi; - TbP = Tbf + Tbi; - } - Tbk = FMA(KP956940335, Tbc, KP290284677 * Tbj); - TbW = FNMS(KP471396736, TbP, KP881921264 * TbO); - TbG = FNMS(KP956940335, Tbj, KP290284677 * Tbc); - TbQ = FMA(KP471396736, TbO, KP881921264 * TbP); - } - { - E Tbv, TbR, TbC, TbS; - { - E Tbn, Tbu, Tby, TbB; - Tbn = Tbl - Tbm; - Tbu = Tbq - Tbt; - Tbv = Tbn - Tbu; - TbR = Tbn + Tbu; - Tby = Tbw - Tbx; - TbB = Tbz - TbA; - TbC = Tby - TbB; - TbS = Tby + TbB; - } - TbD = FNMS(KP956940335, TbC, KP290284677 * Tbv); - TbX = FMA(KP881921264, TbS, KP471396736 * TbR); - TbH = FMA(KP290284677, TbC, KP956940335 * Tbv); - TbT = FNMS(KP471396736, TbS, KP881921264 * TbR); - } - { - E Tb1, TbE, Tid, Tig; - Tb1 = TaL + Tb0; - TbE = Tbk + TbD; - ri[WS(rs, 45)] = Tb1 - TbE; - ri[WS(rs, 13)] = Tb1 + TbE; - Tid = TbG + TbH; - Tig = Tie + Tif; - ii[WS(rs, 13)] = Tid + Tig; - ii[WS(rs, 45)] = Tig - Tid; - } - { - E TbF, TbI, Tih, Tii; - TbF = TaL - Tb0; - TbI = TbG - TbH; - ri[WS(rs, 61)] = TbF - TbI; - ri[WS(rs, 29)] = TbF + TbI; - Tih = TbD - Tbk; - Tii = Tif - Tie; - ii[WS(rs, 29)] = Tih + Tii; - ii[WS(rs, 61)] = Tii - Tih; - } - { - E TbN, TbU, Ti5, Tia; - TbN = TbJ + TbM; - TbU = TbQ + TbT; - ri[WS(rs, 37)] = TbN - TbU; - ri[WS(rs, 5)] = TbN + TbU; - Ti5 = TbW + TbX; - Tia = Ti6 + Ti9; - ii[WS(rs, 5)] = Ti5 + Tia; - ii[WS(rs, 37)] = Tia - Ti5; - } - { - E TbV, TbY, Tib, Tic; - TbV = TbJ - TbM; - TbY = TbW - TbX; - ri[WS(rs, 53)] = TbV - TbY; - ri[WS(rs, 21)] = TbV + TbY; - Tib = TbT - TbQ; - Tic = Ti9 - Ti6; - ii[WS(rs, 21)] = Tib + Tic; - ii[WS(rs, 53)] = Tic - Tib; - } - } - { - E Tc3, Tcv, ThV, Ti1, Tca, Ti0, Tcy, ThO, Tci, TcI, Tcs, TcC, Tcp, TcJ, Tct; - E TcF; - { - E TbZ, Tc2, ThP, ThU; - TbZ = Taz + TaC; - Tc2 = Tc0 + Tc1; - Tc3 = TbZ - Tc2; - Tcv = TbZ + Tc2; - ThP = TaG + TaJ; - ThU = ThQ + ThT; - ThV = ThP + ThU; - Ti1 = ThU - ThP; - } - { - E Tc6, Tcw, Tc9, Tcx; - { - E Tc4, Tc5, Tc7, Tc8; - Tc4 = TaM + TaN; - Tc5 = TaP + TaQ; - Tc6 = FNMS(KP195090322, Tc5, KP980785280 * Tc4); - Tcw = FMA(KP980785280, Tc5, KP195090322 * Tc4); - Tc7 = TaT + TaU; - Tc8 = TaW + TaX; - Tc9 = FMA(KP195090322, Tc7, KP980785280 * Tc8); - Tcx = FNMS(KP195090322, Tc8, KP980785280 * Tc7); - } - Tca = Tc6 - Tc9; - Ti0 = Tcx - Tcw; - Tcy = Tcw + Tcx; - ThO = Tc6 + Tc9; - } - { - E Tce, TcA, Tch, TcB; - { - E Tcc, Tcd, Tcf, Tcg; - Tcc = Tbd + Tbe; - Tcd = Tba + Tb7; - Tce = Tcc - Tcd; - TcA = Tcc + Tcd; - Tcf = Tb2 + Tb3; - Tcg = Tbg + Tbh; - Tch = Tcf - Tcg; - TcB = Tcf + Tcg; - } - Tci = FMA(KP634393284, Tce, KP773010453 * Tch); - TcI = FNMS(KP098017140, TcA, KP995184726 * TcB); - Tcs = FNMS(KP773010453, Tce, KP634393284 * Tch); - TcC = FMA(KP995184726, TcA, KP098017140 * TcB); - } - { - E Tcl, TcD, Tco, TcE; - { - E Tcj, Tck, Tcm, Tcn; - Tcj = Tbl + Tbm; - Tck = TbA + Tbz; - Tcl = Tcj - Tck; - TcD = Tcj + Tck; - Tcm = Tbw + Tbx; - Tcn = Tbq + Tbt; - Tco = Tcm - Tcn; - TcE = Tcm + Tcn; - } - Tcp = FNMS(KP773010453, Tco, KP634393284 * Tcl); - TcJ = FMA(KP098017140, TcD, KP995184726 * TcE); - Tct = FMA(KP773010453, Tcl, KP634393284 * Tco); - TcF = FNMS(KP098017140, TcE, KP995184726 * TcD); - } - { - E Tcb, Tcq, ThZ, Ti2; - Tcb = Tc3 + Tca; - Tcq = Tci + Tcp; - ri[WS(rs, 41)] = Tcb - Tcq; - ri[WS(rs, 9)] = Tcb + Tcq; - ThZ = Tcs + Tct; - Ti2 = Ti0 + Ti1; - ii[WS(rs, 9)] = ThZ + Ti2; - ii[WS(rs, 41)] = Ti2 - ThZ; - } - { - E Tcr, Tcu, Ti3, Ti4; - Tcr = Tc3 - Tca; - Tcu = Tcs - Tct; - ri[WS(rs, 57)] = Tcr - Tcu; - ri[WS(rs, 25)] = Tcr + Tcu; - Ti3 = Tcp - Tci; - Ti4 = Ti1 - Ti0; - ii[WS(rs, 25)] = Ti3 + Ti4; - ii[WS(rs, 57)] = Ti4 - Ti3; - } - { - E Tcz, TcG, ThN, ThW; - Tcz = Tcv + Tcy; - TcG = TcC + TcF; - ri[WS(rs, 33)] = Tcz - TcG; - ri[WS(rs, 1)] = Tcz + TcG; - ThN = TcI + TcJ; - ThW = ThO + ThV; - ii[WS(rs, 1)] = ThN + ThW; - ii[WS(rs, 33)] = ThW - ThN; - } - { - E TcH, TcK, ThX, ThY; - TcH = Tcv - Tcy; - TcK = TcI - TcJ; - ri[WS(rs, 49)] = TcH - TcK; - ri[WS(rs, 17)] = TcH + TcK; - ThX = TcF - TcC; - ThY = ThV - ThO; - ii[WS(rs, 17)] = ThX + ThY; - ii[WS(rs, 49)] = ThY - ThX; - } - } - { - E T9R, Taj, Tip, Tiv, T9Y, Tiu, Tam, Tik, Ta6, Taw, Tag, Taq, Tad, Tax, Tah; - E Tat; - { - E T9N, T9Q, Til, Tio; - T9N = T6b + T6m; - T9Q = T9O + T9P; - T9R = T9N - T9Q; - Taj = T9N + T9Q; - Til = T6y + T6J; - Tio = Tim + Tin; - Tip = Til + Tio; - Tiv = Tio - Til; - } - { - E T9U, Tak, T9X, Tal; - { - E T9S, T9T, T9V, T9W; - T9S = T6Q + T71; - T9T = T77 + T7a; - T9U = FNMS(KP555570233, T9T, KP831469612 * T9S); - Tak = FMA(KP555570233, T9S, KP831469612 * T9T); - T9V = T7h + T7s; - T9W = T7y + T7B; - T9X = FMA(KP831469612, T9V, KP555570233 * T9W); - Tal = FNMS(KP555570233, T9V, KP831469612 * T9W); - } - T9Y = T9U - T9X; - Tiu = Tal - Tak; - Tam = Tak + Tal; - Tik = T9U + T9X; - } - { - E Ta2, Tao, Ta5, Tap; - { - E Ta0, Ta1, Ta3, Ta4; - Ta0 = T8p + T8s; - Ta1 = T8i + T87; - Ta2 = Ta0 - Ta1; - Tao = Ta0 + Ta1; - Ta3 = T7K + T7V; - Ta4 = T8u + T8v; - Ta5 = Ta3 - Ta4; - Tap = Ta3 + Ta4; - } - Ta6 = FMA(KP471396736, Ta2, KP881921264 * Ta5); - Taw = FNMS(KP290284677, Tao, KP956940335 * Tap); - Tag = FNMS(KP881921264, Ta2, KP471396736 * Ta5); - Taq = FMA(KP956940335, Tao, KP290284677 * Tap); - } - { - E Ta9, Tar, Tac, Tas; - { - E Ta7, Ta8, Taa, Tab; - Ta7 = T8D + T8O; - Ta8 = T9o + T9n; - Ta9 = Ta7 - Ta8; - Tar = Ta7 + Ta8; - Taa = T9i + T9l; - Tab = T90 + T9b; - Tac = Taa - Tab; - Tas = Taa + Tab; - } - Tad = FNMS(KP881921264, Tac, KP471396736 * Ta9); - Tax = FMA(KP290284677, Tar, KP956940335 * Tas); - Tah = FMA(KP881921264, Ta9, KP471396736 * Tac); - Tat = FNMS(KP290284677, Tas, KP956940335 * Tar); - } - { - E T9Z, Tae, Tit, Tiw; - T9Z = T9R + T9Y; - Tae = Ta6 + Tad; - ri[WS(rs, 43)] = T9Z - Tae; - ri[WS(rs, 11)] = T9Z + Tae; - Tit = Tag + Tah; - Tiw = Tiu + Tiv; - ii[WS(rs, 11)] = Tit + Tiw; - ii[WS(rs, 43)] = Tiw - Tit; - } - { - E Taf, Tai, Tix, Tiy; - Taf = T9R - T9Y; - Tai = Tag - Tah; - ri[WS(rs, 59)] = Taf - Tai; - ri[WS(rs, 27)] = Taf + Tai; - Tix = Tad - Ta6; - Tiy = Tiv - Tiu; - ii[WS(rs, 27)] = Tix + Tiy; - ii[WS(rs, 59)] = Tiy - Tix; - } - { - E Tan, Tau, Tij, Tiq; - Tan = Taj + Tam; - Tau = Taq + Tat; - ri[WS(rs, 35)] = Tan - Tau; - ri[WS(rs, 3)] = Tan + Tau; - Tij = Taw + Tax; - Tiq = Tik + Tip; - ii[WS(rs, 3)] = Tij + Tiq; - ii[WS(rs, 35)] = Tiq - Tij; - } - { - E Tav, Tay, Tir, Tis; - Tav = Taj - Tam; - Tay = Taw - Tax; - ri[WS(rs, 51)] = Tav - Tay; - ri[WS(rs, 19)] = Tav + Tay; - Tir = Tat - Taq; - Tis = Tip - Tik; - ii[WS(rs, 19)] = Tir + Tis; - ii[WS(rs, 51)] = Tis - Tir; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 64 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 64, "t1_64", twinstr, &GENUS, { 808, 270, 230, 0 }, 0, 0, 0 }; - -void X(codelet_t1_64) (planner *p) { - X(kdft_dit_register) (p, t1_64, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_7.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_7.c deleted file mode 100644 index d112ce85..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_7.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:27 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */ - -/* - * This function contains 72 FP additions, 66 FP multiplications, - * (or, 18 additions, 12 multiplications, 54 fused multiply/add), - * 37 stack variables, 6 constants, and 28 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP974927912, +0.974927912181823607018131682993931217232785801); - DK(KP900968867, +0.900968867902419126236102319507445051165919162); - DK(KP801937735, +0.801937735804838252472204639014890102331838324); - DK(KP554958132, +0.554958132087371191422194871006410481067288862); - DK(KP692021471, +0.692021471630095869627814897002069140197260599); - DK(KP356895867, +0.356895867892209443894399510021300583399127187); - { - INT m; - for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) { - E T1, T1c, Te, T1h, TR, T19, Tr, T1g, TM, T1a, TE, T1i, TW, T1b; - T1 = ri[0]; - T1c = ii[0]; - { - E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8; - T3 = ri[WS(rs, 1)]; - T6 = ii[WS(rs, 1)]; - T2 = W[0]; - T4 = T2 * T3; - TN = T2 * T6; - T9 = ri[WS(rs, 6)]; - Tc = ii[WS(rs, 6)]; - T8 = W[10]; - Ta = T8 * T9; - TP = T8 * Tc; - { - E T7, TO, Td, TQ, T5, Tb; - T5 = W[1]; - T7 = FMA(T5, T6, T4); - TO = FNMS(T5, T3, TN); - Tb = W[11]; - Td = FMA(Tb, Tc, Ta); - TQ = FNMS(Tb, T9, TP); - Te = T7 + Td; - T1h = Td - T7; - TR = TO - TQ; - T19 = TO + TQ; - } - } - { - E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl; - Tg = ri[WS(rs, 2)]; - Tj = ii[WS(rs, 2)]; - Tf = W[2]; - Th = Tf * Tg; - TI = Tf * Tj; - Tm = ri[WS(rs, 5)]; - Tp = ii[WS(rs, 5)]; - Tl = W[8]; - Tn = Tl * Tm; - TK = Tl * Tp; - { - E Tk, TJ, Tq, TL, Ti, To; - Ti = W[3]; - Tk = FMA(Ti, Tj, Th); - TJ = FNMS(Ti, Tg, TI); - To = W[9]; - Tq = FMA(To, Tp, Tn); - TL = FNMS(To, Tm, TK); - Tr = Tk + Tq; - T1g = Tq - Tk; - TM = TJ - TL; - T1a = TJ + TL; - } - } - { - E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty; - Tt = ri[WS(rs, 3)]; - Tw = ii[WS(rs, 3)]; - Ts = W[4]; - Tu = Ts * Tt; - TS = Ts * Tw; - Tz = ri[WS(rs, 4)]; - TC = ii[WS(rs, 4)]; - Ty = W[6]; - TA = Ty * Tz; - TU = Ty * TC; - { - E Tx, TT, TD, TV, Tv, TB; - Tv = W[5]; - Tx = FMA(Tv, Tw, Tu); - TT = FNMS(Tv, Tt, TS); - TB = W[7]; - TD = FMA(TB, TC, TA); - TV = FNMS(TB, Tz, TU); - TE = Tx + TD; - T1i = TD - Tx; - TW = TT - TV; - T1b = TT + TV; - } - } - ri[0] = T1 + Te + Tr + TE; - ii[0] = T19 + T1a + T1b + T1c; - { - E TG, TY, TF, TX, TH; - TF = FNMS(KP356895867, Tr, Te); - TG = FNMS(KP692021471, TF, TE); - TX = FMA(KP554958132, TW, TR); - TY = FMA(KP801937735, TX, TM); - TH = FNMS(KP900968867, TG, T1); - ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH); - ri[WS(rs, 1)] = FMA(KP974927912, TY, TH); - } - { - E T1e, T1k, T1d, T1j, T1f; - T1d = FNMS(KP356895867, T1a, T19); - T1e = FNMS(KP692021471, T1d, T1b); - T1j = FMA(KP554958132, T1i, T1h); - T1k = FMA(KP801937735, T1j, T1g); - T1f = FNMS(KP900968867, T1e, T1c); - ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f); - ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f); - } - { - E T10, T13, TZ, T12, T11; - TZ = FNMS(KP356895867, Te, TE); - T10 = FNMS(KP692021471, TZ, Tr); - T12 = FMA(KP554958132, TM, TW); - T13 = FNMS(KP801937735, T12, TR); - T11 = FNMS(KP900968867, T10, T1); - ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11); - ri[WS(rs, 2)] = FMA(KP974927912, T13, T11); - } - { - E T1m, T1p, T1l, T1o, T1n; - T1l = FNMS(KP356895867, T19, T1b); - T1m = FNMS(KP692021471, T1l, T1a); - T1o = FMA(KP554958132, T1g, T1i); - T1p = FNMS(KP801937735, T1o, T1h); - T1n = FNMS(KP900968867, T1m, T1c); - ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n); - ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n); - } - { - E T15, T18, T14, T17, T16; - T14 = FNMS(KP356895867, TE, Tr); - T15 = FNMS(KP692021471, T14, Te); - T17 = FNMS(KP554958132, TR, TM); - T18 = FNMS(KP801937735, T17, TW); - T16 = FNMS(KP900968867, T15, T1); - ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16); - ri[WS(rs, 3)] = FMA(KP974927912, T18, T16); - } - { - E T1r, T1u, T1q, T1t, T1s; - T1q = FNMS(KP356895867, T1b, T1a); - T1r = FNMS(KP692021471, T1q, T19); - T1t = FNMS(KP554958132, T1h, T1g); - T1u = FNMS(KP801937735, T1t, T1i); - T1s = FNMS(KP900968867, T1r, T1c); - ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s); - ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s); - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 7 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 18, 12, 54, 0 }, 0, 0, 0 }; - -void X(codelet_t1_7) (planner *p) { - X(kdft_dit_register) (p, t1_7, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */ - -/* - * This function contains 72 FP additions, 60 FP multiplications, - * (or, 36 additions, 24 multiplications, 36 fused multiply/add), - * 29 stack variables, 6 constants, and 28 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP222520933, +0.222520933956314404288902564496794759466355569); - DK(KP900968867, +0.900968867902419126236102319507445051165919162); - DK(KP623489801, +0.623489801858733530525004884004239810632274731); - DK(KP433883739, +0.433883739117558120475768332848358754609990728); - DK(KP781831482, +0.781831482468029808708444526674057750232334519); - DK(KP974927912, +0.974927912181823607018131682993931217232785801); - { - INT m; - for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) { - E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ; - T1 = ri[0]; - TR = ii[0]; - { - E T6, TA, Tb, TB; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 1)]; - T5 = ii[WS(rs, 1)]; - T2 = W[0]; - T4 = W[1]; - T6 = FMA(T2, T3, T4 * T5); - TA = FNMS(T4, T3, T2 * T5); - } - { - E T8, Ta, T7, T9; - T8 = ri[WS(rs, 6)]; - Ta = ii[WS(rs, 6)]; - T7 = W[10]; - T9 = W[11]; - Tb = FMA(T7, T8, T9 * Ta); - TB = FNMS(T9, T8, T7 * Ta); - } - Tc = T6 + Tb; - TS = Tb - T6; - TC = TA - TB; - TO = TA + TB; - } - { - E Th, TG, Tm, TH; - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 2)]; - Tg = ii[WS(rs, 2)]; - Td = W[2]; - Tf = W[3]; - Th = FMA(Td, Te, Tf * Tg); - TG = FNMS(Tf, Te, Td * Tg); - } - { - E Tj, Tl, Ti, Tk; - Tj = ri[WS(rs, 5)]; - Tl = ii[WS(rs, 5)]; - Ti = W[8]; - Tk = W[9]; - Tm = FMA(Ti, Tj, Tk * Tl); - TH = FNMS(Tk, Tj, Ti * Tl); - } - Tn = Th + Tm; - TT = Tm - Th; - TI = TG - TH; - TP = TG + TH; - } - { - E Ts, TD, Tx, TE; - { - E Tp, Tr, To, Tq; - Tp = ri[WS(rs, 3)]; - Tr = ii[WS(rs, 3)]; - To = W[4]; - Tq = W[5]; - Ts = FMA(To, Tp, Tq * Tr); - TD = FNMS(Tq, Tp, To * Tr); - } - { - E Tu, Tw, Tt, Tv; - Tu = ri[WS(rs, 4)]; - Tw = ii[WS(rs, 4)]; - Tt = W[6]; - Tv = W[7]; - Tx = FMA(Tt, Tu, Tv * Tw); - TE = FNMS(Tv, Tu, Tt * Tw); - } - Ty = Ts + Tx; - TU = Tx - Ts; - TF = TD - TE; - TQ = TD + TE; - } - ri[0] = T1 + Tc + Tn + Ty; - ii[0] = TO + TP + TQ + TR; - { - E TJ, Tz, TX, TY; - TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI); - Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc); - ri[WS(rs, 5)] = Tz - TJ; - ri[WS(rs, 2)] = Tz + TJ; - TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT); - TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO); - ii[WS(rs, 2)] = TX + TY; - ii[WS(rs, 5)] = TY - TX; - } - { - E TL, TK, TV, TW; - TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF); - TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn); - ri[WS(rs, 6)] = TK - TL; - ri[WS(rs, 1)] = TK + TL; - TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU); - TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP); - ii[WS(rs, 1)] = TV + TW; - ii[WS(rs, 6)] = TW - TV; - } - { - E TN, TM, TZ, T10; - TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI); - TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc); - ri[WS(rs, 4)] = TM - TN; - ri[WS(rs, 3)] = TM + TN; - TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT); - T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO); - ii[WS(rs, 3)] = TZ + T10; - ii[WS(rs, 4)] = T10 - TZ; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 7 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 36, 24, 36, 0 }, 0, 0, 0 }; - -void X(codelet_t1_7) (planner *p) { - X(kdft_dit_register) (p, t1_7, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_8.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_8.c deleted file mode 100644 index 232a6ca0..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_8.c +++ /dev/null @@ -1,376 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:27 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */ - -/* - * This function contains 66 FP additions, 36 FP multiplications, - * (or, 44 additions, 14 multiplications, 22 fused multiply/add), - * 34 stack variables, 1 constants, and 32 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) { - E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts; - E TX, Ty, TZ, TV, T10; - T1 = ri[0]; - T1m = ii[0]; - { - E T3, T6, T4, T1k, T2, T5; - T3 = ri[WS(rs, 4)]; - T6 = ii[WS(rs, 4)]; - T2 = W[6]; - T4 = T2 * T3; - T1k = T2 * T6; - T5 = W[7]; - T7 = FMA(T5, T6, T4); - T1l = FNMS(T5, T3, T1k); - } - { - E Tg, Tj, Th, TR, Tf, Ti; - Tg = ri[WS(rs, 6)]; - Tj = ii[WS(rs, 6)]; - Tf = W[10]; - Th = Tf * Tg; - TR = Tf * Tj; - Ti = W[11]; - Tk = FMA(Ti, Tj, Th); - TS = FNMS(Ti, Tg, TR); - } - { - E Ta, Td, Tb, TP, T9, Tc; - Ta = ri[WS(rs, 2)]; - Td = ii[WS(rs, 2)]; - T9 = W[2]; - Tb = T9 * Ta; - TP = T9 * Td; - Tc = W[3]; - Te = FMA(Tc, Td, Tb); - TQ = FNMS(Tc, Ta, TP); - } - { - E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ; - TB = ri[WS(rs, 7)]; - TE = ii[WS(rs, 7)]; - TA = W[12]; - TC = TA * TB; - T13 = TA * TE; - TH = ri[WS(rs, 3)]; - TK = ii[WS(rs, 3)]; - TG = W[4]; - TI = TG * TH; - T15 = TG * TK; - TD = W[13]; - TF = FMA(TD, TE, TC); - T14 = FNMS(TD, TB, T13); - TJ = W[5]; - TL = FMA(TJ, TK, TI); - T16 = FNMS(TJ, TH, T15); - T12 = TF - TL; - T17 = T14 - T16; - } - { - E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw; - To = ri[WS(rs, 1)]; - Tr = ii[WS(rs, 1)]; - Tn = W[0]; - Tp = Tn * To; - TW = Tn * Tr; - Tu = ri[WS(rs, 5)]; - Tx = ii[WS(rs, 5)]; - Tt = W[8]; - Tv = Tt * Tu; - TY = Tt * Tx; - Tq = W[1]; - Ts = FMA(Tq, Tr, Tp); - TX = FNMS(Tq, To, TW); - Tw = W[9]; - Ty = FMA(Tw, Tx, Tv); - TZ = FNMS(Tw, Tu, TY); - TV = Ts - Ty; - T10 = TX - TZ; - } - { - E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u; - { - E TO, TT, T1r, T1s; - TO = T1 - T7; - TT = TQ - TS; - TU = TO + TT; - T1a = TO - TT; - T1r = T1m - T1l; - T1s = Te - Tk; - T1t = T1r - T1s; - T1v = T1s + T1r; - } - { - E T11, T18, T1b, T1c; - T11 = TV + T10; - T18 = T12 - T17; - T19 = T11 + T18; - T1w = T18 - T11; - T1b = T10 - TV; - T1c = T12 + T17; - T1d = T1b - T1c; - T1u = T1b + T1c; - } - ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU); - ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t); - ri[WS(rs, 1)] = FMA(KP707106781, T19, TU); - ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t); - ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a); - ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v); - ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a); - ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v); - } - { - E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i; - { - E T8, Tl, T1j, T1n; - T8 = T1 + T7; - Tl = Te + Tk; - Tm = T8 + Tl; - T1e = T8 - Tl; - T1j = TQ + TS; - T1n = T1l + T1m; - T1o = T1j + T1n; - T1q = T1n - T1j; - } - { - E Tz, TM, T1f, T1g; - Tz = Ts + Ty; - TM = TF + TL; - TN = Tz + TM; - T1p = TM - Tz; - T1f = TX + TZ; - T1g = T14 + T16; - T1h = T1f - T1g; - T1i = T1f + T1g; - } - ri[WS(rs, 4)] = Tm - TN; - ii[WS(rs, 4)] = T1o - T1i; - ri[0] = Tm + TN; - ii[0] = T1i + T1o; - ri[WS(rs, 6)] = T1e - T1h; - ii[WS(rs, 6)] = T1q - T1p; - ri[WS(rs, 2)] = T1e + T1h; - ii[WS(rs, 2)] = T1p + T1q; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 8 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 44, 14, 22, 0 }, 0, 0, 0 }; - -void X(codelet_t1_8) (planner *p) { - X(kdft_dit_register) (p, t1_8, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */ - -/* - * This function contains 66 FP additions, 32 FP multiplications, - * (or, 52 additions, 18 multiplications, 14 fused multiply/add), - * 28 stack variables, 1 constants, and 32 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) { - E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM; - E TP; - { - E T1, T18, T6, T17; - T1 = ri[0]; - T18 = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 4)]; - T5 = ii[WS(rs, 4)]; - T2 = W[6]; - T4 = W[7]; - T6 = FMA(T2, T3, T4 * T5); - T17 = FNMS(T4, T3, T2 * T5); - } - T7 = T1 + T6; - T1e = T18 - T17; - TH = T1 - T6; - T19 = T17 + T18; - } - { - E Tz, TS, TE, TT; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 7)]; - Ty = ii[WS(rs, 7)]; - Tv = W[12]; - Tx = W[13]; - Tz = FMA(Tv, Tw, Tx * Ty); - TS = FNMS(Tx, Tw, Tv * Ty); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 3)]; - TD = ii[WS(rs, 3)]; - TA = W[4]; - TC = W[5]; - TE = FMA(TA, TB, TC * TD); - TT = FNMS(TC, TB, TA * TD); - } - TF = Tz + TE; - T13 = TS + TT; - TR = Tz - TE; - TU = TS - TT; - } - { - E Tc, TI, Th, TJ; - { - E T9, Tb, T8, Ta; - T9 = ri[WS(rs, 2)]; - Tb = ii[WS(rs, 2)]; - T8 = W[2]; - Ta = W[3]; - Tc = FMA(T8, T9, Ta * Tb); - TI = FNMS(Ta, T9, T8 * Tb); - } - { - E Te, Tg, Td, Tf; - Te = ri[WS(rs, 6)]; - Tg = ii[WS(rs, 6)]; - Td = W[10]; - Tf = W[11]; - Th = FMA(Td, Te, Tf * Tg); - TJ = FNMS(Tf, Te, Td * Tg); - } - Ti = Tc + Th; - T1f = Tc - Th; - TK = TI - TJ; - T16 = TI + TJ; - } - { - E To, TN, Tt, TO; - { - E Tl, Tn, Tk, Tm; - Tl = ri[WS(rs, 1)]; - Tn = ii[WS(rs, 1)]; - Tk = W[0]; - Tm = W[1]; - To = FMA(Tk, Tl, Tm * Tn); - TN = FNMS(Tm, Tl, Tk * Tn); - } - { - E Tq, Ts, Tp, Tr; - Tq = ri[WS(rs, 5)]; - Ts = ii[WS(rs, 5)]; - Tp = W[8]; - Tr = W[9]; - Tt = FMA(Tp, Tq, Tr * Ts); - TO = FNMS(Tr, Tq, Tp * Ts); - } - Tu = To + Tt; - T12 = TN + TO; - TM = To - Tt; - TP = TN - TO; - } - { - E Tj, TG, T1b, T1c; - Tj = T7 + Ti; - TG = Tu + TF; - ri[WS(rs, 4)] = Tj - TG; - ri[0] = Tj + TG; - { - E T15, T1a, T11, T14; - T15 = T12 + T13; - T1a = T16 + T19; - ii[0] = T15 + T1a; - ii[WS(rs, 4)] = T1a - T15; - T11 = T7 - Ti; - T14 = T12 - T13; - ri[WS(rs, 6)] = T11 - T14; - ri[WS(rs, 2)] = T11 + T14; - } - T1b = TF - Tu; - T1c = T19 - T16; - ii[WS(rs, 2)] = T1b + T1c; - ii[WS(rs, 6)] = T1c - T1b; - { - E TX, T1g, T10, T1d, TY, TZ; - TX = TH - TK; - T1g = T1e - T1f; - TY = TP - TM; - TZ = TR + TU; - T10 = KP707106781 * (TY - TZ); - T1d = KP707106781 * (TY + TZ); - ri[WS(rs, 7)] = TX - T10; - ii[WS(rs, 5)] = T1g - T1d; - ri[WS(rs, 3)] = TX + T10; - ii[WS(rs, 1)] = T1d + T1g; - } - { - E TL, T1i, TW, T1h, TQ, TV; - TL = TH + TK; - T1i = T1f + T1e; - TQ = TM + TP; - TV = TR - TU; - TW = KP707106781 * (TQ + TV); - T1h = KP707106781 * (TV - TQ); - ri[WS(rs, 5)] = TL - TW; - ii[WS(rs, 7)] = T1i - T1h; - ri[WS(rs, 1)] = TL + TW; - ii[WS(rs, 3)] = T1h + T1i; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 8 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 52, 18, 14, 0 }, 0, 0, 0 }; - -void X(codelet_t1_8) (planner *p) { - X(kdft_dit_register) (p, t1_8, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_9.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_9.c deleted file mode 100644 index 805fa06b..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t1_9.c +++ /dev/null @@ -1,487 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:27 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */ - -/* - * This function contains 96 FP additions, 88 FP multiplications, - * (or, 24 additions, 16 multiplications, 72 fused multiply/add), - * 55 stack variables, 10 constants, and 36 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP852868531, +0.852868531952443209628250963940074071936020296); - DK(KP492403876, +0.492403876506104029683371512294761506835321626); - DK(KP984807753, +0.984807753012208059366743024589523013670643252); - DK(KP954188894, +0.954188894138671133499268364187245676532219158); - DK(KP363970234, +0.363970234266202361351047882776834043890471784); - DK(KP777861913, +0.777861913430206160028177977318626690410586096); - DK(KP839099631, +0.839099631177280011763127298123181364687434283); - DK(KP176326980, +0.176326980708464973471090386868618986121633062); - DK(KP866025403, +0.866025403784438646763723170752936183471402627); - DK(KP500000000, +0.500000000000000000000000000000000000000000000); - { - INT m; - for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) { - E T1, T1R, Te, T1W, T10, T1Q, T1l, T1r, Ty, T1p, Tl, T1o, T1g, T1q, T1a; - E T1d, TS, T18, TF, T13, T19, T1c; - T1 = ri[0]; - T1R = ii[0]; - { - E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8; - T3 = ri[WS(rs, 3)]; - T6 = ii[WS(rs, 3)]; - T2 = W[4]; - T4 = T2 * T3; - TW = T2 * T6; - T9 = ri[WS(rs, 6)]; - Tc = ii[WS(rs, 6)]; - T8 = W[10]; - Ta = T8 * T9; - TY = T8 * Tc; - { - E T7, TX, Td, TZ, T5, Tb; - T5 = W[5]; - T7 = FMA(T5, T6, T4); - TX = FNMS(T5, T3, TW); - Tb = W[11]; - Td = FMA(Tb, Tc, Ta); - TZ = FNMS(Tb, T9, TY); - Te = T7 + Td; - T1W = Td - T7; - T10 = TX - TZ; - T1Q = TX + TZ; - } - } - { - E Th, Tk, Ti, T1n, Tx, T1i, Tr, T1k, Tg, Tj; - Th = ri[WS(rs, 1)]; - Tk = ii[WS(rs, 1)]; - Tg = W[0]; - Ti = Tg * Th; - T1n = Tg * Tk; - { - E Tt, Tw, Tu, T1h, Ts, Tv; - Tt = ri[WS(rs, 7)]; - Tw = ii[WS(rs, 7)]; - Ts = W[12]; - Tu = Ts * Tt; - T1h = Ts * Tw; - Tv = W[13]; - Tx = FMA(Tv, Tw, Tu); - T1i = FNMS(Tv, Tt, T1h); - } - { - E Tn, Tq, To, T1j, Tm, Tp; - Tn = ri[WS(rs, 4)]; - Tq = ii[WS(rs, 4)]; - Tm = W[6]; - To = Tm * Tn; - T1j = Tm * Tq; - Tp = W[7]; - Tr = FMA(Tp, Tq, To); - T1k = FNMS(Tp, Tn, T1j); - } - T1l = T1i - T1k; - T1r = Tr - Tx; - Ty = Tr + Tx; - T1p = T1k + T1i; - Tj = W[1]; - Tl = FMA(Tj, Tk, Ti); - T1o = FNMS(Tj, Th, T1n); - T1g = FNMS(KP500000000, Ty, Tl); - T1q = FNMS(KP500000000, T1p, T1o); - } - { - E TB, TE, TC, T12, TR, T17, TL, T15, TA, TD; - TB = ri[WS(rs, 2)]; - TE = ii[WS(rs, 2)]; - TA = W[2]; - TC = TA * TB; - T12 = TA * TE; - { - E TN, TQ, TO, T16, TM, TP; - TN = ri[WS(rs, 8)]; - TQ = ii[WS(rs, 8)]; - TM = W[14]; - TO = TM * TN; - T16 = TM * TQ; - TP = W[15]; - TR = FMA(TP, TQ, TO); - T17 = FNMS(TP, TN, T16); - } - { - E TH, TK, TI, T14, TG, TJ; - TH = ri[WS(rs, 5)]; - TK = ii[WS(rs, 5)]; - TG = W[8]; - TI = TG * TH; - T14 = TG * TK; - TJ = W[9]; - TL = FMA(TJ, TK, TI); - T15 = FNMS(TJ, TH, T14); - } - T1a = TR - TL; - T1d = T15 - T17; - TS = TL + TR; - T18 = T15 + T17; - TD = W[3]; - TF = FMA(TD, TE, TC); - T13 = FNMS(TD, TB, T12); - T19 = FNMS(KP500000000, T18, T13); - T1c = FNMS(KP500000000, TS, TF); - } - { - E Tf, T1S, TU, T1U, T1O, T1P, T1L, T1T; - Tf = T1 + Te; - T1S = T1Q + T1R; - { - E Tz, TT, T1M, T1N; - Tz = Tl + Ty; - TT = TF + TS; - TU = Tz + TT; - T1U = TT - Tz; - T1M = T1o + T1p; - T1N = T13 + T18; - T1O = T1M - T1N; - T1P = T1M + T1N; - } - ri[0] = Tf + TU; - ii[0] = T1P + T1S; - T1L = FNMS(KP500000000, TU, Tf); - ri[WS(rs, 6)] = FNMS(KP866025403, T1O, T1L); - ri[WS(rs, 3)] = FMA(KP866025403, T1O, T1L); - T1T = FNMS(KP500000000, T1P, T1S); - ii[WS(rs, 3)] = FMA(KP866025403, T1U, T1T); - ii[WS(rs, 6)] = FNMS(KP866025403, T1U, T1T); - } - { - E T11, T1z, T1X, T21, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G; - E T22, TV, T1V; - TV = FNMS(KP500000000, Te, T1); - T11 = FMA(KP866025403, T10, TV); - T1z = FNMS(KP866025403, T10, TV); - T1V = FNMS(KP500000000, T1Q, T1R); - T1X = FMA(KP866025403, T1W, T1V); - T21 = FNMS(KP866025403, T1W, T1V); - { - E T1b, T1e, T1m, T1s; - T1b = FMA(KP866025403, T1a, T19); - T1e = FMA(KP866025403, T1d, T1c); - T1f = FMA(KP176326980, T1e, T1b); - T1w = FNMS(KP176326980, T1b, T1e); - T1m = FNMS(KP866025403, T1l, T1g); - T1s = FNMS(KP866025403, T1r, T1q); - T1t = FMA(KP839099631, T1s, T1m); - T1x = FNMS(KP839099631, T1m, T1s); - } - T1u = FMA(KP777861913, T1t, T1f); - T1Y = FNMS(KP777861913, T1x, T1w); - { - E T1A, T1B, T1D, T1E; - T1A = FMA(KP866025403, T1r, T1q); - T1B = FMA(KP866025403, T1l, T1g); - T1C = FMA(KP176326980, T1B, T1A); - T1I = FNMS(KP176326980, T1A, T1B); - T1D = FNMS(KP866025403, T1d, T1c); - T1E = FNMS(KP866025403, T1a, T19); - T1F = FNMS(KP363970234, T1E, T1D); - T1J = FMA(KP363970234, T1D, T1E); - } - T1G = FNMS(KP954188894, T1F, T1C); - T22 = FMA(KP954188894, T1J, T1I); - ri[WS(rs, 1)] = FMA(KP984807753, T1u, T11); - ii[WS(rs, 1)] = FNMS(KP984807753, T1Y, T1X); - ri[WS(rs, 2)] = FMA(KP984807753, T1G, T1z); - ii[WS(rs, 2)] = FNMS(KP984807753, T22, T21); - { - E T1v, T1y, T1Z, T20; - T1v = FNMS(KP492403876, T1u, T11); - T1y = FMA(KP777861913, T1x, T1w); - ri[WS(rs, 4)] = FMA(KP852868531, T1y, T1v); - ri[WS(rs, 7)] = FNMS(KP852868531, T1y, T1v); - T1Z = FMA(KP492403876, T1Y, T1X); - T20 = FNMS(KP777861913, T1t, T1f); - ii[WS(rs, 4)] = FMA(KP852868531, T20, T1Z); - ii[WS(rs, 7)] = FNMS(KP852868531, T20, T1Z); - } - { - E T1H, T1K, T23, T24; - T1H = FNMS(KP492403876, T1G, T1z); - T1K = FNMS(KP954188894, T1J, T1I); - ri[WS(rs, 5)] = FNMS(KP852868531, T1K, T1H); - ri[WS(rs, 8)] = FMA(KP852868531, T1K, T1H); - T23 = FMA(KP492403876, T22, T21); - T24 = FMA(KP954188894, T1F, T1C); - ii[WS(rs, 5)] = FNMS(KP852868531, T24, T23); - ii[WS(rs, 8)] = FMA(KP852868531, T24, T23); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 9 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 24, 16, 72, 0 }, 0, 0, 0 }; - -void X(codelet_t1_9) (planner *p) { - X(kdft_dit_register) (p, t1_9, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */ - -/* - * This function contains 96 FP additions, 72 FP multiplications, - * (or, 60 additions, 36 multiplications, 36 fused multiply/add), - * 41 stack variables, 8 constants, and 36 memory accesses - */ -#include "dft/scalar/t.h" - -static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP939692620, +0.939692620785908384054109277324731469936208134); - DK(KP342020143, +0.342020143325668733044099614682259580763083368); - DK(KP984807753, +0.984807753012208059366743024589523013670643252); - DK(KP173648177, +0.173648177666930348851716626769314796000375677); - DK(KP642787609, +0.642787609686539326322643409907263432907559884); - DK(KP766044443, +0.766044443118978035202392650555416673935832457); - DK(KP500000000, +0.500000000000000000000000000000000000000000000); - DK(KP866025403, +0.866025403784438646763723170752936183471402627); - { - INT m; - for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) { - E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu; - E T1w, TW, T1k, T11, T1l; - { - E T6, TO, Tb, TP; - T1 = ri[0]; - T1B = ii[0]; - { - E T3, T5, T2, T4; - T3 = ri[WS(rs, 3)]; - T5 = ii[WS(rs, 3)]; - T2 = W[4]; - T4 = W[5]; - T6 = FMA(T2, T3, T4 * T5); - TO = FNMS(T4, T3, T2 * T5); - } - { - E T8, Ta, T7, T9; - T8 = ri[WS(rs, 6)]; - Ta = ii[WS(rs, 6)]; - T7 = W[10]; - T9 = W[11]; - Tb = FMA(T7, T8, T9 * Ta); - TP = FNMS(T9, T8, T7 * Ta); - } - TQ = KP866025403 * (TO - TP); - T1G = KP866025403 * (Tb - T6); - Tc = T6 + Tb; - TN = FNMS(KP500000000, Tc, T1); - T1A = TO + TP; - T1H = FNMS(KP500000000, T1A, T1B); - } - { - E Tz, T19, TE, T14, TJ, T15, TK, T1a; - { - E Tw, Ty, Tv, Tx; - Tw = ri[WS(rs, 2)]; - Ty = ii[WS(rs, 2)]; - Tv = W[2]; - Tx = W[3]; - Tz = FMA(Tv, Tw, Tx * Ty); - T19 = FNMS(Tx, Tw, Tv * Ty); - } - { - E TB, TD, TA, TC; - TB = ri[WS(rs, 5)]; - TD = ii[WS(rs, 5)]; - TA = W[8]; - TC = W[9]; - TE = FMA(TA, TB, TC * TD); - T14 = FNMS(TC, TB, TA * TD); - } - { - E TG, TI, TF, TH; - TG = ri[WS(rs, 8)]; - TI = ii[WS(rs, 8)]; - TF = W[14]; - TH = W[15]; - TJ = FMA(TF, TG, TH * TI); - T15 = FNMS(TH, TG, TF * TI); - } - TK = TE + TJ; - T1a = T14 + T15; - TL = Tz + TK; - T1x = T19 + T1a; - { - E T13, T16, T18, T1b; - T13 = FNMS(KP500000000, TK, Tz); - T16 = KP866025403 * (T14 - T15); - T17 = T13 + T16; - T1o = T13 - T16; - T18 = KP866025403 * (TJ - TE); - T1b = FNMS(KP500000000, T1a, T19); - T1c = T18 + T1b; - T1n = T1b - T18; - } - } - { - E Ti, TY, Tn, TT, Ts, TU, Tt, TZ; - { - E Tf, Th, Te, Tg; - Tf = ri[WS(rs, 1)]; - Th = ii[WS(rs, 1)]; - Te = W[0]; - Tg = W[1]; - Ti = FMA(Te, Tf, Tg * Th); - TY = FNMS(Tg, Tf, Te * Th); - } - { - E Tk, Tm, Tj, Tl; - Tk = ri[WS(rs, 4)]; - Tm = ii[WS(rs, 4)]; - Tj = W[6]; - Tl = W[7]; - Tn = FMA(Tj, Tk, Tl * Tm); - TT = FNMS(Tl, Tk, Tj * Tm); - } - { - E Tp, Tr, To, Tq; - Tp = ri[WS(rs, 7)]; - Tr = ii[WS(rs, 7)]; - To = W[12]; - Tq = W[13]; - Ts = FMA(To, Tp, Tq * Tr); - TU = FNMS(Tq, Tp, To * Tr); - } - Tt = Tn + Ts; - TZ = TT + TU; - Tu = Ti + Tt; - T1w = TY + TZ; - { - E TS, TV, TX, T10; - TS = FNMS(KP500000000, Tt, Ti); - TV = KP866025403 * (TT - TU); - TW = TS + TV; - T1k = TS - TV; - TX = KP866025403 * (Ts - Tn); - T10 = FNMS(KP500000000, TZ, TY); - T11 = TX + T10; - T1l = T10 - TX; - } - } - { - E T1y, Td, TM, T1v; - T1y = KP866025403 * (T1w - T1x); - Td = T1 + Tc; - TM = Tu + TL; - T1v = FNMS(KP500000000, TM, Td); - ri[0] = Td + TM; - ri[WS(rs, 3)] = T1v + T1y; - ri[WS(rs, 6)] = T1v - T1y; - } - { - E T1D, T1z, T1C, T1E; - T1D = KP866025403 * (TL - Tu); - T1z = T1w + T1x; - T1C = T1A + T1B; - T1E = FNMS(KP500000000, T1z, T1C); - ii[0] = T1z + T1C; - ii[WS(rs, 6)] = T1E - T1D; - ii[WS(rs, 3)] = T1D + T1E; - } - { - E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K; - TR = TN + TQ; - T1I = T1G + T1H; - { - E T12, T1d, T1g, T1h; - T12 = FMA(KP766044443, TW, KP642787609 * T11); - T1d = FMA(KP173648177, T17, KP984807753 * T1c); - T1e = T12 + T1d; - T1J = KP866025403 * (T1d - T12); - T1g = FNMS(KP642787609, TW, KP766044443 * T11); - T1h = FNMS(KP984807753, T17, KP173648177 * T1c); - T1i = KP866025403 * (T1g - T1h); - T1F = T1g + T1h; - } - ri[WS(rs, 1)] = TR + T1e; - ii[WS(rs, 1)] = T1F + T1I; - T1f = FNMS(KP500000000, T1e, TR); - ri[WS(rs, 7)] = T1f - T1i; - ri[WS(rs, 4)] = T1f + T1i; - T1K = FNMS(KP500000000, T1F, T1I); - ii[WS(rs, 4)] = T1J + T1K; - ii[WS(rs, 7)] = T1K - T1J; - } - { - E T1j, T1M, T1q, T1N, T1u, T1L, T1r, T1O; - T1j = TN - TQ; - T1M = T1H - T1G; - { - E T1m, T1p, T1s, T1t; - T1m = FMA(KP173648177, T1k, KP984807753 * T1l); - T1p = FNMS(KP939692620, T1o, KP342020143 * T1n); - T1q = T1m + T1p; - T1N = KP866025403 * (T1p - T1m); - T1s = FNMS(KP984807753, T1k, KP173648177 * T1l); - T1t = FMA(KP342020143, T1o, KP939692620 * T1n); - T1u = KP866025403 * (T1s + T1t); - T1L = T1s - T1t; - } - ri[WS(rs, 2)] = T1j + T1q; - ii[WS(rs, 2)] = T1L + T1M; - T1r = FNMS(KP500000000, T1q, T1j); - ri[WS(rs, 8)] = T1r - T1u; - ri[WS(rs, 5)] = T1r + T1u; - T1O = FNMS(KP500000000, T1L, T1M); - ii[WS(rs, 5)] = T1N + T1O; - ii[WS(rs, 8)] = T1O - T1N; - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_FULL, 0, 9 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 60, 36, 36, 0 }, 0, 0, 0 }; - -void X(codelet_t1_9) (planner *p) { - X(kdft_dit_register) (p, t1_9, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_10.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_10.c deleted file mode 100644 index ba0f0205..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_10.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:37 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */ - -/* - * This function contains 114 FP additions, 94 FP multiplications, - * (or, 48 additions, 28 multiplications, 66 fused multiply/add), - * 63 stack variables, 4 constants, and 40 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - { - INT m; - for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) { - E T2, T3, T8, Tc, T5, T6, Tl, T7, TB, TF, T12, TY, To, Ts, Tw; - E Tb, Td, Th; - { - E TA, TX, TE, T11, Ta, T4; - T2 = W[0]; - T3 = W[2]; - T4 = T2 * T3; - T8 = W[4]; - TA = T2 * T8; - TX = T3 * T8; - Tc = W[5]; - TE = T2 * Tc; - T11 = T3 * Tc; - T5 = W[1]; - T6 = W[3]; - Ta = T2 * T6; - Tl = FMA(T5, T6, T4); - T7 = FNMS(T5, T6, T4); - TB = FMA(T5, Tc, TA); - TF = FNMS(T5, T8, TE); - T12 = FNMS(T6, T8, T11); - TY = FMA(T6, Tc, TX); - { - E Tr, Tv, T9, Tg; - Tr = Tl * T8; - Tv = Tl * Tc; - To = FNMS(T5, T3, Ta); - Ts = FMA(To, Tc, Tr); - Tw = FNMS(To, T8, Tv); - T9 = T7 * T8; - Tg = T7 * Tc; - Tb = FMA(T5, T3, Ta); - Td = FMA(Tb, Tc, T9); - Th = FNMS(Tb, T8, Tg); - } - } - { - E Tk, T1c, T24, T2d, TW, T19, T1a, T1P, T1Q, T1Z, T1g, T1h, T1i, T1C, T1H; - E T2f, Tz, TM, TN, T1S, T1T, T1Y, T1d, T1e, T1f, T1r, T1w, T2e; - { - E T1, T23, Te, Tf, Ti, T21, Tj, T22; - T1 = ri[0]; - T23 = ii[0]; - Te = ri[WS(rs, 5)]; - Tf = Td * Te; - Ti = ii[WS(rs, 5)]; - T21 = Td * Ti; - Tj = FMA(Th, Ti, Tf); - Tk = T1 - Tj; - T1c = T1 + Tj; - T22 = FNMS(Th, Te, T21); - T24 = T22 + T23; - T2d = T23 - T22; - } - { - E TR, T1z, T18, T1G, TV, T1B, T14, T1E; - { - E TO, TP, TQ, T1y; - TO = ri[WS(rs, 4)]; - TP = T7 * TO; - TQ = ii[WS(rs, 4)]; - T1y = T7 * TQ; - TR = FMA(Tb, TQ, TP); - T1z = FNMS(Tb, TO, T1y); - } - { - E T15, T16, T17, T1F; - T15 = ri[WS(rs, 1)]; - T16 = T2 * T15; - T17 = ii[WS(rs, 1)]; - T1F = T2 * T17; - T18 = FMA(T5, T17, T16); - T1G = FNMS(T5, T15, T1F); - } - { - E TS, TT, TU, T1A; - TS = ri[WS(rs, 9)]; - TT = T8 * TS; - TU = ii[WS(rs, 9)]; - T1A = T8 * TU; - TV = FMA(Tc, TU, TT); - T1B = FNMS(Tc, TS, T1A); - } - { - E TZ, T10, T13, T1D; - TZ = ri[WS(rs, 6)]; - T10 = TY * TZ; - T13 = ii[WS(rs, 6)]; - T1D = TY * T13; - T14 = FMA(T12, T13, T10); - T1E = FNMS(T12, TZ, T1D); - } - TW = TR - TV; - T19 = T14 - T18; - T1a = TW + T19; - T1P = T1z + T1B; - T1Q = T1E + T1G; - T1Z = T1P + T1Q; - T1g = TR + TV; - T1h = T14 + T18; - T1i = T1g + T1h; - T1C = T1z - T1B; - T1H = T1E - T1G; - T2f = T1C + T1H; - } - { - E Tq, T1o, TL, T1v, Ty, T1q, TH, T1t; - { - E Tm, Tn, Tp, T1n; - Tm = ri[WS(rs, 2)]; - Tn = Tl * Tm; - Tp = ii[WS(rs, 2)]; - T1n = Tl * Tp; - Tq = FMA(To, Tp, Tn); - T1o = FNMS(To, Tm, T1n); - } - { - E TI, TJ, TK, T1u; - TI = ri[WS(rs, 3)]; - TJ = T3 * TI; - TK = ii[WS(rs, 3)]; - T1u = T3 * TK; - TL = FMA(T6, TK, TJ); - T1v = FNMS(T6, TI, T1u); - } - { - E Tt, Tu, Tx, T1p; - Tt = ri[WS(rs, 7)]; - Tu = Ts * Tt; - Tx = ii[WS(rs, 7)]; - T1p = Ts * Tx; - Ty = FMA(Tw, Tx, Tu); - T1q = FNMS(Tw, Tt, T1p); - } - { - E TC, TD, TG, T1s; - TC = ri[WS(rs, 8)]; - TD = TB * TC; - TG = ii[WS(rs, 8)]; - T1s = TB * TG; - TH = FMA(TF, TG, TD); - T1t = FNMS(TF, TC, T1s); - } - Tz = Tq - Ty; - TM = TH - TL; - TN = Tz + TM; - T1S = T1o + T1q; - T1T = T1t + T1v; - T1Y = T1S + T1T; - T1d = Tq + Ty; - T1e = TH + TL; - T1f = T1d + T1e; - T1r = T1o - T1q; - T1w = T1t - T1v; - T2e = T1r + T1w; - } - { - E T1l, T1b, T1k, T1J, T1L, T1x, T1I, T1K, T1m; - T1l = TN - T1a; - T1b = TN + T1a; - T1k = FNMS(KP250000000, T1b, Tk); - T1x = T1r - T1w; - T1I = T1C - T1H; - T1J = FMA(KP618033988, T1I, T1x); - T1L = FNMS(KP618033988, T1x, T1I); - ri[WS(rs, 5)] = Tk + T1b; - T1K = FNMS(KP559016994, T1l, T1k); - ri[WS(rs, 7)] = FNMS(KP951056516, T1L, T1K); - ri[WS(rs, 3)] = FMA(KP951056516, T1L, T1K); - T1m = FMA(KP559016994, T1l, T1k); - ri[WS(rs, 9)] = FNMS(KP951056516, T1J, T1m); - ri[WS(rs, 1)] = FMA(KP951056516, T1J, T1m); - } - { - E T2i, T2g, T2h, T2m, T2o, T2k, T2l, T2n, T2j; - T2i = T2e - T2f; - T2g = T2e + T2f; - T2h = FNMS(KP250000000, T2g, T2d); - T2k = Tz - TM; - T2l = TW - T19; - T2m = FMA(KP618033988, T2l, T2k); - T2o = FNMS(KP618033988, T2k, T2l); - ii[WS(rs, 5)] = T2g + T2d; - T2n = FNMS(KP559016994, T2i, T2h); - ii[WS(rs, 3)] = FNMS(KP951056516, T2o, T2n); - ii[WS(rs, 7)] = FMA(KP951056516, T2o, T2n); - T2j = FMA(KP559016994, T2i, T2h); - ii[WS(rs, 1)] = FNMS(KP951056516, T2m, T2j); - ii[WS(rs, 9)] = FMA(KP951056516, T2m, T2j); - } - { - E T1N, T1j, T1M, T1V, T1X, T1R, T1U, T1W, T1O; - T1N = T1f - T1i; - T1j = T1f + T1i; - T1M = FNMS(KP250000000, T1j, T1c); - T1R = T1P - T1Q; - T1U = T1S - T1T; - T1V = FNMS(KP618033988, T1U, T1R); - T1X = FMA(KP618033988, T1R, T1U); - ri[0] = T1c + T1j; - T1W = FMA(KP559016994, T1N, T1M); - ri[WS(rs, 4)] = FNMS(KP951056516, T1X, T1W); - ri[WS(rs, 6)] = FMA(KP951056516, T1X, T1W); - T1O = FNMS(KP559016994, T1N, T1M); - ri[WS(rs, 2)] = FNMS(KP951056516, T1V, T1O); - ri[WS(rs, 8)] = FMA(KP951056516, T1V, T1O); - } - { - E T26, T20, T25, T2a, T2c, T28, T29, T2b, T27; - T26 = T1Y - T1Z; - T20 = T1Y + T1Z; - T25 = FNMS(KP250000000, T20, T24); - T28 = T1g - T1h; - T29 = T1d - T1e; - T2a = FNMS(KP618033988, T29, T28); - T2c = FMA(KP618033988, T28, T29); - ii[0] = T20 + T24; - T2b = FMA(KP559016994, T26, T25); - ii[WS(rs, 4)] = FMA(KP951056516, T2c, T2b); - ii[WS(rs, 6)] = FNMS(KP951056516, T2c, T2b); - T27 = FNMS(KP559016994, T26, T25); - ii[WS(rs, 2)] = FMA(KP951056516, T2a, T27); - ii[WS(rs, 8)] = FNMS(KP951056516, T2a, T27); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 48, 28, 66, 0 }, 0, 0, 0 }; - -void X(codelet_t2_10) (planner *p) { - X(kdft_dit_register) (p, t2_10, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */ - -/* - * This function contains 114 FP additions, 80 FP multiplications, - * (or, 76 additions, 42 multiplications, 38 fused multiply/add), - * 63 stack variables, 4 constants, and 40 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - { - INT m; - for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) { - E T2, T5, T3, T6, T8, Tm, Tc, Tk, T9, Td, Te, TM, TO, Tg, Tp; - E Tv, Tx, Tr; - { - E T4, Tb, T7, Ta; - T2 = W[0]; - T5 = W[1]; - T3 = W[2]; - T6 = W[3]; - T4 = T2 * T3; - Tb = T5 * T3; - T7 = T5 * T6; - Ta = T2 * T6; - T8 = T4 - T7; - Tm = Ta - Tb; - Tc = Ta + Tb; - Tk = T4 + T7; - T9 = W[4]; - Td = W[5]; - Te = FMA(T8, T9, Tc * Td); - TM = FMA(T3, T9, T6 * Td); - TO = FNMS(T6, T9, T3 * Td); - Tg = FNMS(Tc, T9, T8 * Td); - Tp = FMA(Tk, T9, Tm * Td); - Tv = FMA(T2, T9, T5 * Td); - Tx = FNMS(T5, T9, T2 * Td); - Tr = FNMS(Tm, T9, Tk * Td); - } - { - E Tj, T1S, TX, T1G, TL, TU, TV, T1s, T1t, T1C, T11, T12, T13, T1h, T1k; - E T1Q, Tu, TD, TE, T1v, T1w, T1B, TY, TZ, T10, T1a, T1d, T1P; - { - E T1, T1F, Ti, T1E, Tf, Th; - T1 = ri[0]; - T1F = ii[0]; - Tf = ri[WS(rs, 5)]; - Th = ii[WS(rs, 5)]; - Ti = FMA(Te, Tf, Tg * Th); - T1E = FNMS(Tg, Tf, Te * Th); - Tj = T1 - Ti; - T1S = T1F - T1E; - TX = T1 + Ti; - T1G = T1E + T1F; - } - { - E TH, T1f, TT, T1j, TK, T1g, TQ, T1i; - { - E TF, TG, TR, TS; - TF = ri[WS(rs, 4)]; - TG = ii[WS(rs, 4)]; - TH = FMA(T8, TF, Tc * TG); - T1f = FNMS(Tc, TF, T8 * TG); - TR = ri[WS(rs, 1)]; - TS = ii[WS(rs, 1)]; - TT = FMA(T2, TR, T5 * TS); - T1j = FNMS(T5, TR, T2 * TS); - } - { - E TI, TJ, TN, TP; - TI = ri[WS(rs, 9)]; - TJ = ii[WS(rs, 9)]; - TK = FMA(T9, TI, Td * TJ); - T1g = FNMS(Td, TI, T9 * TJ); - TN = ri[WS(rs, 6)]; - TP = ii[WS(rs, 6)]; - TQ = FMA(TM, TN, TO * TP); - T1i = FNMS(TO, TN, TM * TP); - } - TL = TH - TK; - TU = TQ - TT; - TV = TL + TU; - T1s = T1f + T1g; - T1t = T1i + T1j; - T1C = T1s + T1t; - T11 = TH + TK; - T12 = TQ + TT; - T13 = T11 + T12; - T1h = T1f - T1g; - T1k = T1i - T1j; - T1Q = T1h + T1k; - } - { - E To, T18, TC, T1c, Tt, T19, Tz, T1b; - { - E Tl, Tn, TA, TB; - Tl = ri[WS(rs, 2)]; - Tn = ii[WS(rs, 2)]; - To = FMA(Tk, Tl, Tm * Tn); - T18 = FNMS(Tm, Tl, Tk * Tn); - TA = ri[WS(rs, 3)]; - TB = ii[WS(rs, 3)]; - TC = FMA(T3, TA, T6 * TB); - T1c = FNMS(T6, TA, T3 * TB); - } - { - E Tq, Ts, Tw, Ty; - Tq = ri[WS(rs, 7)]; - Ts = ii[WS(rs, 7)]; - Tt = FMA(Tp, Tq, Tr * Ts); - T19 = FNMS(Tr, Tq, Tp * Ts); - Tw = ri[WS(rs, 8)]; - Ty = ii[WS(rs, 8)]; - Tz = FMA(Tv, Tw, Tx * Ty); - T1b = FNMS(Tx, Tw, Tv * Ty); - } - Tu = To - Tt; - TD = Tz - TC; - TE = Tu + TD; - T1v = T18 + T19; - T1w = T1b + T1c; - T1B = T1v + T1w; - TY = To + Tt; - TZ = Tz + TC; - T10 = TY + TZ; - T1a = T18 - T19; - T1d = T1b - T1c; - T1P = T1a + T1d; - } - { - E T15, TW, T16, T1m, T1o, T1e, T1l, T1n, T17; - T15 = KP559016994 * (TE - TV); - TW = TE + TV; - T16 = FNMS(KP250000000, TW, Tj); - T1e = T1a - T1d; - T1l = T1h - T1k; - T1m = FMA(KP951056516, T1e, KP587785252 * T1l); - T1o = FNMS(KP587785252, T1e, KP951056516 * T1l); - ri[WS(rs, 5)] = Tj + TW; - T1n = T16 - T15; - ri[WS(rs, 7)] = T1n - T1o; - ri[WS(rs, 3)] = T1n + T1o; - T17 = T15 + T16; - ri[WS(rs, 9)] = T17 - T1m; - ri[WS(rs, 1)] = T17 + T1m; - } - { - E T1R, T1T, T1U, T1Y, T20, T1W, T1X, T1Z, T1V; - T1R = KP559016994 * (T1P - T1Q); - T1T = T1P + T1Q; - T1U = FNMS(KP250000000, T1T, T1S); - T1W = Tu - TD; - T1X = TL - TU; - T1Y = FMA(KP951056516, T1W, KP587785252 * T1X); - T20 = FNMS(KP587785252, T1W, KP951056516 * T1X); - ii[WS(rs, 5)] = T1T + T1S; - T1Z = T1U - T1R; - ii[WS(rs, 3)] = T1Z - T20; - ii[WS(rs, 7)] = T20 + T1Z; - T1V = T1R + T1U; - ii[WS(rs, 1)] = T1V - T1Y; - ii[WS(rs, 9)] = T1Y + T1V; - } - { - E T1q, T14, T1p, T1y, T1A, T1u, T1x, T1z, T1r; - T1q = KP559016994 * (T10 - T13); - T14 = T10 + T13; - T1p = FNMS(KP250000000, T14, TX); - T1u = T1s - T1t; - T1x = T1v - T1w; - T1y = FNMS(KP587785252, T1x, KP951056516 * T1u); - T1A = FMA(KP951056516, T1x, KP587785252 * T1u); - ri[0] = TX + T14; - T1z = T1q + T1p; - ri[WS(rs, 4)] = T1z - T1A; - ri[WS(rs, 6)] = T1z + T1A; - T1r = T1p - T1q; - ri[WS(rs, 2)] = T1r - T1y; - ri[WS(rs, 8)] = T1r + T1y; - } - { - E T1L, T1D, T1K, T1J, T1N, T1H, T1I, T1O, T1M; - T1L = KP559016994 * (T1B - T1C); - T1D = T1B + T1C; - T1K = FNMS(KP250000000, T1D, T1G); - T1H = T11 - T12; - T1I = TY - TZ; - T1J = FNMS(KP587785252, T1I, KP951056516 * T1H); - T1N = FMA(KP951056516, T1I, KP587785252 * T1H); - ii[0] = T1D + T1G; - T1O = T1L + T1K; - ii[WS(rs, 4)] = T1N + T1O; - ii[WS(rs, 6)] = T1O - T1N; - T1M = T1K - T1L; - ii[WS(rs, 2)] = T1J + T1M; - ii[WS(rs, 8)] = T1M - T1J; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 76, 42, 38, 0 }, 0, 0, 0 }; - -void X(codelet_t2_10) (planner *p) { - X(kdft_dit_register) (p, t2_10, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_16.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_16.c deleted file mode 100644 index 823760ab..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_16.c +++ /dev/null @@ -1,836 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:32 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */ - -/* - * This function contains 196 FP additions, 134 FP multiplications, - * (or, 104 additions, 42 multiplications, 92 fused multiply/add), - * 90 stack variables, 3 constants, and 64 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP414213562, +0.414213562373095048801688724209698078569671875); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) { - E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW; - E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m; - { - E TN, TS, T4, Tp, Ta, Tt, Tl, Tg; - T2 = W[0]; - Tf = W[2]; - Tg = T2 * Tf; - TM = W[6]; - TN = T2 * TM; - TO = W[7]; - TS = T2 * TO; - T3 = W[4]; - T4 = T2 * T3; - Tp = Tf * T3; - T6 = W[5]; - Ta = T2 * T6; - Tt = Tf * T6; - T5 = W[1]; - Th = W[3]; - Tl = T2 * Th; - Tz = FMA(T5, Th, Tg); - Ti = FNMS(T5, Th, Tg); - T7 = FMA(T5, T6, T4); - TZ = FNMS(Th, T3, Tt); - TT = FNMS(T5, TM, TS); - Tq = FNMS(Th, T6, Tp); - TW = FMA(Th, T6, Tp); - Tb = FNMS(T5, T3, Ta); - Tu = FMA(Th, T3, Tt); - TP = FMA(T5, TO, TN); - TI = FMA(T5, T3, Ta); - TF = FNMS(T5, T6, T4); - { - E T1y, T1C, T1e, T1i; - T1y = Tz * T3; - T1C = Tz * T6; - TC = FNMS(T5, Tf, Tl); - T1z = FMA(TC, T6, T1y); - T1O = FMA(TC, T3, T1C); - T1D = FNMS(TC, T3, T1C); - T1L = FNMS(TC, T6, T1y); - T1e = Ti * T3; - T1i = Ti * T6; - Tm = FMA(T5, Tf, Tl); - T1f = FMA(Tm, T6, T1e); - T1p = FMA(Tm, T3, T1i); - T1j = FNMS(Tm, T3, T1i); - T1m = FNMS(Tm, T6, T1e); - } - } - { - E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z; - E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28; - E T2d, T38; - { - E T1, T3z, T8, T9, Tc, T3x, Td, T3y; - T1 = ri[0]; - T3z = ii[0]; - T8 = ri[WS(rs, 8)]; - T9 = T7 * T8; - Tc = ii[WS(rs, 8)]; - T3x = T7 * Tc; - Td = FMA(Tb, Tc, T9); - Te = T1 + Td; - T1U = T1 - Td; - T3y = FNMS(Tb, T8, T3x); - T3A = T3y + T3z; - T3L = T3z - T3y; - } - { - E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y; - T1u = ri[WS(rs, 15)]; - T1v = TM * T1u; - T1w = ii[WS(rs, 15)]; - T2w = TM * T1w; - T1A = ri[WS(rs, 7)]; - T1B = T1z * T1A; - T1E = ii[WS(rs, 7)]; - T2y = T1z * T1E; - { - E T1x, T1F, T2x, T2z; - T1x = FMA(TO, T1w, T1v); - T1F = FMA(T1D, T1E, T1B); - T1G = T1x + T1F; - T2D = T1x - T1F; - T2x = FNMS(TO, T1u, T2w); - T2z = FNMS(T1D, T1A, T2y); - T2A = T2x - T2z; - T3h = T2x + T2z; - } - } - { - E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G; - T1H = ri[WS(rs, 3)]; - T1I = Tf * T1H; - T1J = ii[WS(rs, 3)]; - T2E = Tf * T1J; - T1M = ri[WS(rs, 11)]; - T1N = T1L * T1M; - T1P = ii[WS(rs, 11)]; - T2G = T1L * T1P; - { - E T1K, T1Q, T2F, T2H; - T1K = FMA(Th, T1J, T1I); - T1Q = FMA(T1O, T1P, T1N); - T1R = T1K + T1Q; - T2B = T1K - T1Q; - T2F = FNMS(Th, T1H, T2E); - T2H = FNMS(T1O, T1M, T2G); - T2I = T2F - T2H; - T3i = T2F + T2H; - } - } - { - E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X; - Tj = ri[WS(rs, 4)]; - Tk = Ti * Tj; - Tn = ii[WS(rs, 4)]; - T1V = Ti * Tn; - Tr = ri[WS(rs, 12)]; - Ts = Tq * Tr; - Tv = ii[WS(rs, 12)]; - T1X = Tq * Tv; - { - E To, Tw, T1W, T1Y; - To = FMA(Tm, Tn, Tk); - Tw = FMA(Tu, Tv, Ts); - Tx = To + Tw; - T3M = To - Tw; - T1W = FNMS(Tm, Tj, T1V); - T1Y = FNMS(Tu, Tr, T1X); - T1Z = T1W - T1Y; - T3w = T1W + T1Y; - } - } - { - E TA, TB, TD, T21, TG, TH, TJ, T23; - TA = ri[WS(rs, 2)]; - TB = Tz * TA; - TD = ii[WS(rs, 2)]; - T21 = Tz * TD; - TG = ri[WS(rs, 10)]; - TH = TF * TG; - TJ = ii[WS(rs, 10)]; - T23 = TF * TJ; - { - E TE, TK, T22, T24; - TE = FMA(TC, TD, TB); - TK = FMA(TI, TJ, TH); - TL = TE + TK; - T26 = TE - TK; - T22 = FNMS(TC, TA, T21); - T24 = FNMS(TI, TG, T23); - T25 = T22 - T24; - T37 = T22 + T24; - } - } - { - E T15, T16, T17, T2h, T19, T1a, T1b, T2j; - T15 = ri[WS(rs, 1)]; - T16 = T2 * T15; - T17 = ii[WS(rs, 1)]; - T2h = T2 * T17; - T19 = ri[WS(rs, 9)]; - T1a = T3 * T19; - T1b = ii[WS(rs, 9)]; - T2j = T3 * T1b; - { - E T18, T1c, T2i, T2k; - T18 = FMA(T5, T17, T16); - T1c = FMA(T6, T1b, T1a); - T1d = T18 + T1c; - T2o = T18 - T1c; - T2i = FNMS(T5, T15, T2h); - T2k = FNMS(T6, T19, T2j); - T2l = T2i - T2k; - T3c = T2i + T2k; - } - } - { - E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r; - T1g = ri[WS(rs, 5)]; - T1h = T1f * T1g; - T1k = ii[WS(rs, 5)]; - T2p = T1f * T1k; - T1n = ri[WS(rs, 13)]; - T1o = T1m * T1n; - T1q = ii[WS(rs, 13)]; - T2r = T1m * T1q; - { - E T1l, T1r, T2q, T2s; - T1l = FMA(T1j, T1k, T1h); - T1r = FMA(T1p, T1q, T1o); - T1s = T1l + T1r; - T2m = T1l - T1r; - T2q = FNMS(T1j, T1g, T2p); - T2s = FNMS(T1p, T1n, T2r); - T2t = T2q - T2s; - T3d = T2q + T2s; - } - } - { - E TQ, TR, TU, T29, TX, TY, T10, T2b; - TQ = ri[WS(rs, 14)]; - TR = TP * TQ; - TU = ii[WS(rs, 14)]; - T29 = TP * TU; - TX = ri[WS(rs, 6)]; - TY = TW * TX; - T10 = ii[WS(rs, 6)]; - T2b = TW * T10; - { - E TV, T11, T2a, T2c; - TV = FMA(TT, TU, TR); - T11 = FMA(TZ, T10, TY); - T12 = TV + T11; - T28 = TV - T11; - T2a = FNMS(TT, TQ, T29); - T2c = FNMS(TZ, TX, T2b); - T2d = T2a - T2c; - T38 = T2a + T2c; - } - } - { - E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u; - { - E Ty, T13, T3v, T3B; - Ty = Te + Tx; - T13 = TL + T12; - T14 = Ty + T13; - T3q = Ty - T13; - T3v = T37 + T38; - T3B = T3w + T3A; - T3C = T3v + T3B; - T3E = T3B - T3v; - } - { - E T1t, T1S, T3r, T3s; - T1t = T1d + T1s; - T1S = T1G + T1R; - T1T = T1t + T1S; - T3D = T1S - T1t; - T3r = T3c + T3d; - T3s = T3h + T3i; - T3t = T3r - T3s; - T3u = T3r + T3s; - } - ri[WS(rs, 8)] = T14 - T1T; - ii[WS(rs, 8)] = T3C - T3u; - ri[0] = T14 + T1T; - ii[0] = T3u + T3C; - ri[WS(rs, 12)] = T3q - T3t; - ii[WS(rs, 12)] = T3E - T3D; - ri[WS(rs, 4)] = T3q + T3t; - ii[WS(rs, 4)] = T3D + T3E; - } - { - E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o; - { - E T36, T39, T3F, T3G; - T36 = Te - Tx; - T39 = T37 - T38; - T3a = T36 + T39; - T3m = T36 - T39; - T3F = T12 - TL; - T3G = T3A - T3w; - T3H = T3F + T3G; - T3J = T3G - T3F; - } - { - E T3b, T3e, T3g, T3j; - T3b = T1d - T1s; - T3e = T3c - T3d; - T3f = T3b + T3e; - T3n = T3e - T3b; - T3g = T1G - T1R; - T3j = T3h - T3i; - T3k = T3g - T3j; - T3o = T3g + T3j; - } - { - E T3l, T3I, T3p, T3K; - T3l = T3f + T3k; - ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a); - ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a); - T3I = T3n + T3o; - ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H); - ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H); - T3p = T3n - T3o; - ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m); - ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m); - T3K = T3k - T3f; - ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J); - ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J); - } - } - { - E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K; - E T2O; - { - E T27, T2e, T2n, T2u; - T20 = T1U - T1Z; - T3N = T3L - T3M; - T3T = T3M + T3L; - T2Q = T1U + T1Z; - T27 = T25 - T26; - T2e = T28 + T2d; - T2f = T27 - T2e; - T3O = T27 + T2e; - { - E T2Y, T2Z, T2R, T2S; - T2Y = T2D + T2I; - T2Z = T2A - T2B; - T30 = FNMS(KP414213562, T2Z, T2Y); - T34 = FMA(KP414213562, T2Y, T2Z); - T2R = T26 + T25; - T2S = T28 - T2d; - T2T = T2R + T2S; - T3U = T2S - T2R; - } - T2n = T2l + T2m; - T2u = T2o - T2t; - T2v = FMA(KP414213562, T2u, T2n); - T2N = FNMS(KP414213562, T2n, T2u); - { - E T2V, T2W, T2C, T2J; - T2V = T2o + T2t; - T2W = T2l - T2m; - T2X = FMA(KP414213562, T2W, T2V); - T33 = FNMS(KP414213562, T2V, T2W); - T2C = T2A + T2B; - T2J = T2D - T2I; - T2K = FNMS(KP414213562, T2J, T2C); - T2O = FMA(KP414213562, T2C, T2J); - } - } - { - E T2g, T2L, T3V, T3W; - T2g = FMA(KP707106781, T2f, T20); - T2L = T2v - T2K; - ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g); - ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g); - T3V = FMA(KP707106781, T3U, T3T); - T3W = T2O - T2N; - ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V); - ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V); - } - { - E T2M, T2P, T3X, T3Y; - T2M = FNMS(KP707106781, T2f, T20); - T2P = T2N + T2O; - ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M); - ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M); - T3X = FNMS(KP707106781, T3U, T3T); - T3Y = T2v + T2K; - ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X); - ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X); - } - { - E T2U, T31, T3P, T3Q; - T2U = FMA(KP707106781, T2T, T2Q); - T31 = T2X + T30; - ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U); - ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U); - T3P = FMA(KP707106781, T3O, T3N); - T3Q = T33 + T34; - ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P); - ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P); - } - { - E T32, T35, T3R, T3S; - T32 = FNMS(KP707106781, T2T, T2Q); - T35 = T33 - T34; - ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32); - ri[WS(rs, 5)] = FMA(KP923879532, T35, T32); - T3R = FNMS(KP707106781, T3O, T3N); - T3S = T30 - T2X; - ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R); - ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R); - } - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 15 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 104, 42, 92, 0 }, 0, 0, 0 }; - -void X(codelet_t2_16) (planner *p) { - X(kdft_dit_register) (p, t2_16, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */ - -/* - * This function contains 196 FP additions, 108 FP multiplications, - * (or, 156 additions, 68 multiplications, 40 fused multiply/add), - * 82 stack variables, 3 constants, and 64 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP382683432, +0.382683432365089771728459984030398866761344562); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) { - E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU; - E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F; - { - E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr; - { - E Th, Tn, Tj, Tm; - T2 = W[0]; - T5 = W[1]; - Tg = W[2]; - Ti = W[3]; - Th = T2 * Tg; - Tn = T5 * Tg; - Tj = T5 * Ti; - Tm = T2 * Ti; - Tk = Th - Tj; - To = Tm + Tn; - TE = Tm - Tn; - TC = Th + Tj; - T6 = W[5]; - T7 = T5 * T6; - Tv = Tg * T6; - Ta = T2 * T6; - Ts = Ti * T6; - T3 = W[4]; - T4 = T2 * T3; - Tw = Ti * T3; - Tb = T5 * T3; - Tr = Tg * T3; - } - T8 = T4 + T7; - TW = Tv - Tw; - TJ = Ta + Tb; - Tt = Tr - Ts; - TU = Tr + Ts; - Tc = Ta - Tb; - Tx = Tv + Tw; - TH = T4 - T7; - TN = W[6]; - TO = W[7]; - TP = FMA(T2, TN, T5 * TO); - TR = FNMS(T5, TN, T2 * TO); - { - E T1d, T1e, T19, T1a; - T1d = Tk * T6; - T1e = To * T3; - T1f = T1d - T1e; - T1k = T1d + T1e; - T19 = Tk * T3; - T1a = To * T6; - T1b = T19 + T1a; - T1i = T19 - T1a; - } - { - E T1w, T1x, T1s, T1t; - T1w = TC * T6; - T1x = TE * T3; - T1y = T1w - T1x; - T1H = T1w + T1x; - T1s = TC * T3; - T1t = TE * T6; - T1u = T1s + T1t; - T1F = T1s - T1t; - } - } - { - E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21; - E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R; - E T2S, T2T, T28, T2A, T2d, T2B; - { - E T1, T3d, Te, T3c, T9, Td; - T1 = ri[0]; - T3d = ii[0]; - T9 = ri[WS(rs, 8)]; - Td = ii[WS(rs, 8)]; - Te = FMA(T8, T9, Tc * Td); - T3c = FNMS(Tc, T9, T8 * Td); - Tf = T1 + Te; - T3r = T3d - T3c; - T1N = T1 - Te; - T3e = T3c + T3d; - } - { - E Tq, T1O, Tz, T1P; - { - E Tl, Tp, Tu, Ty; - Tl = ri[WS(rs, 4)]; - Tp = ii[WS(rs, 4)]; - Tq = FMA(Tk, Tl, To * Tp); - T1O = FNMS(To, Tl, Tk * Tp); - Tu = ri[WS(rs, 12)]; - Ty = ii[WS(rs, 12)]; - Tz = FMA(Tt, Tu, Tx * Ty); - T1P = FNMS(Tx, Tu, Tt * Ty); - } - TA = Tq + Tz; - T3s = Tq - Tz; - T1Q = T1O - T1P; - T3b = T1O + T1P; - } - { - E TG, T1S, TL, T1T, T1U, T1V; - { - E TD, TF, TI, TK; - TD = ri[WS(rs, 2)]; - TF = ii[WS(rs, 2)]; - TG = FMA(TC, TD, TE * TF); - T1S = FNMS(TE, TD, TC * TF); - TI = ri[WS(rs, 10)]; - TK = ii[WS(rs, 10)]; - TL = FMA(TH, TI, TJ * TK); - T1T = FNMS(TJ, TI, TH * TK); - } - TM = TG + TL; - T2M = T1S + T1T; - T1U = T1S - T1T; - T1V = TG - TL; - T1W = T1U - T1V; - T2w = T1V + T1U; - } - { - E TT, T1Y, TY, T1Z, T1X, T20; - { - E TQ, TS, TV, TX; - TQ = ri[WS(rs, 14)]; - TS = ii[WS(rs, 14)]; - TT = FMA(TP, TQ, TR * TS); - T1Y = FNMS(TR, TQ, TP * TS); - TV = ri[WS(rs, 6)]; - TX = ii[WS(rs, 6)]; - TY = FMA(TU, TV, TW * TX); - T1Z = FNMS(TW, TV, TU * TX); - } - TZ = TT + TY; - T2N = T1Y + T1Z; - T1X = TT - TY; - T20 = T1Y - T1Z; - T21 = T1X + T20; - T2x = T1X - T20; - } - { - E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g; - { - E T1p, T1q, T1G, T1I; - T1p = ri[WS(rs, 15)]; - T1q = ii[WS(rs, 15)]; - T1r = FMA(TN, T1p, TO * T1q); - T2k = FNMS(TO, T1p, TN * T1q); - T1G = ri[WS(rs, 11)]; - T1I = ii[WS(rs, 11)]; - T1J = FMA(T1F, T1G, T1H * T1I); - T2h = FNMS(T1H, T1G, T1F * T1I); - } - { - E T1v, T1z, T1C, T1D; - T1v = ri[WS(rs, 7)]; - T1z = ii[WS(rs, 7)]; - T1A = FMA(T1u, T1v, T1y * T1z); - T2l = FNMS(T1y, T1v, T1u * T1z); - T1C = ri[WS(rs, 3)]; - T1D = ii[WS(rs, 3)]; - T1E = FMA(Tg, T1C, Ti * T1D); - T2g = FNMS(Ti, T1C, Tg * T1D); - } - T1B = T1r + T1A; - T1K = T1E + T1J; - T2V = T1B - T1K; - T2W = T2k + T2l; - T2X = T2g + T2h; - T2Y = T2W - T2X; - { - E T2f, T2i, T2m, T2n; - T2f = T1r - T1A; - T2i = T2g - T2h; - T2j = T2f - T2i; - T2D = T2f + T2i; - T2m = T2k - T2l; - T2n = T1E - T1J; - T2o = T2m + T2n; - T2E = T2m - T2n; - } - } - { - E T14, T24, T1m, T2b, T17, T25, T1h, T2a; - { - E T12, T13, T1j, T1l; - T12 = ri[WS(rs, 1)]; - T13 = ii[WS(rs, 1)]; - T14 = FMA(T2, T12, T5 * T13); - T24 = FNMS(T5, T12, T2 * T13); - T1j = ri[WS(rs, 13)]; - T1l = ii[WS(rs, 13)]; - T1m = FMA(T1i, T1j, T1k * T1l); - T2b = FNMS(T1k, T1j, T1i * T1l); - } - { - E T15, T16, T1c, T1g; - T15 = ri[WS(rs, 9)]; - T16 = ii[WS(rs, 9)]; - T17 = FMA(T3, T15, T6 * T16); - T25 = FNMS(T6, T15, T3 * T16); - T1c = ri[WS(rs, 5)]; - T1g = ii[WS(rs, 5)]; - T1h = FMA(T1b, T1c, T1f * T1g); - T2a = FNMS(T1f, T1c, T1b * T1g); - } - T18 = T14 + T17; - T1n = T1h + T1m; - T2Q = T18 - T1n; - T2R = T24 + T25; - T2S = T2a + T2b; - T2T = T2R - T2S; - { - E T26, T27, T29, T2c; - T26 = T24 - T25; - T27 = T1h - T1m; - T28 = T26 + T27; - T2A = T26 - T27; - T29 = T14 - T17; - T2c = T2a - T2b; - T2d = T29 - T2c; - T2B = T29 + T2c; - } - } - { - E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x; - { - E T1R, T22, T3y, T3z; - T1R = T1N - T1Q; - T22 = KP707106781 * (T1W - T21); - T23 = T1R + T22; - T2r = T1R - T22; - T3y = KP707106781 * (T2x - T2w); - T3z = T3s + T3r; - T3A = T3y + T3z; - T3C = T3z - T3y; - } - { - E T2e, T2p, T2s, T2t; - T2e = FMA(KP923879532, T28, KP382683432 * T2d); - T2p = FNMS(KP923879532, T2o, KP382683432 * T2j); - T2q = T2e + T2p; - T3B = T2p - T2e; - T2s = FNMS(KP923879532, T2d, KP382683432 * T28); - T2t = FMA(KP382683432, T2o, KP923879532 * T2j); - T2u = T2s - T2t; - T3x = T2s + T2t; - } - ri[WS(rs, 11)] = T23 - T2q; - ii[WS(rs, 11)] = T3A - T3x; - ri[WS(rs, 3)] = T23 + T2q; - ii[WS(rs, 3)] = T3x + T3A; - ri[WS(rs, 15)] = T2r - T2u; - ii[WS(rs, 15)] = T3C - T3B; - ri[WS(rs, 7)] = T2r + T2u; - ii[WS(rs, 7)] = T3B + T3C; - } - { - E T2P, T31, T3m, T3o, T30, T3n, T34, T3j; - { - E T2L, T2O, T3k, T3l; - T2L = Tf - TA; - T2O = T2M - T2N; - T2P = T2L + T2O; - T31 = T2L - T2O; - T3k = TZ - TM; - T3l = T3e - T3b; - T3m = T3k + T3l; - T3o = T3l - T3k; - } - { - E T2U, T2Z, T32, T33; - T2U = T2Q + T2T; - T2Z = T2V - T2Y; - T30 = KP707106781 * (T2U + T2Z); - T3n = KP707106781 * (T2Z - T2U); - T32 = T2T - T2Q; - T33 = T2V + T2Y; - T34 = KP707106781 * (T32 - T33); - T3j = KP707106781 * (T32 + T33); - } - ri[WS(rs, 10)] = T2P - T30; - ii[WS(rs, 10)] = T3m - T3j; - ri[WS(rs, 2)] = T2P + T30; - ii[WS(rs, 2)] = T3j + T3m; - ri[WS(rs, 14)] = T31 - T34; - ii[WS(rs, 14)] = T3o - T3n; - ri[WS(rs, 6)] = T31 + T34; - ii[WS(rs, 6)] = T3n + T3o; - } - { - E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p; - { - E T2v, T2y, T3q, T3t; - T2v = T1N + T1Q; - T2y = KP707106781 * (T2w + T2x); - T2z = T2v + T2y; - T2H = T2v - T2y; - T3q = KP707106781 * (T1W + T21); - T3t = T3r - T3s; - T3u = T3q + T3t; - T3w = T3t - T3q; - } - { - E T2C, T2F, T2I, T2J; - T2C = FMA(KP382683432, T2A, KP923879532 * T2B); - T2F = FNMS(KP382683432, T2E, KP923879532 * T2D); - T2G = T2C + T2F; - T3v = T2F - T2C; - T2I = FNMS(KP382683432, T2B, KP923879532 * T2A); - T2J = FMA(KP923879532, T2E, KP382683432 * T2D); - T2K = T2I - T2J; - T3p = T2I + T2J; - } - ri[WS(rs, 9)] = T2z - T2G; - ii[WS(rs, 9)] = T3u - T3p; - ri[WS(rs, 1)] = T2z + T2G; - ii[WS(rs, 1)] = T3p + T3u; - ri[WS(rs, 13)] = T2H - T2K; - ii[WS(rs, 13)] = T3w - T3v; - ri[WS(rs, 5)] = T2H + T2K; - ii[WS(rs, 5)] = T3v + T3w; - } - { - E T11, T35, T3g, T3i, T1M, T3h, T38, T39; - { - E TB, T10, T3a, T3f; - TB = Tf + TA; - T10 = TM + TZ; - T11 = TB + T10; - T35 = TB - T10; - T3a = T2M + T2N; - T3f = T3b + T3e; - T3g = T3a + T3f; - T3i = T3f - T3a; - } - { - E T1o, T1L, T36, T37; - T1o = T18 + T1n; - T1L = T1B + T1K; - T1M = T1o + T1L; - T3h = T1L - T1o; - T36 = T2R + T2S; - T37 = T2W + T2X; - T38 = T36 - T37; - T39 = T36 + T37; - } - ri[WS(rs, 8)] = T11 - T1M; - ii[WS(rs, 8)] = T3g - T39; - ri[0] = T11 + T1M; - ii[0] = T39 + T3g; - ri[WS(rs, 12)] = T35 - T38; - ii[WS(rs, 12)] = T3i - T3h; - ri[WS(rs, 4)] = T35 + T38; - ii[WS(rs, 4)] = T3h + T3i; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 15 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 156, 68, 40, 0 }, 0, 0, 0 }; - -void X(codelet_t2_16) (planner *p) { - X(kdft_dit_register) (p, t2_16, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_20.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_20.c deleted file mode 100644 index 0239b29e..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_20.c +++ /dev/null @@ -1,1097 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:38 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include dft/scalar/t.h */ - -/* - * This function contains 276 FP additions, 198 FP multiplications, - * (or, 136 additions, 58 multiplications, 140 fused multiply/add), - * 95 stack variables, 4 constants, and 80 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - { - INT m; - for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) { - E T2, Th, Tf, T6, T5, Ti, Tl, T1n, T3, Tt, Tv, T7, T17, T1L, T24; - E Tb, T13, T1P, T21, T1b, T1D, T1A, T1H, T1f, TA, Tw, Tq, Tm, TK, T1S; - E TO, T1p, T1q, T1u, T2n, T2k, T2h, T2d; - { - E Tk, Ta, T1e, T4, T1a, Tj, T12, T1G, T16, T1K, Tg, Tz; - T2 = W[0]; - Th = W[3]; - Tf = W[2]; - Tg = T2 * Tf; - Tk = T2 * Th; - T6 = W[5]; - Ta = T2 * T6; - T1e = Tf * T6; - T5 = W[1]; - Ti = FNMS(T5, Th, Tg); - Tl = FMA(T5, Tf, Tk); - T1n = FMA(T5, Th, Tg); - T3 = W[4]; - T4 = T2 * T3; - T1a = Tf * T3; - Tj = Ti * T3; - Tt = W[6]; - T12 = Tf * Tt; - T1G = T2 * Tt; - Tv = W[7]; - T16 = Tf * Tv; - T1K = T2 * Tv; - T7 = FNMS(T5, T6, T4); - T17 = FNMS(Th, Tt, T16); - T1L = FNMS(T5, Tt, T1K); - T24 = FMA(Th, T3, T1e); - Tb = FMA(T5, T3, Ta); - T13 = FMA(Th, Tv, T12); - T1P = FNMS(Tl, T6, Tj); - T21 = FNMS(Th, T6, T1a); - T1b = FMA(Th, T6, T1a); - T1D = FNMS(T5, T3, Ta); - T1A = FMA(T5, T6, T4); - T1H = FMA(T5, Tv, T1G); - T1f = FNMS(Th, T3, T1e); - Tz = Ti * Tv; - TA = FNMS(Tl, Tt, Tz); - { - E Tu, Tp, TJ, TN; - Tu = Ti * Tt; - Tw = FMA(Tl, Tv, Tu); - Tp = Ti * T6; - Tq = FNMS(Tl, T3, Tp); - Tm = FMA(Tl, T6, Tj); - TJ = Tm * Tt; - TN = Tm * Tv; - TK = FMA(Tq, Tv, TJ); - T1S = FMA(Tl, T3, Tp); - TO = FNMS(Tq, Tt, TN); - { - E T1o, T2g, T1t, T2c; - T1o = T1n * T3; - T2g = T1n * Tv; - T1t = T1n * T6; - T2c = T1n * Tt; - T1p = FNMS(T5, Tf, Tk); - T1q = FNMS(T1p, T6, T1o); - T1u = FMA(T1p, T3, T1t); - T2n = FNMS(T1p, T3, T1t); - T2k = FMA(T1p, T6, T1o); - T2h = FNMS(T1p, Tt, T2g); - T2d = FMA(T1p, Tv, T2c); - } - } - } - { - E Te, T2C, T4L, T57, TD, T58, T2H, T4H, T11, T2v, T4k, T4v, T2P, T3P, T3C; - E T3Z, T2r, T2z, T4g, T4z, T3b, T3T, T3u, T43, T20, T2y, T4d, T4y, T34, T3S; - E T3n, T42, T1y, T2w, T4n, T4w, T2W, T3Q, T3J, T40; - { - E T1, T4K, T8, T9, Tc, T4I, Td, T4J; - T1 = ri[0]; - T4K = ii[0]; - T8 = ri[WS(rs, 10)]; - T9 = T7 * T8; - Tc = ii[WS(rs, 10)]; - T4I = T7 * Tc; - Td = FMA(Tb, Tc, T9); - Te = T1 + Td; - T2C = T1 - Td; - T4J = FNMS(Tb, T8, T4I); - T4L = T4J + T4K; - T57 = T4K - T4J; - } - { - E Tn, To, Tr, T2D, Tx, Ty, TB, T2F; - Tn = ri[WS(rs, 5)]; - To = Tm * Tn; - Tr = ii[WS(rs, 5)]; - T2D = Tm * Tr; - Tx = ri[WS(rs, 15)]; - Ty = Tw * Tx; - TB = ii[WS(rs, 15)]; - T2F = Tw * TB; - { - E Ts, TC, T2E, T2G; - Ts = FMA(Tq, Tr, To); - TC = FMA(TA, TB, Ty); - TD = Ts + TC; - T58 = Ts - TC; - T2E = FNMS(Tq, Tn, T2D); - T2G = FNMS(TA, Tx, T2F); - T2H = T2E - T2G; - T4H = T2E + T2G; - } - } - { - E TI, T3x, TZ, T2N, TQ, T3z, TV, T2L; - { - E TF, TG, TH, T3w; - TF = ri[WS(rs, 4)]; - TG = Ti * TF; - TH = ii[WS(rs, 4)]; - T3w = Ti * TH; - TI = FMA(Tl, TH, TG); - T3x = FNMS(Tl, TF, T3w); - } - { - E TW, TX, TY, T2M; - TW = ri[WS(rs, 19)]; - TX = Tt * TW; - TY = ii[WS(rs, 19)]; - T2M = Tt * TY; - TZ = FMA(Tv, TY, TX); - T2N = FNMS(Tv, TW, T2M); - } - { - E TL, TM, TP, T3y; - TL = ri[WS(rs, 14)]; - TM = TK * TL; - TP = ii[WS(rs, 14)]; - T3y = TK * TP; - TQ = FMA(TO, TP, TM); - T3z = FNMS(TO, TL, T3y); - } - { - E TS, TT, TU, T2K; - TS = ri[WS(rs, 9)]; - TT = T3 * TS; - TU = ii[WS(rs, 9)]; - T2K = T3 * TU; - TV = FMA(T6, TU, TT); - T2L = FNMS(T6, TS, T2K); - } - { - E TR, T10, T4i, T4j; - TR = TI + TQ; - T10 = TV + TZ; - T11 = TR - T10; - T2v = TR + T10; - T4i = T3x + T3z; - T4j = T2L + T2N; - T4k = T4i - T4j; - T4v = T4i + T4j; - } - { - E T2J, T2O, T3A, T3B; - T2J = TI - TQ; - T2O = T2L - T2N; - T2P = T2J - T2O; - T3P = T2J + T2O; - T3A = T3x - T3z; - T3B = TV - TZ; - T3C = T3A + T3B; - T3Z = T3A - T3B; - } - } - { - E T26, T3p, T2p, T39, T2a, T3r, T2j, T37; - { - E T22, T23, T25, T3o; - T22 = ri[WS(rs, 12)]; - T23 = T21 * T22; - T25 = ii[WS(rs, 12)]; - T3o = T21 * T25; - T26 = FMA(T24, T25, T23); - T3p = FNMS(T24, T22, T3o); - } - { - E T2l, T2m, T2o, T38; - T2l = ri[WS(rs, 7)]; - T2m = T2k * T2l; - T2o = ii[WS(rs, 7)]; - T38 = T2k * T2o; - T2p = FMA(T2n, T2o, T2m); - T39 = FNMS(T2n, T2l, T38); - } - { - E T27, T28, T29, T3q; - T27 = ri[WS(rs, 2)]; - T28 = T1n * T27; - T29 = ii[WS(rs, 2)]; - T3q = T1n * T29; - T2a = FMA(T1p, T29, T28); - T3r = FNMS(T1p, T27, T3q); - } - { - E T2e, T2f, T2i, T36; - T2e = ri[WS(rs, 17)]; - T2f = T2d * T2e; - T2i = ii[WS(rs, 17)]; - T36 = T2d * T2i; - T2j = FMA(T2h, T2i, T2f); - T37 = FNMS(T2h, T2e, T36); - } - { - E T2b, T2q, T4e, T4f; - T2b = T26 + T2a; - T2q = T2j + T2p; - T2r = T2b - T2q; - T2z = T2b + T2q; - T4e = T3p + T3r; - T4f = T37 + T39; - T4g = T4e - T4f; - T4z = T4e + T4f; - } - { - E T35, T3a, T3s, T3t; - T35 = T26 - T2a; - T3a = T37 - T39; - T3b = T35 - T3a; - T3T = T35 + T3a; - T3s = T3p - T3r; - T3t = T2j - T2p; - T3u = T3s + T3t; - T43 = T3s - T3t; - } - } - { - E T1F, T3i, T1Y, T32, T1N, T3k, T1U, T30; - { - E T1B, T1C, T1E, T3h; - T1B = ri[WS(rs, 8)]; - T1C = T1A * T1B; - T1E = ii[WS(rs, 8)]; - T3h = T1A * T1E; - T1F = FMA(T1D, T1E, T1C); - T3i = FNMS(T1D, T1B, T3h); - } - { - E T1V, T1W, T1X, T31; - T1V = ri[WS(rs, 3)]; - T1W = Tf * T1V; - T1X = ii[WS(rs, 3)]; - T31 = Tf * T1X; - T1Y = FMA(Th, T1X, T1W); - T32 = FNMS(Th, T1V, T31); - } - { - E T1I, T1J, T1M, T3j; - T1I = ri[WS(rs, 18)]; - T1J = T1H * T1I; - T1M = ii[WS(rs, 18)]; - T3j = T1H * T1M; - T1N = FMA(T1L, T1M, T1J); - T3k = FNMS(T1L, T1I, T3j); - } - { - E T1Q, T1R, T1T, T2Z; - T1Q = ri[WS(rs, 13)]; - T1R = T1P * T1Q; - T1T = ii[WS(rs, 13)]; - T2Z = T1P * T1T; - T1U = FMA(T1S, T1T, T1R); - T30 = FNMS(T1S, T1Q, T2Z); - } - { - E T1O, T1Z, T4b, T4c; - T1O = T1F + T1N; - T1Z = T1U + T1Y; - T20 = T1O - T1Z; - T2y = T1O + T1Z; - T4b = T3i + T3k; - T4c = T30 + T32; - T4d = T4b - T4c; - T4y = T4b + T4c; - } - { - E T2Y, T33, T3l, T3m; - T2Y = T1F - T1N; - T33 = T30 - T32; - T34 = T2Y - T33; - T3S = T2Y + T33; - T3l = T3i - T3k; - T3m = T1U - T1Y; - T3n = T3l + T3m; - T42 = T3l - T3m; - } - } - { - E T19, T3E, T1w, T2U, T1h, T3G, T1m, T2S; - { - E T14, T15, T18, T3D; - T14 = ri[WS(rs, 16)]; - T15 = T13 * T14; - T18 = ii[WS(rs, 16)]; - T3D = T13 * T18; - T19 = FMA(T17, T18, T15); - T3E = FNMS(T17, T14, T3D); - } - { - E T1r, T1s, T1v, T2T; - T1r = ri[WS(rs, 11)]; - T1s = T1q * T1r; - T1v = ii[WS(rs, 11)]; - T2T = T1q * T1v; - T1w = FMA(T1u, T1v, T1s); - T2U = FNMS(T1u, T1r, T2T); - } - { - E T1c, T1d, T1g, T3F; - T1c = ri[WS(rs, 6)]; - T1d = T1b * T1c; - T1g = ii[WS(rs, 6)]; - T3F = T1b * T1g; - T1h = FMA(T1f, T1g, T1d); - T3G = FNMS(T1f, T1c, T3F); - } - { - E T1j, T1k, T1l, T2R; - T1j = ri[WS(rs, 1)]; - T1k = T2 * T1j; - T1l = ii[WS(rs, 1)]; - T2R = T2 * T1l; - T1m = FMA(T5, T1l, T1k); - T2S = FNMS(T5, T1j, T2R); - } - { - E T1i, T1x, T4l, T4m; - T1i = T19 + T1h; - T1x = T1m + T1w; - T1y = T1i - T1x; - T2w = T1i + T1x; - T4l = T3E + T3G; - T4m = T2S + T2U; - T4n = T4l - T4m; - T4w = T4l + T4m; - } - { - E T2Q, T2V, T3H, T3I; - T2Q = T19 - T1h; - T2V = T2S - T2U; - T2W = T2Q - T2V; - T3Q = T2Q + T2V; - T3H = T3E - T3G; - T3I = T1m - T1w; - T3J = T3H + T3I; - T40 = T3H - T3I; - } - } - { - E T4p, T4r, TE, T2t, T48, T49, T4q, T4a; - { - E T4h, T4o, T1z, T2s; - T4h = T4d - T4g; - T4o = T4k - T4n; - T4p = FNMS(KP618033988, T4o, T4h); - T4r = FMA(KP618033988, T4h, T4o); - TE = Te - TD; - T1z = T11 + T1y; - T2s = T20 + T2r; - T2t = T1z + T2s; - T48 = FNMS(KP250000000, T2t, TE); - T49 = T1z - T2s; - } - ri[WS(rs, 10)] = TE + T2t; - T4q = FMA(KP559016994, T49, T48); - ri[WS(rs, 14)] = FNMS(KP951056516, T4r, T4q); - ri[WS(rs, 6)] = FMA(KP951056516, T4r, T4q); - T4a = FNMS(KP559016994, T49, T48); - ri[WS(rs, 2)] = FNMS(KP951056516, T4p, T4a); - ri[WS(rs, 18)] = FMA(KP951056516, T4p, T4a); - } - { - E T54, T56, T4V, T4Y, T4Z, T50, T55, T51; - { - E T52, T53, T4W, T4X; - T52 = T20 - T2r; - T53 = T11 - T1y; - T54 = FNMS(KP618033988, T53, T52); - T56 = FMA(KP618033988, T52, T53); - T4V = T4L - T4H; - T4W = T4k + T4n; - T4X = T4d + T4g; - T4Y = T4W + T4X; - T4Z = FNMS(KP250000000, T4Y, T4V); - T50 = T4W - T4X; - } - ii[WS(rs, 10)] = T4Y + T4V; - T55 = FMA(KP559016994, T50, T4Z); - ii[WS(rs, 6)] = FNMS(KP951056516, T56, T55); - ii[WS(rs, 14)] = FMA(KP951056516, T56, T55); - T51 = FNMS(KP559016994, T50, T4Z); - ii[WS(rs, 2)] = FMA(KP951056516, T54, T51); - ii[WS(rs, 18)] = FNMS(KP951056516, T54, T51); - } - { - E T4B, T4D, T2u, T2B, T4s, T4t, T4C, T4u; - { - E T4x, T4A, T2x, T2A; - T4x = T4v - T4w; - T4A = T4y - T4z; - T4B = FMA(KP618033988, T4A, T4x); - T4D = FNMS(KP618033988, T4x, T4A); - T2u = Te + TD; - T2x = T2v + T2w; - T2A = T2y + T2z; - T2B = T2x + T2A; - T4s = FNMS(KP250000000, T2B, T2u); - T4t = T2x - T2A; - } - ri[0] = T2u + T2B; - T4C = FNMS(KP559016994, T4t, T4s); - ri[WS(rs, 12)] = FNMS(KP951056516, T4D, T4C); - ri[WS(rs, 8)] = FMA(KP951056516, T4D, T4C); - T4u = FMA(KP559016994, T4t, T4s); - ri[WS(rs, 4)] = FNMS(KP951056516, T4B, T4u); - ri[WS(rs, 16)] = FMA(KP951056516, T4B, T4u); - } - { - E T4S, T4U, T4M, T4G, T4N, T4O, T4T, T4P; - { - E T4Q, T4R, T4E, T4F; - T4Q = T2v - T2w; - T4R = T2y - T2z; - T4S = FMA(KP618033988, T4R, T4Q); - T4U = FNMS(KP618033988, T4Q, T4R); - T4M = T4H + T4L; - T4E = T4v + T4w; - T4F = T4y + T4z; - T4G = T4E + T4F; - T4N = FNMS(KP250000000, T4G, T4M); - T4O = T4E - T4F; - } - ii[0] = T4G + T4M; - T4T = FNMS(KP559016994, T4O, T4N); - ii[WS(rs, 8)] = FNMS(KP951056516, T4U, T4T); - ii[WS(rs, 12)] = FMA(KP951056516, T4U, T4T); - T4P = FMA(KP559016994, T4O, T4N); - ii[WS(rs, 4)] = FMA(KP951056516, T4S, T4P); - ii[WS(rs, 16)] = FNMS(KP951056516, T4S, T4P); - } - { - E T3L, T3N, T2I, T3d, T3e, T3f, T3M, T3g; - { - E T3v, T3K, T2X, T3c; - T3v = T3n - T3u; - T3K = T3C - T3J; - T3L = FNMS(KP618033988, T3K, T3v); - T3N = FMA(KP618033988, T3v, T3K); - T2I = T2C - T2H; - T2X = T2P + T2W; - T3c = T34 + T3b; - T3d = T2X + T3c; - T3e = FNMS(KP250000000, T3d, T2I); - T3f = T2X - T3c; - } - ri[WS(rs, 15)] = T2I + T3d; - T3M = FMA(KP559016994, T3f, T3e); - ri[WS(rs, 11)] = FMA(KP951056516, T3N, T3M); - ri[WS(rs, 19)] = FNMS(KP951056516, T3N, T3M); - T3g = FNMS(KP559016994, T3f, T3e); - ri[WS(rs, 3)] = FMA(KP951056516, T3L, T3g); - ri[WS(rs, 7)] = FNMS(KP951056516, T3L, T3g); - } - { - E T5u, T5w, T5l, T5o, T5p, T5q, T5v, T5r; - { - E T5s, T5t, T5m, T5n; - T5s = T34 - T3b; - T5t = T2P - T2W; - T5u = FNMS(KP618033988, T5t, T5s); - T5w = FMA(KP618033988, T5s, T5t); - T5l = T58 + T57; - T5m = T3C + T3J; - T5n = T3n + T3u; - T5o = T5m + T5n; - T5p = FNMS(KP250000000, T5o, T5l); - T5q = T5m - T5n; - } - ii[WS(rs, 15)] = T5o + T5l; - T5v = FMA(KP559016994, T5q, T5p); - ii[WS(rs, 11)] = FNMS(KP951056516, T5w, T5v); - ii[WS(rs, 19)] = FMA(KP951056516, T5w, T5v); - T5r = FNMS(KP559016994, T5q, T5p); - ii[WS(rs, 3)] = FNMS(KP951056516, T5u, T5r); - ii[WS(rs, 7)] = FMA(KP951056516, T5u, T5r); - } - { - E T45, T47, T3O, T3V, T3W, T3X, T46, T3Y; - { - E T41, T44, T3R, T3U; - T41 = T3Z - T40; - T44 = T42 - T43; - T45 = FMA(KP618033988, T44, T41); - T47 = FNMS(KP618033988, T41, T44); - T3O = T2C + T2H; - T3R = T3P + T3Q; - T3U = T3S + T3T; - T3V = T3R + T3U; - T3W = FNMS(KP250000000, T3V, T3O); - T3X = T3R - T3U; - } - ri[WS(rs, 5)] = T3O + T3V; - T46 = FNMS(KP559016994, T3X, T3W); - ri[WS(rs, 13)] = FMA(KP951056516, T47, T46); - ri[WS(rs, 17)] = FNMS(KP951056516, T47, T46); - T3Y = FMA(KP559016994, T3X, T3W); - ri[WS(rs, 1)] = FMA(KP951056516, T45, T3Y); - ri[WS(rs, 9)] = FNMS(KP951056516, T45, T3Y); - } - { - E T5i, T5k, T59, T5c, T5d, T5e, T5j, T5f; - { - E T5g, T5h, T5a, T5b; - T5g = T3P - T3Q; - T5h = T3S - T3T; - T5i = FMA(KP618033988, T5h, T5g); - T5k = FNMS(KP618033988, T5g, T5h); - T59 = T57 - T58; - T5a = T3Z + T40; - T5b = T42 + T43; - T5c = T5a + T5b; - T5d = FNMS(KP250000000, T5c, T59); - T5e = T5a - T5b; - } - ii[WS(rs, 5)] = T5c + T59; - T5j = FNMS(KP559016994, T5e, T5d); - ii[WS(rs, 13)] = FNMS(KP951056516, T5k, T5j); - ii[WS(rs, 17)] = FMA(KP951056516, T5k, T5j); - T5f = FMA(KP559016994, T5e, T5d); - ii[WS(rs, 1)] = FNMS(KP951056516, T5i, T5f); - ii[WS(rs, 9)] = FMA(KP951056516, T5i, T5f); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 19 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, { 136, 58, 140, 0 }, 0, 0, 0 }; - -void X(codelet_t2_20) (planner *p) { - X(kdft_dit_register) (p, t2_20, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -name t2_20 -include dft/scalar/t.h */ - -/* - * This function contains 276 FP additions, 164 FP multiplications, - * (or, 204 additions, 92 multiplications, 72 fused multiply/add), - * 123 stack variables, 4 constants, and 80 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - { - INT m; - for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(40, rs)) { - E T2, T5, Tg, Ti, Tk, To, T1h, T1f, T6, T3, T8, T14, T1Q, Tc, T1O; - E T1v, T18, T1t, T1n, T24, T1j, T22, Tq, Tu, T1E, T1G, Tx, Ty, Tz, TJ; - E T1Z, TB, T1X, T1A, TZ, TL, T1y, TX; - { - E T7, T16, Ta, T13, T4, T17, Tb, T12; - { - E Th, Tn, Tj, Tm; - T2 = W[0]; - T5 = W[1]; - Tg = W[2]; - Ti = W[3]; - Th = T2 * Tg; - Tn = T5 * Tg; - Tj = T5 * Ti; - Tm = T2 * Ti; - Tk = Th - Tj; - To = Tm + Tn; - T1h = Tm - Tn; - T1f = Th + Tj; - T6 = W[5]; - T7 = T5 * T6; - T16 = Tg * T6; - Ta = T2 * T6; - T13 = Ti * T6; - T3 = W[4]; - T4 = T2 * T3; - T17 = Ti * T3; - Tb = T5 * T3; - T12 = Tg * T3; - } - T8 = T4 - T7; - T14 = T12 + T13; - T1Q = T16 + T17; - Tc = Ta + Tb; - T1O = T12 - T13; - T1v = Ta - Tb; - T18 = T16 - T17; - T1t = T4 + T7; - { - E T1l, T1m, T1g, T1i; - T1l = T1f * T6; - T1m = T1h * T3; - T1n = T1l + T1m; - T24 = T1l - T1m; - T1g = T1f * T3; - T1i = T1h * T6; - T1j = T1g - T1i; - T22 = T1g + T1i; - { - E Tl, Tp, Ts, Tt; - Tl = Tk * T3; - Tp = To * T6; - Tq = Tl + Tp; - Ts = Tk * T6; - Tt = To * T3; - Tu = Ts - Tt; - T1E = Tl - Tp; - T1G = Ts + Tt; - Tx = W[6]; - Ty = W[7]; - Tz = FMA(Tk, Tx, To * Ty); - TJ = FMA(Tq, Tx, Tu * Ty); - T1Z = FNMS(T1h, Tx, T1f * Ty); - TB = FNMS(To, Tx, Tk * Ty); - T1X = FMA(T1f, Tx, T1h * Ty); - T1A = FNMS(T5, Tx, T2 * Ty); - TZ = FNMS(Ti, Tx, Tg * Ty); - TL = FNMS(Tu, Tx, Tq * Ty); - T1y = FMA(T2, Tx, T5 * Ty); - TX = FMA(Tg, Tx, Ti * Ty); - } - } - } - { - E TF, T2b, T4A, T4J, T2K, T3r, T4a, T4m, T1N, T28, T29, T3C, T3F, T4o, T3X; - E T3Y, T44, T2f, T2g, T2h, T2n, T2s, T4L, T3g, T3h, T4w, T3n, T3o, T3p, T30; - E T35, T36, TW, T1r, T1s, T3J, T3M, T4n, T3U, T3V, T43, T2c, T2d, T2e, T2y; - E T2D, T4K, T3d, T3e, T4v, T3k, T3l, T3m, T2P, T2U, T2V; - { - E T1, T48, Te, T47, Tw, T2H, TD, T2I, T9, Td; - T1 = ri[0]; - T48 = ii[0]; - T9 = ri[WS(rs, 10)]; - Td = ii[WS(rs, 10)]; - Te = FMA(T8, T9, Tc * Td); - T47 = FNMS(Tc, T9, T8 * Td); - { - E Tr, Tv, TA, TC; - Tr = ri[WS(rs, 5)]; - Tv = ii[WS(rs, 5)]; - Tw = FMA(Tq, Tr, Tu * Tv); - T2H = FNMS(Tu, Tr, Tq * Tv); - TA = ri[WS(rs, 15)]; - TC = ii[WS(rs, 15)]; - TD = FMA(Tz, TA, TB * TC); - T2I = FNMS(TB, TA, Tz * TC); - } - { - E Tf, TE, T4y, T4z; - Tf = T1 + Te; - TE = Tw + TD; - TF = Tf - TE; - T2b = Tf + TE; - T4y = T48 - T47; - T4z = Tw - TD; - T4A = T4y - T4z; - T4J = T4z + T4y; - } - { - E T2G, T2J, T46, T49; - T2G = T1 - Te; - T2J = T2H - T2I; - T2K = T2G - T2J; - T3r = T2G + T2J; - T46 = T2H + T2I; - T49 = T47 + T48; - T4a = T46 + T49; - T4m = T49 - T46; - } - } - { - E T1D, T3A, T2l, T2W, T27, T3E, T2r, T34, T1M, T3B, T2m, T2Z, T1W, T3D, T2q; - E T31; - { - E T1x, T2j, T1C, T2k; - { - E T1u, T1w, T1z, T1B; - T1u = ri[WS(rs, 8)]; - T1w = ii[WS(rs, 8)]; - T1x = FMA(T1t, T1u, T1v * T1w); - T2j = FNMS(T1v, T1u, T1t * T1w); - T1z = ri[WS(rs, 18)]; - T1B = ii[WS(rs, 18)]; - T1C = FMA(T1y, T1z, T1A * T1B); - T2k = FNMS(T1A, T1z, T1y * T1B); - } - T1D = T1x + T1C; - T3A = T2j + T2k; - T2l = T2j - T2k; - T2W = T1x - T1C; - } - { - E T21, T32, T26, T33; - { - E T1Y, T20, T23, T25; - T1Y = ri[WS(rs, 17)]; - T20 = ii[WS(rs, 17)]; - T21 = FMA(T1X, T1Y, T1Z * T20); - T32 = FNMS(T1Z, T1Y, T1X * T20); - T23 = ri[WS(rs, 7)]; - T25 = ii[WS(rs, 7)]; - T26 = FMA(T22, T23, T24 * T25); - T33 = FNMS(T24, T23, T22 * T25); - } - T27 = T21 + T26; - T3E = T32 + T33; - T2r = T21 - T26; - T34 = T32 - T33; - } - { - E T1I, T2X, T1L, T2Y; - { - E T1F, T1H, T1J, T1K; - T1F = ri[WS(rs, 13)]; - T1H = ii[WS(rs, 13)]; - T1I = FMA(T1E, T1F, T1G * T1H); - T2X = FNMS(T1G, T1F, T1E * T1H); - T1J = ri[WS(rs, 3)]; - T1K = ii[WS(rs, 3)]; - T1L = FMA(Tg, T1J, Ti * T1K); - T2Y = FNMS(Ti, T1J, Tg * T1K); - } - T1M = T1I + T1L; - T3B = T2X + T2Y; - T2m = T1I - T1L; - T2Z = T2X - T2Y; - } - { - E T1S, T2o, T1V, T2p; - { - E T1P, T1R, T1T, T1U; - T1P = ri[WS(rs, 12)]; - T1R = ii[WS(rs, 12)]; - T1S = FMA(T1O, T1P, T1Q * T1R); - T2o = FNMS(T1Q, T1P, T1O * T1R); - T1T = ri[WS(rs, 2)]; - T1U = ii[WS(rs, 2)]; - T1V = FMA(T1f, T1T, T1h * T1U); - T2p = FNMS(T1h, T1T, T1f * T1U); - } - T1W = T1S + T1V; - T3D = T2o + T2p; - T2q = T2o - T2p; - T31 = T1S - T1V; - } - T1N = T1D - T1M; - T28 = T1W - T27; - T29 = T1N + T28; - T3C = T3A - T3B; - T3F = T3D - T3E; - T4o = T3C + T3F; - T3X = T3A + T3B; - T3Y = T3D + T3E; - T44 = T3X + T3Y; - T2f = T1D + T1M; - T2g = T1W + T27; - T2h = T2f + T2g; - T2n = T2l + T2m; - T2s = T2q + T2r; - T4L = T2n + T2s; - T3g = T2l - T2m; - T3h = T2q - T2r; - T4w = T3g + T3h; - T3n = T2W + T2Z; - T3o = T31 + T34; - T3p = T3n + T3o; - T30 = T2W - T2Z; - T35 = T31 - T34; - T36 = T30 + T35; - } - { - E TO, T3H, T2w, T2L, T1q, T3L, T2C, T2T, TV, T3I, T2x, T2O, T1b, T3K, T2B; - E T2Q; - { - E TI, T2u, TN, T2v; - { - E TG, TH, TK, TM; - TG = ri[WS(rs, 4)]; - TH = ii[WS(rs, 4)]; - TI = FMA(Tk, TG, To * TH); - T2u = FNMS(To, TG, Tk * TH); - TK = ri[WS(rs, 14)]; - TM = ii[WS(rs, 14)]; - TN = FMA(TJ, TK, TL * TM); - T2v = FNMS(TL, TK, TJ * TM); - } - TO = TI + TN; - T3H = T2u + T2v; - T2w = T2u - T2v; - T2L = TI - TN; - } - { - E T1e, T2R, T1p, T2S; - { - E T1c, T1d, T1k, T1o; - T1c = ri[WS(rs, 1)]; - T1d = ii[WS(rs, 1)]; - T1e = FMA(T2, T1c, T5 * T1d); - T2R = FNMS(T5, T1c, T2 * T1d); - T1k = ri[WS(rs, 11)]; - T1o = ii[WS(rs, 11)]; - T1p = FMA(T1j, T1k, T1n * T1o); - T2S = FNMS(T1n, T1k, T1j * T1o); - } - T1q = T1e + T1p; - T3L = T2R + T2S; - T2C = T1e - T1p; - T2T = T2R - T2S; - } - { - E TR, T2M, TU, T2N; - { - E TP, TQ, TS, TT; - TP = ri[WS(rs, 9)]; - TQ = ii[WS(rs, 9)]; - TR = FMA(T3, TP, T6 * TQ); - T2M = FNMS(T6, TP, T3 * TQ); - TS = ri[WS(rs, 19)]; - TT = ii[WS(rs, 19)]; - TU = FMA(Tx, TS, Ty * TT); - T2N = FNMS(Ty, TS, Tx * TT); - } - TV = TR + TU; - T3I = T2M + T2N; - T2x = TR - TU; - T2O = T2M - T2N; - } - { - E T11, T2z, T1a, T2A; - { - E TY, T10, T15, T19; - TY = ri[WS(rs, 16)]; - T10 = ii[WS(rs, 16)]; - T11 = FMA(TX, TY, TZ * T10); - T2z = FNMS(TZ, TY, TX * T10); - T15 = ri[WS(rs, 6)]; - T19 = ii[WS(rs, 6)]; - T1a = FMA(T14, T15, T18 * T19); - T2A = FNMS(T18, T15, T14 * T19); - } - T1b = T11 + T1a; - T3K = T2z + T2A; - T2B = T2z - T2A; - T2Q = T11 - T1a; - } - TW = TO - TV; - T1r = T1b - T1q; - T1s = TW + T1r; - T3J = T3H - T3I; - T3M = T3K - T3L; - T4n = T3J + T3M; - T3U = T3H + T3I; - T3V = T3K + T3L; - T43 = T3U + T3V; - T2c = TO + TV; - T2d = T1b + T1q; - T2e = T2c + T2d; - T2y = T2w + T2x; - T2D = T2B + T2C; - T4K = T2y + T2D; - T3d = T2w - T2x; - T3e = T2B - T2C; - T4v = T3d + T3e; - T3k = T2L + T2O; - T3l = T2Q + T2T; - T3m = T3k + T3l; - T2P = T2L - T2O; - T2U = T2Q - T2T; - T2V = T2P + T2U; - } - { - E T3y, T2a, T3x, T3O, T3Q, T3G, T3N, T3P, T3z; - T3y = KP559016994 * (T1s - T29); - T2a = T1s + T29; - T3x = FNMS(KP250000000, T2a, TF); - T3G = T3C - T3F; - T3N = T3J - T3M; - T3O = FNMS(KP587785252, T3N, KP951056516 * T3G); - T3Q = FMA(KP951056516, T3N, KP587785252 * T3G); - ri[WS(rs, 10)] = TF + T2a; - T3P = T3y + T3x; - ri[WS(rs, 14)] = T3P - T3Q; - ri[WS(rs, 6)] = T3P + T3Q; - T3z = T3x - T3y; - ri[WS(rs, 2)] = T3z - T3O; - ri[WS(rs, 18)] = T3z + T3O; - } - { - E T4r, T4p, T4q, T4l, T4u, T4j, T4k, T4t, T4s; - T4r = KP559016994 * (T4n - T4o); - T4p = T4n + T4o; - T4q = FNMS(KP250000000, T4p, T4m); - T4j = T1N - T28; - T4k = TW - T1r; - T4l = FNMS(KP587785252, T4k, KP951056516 * T4j); - T4u = FMA(KP951056516, T4k, KP587785252 * T4j); - ii[WS(rs, 10)] = T4p + T4m; - T4t = T4r + T4q; - ii[WS(rs, 6)] = T4t - T4u; - ii[WS(rs, 14)] = T4u + T4t; - T4s = T4q - T4r; - ii[WS(rs, 2)] = T4l + T4s; - ii[WS(rs, 18)] = T4s - T4l; - } - { - E T3R, T2i, T3S, T40, T42, T3W, T3Z, T41, T3T; - T3R = KP559016994 * (T2e - T2h); - T2i = T2e + T2h; - T3S = FNMS(KP250000000, T2i, T2b); - T3W = T3U - T3V; - T3Z = T3X - T3Y; - T40 = FMA(KP951056516, T3W, KP587785252 * T3Z); - T42 = FNMS(KP587785252, T3W, KP951056516 * T3Z); - ri[0] = T2b + T2i; - T41 = T3S - T3R; - ri[WS(rs, 12)] = T41 - T42; - ri[WS(rs, 8)] = T41 + T42; - T3T = T3R + T3S; - ri[WS(rs, 4)] = T3T - T40; - ri[WS(rs, 16)] = T3T + T40; - } - { - E T4e, T45, T4f, T4d, T4i, T4b, T4c, T4h, T4g; - T4e = KP559016994 * (T43 - T44); - T45 = T43 + T44; - T4f = FNMS(KP250000000, T45, T4a); - T4b = T2c - T2d; - T4c = T2f - T2g; - T4d = FMA(KP951056516, T4b, KP587785252 * T4c); - T4i = FNMS(KP587785252, T4b, KP951056516 * T4c); - ii[0] = T45 + T4a; - T4h = T4f - T4e; - ii[WS(rs, 8)] = T4h - T4i; - ii[WS(rs, 12)] = T4i + T4h; - T4g = T4e + T4f; - ii[WS(rs, 4)] = T4d + T4g; - ii[WS(rs, 16)] = T4g - T4d; - } - { - E T39, T37, T38, T2F, T3b, T2t, T2E, T3c, T3a; - T39 = KP559016994 * (T2V - T36); - T37 = T2V + T36; - T38 = FNMS(KP250000000, T37, T2K); - T2t = T2n - T2s; - T2E = T2y - T2D; - T2F = FNMS(KP587785252, T2E, KP951056516 * T2t); - T3b = FMA(KP951056516, T2E, KP587785252 * T2t); - ri[WS(rs, 15)] = T2K + T37; - T3c = T39 + T38; - ri[WS(rs, 11)] = T3b + T3c; - ri[WS(rs, 19)] = T3c - T3b; - T3a = T38 - T39; - ri[WS(rs, 3)] = T2F + T3a; - ri[WS(rs, 7)] = T3a - T2F; - } - { - E T4O, T4M, T4N, T4S, T4U, T4Q, T4R, T4T, T4P; - T4O = KP559016994 * (T4K - T4L); - T4M = T4K + T4L; - T4N = FNMS(KP250000000, T4M, T4J); - T4Q = T30 - T35; - T4R = T2P - T2U; - T4S = FNMS(KP587785252, T4R, KP951056516 * T4Q); - T4U = FMA(KP951056516, T4R, KP587785252 * T4Q); - ii[WS(rs, 15)] = T4M + T4J; - T4T = T4O + T4N; - ii[WS(rs, 11)] = T4T - T4U; - ii[WS(rs, 19)] = T4U + T4T; - T4P = T4N - T4O; - ii[WS(rs, 3)] = T4P - T4S; - ii[WS(rs, 7)] = T4S + T4P; - } - { - E T3q, T3s, T3t, T3j, T3v, T3f, T3i, T3w, T3u; - T3q = KP559016994 * (T3m - T3p); - T3s = T3m + T3p; - T3t = FNMS(KP250000000, T3s, T3r); - T3f = T3d - T3e; - T3i = T3g - T3h; - T3j = FMA(KP951056516, T3f, KP587785252 * T3i); - T3v = FNMS(KP587785252, T3f, KP951056516 * T3i); - ri[WS(rs, 5)] = T3r + T3s; - T3w = T3t - T3q; - ri[WS(rs, 13)] = T3v + T3w; - ri[WS(rs, 17)] = T3w - T3v; - T3u = T3q + T3t; - ri[WS(rs, 1)] = T3j + T3u; - ri[WS(rs, 9)] = T3u - T3j; - } - { - E T4x, T4B, T4C, T4G, T4I, T4E, T4F, T4H, T4D; - T4x = KP559016994 * (T4v - T4w); - T4B = T4v + T4w; - T4C = FNMS(KP250000000, T4B, T4A); - T4E = T3k - T3l; - T4F = T3n - T3o; - T4G = FMA(KP951056516, T4E, KP587785252 * T4F); - T4I = FNMS(KP587785252, T4E, KP951056516 * T4F); - ii[WS(rs, 5)] = T4B + T4A; - T4H = T4C - T4x; - ii[WS(rs, 13)] = T4H - T4I; - ii[WS(rs, 17)] = T4I + T4H; - T4D = T4x + T4C; - ii[WS(rs, 1)] = T4D - T4G; - ii[WS(rs, 9)] = T4G + T4D; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 19 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 20, "t2_20", twinstr, &GENUS, { 204, 92, 72, 0 }, 0, 0, 0 }; - -void X(codelet_t2_20) (planner *p) { - X(kdft_dit_register) (p, t2_20, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_32.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_32.c deleted file mode 100644 index bacad329..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_32.c +++ /dev/null @@ -1,1893 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:32 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -name t2_32 -include dft/scalar/t.h */ - -/* - * This function contains 488 FP additions, 350 FP multiplications, - * (or, 236 additions, 98 multiplications, 252 fused multiply/add), - * 164 stack variables, 7 constants, and 128 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP198912367, +0.198912367379658006911597622644676228597850501); - DK(KP668178637, +0.668178637919298919997757686523080761552472251); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP414213562, +0.414213562373095048801688724209698078569671875); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) { - E T2, T8, T3, T6, Te, Ti, T5, T7, TJ, Tb, TM, Tc, Ts, T23, T1w; - E T19, TA, TE, T1s, T1N, T1o, T1C, T1F, T1K, T15, T11, T2F, T31, T2J, T34; - E T3f, T3z, T3j, T3C, Tw, T3M, T3Q, T1z, T2s, T2w, T1d, T3n, T3r, T26, T2T; - E T2X, Th, TR, TP, Td, Tj, TW, Tn, TS, T1U, T2b, T29, T1R, T1V, T2g; - E T1Z, T2c; - { - E Tz, T1n, T10, TD, T1r, T14, T9, T1Q, Tv, T1c; - { - E T4, T18, Ta, Tr; - T2 = W[0]; - T8 = W[4]; - T3 = W[2]; - T6 = W[3]; - T4 = T2 * T3; - T18 = T3 * T8; - Ta = T2 * T6; - Tr = T2 * T8; - Te = W[6]; - Tz = T3 * Te; - T1n = T8 * Te; - T10 = T2 * Te; - Ti = W[7]; - TD = T3 * Ti; - T1r = T8 * Ti; - T14 = T2 * Ti; - T5 = W[1]; - T7 = FMA(T5, T6, T4); - TJ = FNMS(T5, T6, T4); - T9 = T7 * T8; - T1Q = TJ * T8; - Tb = FNMS(T5, T3, Ta); - TM = FMA(T5, T3, Ta); - Tc = W[5]; - Tv = T2 * Tc; - T1c = T3 * Tc; - Ts = FMA(T5, Tc, Tr); - T23 = FMA(T6, Tc, T18); - T1w = FNMS(T5, Tc, Tr); - T19 = FNMS(T6, Tc, T18); - } - TA = FMA(T6, Ti, Tz); - TE = FNMS(T6, Te, TD); - T1s = FNMS(Tc, Te, T1r); - T1N = FMA(T6, Te, TD); - T1o = FMA(Tc, Ti, T1n); - T1C = FMA(T5, Ti, T10); - T1F = FNMS(T5, Te, T14); - T1K = FNMS(T6, Ti, Tz); - T15 = FMA(T5, Te, T14); - T11 = FNMS(T5, Ti, T10); - { - E T2E, T2I, T2S, T2W; - T2E = T7 * Te; - T2F = FMA(Tb, Ti, T2E); - T31 = FNMS(Tb, Ti, T2E); - T2I = T7 * Ti; - T2J = FNMS(Tb, Te, T2I); - T34 = FMA(Tb, Te, T2I); - { - E T3e, T3i, T3L, T3P; - T3e = TJ * Te; - T3f = FNMS(TM, Ti, T3e); - T3z = FMA(TM, Ti, T3e); - T3i = TJ * Ti; - T3j = FMA(TM, Te, T3i); - T3C = FNMS(TM, Te, T3i); - T3L = Ts * Te; - T3P = Ts * Ti; - Tw = FNMS(T5, T8, Tv); - T3M = FMA(Tw, Ti, T3L); - T3Q = FNMS(Tw, Te, T3P); - } - { - E T2r, T2v, T3m, T3q; - T2r = T1w * Te; - T2v = T1w * Ti; - T1z = FMA(T5, T8, Tv); - T2s = FMA(T1z, Ti, T2r); - T2w = FNMS(T1z, Te, T2v); - T3m = T19 * Te; - T3q = T19 * Ti; - T1d = FMA(T6, T8, T1c); - T3n = FMA(T1d, Ti, T3m); - T3r = FNMS(T1d, Te, T3q); - } - T2S = T23 * Te; - T2W = T23 * Ti; - T26 = FNMS(T6, T8, T1c); - T2T = FMA(T26, Ti, T2S); - T2X = FNMS(T26, Te, T2W); - { - E TQ, TV, Tf, Tm, Tg; - Tg = T7 * Tc; - Th = FMA(Tb, T8, Tg); - TR = FNMS(Tb, T8, Tg); - TP = FMA(Tb, Tc, T9); - TQ = TP * Te; - TV = TP * Ti; - Td = FNMS(Tb, Tc, T9); - Tf = Td * Te; - Tm = Td * Ti; - Tj = FMA(Th, Ti, Tf); - TW = FNMS(TR, Te, TV); - Tn = FNMS(Th, Te, Tm); - TS = FMA(TR, Ti, TQ); - } - { - E T2a, T2f, T1S, T1Y, T1T; - T1T = TJ * Tc; - T1U = FMA(TM, T8, T1T); - T2b = FNMS(TM, T8, T1T); - T29 = FMA(TM, Tc, T1Q); - T2a = T29 * Te; - T2f = T29 * Ti; - T1R = FNMS(TM, Tc, T1Q); - T1S = T1R * Te; - T1Y = T1R * Ti; - T1V = FMA(T1U, Ti, T1S); - T2g = FNMS(T2b, Te, T2f); - T1Z = FNMS(T1U, Te, T1Y); - T2c = FMA(T2b, Ti, T2a); - } - } - } - { - E Tq, T46, T8H, T97, TH, T98, T4b, T8D, TZ, T7f, T4j, T6t, T1g, T7g, T4q; - E T6u, T1v, T1I, T7m, T7j, T7k, T7l, T4z, T6x, T4G, T6y, T22, T2j, T7o, T7p; - E T7q, T7r, T4O, T6A, T4V, T6B, T3G, T7L, T7I, T8n, T5E, T6P, T61, T6M, T2N; - E T7A, T7x, T8i, T55, T6I, T5s, T6F, T43, T7J, T7O, T8o, T5L, T62, T5S, T63; - E T3c, T7y, T7D, T8j, T5c, T5t, T5j, T5u; - { - E T1, T8G, Tk, Tl, To, T8E, Tp, T8F; - T1 = ri[0]; - T8G = ii[0]; - Tk = ri[WS(rs, 16)]; - Tl = Tj * Tk; - To = ii[WS(rs, 16)]; - T8E = Tj * To; - Tp = FMA(Tn, To, Tl); - Tq = T1 + Tp; - T46 = T1 - Tp; - T8F = FNMS(Tn, Tk, T8E); - T8H = T8F + T8G; - T97 = T8G - T8F; - } - { - E Tt, Tu, Tx, T47, TB, TC, TF, T49; - Tt = ri[WS(rs, 8)]; - Tu = Ts * Tt; - Tx = ii[WS(rs, 8)]; - T47 = Ts * Tx; - TB = ri[WS(rs, 24)]; - TC = TA * TB; - TF = ii[WS(rs, 24)]; - T49 = TA * TF; - { - E Ty, TG, T48, T4a; - Ty = FMA(Tw, Tx, Tu); - TG = FMA(TE, TF, TC); - TH = Ty + TG; - T98 = Ty - TG; - T48 = FNMS(Tw, Tt, T47); - T4a = FNMS(TE, TB, T49); - T4b = T48 - T4a; - T8D = T48 + T4a; - } - } - { - E TO, T4f, TY, T4h, T4d, T4i; - { - E TK, TL, TN, T4e; - TK = ri[WS(rs, 4)]; - TL = TJ * TK; - TN = ii[WS(rs, 4)]; - T4e = TJ * TN; - TO = FMA(TM, TN, TL); - T4f = FNMS(TM, TK, T4e); - } - { - E TT, TU, TX, T4g; - TT = ri[WS(rs, 20)]; - TU = TS * TT; - TX = ii[WS(rs, 20)]; - T4g = TS * TX; - TY = FMA(TW, TX, TU); - T4h = FNMS(TW, TT, T4g); - } - TZ = TO + TY; - T7f = T4f + T4h; - T4d = TO - TY; - T4i = T4f - T4h; - T4j = T4d + T4i; - T6t = T4i - T4d; - } - { - E T17, T4m, T1f, T4o, T4k, T4p; - { - E T12, T13, T16, T4l; - T12 = ri[WS(rs, 28)]; - T13 = T11 * T12; - T16 = ii[WS(rs, 28)]; - T4l = T11 * T16; - T17 = FMA(T15, T16, T13); - T4m = FNMS(T15, T12, T4l); - } - { - E T1a, T1b, T1e, T4n; - T1a = ri[WS(rs, 12)]; - T1b = T19 * T1a; - T1e = ii[WS(rs, 12)]; - T4n = T19 * T1e; - T1f = FMA(T1d, T1e, T1b); - T4o = FNMS(T1d, T1a, T4n); - } - T1g = T17 + T1f; - T7g = T4m + T4o; - T4k = T17 - T1f; - T4p = T4m - T4o; - T4q = T4k - T4p; - T6u = T4k + T4p; - } - { - E T1m, T4u, T1H, T4E, T1u, T4w, T1B, T4C; - { - E T1j, T1k, T1l, T4t; - T1j = ri[WS(rs, 2)]; - T1k = T7 * T1j; - T1l = ii[WS(rs, 2)]; - T4t = T7 * T1l; - T1m = FMA(Tb, T1l, T1k); - T4u = FNMS(Tb, T1j, T4t); - } - { - E T1D, T1E, T1G, T4D; - T1D = ri[WS(rs, 26)]; - T1E = T1C * T1D; - T1G = ii[WS(rs, 26)]; - T4D = T1C * T1G; - T1H = FMA(T1F, T1G, T1E); - T4E = FNMS(T1F, T1D, T4D); - } - { - E T1p, T1q, T1t, T4v; - T1p = ri[WS(rs, 18)]; - T1q = T1o * T1p; - T1t = ii[WS(rs, 18)]; - T4v = T1o * T1t; - T1u = FMA(T1s, T1t, T1q); - T4w = FNMS(T1s, T1p, T4v); - } - { - E T1x, T1y, T1A, T4B; - T1x = ri[WS(rs, 10)]; - T1y = T1w * T1x; - T1A = ii[WS(rs, 10)]; - T4B = T1w * T1A; - T1B = FMA(T1z, T1A, T1y); - T4C = FNMS(T1z, T1x, T4B); - } - T1v = T1m + T1u; - T1I = T1B + T1H; - T7m = T1v - T1I; - T7j = T4u + T4w; - T7k = T4C + T4E; - T7l = T7j - T7k; - { - E T4x, T4y, T4A, T4F; - T4x = T4u - T4w; - T4y = T1B - T1H; - T4z = T4x - T4y; - T6x = T4x + T4y; - T4A = T1m - T1u; - T4F = T4C - T4E; - T4G = T4A + T4F; - T6y = T4A - T4F; - } - } - { - E T1P, T4J, T2i, T4T, T21, T4L, T28, T4R; - { - E T1L, T1M, T1O, T4I; - T1L = ri[WS(rs, 30)]; - T1M = T1K * T1L; - T1O = ii[WS(rs, 30)]; - T4I = T1K * T1O; - T1P = FMA(T1N, T1O, T1M); - T4J = FNMS(T1N, T1L, T4I); - } - { - E T2d, T2e, T2h, T4S; - T2d = ri[WS(rs, 22)]; - T2e = T2c * T2d; - T2h = ii[WS(rs, 22)]; - T4S = T2c * T2h; - T2i = FMA(T2g, T2h, T2e); - T4T = FNMS(T2g, T2d, T4S); - } - { - E T1W, T1X, T20, T4K; - T1W = ri[WS(rs, 14)]; - T1X = T1V * T1W; - T20 = ii[WS(rs, 14)]; - T4K = T1V * T20; - T21 = FMA(T1Z, T20, T1X); - T4L = FNMS(T1Z, T1W, T4K); - } - { - E T24, T25, T27, T4Q; - T24 = ri[WS(rs, 6)]; - T25 = T23 * T24; - T27 = ii[WS(rs, 6)]; - T4Q = T23 * T27; - T28 = FMA(T26, T27, T25); - T4R = FNMS(T26, T24, T4Q); - } - T22 = T1P + T21; - T2j = T28 + T2i; - T7o = T22 - T2j; - T7p = T4J + T4L; - T7q = T4R + T4T; - T7r = T7p - T7q; - { - E T4M, T4N, T4P, T4U; - T4M = T4J - T4L; - T4N = T28 - T2i; - T4O = T4M - T4N; - T6A = T4M + T4N; - T4P = T1P - T21; - T4U = T4R - T4T; - T4V = T4P + T4U; - T6B = T4P - T4U; - } - } - { - E T3l, T5z, T3E, T5Z, T3t, T5B, T3y, T5X; - { - E T3g, T3h, T3k, T5y; - T3g = ri[WS(rs, 31)]; - T3h = T3f * T3g; - T3k = ii[WS(rs, 31)]; - T5y = T3f * T3k; - T3l = FMA(T3j, T3k, T3h); - T5z = FNMS(T3j, T3g, T5y); - } - { - E T3A, T3B, T3D, T5Y; - T3A = ri[WS(rs, 23)]; - T3B = T3z * T3A; - T3D = ii[WS(rs, 23)]; - T5Y = T3z * T3D; - T3E = FMA(T3C, T3D, T3B); - T5Z = FNMS(T3C, T3A, T5Y); - } - { - E T3o, T3p, T3s, T5A; - T3o = ri[WS(rs, 15)]; - T3p = T3n * T3o; - T3s = ii[WS(rs, 15)]; - T5A = T3n * T3s; - T3t = FMA(T3r, T3s, T3p); - T5B = FNMS(T3r, T3o, T5A); - } - { - E T3v, T3w, T3x, T5W; - T3v = ri[WS(rs, 7)]; - T3w = TP * T3v; - T3x = ii[WS(rs, 7)]; - T5W = TP * T3x; - T3y = FMA(TR, T3x, T3w); - T5X = FNMS(TR, T3v, T5W); - } - { - E T3u, T3F, T7G, T7H; - T3u = T3l + T3t; - T3F = T3y + T3E; - T3G = T3u + T3F; - T7L = T3u - T3F; - T7G = T5z + T5B; - T7H = T5X + T5Z; - T7I = T7G - T7H; - T8n = T7G + T7H; - } - { - E T5C, T5D, T5V, T60; - T5C = T5z - T5B; - T5D = T3y - T3E; - T5E = T5C - T5D; - T6P = T5C + T5D; - T5V = T3l - T3t; - T60 = T5X - T5Z; - T61 = T5V + T60; - T6M = T5V - T60; - } - } - { - E T2q, T50, T2L, T5q, T2y, T52, T2D, T5o; - { - E T2n, T2o, T2p, T4Z; - T2n = ri[WS(rs, 1)]; - T2o = T2 * T2n; - T2p = ii[WS(rs, 1)]; - T4Z = T2 * T2p; - T2q = FMA(T5, T2p, T2o); - T50 = FNMS(T5, T2n, T4Z); - } - { - E T2G, T2H, T2K, T5p; - T2G = ri[WS(rs, 25)]; - T2H = T2F * T2G; - T2K = ii[WS(rs, 25)]; - T5p = T2F * T2K; - T2L = FMA(T2J, T2K, T2H); - T5q = FNMS(T2J, T2G, T5p); - } - { - E T2t, T2u, T2x, T51; - T2t = ri[WS(rs, 17)]; - T2u = T2s * T2t; - T2x = ii[WS(rs, 17)]; - T51 = T2s * T2x; - T2y = FMA(T2w, T2x, T2u); - T52 = FNMS(T2w, T2t, T51); - } - { - E T2A, T2B, T2C, T5n; - T2A = ri[WS(rs, 9)]; - T2B = T8 * T2A; - T2C = ii[WS(rs, 9)]; - T5n = T8 * T2C; - T2D = FMA(Tc, T2C, T2B); - T5o = FNMS(Tc, T2A, T5n); - } - { - E T2z, T2M, T7v, T7w; - T2z = T2q + T2y; - T2M = T2D + T2L; - T2N = T2z + T2M; - T7A = T2z - T2M; - T7v = T50 + T52; - T7w = T5o + T5q; - T7x = T7v - T7w; - T8i = T7v + T7w; - } - { - E T53, T54, T5m, T5r; - T53 = T50 - T52; - T54 = T2D - T2L; - T55 = T53 - T54; - T6I = T53 + T54; - T5m = T2q - T2y; - T5r = T5o - T5q; - T5s = T5m + T5r; - T6F = T5m - T5r; - } - } - { - E T3K, T5G, T41, T5Q, T3S, T5I, T3X, T5O; - { - E T3H, T3I, T3J, T5F; - T3H = ri[WS(rs, 3)]; - T3I = T3 * T3H; - T3J = ii[WS(rs, 3)]; - T5F = T3 * T3J; - T3K = FMA(T6, T3J, T3I); - T5G = FNMS(T6, T3H, T5F); - } - { - E T3Y, T3Z, T40, T5P; - T3Y = ri[WS(rs, 11)]; - T3Z = Td * T3Y; - T40 = ii[WS(rs, 11)]; - T5P = Td * T40; - T41 = FMA(Th, T40, T3Z); - T5Q = FNMS(Th, T3Y, T5P); - } - { - E T3N, T3O, T3R, T5H; - T3N = ri[WS(rs, 19)]; - T3O = T3M * T3N; - T3R = ii[WS(rs, 19)]; - T5H = T3M * T3R; - T3S = FMA(T3Q, T3R, T3O); - T5I = FNMS(T3Q, T3N, T5H); - } - { - E T3U, T3V, T3W, T5N; - T3U = ri[WS(rs, 27)]; - T3V = Te * T3U; - T3W = ii[WS(rs, 27)]; - T5N = Te * T3W; - T3X = FMA(Ti, T3W, T3V); - T5O = FNMS(Ti, T3U, T5N); - } - { - E T3T, T42, T7M, T7N; - T3T = T3K + T3S; - T42 = T3X + T41; - T43 = T3T + T42; - T7J = T42 - T3T; - T7M = T5G + T5I; - T7N = T5O + T5Q; - T7O = T7M - T7N; - T8o = T7M + T7N; - } - { - E T5J, T5K, T5M, T5R; - T5J = T5G - T5I; - T5K = T3K - T3S; - T5L = T5J - T5K; - T62 = T5K + T5J; - T5M = T3X - T41; - T5R = T5O - T5Q; - T5S = T5M + T5R; - T63 = T5M - T5R; - } - } - { - E T2R, T57, T3a, T5h, T2Z, T59, T36, T5f; - { - E T2O, T2P, T2Q, T56; - T2O = ri[WS(rs, 5)]; - T2P = T29 * T2O; - T2Q = ii[WS(rs, 5)]; - T56 = T29 * T2Q; - T2R = FMA(T2b, T2Q, T2P); - T57 = FNMS(T2b, T2O, T56); - } - { - E T37, T38, T39, T5g; - T37 = ri[WS(rs, 13)]; - T38 = T1R * T37; - T39 = ii[WS(rs, 13)]; - T5g = T1R * T39; - T3a = FMA(T1U, T39, T38); - T5h = FNMS(T1U, T37, T5g); - } - { - E T2U, T2V, T2Y, T58; - T2U = ri[WS(rs, 21)]; - T2V = T2T * T2U; - T2Y = ii[WS(rs, 21)]; - T58 = T2T * T2Y; - T2Z = FMA(T2X, T2Y, T2V); - T59 = FNMS(T2X, T2U, T58); - } - { - E T32, T33, T35, T5e; - T32 = ri[WS(rs, 29)]; - T33 = T31 * T32; - T35 = ii[WS(rs, 29)]; - T5e = T31 * T35; - T36 = FMA(T34, T35, T33); - T5f = FNMS(T34, T32, T5e); - } - { - E T30, T3b, T7B, T7C; - T30 = T2R + T2Z; - T3b = T36 + T3a; - T3c = T30 + T3b; - T7y = T3b - T30; - T7B = T57 + T59; - T7C = T5f + T5h; - T7D = T7B - T7C; - T8j = T7B + T7C; - } - { - E T5a, T5b, T5d, T5i; - T5a = T57 - T59; - T5b = T2R - T2Z; - T5c = T5a - T5b; - T5t = T5b + T5a; - T5d = T36 - T3a; - T5i = T5f - T5h; - T5j = T5d + T5i; - T5u = T5d - T5i; - } - } - { - E T1i, T8c, T8z, T8A, T8J, T8O, T2l, T8N, T45, T8L, T8l, T8t, T8q, T8u, T8f; - E T8B; - { - E TI, T1h, T8x, T8y; - TI = Tq + TH; - T1h = TZ + T1g; - T1i = TI + T1h; - T8c = TI - T1h; - T8x = T8i + T8j; - T8y = T8n + T8o; - T8z = T8x - T8y; - T8A = T8x + T8y; - } - { - E T8C, T8I, T1J, T2k; - T8C = T7f + T7g; - T8I = T8D + T8H; - T8J = T8C + T8I; - T8O = T8I - T8C; - T1J = T1v + T1I; - T2k = T22 + T2j; - T2l = T1J + T2k; - T8N = T2k - T1J; - } - { - E T3d, T44, T8h, T8k; - T3d = T2N + T3c; - T44 = T3G + T43; - T45 = T3d + T44; - T8L = T44 - T3d; - T8h = T2N - T3c; - T8k = T8i - T8j; - T8l = T8h + T8k; - T8t = T8k - T8h; - } - { - E T8m, T8p, T8d, T8e; - T8m = T3G - T43; - T8p = T8n - T8o; - T8q = T8m - T8p; - T8u = T8m + T8p; - T8d = T7j + T7k; - T8e = T7p + T7q; - T8f = T8d - T8e; - T8B = T8d + T8e; - } - { - E T2m, T8K, T8w, T8M; - T2m = T1i + T2l; - ri[WS(rs, 16)] = T2m - T45; - ri[0] = T2m + T45; - T8K = T8B + T8J; - ii[0] = T8A + T8K; - ii[WS(rs, 16)] = T8K - T8A; - T8w = T1i - T2l; - ri[WS(rs, 24)] = T8w - T8z; - ri[WS(rs, 8)] = T8w + T8z; - T8M = T8J - T8B; - ii[WS(rs, 8)] = T8L + T8M; - ii[WS(rs, 24)] = T8M - T8L; - } - { - E T8g, T8r, T8P, T8Q; - T8g = T8c + T8f; - T8r = T8l + T8q; - ri[WS(rs, 20)] = FNMS(KP707106781, T8r, T8g); - ri[WS(rs, 4)] = FMA(KP707106781, T8r, T8g); - T8P = T8N + T8O; - T8Q = T8t + T8u; - ii[WS(rs, 4)] = FMA(KP707106781, T8Q, T8P); - ii[WS(rs, 20)] = FNMS(KP707106781, T8Q, T8P); - } - { - E T8s, T8v, T8R, T8S; - T8s = T8c - T8f; - T8v = T8t - T8u; - ri[WS(rs, 28)] = FNMS(KP707106781, T8v, T8s); - ri[WS(rs, 12)] = FMA(KP707106781, T8v, T8s); - T8R = T8O - T8N; - T8S = T8q - T8l; - ii[WS(rs, 12)] = FMA(KP707106781, T8S, T8R); - ii[WS(rs, 28)] = FNMS(KP707106781, T8S, T8R); - } - } - { - E T7i, T7W, T86, T8a, T8V, T91, T7t, T8W, T7F, T7T, T7Z, T92, T83, T89, T7Q; - E T7U; - { - E T7e, T7h, T84, T85; - T7e = Tq - TH; - T7h = T7f - T7g; - T7i = T7e - T7h; - T7W = T7e + T7h; - T84 = T7L + T7O; - T85 = T7I + T7J; - T86 = FNMS(KP414213562, T85, T84); - T8a = FMA(KP414213562, T84, T85); - } - { - E T8T, T8U, T7n, T7s; - T8T = T1g - TZ; - T8U = T8H - T8D; - T8V = T8T + T8U; - T91 = T8U - T8T; - T7n = T7l - T7m; - T7s = T7o + T7r; - T7t = T7n - T7s; - T8W = T7n + T7s; - } - { - E T7z, T7E, T7X, T7Y; - T7z = T7x - T7y; - T7E = T7A - T7D; - T7F = FMA(KP414213562, T7E, T7z); - T7T = FNMS(KP414213562, T7z, T7E); - T7X = T7m + T7l; - T7Y = T7o - T7r; - T7Z = T7X + T7Y; - T92 = T7Y - T7X; - } - { - E T81, T82, T7K, T7P; - T81 = T7A + T7D; - T82 = T7x + T7y; - T83 = FMA(KP414213562, T82, T81); - T89 = FNMS(KP414213562, T81, T82); - T7K = T7I - T7J; - T7P = T7L - T7O; - T7Q = FNMS(KP414213562, T7P, T7K); - T7U = FMA(KP414213562, T7K, T7P); - } - { - E T7u, T7R, T93, T94; - T7u = FMA(KP707106781, T7t, T7i); - T7R = T7F - T7Q; - ri[WS(rs, 22)] = FNMS(KP923879532, T7R, T7u); - ri[WS(rs, 6)] = FMA(KP923879532, T7R, T7u); - T93 = FMA(KP707106781, T92, T91); - T94 = T7U - T7T; - ii[WS(rs, 6)] = FMA(KP923879532, T94, T93); - ii[WS(rs, 22)] = FNMS(KP923879532, T94, T93); - } - { - E T7S, T7V, T95, T96; - T7S = FNMS(KP707106781, T7t, T7i); - T7V = T7T + T7U; - ri[WS(rs, 14)] = FNMS(KP923879532, T7V, T7S); - ri[WS(rs, 30)] = FMA(KP923879532, T7V, T7S); - T95 = FNMS(KP707106781, T92, T91); - T96 = T7F + T7Q; - ii[WS(rs, 14)] = FNMS(KP923879532, T96, T95); - ii[WS(rs, 30)] = FMA(KP923879532, T96, T95); - } - { - E T80, T87, T8X, T8Y; - T80 = FMA(KP707106781, T7Z, T7W); - T87 = T83 + T86; - ri[WS(rs, 18)] = FNMS(KP923879532, T87, T80); - ri[WS(rs, 2)] = FMA(KP923879532, T87, T80); - T8X = FMA(KP707106781, T8W, T8V); - T8Y = T89 + T8a; - ii[WS(rs, 2)] = FMA(KP923879532, T8Y, T8X); - ii[WS(rs, 18)] = FNMS(KP923879532, T8Y, T8X); - } - { - E T88, T8b, T8Z, T90; - T88 = FNMS(KP707106781, T7Z, T7W); - T8b = T89 - T8a; - ri[WS(rs, 26)] = FNMS(KP923879532, T8b, T88); - ri[WS(rs, 10)] = FMA(KP923879532, T8b, T88); - T8Z = FNMS(KP707106781, T8W, T8V); - T90 = T86 - T83; - ii[WS(rs, 10)] = FMA(KP923879532, T90, T8Z); - ii[WS(rs, 26)] = FNMS(KP923879532, T90, T8Z); - } - } - { - E T4s, T6c, T4X, T9c, T9b, T9h, T6f, T9i, T66, T6q, T6a, T6m, T5x, T6p, T69; - E T6j; - { - E T4c, T4r, T6d, T6e; - T4c = T46 + T4b; - T4r = T4j + T4q; - T4s = FNMS(KP707106781, T4r, T4c); - T6c = FMA(KP707106781, T4r, T4c); - { - E T4H, T4W, T99, T9a; - T4H = FNMS(KP414213562, T4G, T4z); - T4W = FMA(KP414213562, T4V, T4O); - T4X = T4H - T4W; - T9c = T4H + T4W; - T99 = T97 - T98; - T9a = T6t + T6u; - T9b = FMA(KP707106781, T9a, T99); - T9h = FNMS(KP707106781, T9a, T99); - } - T6d = FMA(KP414213562, T4z, T4G); - T6e = FNMS(KP414213562, T4O, T4V); - T6f = T6d + T6e; - T9i = T6e - T6d; - { - E T5U, T6l, T65, T6k, T5T, T64; - T5T = T5L + T5S; - T5U = FNMS(KP707106781, T5T, T5E); - T6l = FMA(KP707106781, T5T, T5E); - T64 = T62 + T63; - T65 = FNMS(KP707106781, T64, T61); - T6k = FMA(KP707106781, T64, T61); - T66 = FNMS(KP668178637, T65, T5U); - T6q = FMA(KP198912367, T6k, T6l); - T6a = FMA(KP668178637, T5U, T65); - T6m = FNMS(KP198912367, T6l, T6k); - } - { - E T5l, T6i, T5w, T6h, T5k, T5v; - T5k = T5c + T5j; - T5l = FNMS(KP707106781, T5k, T55); - T6i = FMA(KP707106781, T5k, T55); - T5v = T5t + T5u; - T5w = FNMS(KP707106781, T5v, T5s); - T6h = FMA(KP707106781, T5v, T5s); - T5x = FMA(KP668178637, T5w, T5l); - T6p = FNMS(KP198912367, T6h, T6i); - T69 = FNMS(KP668178637, T5l, T5w); - T6j = FMA(KP198912367, T6i, T6h); - } - } - { - E T4Y, T67, T9j, T9k; - T4Y = FMA(KP923879532, T4X, T4s); - T67 = T5x - T66; - ri[WS(rs, 21)] = FNMS(KP831469612, T67, T4Y); - ri[WS(rs, 5)] = FMA(KP831469612, T67, T4Y); - T9j = FMA(KP923879532, T9i, T9h); - T9k = T6a - T69; - ii[WS(rs, 5)] = FMA(KP831469612, T9k, T9j); - ii[WS(rs, 21)] = FNMS(KP831469612, T9k, T9j); - } - { - E T68, T6b, T9l, T9m; - T68 = FNMS(KP923879532, T4X, T4s); - T6b = T69 + T6a; - ri[WS(rs, 13)] = FNMS(KP831469612, T6b, T68); - ri[WS(rs, 29)] = FMA(KP831469612, T6b, T68); - T9l = FNMS(KP923879532, T9i, T9h); - T9m = T5x + T66; - ii[WS(rs, 13)] = FNMS(KP831469612, T9m, T9l); - ii[WS(rs, 29)] = FMA(KP831469612, T9m, T9l); - } - { - E T6g, T6n, T9d, T9e; - T6g = FMA(KP923879532, T6f, T6c); - T6n = T6j + T6m; - ri[WS(rs, 17)] = FNMS(KP980785280, T6n, T6g); - ri[WS(rs, 1)] = FMA(KP980785280, T6n, T6g); - T9d = FMA(KP923879532, T9c, T9b); - T9e = T6p + T6q; - ii[WS(rs, 1)] = FMA(KP980785280, T9e, T9d); - ii[WS(rs, 17)] = FNMS(KP980785280, T9e, T9d); - } - { - E T6o, T6r, T9f, T9g; - T6o = FNMS(KP923879532, T6f, T6c); - T6r = T6p - T6q; - ri[WS(rs, 25)] = FNMS(KP980785280, T6r, T6o); - ri[WS(rs, 9)] = FMA(KP980785280, T6r, T6o); - T9f = FNMS(KP923879532, T9c, T9b); - T9g = T6m - T6j; - ii[WS(rs, 9)] = FMA(KP980785280, T9g, T9f); - ii[WS(rs, 25)] = FNMS(KP980785280, T9g, T9f); - } - } - { - E T6w, T6Y, T6D, T9w, T9p, T9v, T71, T9q, T6S, T7c, T6W, T78, T6L, T7b, T6V; - E T75; - { - E T6s, T6v, T6Z, T70; - T6s = T46 - T4b; - T6v = T6t - T6u; - T6w = FMA(KP707106781, T6v, T6s); - T6Y = FNMS(KP707106781, T6v, T6s); - { - E T6z, T6C, T9n, T9o; - T6z = FMA(KP414213562, T6y, T6x); - T6C = FNMS(KP414213562, T6B, T6A); - T6D = T6z - T6C; - T9w = T6z + T6C; - T9n = T98 + T97; - T9o = T4q - T4j; - T9p = FMA(KP707106781, T9o, T9n); - T9v = FNMS(KP707106781, T9o, T9n); - } - T6Z = FNMS(KP414213562, T6x, T6y); - T70 = FMA(KP414213562, T6A, T6B); - T71 = T6Z + T70; - T9q = T70 - T6Z; - { - E T6O, T77, T6R, T76, T6N, T6Q; - T6N = T5S - T5L; - T6O = FNMS(KP707106781, T6N, T6M); - T77 = FMA(KP707106781, T6N, T6M); - T6Q = T62 - T63; - T6R = FNMS(KP707106781, T6Q, T6P); - T76 = FMA(KP707106781, T6Q, T6P); - T6S = FNMS(KP668178637, T6R, T6O); - T7c = FMA(KP198912367, T76, T77); - T6W = FMA(KP668178637, T6O, T6R); - T78 = FNMS(KP198912367, T77, T76); - } - { - E T6H, T74, T6K, T73, T6G, T6J; - T6G = T5j - T5c; - T6H = FNMS(KP707106781, T6G, T6F); - T74 = FMA(KP707106781, T6G, T6F); - T6J = T5t - T5u; - T6K = FNMS(KP707106781, T6J, T6I); - T73 = FMA(KP707106781, T6J, T6I); - T6L = FMA(KP668178637, T6K, T6H); - T7b = FNMS(KP198912367, T73, T74); - T6V = FNMS(KP668178637, T6H, T6K); - T75 = FMA(KP198912367, T74, T73); - } - } - { - E T6E, T6T, T9r, T9s; - T6E = FMA(KP923879532, T6D, T6w); - T6T = T6L + T6S; - ri[WS(rs, 19)] = FNMS(KP831469612, T6T, T6E); - ri[WS(rs, 3)] = FMA(KP831469612, T6T, T6E); - T9r = FMA(KP923879532, T9q, T9p); - T9s = T6V + T6W; - ii[WS(rs, 3)] = FMA(KP831469612, T9s, T9r); - ii[WS(rs, 19)] = FNMS(KP831469612, T9s, T9r); - } - { - E T6U, T6X, T9t, T9u; - T6U = FNMS(KP923879532, T6D, T6w); - T6X = T6V - T6W; - ri[WS(rs, 27)] = FNMS(KP831469612, T6X, T6U); - ri[WS(rs, 11)] = FMA(KP831469612, T6X, T6U); - T9t = FNMS(KP923879532, T9q, T9p); - T9u = T6S - T6L; - ii[WS(rs, 11)] = FMA(KP831469612, T9u, T9t); - ii[WS(rs, 27)] = FNMS(KP831469612, T9u, T9t); - } - { - E T72, T79, T9x, T9y; - T72 = FNMS(KP923879532, T71, T6Y); - T79 = T75 - T78; - ri[WS(rs, 23)] = FNMS(KP980785280, T79, T72); - ri[WS(rs, 7)] = FMA(KP980785280, T79, T72); - T9x = FNMS(KP923879532, T9w, T9v); - T9y = T7c - T7b; - ii[WS(rs, 7)] = FMA(KP980785280, T9y, T9x); - ii[WS(rs, 23)] = FNMS(KP980785280, T9y, T9x); - } - { - E T7a, T7d, T9z, T9A; - T7a = FMA(KP923879532, T71, T6Y); - T7d = T7b + T7c; - ri[WS(rs, 15)] = FNMS(KP980785280, T7d, T7a); - ri[WS(rs, 31)] = FMA(KP980785280, T7d, T7a); - T9z = FMA(KP923879532, T9w, T9v); - T9A = T75 + T78; - ii[WS(rs, 15)] = FNMS(KP980785280, T9A, T9z); - ii[WS(rs, 31)] = FMA(KP980785280, T9A, T9z); - } - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 27 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 32, "t2_32", twinstr, &GENUS, { 236, 98, 252, 0 }, 0, 0, 0 }; - -void X(codelet_t2_32) (planner *p) { - X(kdft_dit_register) (p, t2_32, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -name t2_32 -include dft/scalar/t.h */ - -/* - * This function contains 488 FP additions, 280 FP multiplications, - * (or, 376 additions, 168 multiplications, 112 fused multiply/add), - * 158 stack variables, 7 constants, and 128 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP195090322, +0.195090322016128267848284868477022240927691618); - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP555570233, +0.555570233019602224742830813948532874374937191); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP382683432, +0.382683432365089771728459984030398866761344562); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) { - E T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y; - E T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d; - E Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C; - E T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25; - E T1S, T23; - { - E Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF; - E T10; - { - E T4, Tc, T7, Tb; - T2 = W[0]; - T5 = W[1]; - T3 = W[2]; - T6 = W[3]; - T4 = T2 * T3; - Tc = T5 * T3; - T7 = T5 * T6; - Tb = T2 * T6; - T8 = T4 + T7; - TM = T4 - T7; - TO = Tb + Tc; - Td = Tb - Tc; - T9 = W[4]; - Ts = T2 * T9; - T1d = T6 * T9; - Tx = T5 * T9; - T18 = T3 * T9; - Te = W[5]; - Tt = T5 * Te; - T1c = T3 * Te; - Tw = T2 * Te; - T19 = T6 * Te; - Th = W[6]; - TB = T3 * Th; - T14 = T5 * Th; - TG = T6 * Th; - TZ = T2 * Th; - Tl = W[7]; - TC = T6 * Tl; - T13 = T2 * Tl; - TF = T3 * Tl; - T10 = T5 * Tl; - } - TD = TB + TC; - TH = TF - TG; - T1y = TZ + T10; - T1H = TF + TG; - T15 = T13 + T14; - T1A = T13 - T14; - T11 = TZ - T10; - T1F = TB - TC; - T1n = FMA(T9, Th, Te * Tl); - T1p = FNMS(Te, Th, T9 * Tl); - { - E T2o, T2p, T2s, T2t; - T2o = T8 * Th; - T2p = Td * Tl; - T2q = T2o + T2p; - T2I = T2o - T2p; - T2s = T8 * Tl; - T2t = Td * Th; - T2u = T2s - T2t; - T2K = T2s + T2t; - } - { - E T2T, T2U, T2X, T2Y; - T2T = TM * Th; - T2U = TO * Tl; - T2V = T2T - T2U; - T3b = T2T + T2U; - T2X = TM * Tl; - T2Y = TO * Th; - T2Z = T2X + T2Y; - T3d = T2X - T2Y; - Tu = Ts + Tt; - Ty = Tw - Tx; - T3l = FMA(Tu, Th, Ty * Tl); - T3n = FNMS(Ty, Th, Tu * Tl); - } - T1t = Ts - Tt; - T1v = Tw + Tx; - T2f = FMA(T1t, Th, T1v * Tl); - T2h = FNMS(T1v, Th, T1t * Tl); - T1a = T18 - T19; - T1e = T1c + T1d; - T32 = FMA(T1a, Th, T1e * Tl); - T34 = FNMS(T1e, Th, T1a * Tl); - T1W = T18 + T19; - T1Y = T1c - T1d; - T2C = FMA(T1W, Th, T1Y * Tl); - T2E = FNMS(T1Y, Th, T1W * Tl); - { - E Ta, Tf, Ti, Tj; - Ta = T8 * T9; - Tf = Td * Te; - Tg = Ta - Tf; - TR = Ta + Tf; - Ti = T8 * Te; - Tj = Td * T9; - Tk = Ti + Tj; - TS = Ti - Tj; - } - Tm = FMA(Tg, Th, Tk * Tl); - TV = FNMS(TS, Th, TR * Tl); - To = FNMS(Tk, Th, Tg * Tl); - TT = FMA(TR, Th, TS * Tl); - { - E T1K, T1L, T1N, T1O; - T1K = TM * T9; - T1L = TO * Te; - T1M = T1K - T1L; - T21 = T1K + T1L; - T1N = TM * Te; - T1O = TO * T9; - T1P = T1N + T1O; - T22 = T1N - T1O; - } - T1Q = FMA(T1M, Th, T1P * Tl); - T25 = FNMS(T22, Th, T21 * Tl); - T1S = FNMS(T1P, Th, T1M * Tl); - T23 = FMA(T21, Th, T22 * Tl); - } - { - E TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B; - E T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u; - E T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47; - E T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p; - E T4W, T5R, T55, T5O; - { - E T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp; - T1 = ri[0]; - T7G = ii[0]; - Tn = ri[WS(rs, 16)]; - Tp = ii[WS(rs, 16)]; - Tq = FMA(Tm, Tn, To * Tp); - T7F = FNMS(To, Tn, Tm * Tp); - { - E Tv, Tz, TE, TI; - Tv = ri[WS(rs, 8)]; - Tz = ii[WS(rs, 8)]; - TA = FMA(Tu, Tv, Ty * Tz); - T3C = FNMS(Ty, Tv, Tu * Tz); - TE = ri[WS(rs, 24)]; - TI = ii[WS(rs, 24)]; - TJ = FMA(TD, TE, TH * TI); - T3D = FNMS(TH, TE, TD * TI); - } - { - E Tr, TK, T8a, T8b; - Tr = T1 + Tq; - TK = TA + TJ; - TL = Tr + TK; - T6f = Tr - TK; - T8a = T7G - T7F; - T8b = TA - TJ; - T8c = T8a - T8b; - T8q = T8b + T8a; - } - { - E T3B, T3E, T7E, T7H; - T3B = T1 - Tq; - T3E = T3C - T3D; - T3F = T3B - T3E; - T5t = T3B + T3E; - T7E = T3C + T3D; - T7H = T7F + T7G; - T7I = T7E + T7H; - T7W = T7H - T7E; - } - } - { - E T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y; - { - E T2c, T2d, T2r, T2v; - T2c = ri[WS(rs, 1)]; - T2d = ii[WS(rs, 1)]; - T2e = FMA(T2, T2c, T5 * T2d); - T4g = FNMS(T5, T2c, T2 * T2d); - T2r = ri[WS(rs, 25)]; - T2v = ii[WS(rs, 25)]; - T2w = FMA(T2q, T2r, T2u * T2v); - T4z = FNMS(T2u, T2r, T2q * T2v); - } - { - E T2g, T2i, T2l, T2m; - T2g = ri[WS(rs, 17)]; - T2i = ii[WS(rs, 17)]; - T2j = FMA(T2f, T2g, T2h * T2i); - T4h = FNMS(T2h, T2g, T2f * T2i); - T2l = ri[WS(rs, 9)]; - T2m = ii[WS(rs, 9)]; - T2n = FMA(T9, T2l, Te * T2m); - T4y = FNMS(Te, T2l, T9 * T2m); - } - { - E T2k, T2x, T6w, T6x; - T2k = T2e + T2j; - T2x = T2n + T2w; - T2y = T2k + T2x; - T6B = T2k - T2x; - T6w = T4g + T4h; - T6x = T4y + T4z; - T6y = T6w - T6x; - T7j = T6w + T6x; - } - { - E T4i, T4j, T4x, T4A; - T4i = T4g - T4h; - T4j = T2n - T2w; - T4k = T4i + T4j; - T5J = T4i - T4j; - T4x = T2e - T2j; - T4A = T4y - T4z; - T4B = T4x - T4A; - T5G = T4x + T4A; - } - } - { - E T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I; - { - E T2W, T30, T3c, T3e; - T2W = ri[WS(rs, 31)]; - T30 = ii[WS(rs, 31)]; - T31 = FMA(T2V, T2W, T2Z * T30); - T4Y = FNMS(T2Z, T2W, T2V * T30); - T3c = ri[WS(rs, 23)]; - T3e = ii[WS(rs, 23)]; - T3f = FMA(T3b, T3c, T3d * T3e); - T4J = FNMS(T3d, T3c, T3b * T3e); - } - { - E T33, T35, T38, T39; - T33 = ri[WS(rs, 15)]; - T35 = ii[WS(rs, 15)]; - T36 = FMA(T32, T33, T34 * T35); - T4Z = FNMS(T34, T33, T32 * T35); - T38 = ri[WS(rs, 7)]; - T39 = ii[WS(rs, 7)]; - T3a = FMA(TR, T38, TS * T39); - T4I = FNMS(TS, T38, TR * T39); - } - { - E T37, T3g, T6M, T6N; - T37 = T31 + T36; - T3g = T3a + T3f; - T3h = T37 + T3g; - T6H = T37 - T3g; - T6M = T4Y + T4Z; - T6N = T4I + T4J; - T6O = T6M - T6N; - T7o = T6M + T6N; - } - { - E T4H, T4K, T50, T51; - T4H = T31 - T36; - T4K = T4I - T4J; - T4L = T4H - T4K; - T5N = T4H + T4K; - T50 = T4Y - T4Z; - T51 = T3a - T3f; - T52 = T50 + T51; - T5Q = T50 - T51; - } - } - { - E TQ, T3G, T1g, T3N, TX, T3H, T17, T3M; - { - E TN, TP, T1b, T1f; - TN = ri[WS(rs, 4)]; - TP = ii[WS(rs, 4)]; - TQ = FMA(TM, TN, TO * TP); - T3G = FNMS(TO, TN, TM * TP); - T1b = ri[WS(rs, 12)]; - T1f = ii[WS(rs, 12)]; - T1g = FMA(T1a, T1b, T1e * T1f); - T3N = FNMS(T1e, T1b, T1a * T1f); - } - { - E TU, TW, T12, T16; - TU = ri[WS(rs, 20)]; - TW = ii[WS(rs, 20)]; - TX = FMA(TT, TU, TV * TW); - T3H = FNMS(TV, TU, TT * TW); - T12 = ri[WS(rs, 28)]; - T16 = ii[WS(rs, 28)]; - T17 = FMA(T11, T12, T15 * T16); - T3M = FNMS(T15, T12, T11 * T16); - } - { - E TY, T1h, T6g, T6h; - TY = TQ + TX; - T1h = T17 + T1g; - T1i = TY + T1h; - T7V = T1h - TY; - T6g = T3G + T3H; - T6h = T3M + T3N; - T6i = T6g - T6h; - T7D = T6g + T6h; - } - { - E T3I, T3J, T3L, T3O; - T3I = T3G - T3H; - T3J = TQ - TX; - T3K = T3I - T3J; - T5u = T3J + T3I; - T3L = T17 - T1g; - T3O = T3M - T3N; - T3P = T3L + T3O; - T5v = T3L - T3O; - } - } - { - E T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y; - { - E T1k, T1l, T1z, T1B; - T1k = ri[WS(rs, 2)]; - T1l = ii[WS(rs, 2)]; - T1m = FMA(T8, T1k, Td * T1l); - T3S = FNMS(Td, T1k, T8 * T1l); - T1z = ri[WS(rs, 26)]; - T1B = ii[WS(rs, 26)]; - T1C = FMA(T1y, T1z, T1A * T1B); - T3Z = FNMS(T1A, T1z, T1y * T1B); - } - { - E T1o, T1q, T1u, T1w; - T1o = ri[WS(rs, 18)]; - T1q = ii[WS(rs, 18)]; - T1r = FMA(T1n, T1o, T1p * T1q); - T3T = FNMS(T1p, T1o, T1n * T1q); - T1u = ri[WS(rs, 10)]; - T1w = ii[WS(rs, 10)]; - T1x = FMA(T1t, T1u, T1v * T1w); - T3Y = FNMS(T1v, T1u, T1t * T1w); - } - { - E T1s, T1D, T6k, T6l; - T1s = T1m + T1r; - T1D = T1x + T1C; - T1E = T1s + T1D; - T6n = T1s - T1D; - T6k = T3S + T3T; - T6l = T3Y + T3Z; - T6m = T6k - T6l; - T7e = T6k + T6l; - } - { - E T3U, T3V, T3X, T40; - T3U = T3S - T3T; - T3V = T1x - T1C; - T3W = T3U + T3V; - T5y = T3U - T3V; - T3X = T1m - T1r; - T40 = T3Y - T3Z; - T41 = T3X - T40; - T5z = T3X + T40; - } - } - { - E T1J, T43, T27, T4a, T1U, T44, T20, T49; - { - E T1G, T1I, T24, T26; - T1G = ri[WS(rs, 30)]; - T1I = ii[WS(rs, 30)]; - T1J = FMA(T1F, T1G, T1H * T1I); - T43 = FNMS(T1H, T1G, T1F * T1I); - T24 = ri[WS(rs, 22)]; - T26 = ii[WS(rs, 22)]; - T27 = FMA(T23, T24, T25 * T26); - T4a = FNMS(T25, T24, T23 * T26); - } - { - E T1R, T1T, T1X, T1Z; - T1R = ri[WS(rs, 14)]; - T1T = ii[WS(rs, 14)]; - T1U = FMA(T1Q, T1R, T1S * T1T); - T44 = FNMS(T1S, T1R, T1Q * T1T); - T1X = ri[WS(rs, 6)]; - T1Z = ii[WS(rs, 6)]; - T20 = FMA(T1W, T1X, T1Y * T1Z); - T49 = FNMS(T1Y, T1X, T1W * T1Z); - } - { - E T1V, T28, T6q, T6r; - T1V = T1J + T1U; - T28 = T20 + T27; - T29 = T1V + T28; - T6p = T1V - T28; - T6q = T43 + T44; - T6r = T49 + T4a; - T6s = T6q - T6r; - T7f = T6q + T6r; - } - { - E T45, T46, T48, T4b; - T45 = T43 - T44; - T46 = T20 - T27; - T47 = T45 + T46; - T5B = T45 - T46; - T48 = T1J - T1U; - T4b = T49 - T4a; - T4c = T48 - T4b; - T5C = T48 + T4b; - } - } - { - E T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o; - { - E T2z, T2A, T2D, T2F; - T2z = ri[WS(rs, 5)]; - T2A = ii[WS(rs, 5)]; - T2B = FMA(T21, T2z, T22 * T2A); - T4r = FNMS(T22, T2z, T21 * T2A); - T2D = ri[WS(rs, 21)]; - T2F = ii[WS(rs, 21)]; - T2G = FMA(T2C, T2D, T2E * T2F); - T4s = FNMS(T2E, T2D, T2C * T2F); - } - T4q = T2B - T2G; - T4t = T4r - T4s; - { - E T2J, T2L, T2N, T2O; - T2J = ri[WS(rs, 29)]; - T2L = ii[WS(rs, 29)]; - T2M = FMA(T2I, T2J, T2K * T2L); - T4m = FNMS(T2K, T2J, T2I * T2L); - T2N = ri[WS(rs, 13)]; - T2O = ii[WS(rs, 13)]; - T2P = FMA(T1M, T2N, T1P * T2O); - T4n = FNMS(T1P, T2N, T1M * T2O); - } - T4l = T2M - T2P; - T4o = T4m - T4n; - { - E T2H, T2Q, T6C, T6D; - T2H = T2B + T2G; - T2Q = T2M + T2P; - T2R = T2H + T2Q; - T6z = T2Q - T2H; - T6C = T4r + T4s; - T6D = T4m + T4n; - T6E = T6C - T6D; - T7k = T6C + T6D; - } - { - E T4p, T4u, T4C, T4D; - T4p = T4l - T4o; - T4u = T4q + T4t; - T4v = KP707106781 * (T4p - T4u); - T5H = KP707106781 * (T4u + T4p); - T4C = T4t - T4q; - T4D = T4l + T4o; - T4E = KP707106781 * (T4C - T4D); - T5K = KP707106781 * (T4C + T4D); - } - } - { - E T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U; - { - E T3i, T3j, T3m, T3o; - T3i = ri[WS(rs, 3)]; - T3j = ii[WS(rs, 3)]; - T3k = FMA(T3, T3i, T6 * T3j); - T4M = FNMS(T6, T3i, T3 * T3j); - T3m = ri[WS(rs, 19)]; - T3o = ii[WS(rs, 19)]; - T3p = FMA(T3l, T3m, T3n * T3o); - T4N = FNMS(T3n, T3m, T3l * T3o); - } - T4O = T4M - T4N; - T4P = T3k - T3p; - { - E T3r, T3s, T3u, T3v; - T3r = ri[WS(rs, 27)]; - T3s = ii[WS(rs, 27)]; - T3t = FMA(Th, T3r, Tl * T3s); - T4S = FNMS(Tl, T3r, Th * T3s); - T3u = ri[WS(rs, 11)]; - T3v = ii[WS(rs, 11)]; - T3w = FMA(Tg, T3u, Tk * T3v); - T4T = FNMS(Tk, T3u, Tg * T3v); - } - T4R = T3t - T3w; - T4U = T4S - T4T; - { - E T3q, T3x, T6I, T6J; - T3q = T3k + T3p; - T3x = T3t + T3w; - T3y = T3q + T3x; - T6P = T3x - T3q; - T6I = T4M + T4N; - T6J = T4S + T4T; - T6K = T6I - T6J; - T7p = T6I + T6J; - } - { - E T4Q, T4V, T53, T54; - T4Q = T4O - T4P; - T4V = T4R + T4U; - T4W = KP707106781 * (T4Q - T4V); - T5R = KP707106781 * (T4Q + T4V); - T53 = T4R - T4U; - T54 = T4P + T4O; - T55 = KP707106781 * (T53 - T54); - T5O = KP707106781 * (T54 + T53); - } - } - { - E T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B; - { - E T1j, T2a, T7C, T7J; - T1j = TL + T1i; - T2a = T1E + T29; - T2b = T1j + T2a; - T7x = T1j - T2a; - T7C = T7e + T7f; - T7J = T7D + T7I; - T7K = T7C + T7J; - T7M = T7J - T7C; - } - { - E T2S, T3z, T7y, T7z; - T2S = T2y + T2R; - T3z = T3h + T3y; - T3A = T2S + T3z; - T7L = T3z - T2S; - T7y = T7j + T7k; - T7z = T7o + T7p; - T7A = T7y - T7z; - T7B = T7y + T7z; - } - ri[WS(rs, 16)] = T2b - T3A; - ii[WS(rs, 16)] = T7K - T7B; - ri[0] = T2b + T3A; - ii[0] = T7B + T7K; - ri[WS(rs, 24)] = T7x - T7A; - ii[WS(rs, 24)] = T7M - T7L; - ri[WS(rs, 8)] = T7x + T7A; - ii[WS(rs, 8)] = T7L + T7M; - } - { - E T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v; - { - E T7d, T7g, T7O, T7P; - T7d = TL - T1i; - T7g = T7e - T7f; - T7h = T7d + T7g; - T7t = T7d - T7g; - T7O = T29 - T1E; - T7P = T7I - T7D; - T7Q = T7O + T7P; - T7S = T7P - T7O; - } - { - E T7i, T7l, T7n, T7q; - T7i = T2y - T2R; - T7l = T7j - T7k; - T7m = T7i + T7l; - T7u = T7l - T7i; - T7n = T3h - T3y; - T7q = T7o - T7p; - T7r = T7n - T7q; - T7v = T7n + T7q; - } - { - E T7s, T7N, T7w, T7R; - T7s = KP707106781 * (T7m + T7r); - ri[WS(rs, 20)] = T7h - T7s; - ri[WS(rs, 4)] = T7h + T7s; - T7N = KP707106781 * (T7u + T7v); - ii[WS(rs, 4)] = T7N + T7Q; - ii[WS(rs, 20)] = T7Q - T7N; - T7w = KP707106781 * (T7u - T7v); - ri[WS(rs, 28)] = T7t - T7w; - ri[WS(rs, 12)] = T7t + T7w; - T7R = KP707106781 * (T7r - T7m); - ii[WS(rs, 12)] = T7R + T7S; - ii[WS(rs, 28)] = T7S - T7R; - } - } - { - E T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R; - E T6V; - { - E T6o, T6t, T6A, T6F; - T6j = T6f - T6i; - T7X = T7V + T7W; - T83 = T7W - T7V; - T6X = T6f + T6i; - T6o = T6m - T6n; - T6t = T6p + T6s; - T6u = KP707106781 * (T6o - T6t); - T7U = KP707106781 * (T6o + T6t); - { - E T75, T76, T6Y, T6Z; - T75 = T6H + T6K; - T76 = T6O + T6P; - T77 = FNMS(KP382683432, T76, KP923879532 * T75); - T7b = FMA(KP923879532, T76, KP382683432 * T75); - T6Y = T6n + T6m; - T6Z = T6p - T6s; - T70 = KP707106781 * (T6Y + T6Z); - T82 = KP707106781 * (T6Z - T6Y); - } - T6A = T6y - T6z; - T6F = T6B - T6E; - T6G = FMA(KP923879532, T6A, KP382683432 * T6F); - T6U = FNMS(KP923879532, T6F, KP382683432 * T6A); - { - E T72, T73, T6L, T6Q; - T72 = T6y + T6z; - T73 = T6B + T6E; - T74 = FMA(KP382683432, T72, KP923879532 * T73); - T7a = FNMS(KP382683432, T73, KP923879532 * T72); - T6L = T6H - T6K; - T6Q = T6O - T6P; - T6R = FNMS(KP923879532, T6Q, KP382683432 * T6L); - T6V = FMA(KP382683432, T6Q, KP923879532 * T6L); - } - } - { - E T6v, T6S, T81, T84; - T6v = T6j + T6u; - T6S = T6G + T6R; - ri[WS(rs, 22)] = T6v - T6S; - ri[WS(rs, 6)] = T6v + T6S; - T81 = T6U + T6V; - T84 = T82 + T83; - ii[WS(rs, 6)] = T81 + T84; - ii[WS(rs, 22)] = T84 - T81; - } - { - E T6T, T6W, T85, T86; - T6T = T6j - T6u; - T6W = T6U - T6V; - ri[WS(rs, 30)] = T6T - T6W; - ri[WS(rs, 14)] = T6T + T6W; - T85 = T6R - T6G; - T86 = T83 - T82; - ii[WS(rs, 14)] = T85 + T86; - ii[WS(rs, 30)] = T86 - T85; - } - { - E T71, T78, T7T, T7Y; - T71 = T6X + T70; - T78 = T74 + T77; - ri[WS(rs, 18)] = T71 - T78; - ri[WS(rs, 2)] = T71 + T78; - T7T = T7a + T7b; - T7Y = T7U + T7X; - ii[WS(rs, 2)] = T7T + T7Y; - ii[WS(rs, 18)] = T7Y - T7T; - } - { - E T79, T7c, T7Z, T80; - T79 = T6X - T70; - T7c = T7a - T7b; - ri[WS(rs, 26)] = T79 - T7c; - ri[WS(rs, 10)] = T79 + T7c; - T7Z = T77 - T74; - T80 = T7X - T7U; - ii[WS(rs, 10)] = T7Z + T80; - ii[WS(rs, 26)] = T80 - T7Z; - } - } - { - E T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57; - E T5b, T3Q, T8p; - T3Q = KP707106781 * (T3K - T3P); - T3R = T3F - T3Q; - T5d = T3F + T3Q; - T8p = KP707106781 * (T5v - T5u); - T8r = T8p + T8q; - T8x = T8q - T8p; - { - E T42, T4d, T5l, T5m; - T42 = FNMS(KP923879532, T41, KP382683432 * T3W); - T4d = FMA(KP382683432, T47, KP923879532 * T4c); - T4e = T42 - T4d; - T8o = T42 + T4d; - T5l = T4L + T4W; - T5m = T52 + T55; - T5n = FNMS(KP555570233, T5m, KP831469612 * T5l); - T5r = FMA(KP831469612, T5m, KP555570233 * T5l); - } - { - E T4w, T4F, T5e, T5f; - T4w = T4k - T4v; - T4F = T4B - T4E; - T4G = FMA(KP980785280, T4w, KP195090322 * T4F); - T5a = FNMS(KP980785280, T4F, KP195090322 * T4w); - T5e = FMA(KP923879532, T3W, KP382683432 * T41); - T5f = FNMS(KP923879532, T47, KP382683432 * T4c); - T5g = T5e + T5f; - T8w = T5f - T5e; - } - { - E T5i, T5j, T4X, T56; - T5i = T4k + T4v; - T5j = T4B + T4E; - T5k = FMA(KP555570233, T5i, KP831469612 * T5j); - T5q = FNMS(KP555570233, T5j, KP831469612 * T5i); - T4X = T4L - T4W; - T56 = T52 - T55; - T57 = FNMS(KP980785280, T56, KP195090322 * T4X); - T5b = FMA(KP195090322, T56, KP980785280 * T4X); - } - { - E T4f, T58, T8v, T8y; - T4f = T3R + T4e; - T58 = T4G + T57; - ri[WS(rs, 23)] = T4f - T58; - ri[WS(rs, 7)] = T4f + T58; - T8v = T5a + T5b; - T8y = T8w + T8x; - ii[WS(rs, 7)] = T8v + T8y; - ii[WS(rs, 23)] = T8y - T8v; - } - { - E T59, T5c, T8z, T8A; - T59 = T3R - T4e; - T5c = T5a - T5b; - ri[WS(rs, 31)] = T59 - T5c; - ri[WS(rs, 15)] = T59 + T5c; - T8z = T57 - T4G; - T8A = T8x - T8w; - ii[WS(rs, 15)] = T8z + T8A; - ii[WS(rs, 31)] = T8A - T8z; - } - { - E T5h, T5o, T8n, T8s; - T5h = T5d + T5g; - T5o = T5k + T5n; - ri[WS(rs, 19)] = T5h - T5o; - ri[WS(rs, 3)] = T5h + T5o; - T8n = T5q + T5r; - T8s = T8o + T8r; - ii[WS(rs, 3)] = T8n + T8s; - ii[WS(rs, 19)] = T8s - T8n; - } - { - E T5p, T5s, T8t, T8u; - T5p = T5d - T5g; - T5s = T5q - T5r; - ri[WS(rs, 27)] = T5p - T5s; - ri[WS(rs, 11)] = T5p + T5s; - T8t = T5n - T5k; - T8u = T8r - T8o; - ii[WS(rs, 11)] = T8t + T8u; - ii[WS(rs, 27)] = T8u - T8t; - } - } - { - E T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T; - E T5X, T5w, T89; - T5w = KP707106781 * (T5u + T5v); - T5x = T5t - T5w; - T5Z = T5t + T5w; - T89 = KP707106781 * (T3K + T3P); - T8d = T89 + T8c; - T8j = T8c - T89; - { - E T5A, T5D, T67, T68; - T5A = FNMS(KP382683432, T5z, KP923879532 * T5y); - T5D = FMA(KP923879532, T5B, KP382683432 * T5C); - T5E = T5A - T5D; - T88 = T5A + T5D; - T67 = T5N + T5O; - T68 = T5Q + T5R; - T69 = FNMS(KP195090322, T68, KP980785280 * T67); - T6d = FMA(KP195090322, T67, KP980785280 * T68); - } - { - E T5I, T5L, T60, T61; - T5I = T5G - T5H; - T5L = T5J - T5K; - T5M = FMA(KP555570233, T5I, KP831469612 * T5L); - T5W = FNMS(KP831469612, T5I, KP555570233 * T5L); - T60 = FMA(KP382683432, T5y, KP923879532 * T5z); - T61 = FNMS(KP382683432, T5B, KP923879532 * T5C); - T62 = T60 + T61; - T8i = T61 - T60; - } - { - E T64, T65, T5P, T5S; - T64 = T5G + T5H; - T65 = T5J + T5K; - T66 = FMA(KP980785280, T64, KP195090322 * T65); - T6c = FNMS(KP195090322, T64, KP980785280 * T65); - T5P = T5N - T5O; - T5S = T5Q - T5R; - T5T = FNMS(KP831469612, T5S, KP555570233 * T5P); - T5X = FMA(KP831469612, T5P, KP555570233 * T5S); - } - { - E T5F, T5U, T8h, T8k; - T5F = T5x + T5E; - T5U = T5M + T5T; - ri[WS(rs, 21)] = T5F - T5U; - ri[WS(rs, 5)] = T5F + T5U; - T8h = T5W + T5X; - T8k = T8i + T8j; - ii[WS(rs, 5)] = T8h + T8k; - ii[WS(rs, 21)] = T8k - T8h; - } - { - E T5V, T5Y, T8l, T8m; - T5V = T5x - T5E; - T5Y = T5W - T5X; - ri[WS(rs, 29)] = T5V - T5Y; - ri[WS(rs, 13)] = T5V + T5Y; - T8l = T5T - T5M; - T8m = T8j - T8i; - ii[WS(rs, 13)] = T8l + T8m; - ii[WS(rs, 29)] = T8m - T8l; - } - { - E T63, T6a, T87, T8e; - T63 = T5Z + T62; - T6a = T66 + T69; - ri[WS(rs, 17)] = T63 - T6a; - ri[WS(rs, 1)] = T63 + T6a; - T87 = T6c + T6d; - T8e = T88 + T8d; - ii[WS(rs, 1)] = T87 + T8e; - ii[WS(rs, 17)] = T8e - T87; - } - { - E T6b, T6e, T8f, T8g; - T6b = T5Z - T62; - T6e = T6c - T6d; - ri[WS(rs, 25)] = T6b - T6e; - ri[WS(rs, 9)] = T6b + T6e; - T8f = T69 - T66; - T8g = T8d - T88; - ii[WS(rs, 9)] = T8f + T8g; - ii[WS(rs, 25)] = T8g - T8f; - } - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 27 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 32, "t2_32", twinstr, &GENUS, { 376, 168, 112, 0 }, 0, 0, 0 }; - -void X(codelet_t2_32) (planner *p) { - X(kdft_dit_register) (p, t2_32, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_4.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_4.c deleted file mode 100644 index dde37585..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_4.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:32 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */ - -/* - * This function contains 24 FP additions, 16 FP multiplications, - * (or, 16 additions, 8 multiplications, 8 fused multiply/add), - * 21 stack variables, 0 constants, and 16 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - { - INT m; - for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) { - E T2, T6, T3, T5, T7, Tb, T4, Ta; - T2 = W[0]; - T6 = W[3]; - T3 = W[2]; - T4 = T2 * T3; - Ta = T2 * T6; - T5 = W[1]; - T7 = FMA(T5, T6, T4); - Tb = FNMS(T5, T3, Ta); - { - E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts; - T1 = ri[0]; - Tx = ii[0]; - { - E T8, T9, Tc, Tv; - T8 = ri[WS(rs, 2)]; - T9 = T7 * T8; - Tc = ii[WS(rs, 2)]; - Tv = T7 * Tc; - Td = FMA(Tb, Tc, T9); - Tw = FNMS(Tb, T8, Tv); - } - { - E Tf, Tg, Th, Tp; - Tf = ri[WS(rs, 1)]; - Tg = T2 * Tf; - Th = ii[WS(rs, 1)]; - Tp = T2 * Th; - Ti = FMA(T5, Th, Tg); - Tq = FNMS(T5, Tf, Tp); - } - { - E Tj, Tk, Tl, Tr; - Tj = ri[WS(rs, 3)]; - Tk = T3 * Tj; - Tl = ii[WS(rs, 3)]; - Tr = T3 * Tl; - Tm = FMA(T6, Tl, Tk); - Ts = FNMS(T6, Tj, Tr); - } - { - E Te, Tn, Tu, Ty; - Te = T1 + Td; - Tn = Ti + Tm; - ri[WS(rs, 2)] = Te - Tn; - ri[0] = Te + Tn; - Tu = Tq + Ts; - Ty = Tw + Tx; - ii[0] = Tu + Ty; - ii[WS(rs, 2)] = Ty - Tu; - } - { - E To, Tt, Tz, TA; - To = T1 - Td; - Tt = Tq - Ts; - ri[WS(rs, 3)] = To - Tt; - ri[WS(rs, 1)] = To + Tt; - Tz = Tx - Tw; - TA = Ti - Tm; - ii[WS(rs, 1)] = Tz - TA; - ii[WS(rs, 3)] = TA + Tz; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 }; - -void X(codelet_t2_4) (planner *p) { - X(kdft_dit_register) (p, t2_4, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */ - -/* - * This function contains 24 FP additions, 16 FP multiplications, - * (or, 16 additions, 8 multiplications, 8 fused multiply/add), - * 21 stack variables, 0 constants, and 16 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - { - INT m; - for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) { - E T2, T4, T3, T5, T6, T8; - T2 = W[0]; - T4 = W[1]; - T3 = W[2]; - T5 = W[3]; - T6 = FMA(T2, T3, T4 * T5); - T8 = FNMS(T4, T3, T2 * T5); - { - E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9; - T1 = ri[0]; - Tp = ii[0]; - T7 = ri[WS(rs, 2)]; - T9 = ii[WS(rs, 2)]; - Ta = FMA(T6, T7, T8 * T9); - To = FNMS(T8, T7, T6 * T9); - { - E Tc, Td, Tf, Tg; - Tc = ri[WS(rs, 1)]; - Td = ii[WS(rs, 1)]; - Te = FMA(T2, Tc, T4 * Td); - Tk = FNMS(T4, Tc, T2 * Td); - Tf = ri[WS(rs, 3)]; - Tg = ii[WS(rs, 3)]; - Th = FMA(T3, Tf, T5 * Tg); - Tl = FNMS(T5, Tf, T3 * Tg); - } - { - E Tb, Ti, Tn, Tq; - Tb = T1 + Ta; - Ti = Te + Th; - ri[WS(rs, 2)] = Tb - Ti; - ri[0] = Tb + Ti; - Tn = Tk + Tl; - Tq = To + Tp; - ii[0] = Tn + Tq; - ii[WS(rs, 2)] = Tq - Tn; - } - { - E Tj, Tm, Tr, Ts; - Tj = T1 - Ta; - Tm = Tk - Tl; - ri[WS(rs, 3)] = Tj - Tm; - ri[WS(rs, 1)] = Tj + Tm; - Tr = Tp - To; - Ts = Te - Th; - ii[WS(rs, 1)] = Tr - Ts; - ii[WS(rs, 3)] = Ts + Tr; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 }; - -void X(codelet_t2_4) (planner *p) { - X(kdft_dit_register) (p, t2_4, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_5.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_5.c deleted file mode 100644 index c015427d..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_5.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:37 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */ - -/* - * This function contains 44 FP additions, 40 FP multiplications, - * (or, 14 additions, 10 multiplications, 30 fused multiply/add), - * 38 stack variables, 4 constants, and 20 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP618033988, +0.618033988749894848204586834365638117720309180); - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - { - INT m; - for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) { - E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te; - T2 = W[0]; - Ta = W[3]; - T8 = W[2]; - T9 = T2 * T8; - Te = T2 * Ta; - T5 = W[1]; - Tb = FNMS(T5, Ta, T9); - Tm = FNMS(T5, T8, Te); - Tf = FMA(T5, T8, Te); - Tj = FMA(T5, Ta, T9); - { - E T1, TO, T7, Th, Ti, Tz, TB, TL, To, Ts, Tt, TE, TG, TM; - T1 = ri[0]; - TO = ii[0]; - { - E T3, T4, T6, Ty, Tc, Td, Tg, TA; - T3 = ri[WS(rs, 1)]; - T4 = T2 * T3; - T6 = ii[WS(rs, 1)]; - Ty = T2 * T6; - Tc = ri[WS(rs, 4)]; - Td = Tb * Tc; - Tg = ii[WS(rs, 4)]; - TA = Tb * Tg; - T7 = FMA(T5, T6, T4); - Th = FMA(Tf, Tg, Td); - Ti = T7 + Th; - Tz = FNMS(T5, T3, Ty); - TB = FNMS(Tf, Tc, TA); - TL = Tz + TB; - } - { - E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF; - Tk = ri[WS(rs, 2)]; - Tl = Tj * Tk; - Tn = ii[WS(rs, 2)]; - TD = Tj * Tn; - Tp = ri[WS(rs, 3)]; - Tq = T8 * Tp; - Tr = ii[WS(rs, 3)]; - TF = T8 * Tr; - To = FMA(Tm, Tn, Tl); - Ts = FMA(Ta, Tr, Tq); - Tt = To + Ts; - TE = FNMS(Tm, Tk, TD); - TG = FNMS(Ta, Tp, TF); - TM = TE + TG; - } - { - E Tw, Tu, Tv, TI, TK, TC, TH, TJ, Tx; - Tw = Ti - Tt; - Tu = Ti + Tt; - Tv = FNMS(KP250000000, Tu, T1); - TC = Tz - TB; - TH = TE - TG; - TI = FMA(KP618033988, TH, TC); - TK = FNMS(KP618033988, TC, TH); - ri[0] = T1 + Tu; - TJ = FNMS(KP559016994, Tw, Tv); - ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ); - ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ); - Tx = FMA(KP559016994, Tw, Tv); - ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx); - ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx); - } - { - E TQ, TN, TP, TU, TW, TS, TT, TV, TR; - TQ = TL - TM; - TN = TL + TM; - TP = FNMS(KP250000000, TN, TO); - TS = T7 - Th; - TT = To - Ts; - TU = FMA(KP618033988, TT, TS); - TW = FNMS(KP618033988, TS, TT); - ii[0] = TN + TO; - TV = FNMS(KP559016994, TQ, TP); - ii[WS(rs, 2)] = FMA(KP951056516, TW, TV); - ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV); - TR = FMA(KP559016994, TQ, TP); - ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR); - ii[WS(rs, 4)] = FMA(KP951056516, TU, TR); - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 14, 10, 30, 0 }, 0, 0, 0 }; - -void X(codelet_t2_5) (planner *p) { - X(kdft_dit_register) (p, t2_5, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */ - -/* - * This function contains 44 FP additions, 32 FP multiplications, - * (or, 30 additions, 18 multiplications, 14 fused multiply/add), - * 37 stack variables, 4 constants, and 20 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP250000000, +0.250000000000000000000000000000000000000000000); - DK(KP559016994, +0.559016994374947424102293417182819058860154590); - DK(KP587785252, +0.587785252292473129168705954639072768597652438); - DK(KP951056516, +0.951056516295153572116439333379382143405698634); - { - INT m; - for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) { - E T2, T4, T7, T9, Tb, Tl, Tf, Tj; - { - E T8, Te, Ta, Td; - T2 = W[0]; - T4 = W[1]; - T7 = W[2]; - T9 = W[3]; - T8 = T2 * T7; - Te = T4 * T7; - Ta = T4 * T9; - Td = T2 * T9; - Tb = T8 - Ta; - Tl = Td - Te; - Tf = Td + Te; - Tj = T8 + Ta; - } - { - E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts; - T1 = ri[0]; - TI = ii[0]; - { - E T6, Tw, Tq, TA, Th, Tx, Tn, Tz; - { - E T3, T5, To, Tp; - T3 = ri[WS(rs, 1)]; - T5 = ii[WS(rs, 1)]; - T6 = FMA(T2, T3, T4 * T5); - Tw = FNMS(T4, T3, T2 * T5); - To = ri[WS(rs, 3)]; - Tp = ii[WS(rs, 3)]; - Tq = FMA(T7, To, T9 * Tp); - TA = FNMS(T9, To, T7 * Tp); - } - { - E Tc, Tg, Tk, Tm; - Tc = ri[WS(rs, 4)]; - Tg = ii[WS(rs, 4)]; - Th = FMA(Tb, Tc, Tf * Tg); - Tx = FNMS(Tf, Tc, Tb * Tg); - Tk = ri[WS(rs, 2)]; - Tm = ii[WS(rs, 2)]; - Tn = FMA(Tj, Tk, Tl * Tm); - Tz = FNMS(Tl, Tk, Tj * Tm); - } - Ty = Tw - Tx; - TB = Tz - TA; - TN = Tn - Tq; - TM = T6 - Th; - TF = Tw + Tx; - TG = Tz + TA; - TH = TF + TG; - Ti = T6 + Th; - Tr = Tn + Tq; - Ts = Ti + Tr; - } - ri[0] = T1 + Ts; - ii[0] = TH + TI; - { - E TC, TE, Tv, TD, Tt, Tu; - TC = FMA(KP951056516, Ty, KP587785252 * TB); - TE = FNMS(KP587785252, Ty, KP951056516 * TB); - Tt = KP559016994 * (Ti - Tr); - Tu = FNMS(KP250000000, Ts, T1); - Tv = Tt + Tu; - TD = Tu - Tt; - ri[WS(rs, 4)] = Tv - TC; - ri[WS(rs, 3)] = TD + TE; - ri[WS(rs, 1)] = Tv + TC; - ri[WS(rs, 2)] = TD - TE; - } - { - E TO, TP, TL, TQ, TJ, TK; - TO = FMA(KP951056516, TM, KP587785252 * TN); - TP = FNMS(KP587785252, TM, KP951056516 * TN); - TJ = KP559016994 * (TF - TG); - TK = FNMS(KP250000000, TH, TI); - TL = TJ + TK; - TQ = TK - TJ; - ii[WS(rs, 1)] = TL - TO; - ii[WS(rs, 3)] = TQ - TP; - ii[WS(rs, 4)] = TO + TL; - ii[WS(rs, 2)] = TP + TQ; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 30, 18, 14, 0 }, 0, 0, 0 }; - -void X(codelet_t2_5) (planner *p) { - X(kdft_dit_register) (p, t2_5, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_64.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_64.c deleted file mode 100644 index 408bde94..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_64.c +++ /dev/null @@ -1,4243 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:33 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 64 -name t2_64 -include dft/scalar/t.h */ - -/* - * This function contains 1154 FP additions, 840 FP multiplications, - * (or, 520 additions, 206 multiplications, 634 fused multiply/add), - * 316 stack variables, 15 constants, and 256 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP995184726, +0.995184726672196886244836953109479921575474869); - DK(KP773010453, +0.773010453362736960810906609758469800971041293); - DK(KP956940335, +0.956940335732208864935797886980269969482849206); - DK(KP881921264, +0.881921264348355029712756863660388349508442621); - DK(KP098491403, +0.098491403357164253077197521291327432293052451); - DK(KP820678790, +0.820678790828660330972281985331011598767386482); - DK(KP303346683, +0.303346683607342391675883946941299872384187453); - DK(KP534511135, +0.534511135950791641089685961295362908582039528); - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP198912367, +0.198912367379658006911597622644676228597850501); - DK(KP668178637, +0.668178637919298919997757686523080761552472251); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - DK(KP414213562, +0.414213562373095048801688724209698078569671875); - { - INT m; - for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(128, rs)) { - E T2, T3, Tc, T8, Te, T5, T6, Tr, T7, TJ, T14, T3d, T3i, TG, T10; - E T3a, T3g, TL, TP, Tb, Td, T17, Tt, Tu, T1i, Ti, T2U, T1t, T7B, T5O; - E T3N, T3U, T1I, T3G, T3R, T79, T1x, T3D, T2l, T3X, T2d, T1M, T4B, T4x, T4T; - E T2h, T29, T5s, T81, T5w, T7X, T7N, T7h, T64, T6a, T6e, T7l, T60, T7R, T5A; - E T6h, T6J, T7o, T5E, T6k, T6N, T7r, T2X, T6t, T6x, TO, TK, TQ, T7c, TU; - E T2x, T2u, T2y, T7E, T2C, T4b, T48, T4c, T5R, T4g, T3m, T3j, T3n, T4W, T3r; - E Tx, Ty, TC, T1Z, T23, T4s, T4p, T70, T6W, T19, T41, T44, T1a, T1e, T35; - E T31, T59, T55, T1k, T1R, T1V, T1l, T1p, T2Q, T2N, T8i, T8e, Th, T4E, T4H; - E Tj, Tn, T3A, T3w, T5n, T5j; - { - E T1H, Tg, Tw, T1s, T2g, TH, T2t, T47, T3h, T28, T4w, T3M, T2c, T4A, T3Q; - E T1w, T2k, T1L, T5r, T80; - { - E TI, T13, TF, TZ, Ta, T4, T9, Ts; - T2 = W[0]; - T3 = W[2]; - T4 = T2 * T3; - Tc = W[5]; - TI = T3 * Tc; - T13 = T2 * Tc; - T8 = W[4]; - Te = W[6]; - TF = T3 * T8; - T1H = T8 * Te; - TZ = T2 * T8; - T5 = W[1]; - T6 = W[3]; - Ta = T2 * T6; - Tr = FMA(T5, T6, T4); - T7 = FNMS(T5, T6, T4); - Tg = T7 * Tc; - Tw = Tr * Tc; - T1s = T3 * Te; - T2g = T2 * Te; - TJ = FMA(T6, T8, TI); - T14 = FNMS(T5, T8, T13); - T3d = FMA(T5, T8, T13); - T3i = FNMS(T6, T8, TI); - TG = FNMS(T6, Tc, TF); - TH = TG * Te; - T10 = FMA(T5, Tc, TZ); - T2t = T10 * Te; - T3a = FNMS(T5, Tc, TZ); - T47 = T3a * Te; - T3g = FMA(T6, Tc, TF); - T3h = T3g * Te; - TL = W[8]; - T28 = T3 * TL; - T4w = T8 * TL; - T3M = T2 * TL; - TP = W[9]; - T2c = T3 * TP; - T4A = T8 * TP; - T3Q = T2 * TP; - T9 = T7 * T8; - Tb = FMA(T5, T3, Ta); - Td = FMA(Tb, Tc, T9); - T17 = FNMS(Tb, Tc, T9); - Ts = Tr * T8; - Tt = FNMS(T5, T3, Ta); - Tu = FNMS(Tt, Tc, Ts); - T1i = FMA(Tt, Tc, Ts); - Ti = W[7]; - T1w = T3 * Ti; - T2k = T2 * Ti; - T1L = T8 * Ti; - T2U = FMA(Tc, Ti, T1H); - } - T1t = FMA(T6, Ti, T1s); - T7B = FNMS(T14, Ti, T2t); - T5O = FNMS(T3d, Ti, T47); - T3N = FMA(T5, TP, T3M); - T3U = FNMS(T6, Ti, T1s); - T1I = FNMS(Tc, Ti, T1H); - T3G = FNMS(T5, Te, T2k); - T3R = FNMS(T5, TL, T3Q); - T79 = FNMS(TJ, Ti, TH); - T1x = FNMS(T6, Te, T1w); - T3D = FMA(T5, Ti, T2g); - T2l = FMA(T5, Te, T2k); - T3X = FMA(T6, Te, T1w); - T2d = FNMS(T6, TL, T2c); - T1M = FMA(Tc, Te, T1L); - T4B = FNMS(Tc, TL, T4A); - T4x = FMA(Tc, TP, T4w); - T4T = FNMS(T3i, Ti, T3h); - T2h = FNMS(T5, Ti, T2g); - T29 = FMA(T6, TP, T28); - T5r = T3g * TL; - T5s = FMA(T3i, TP, T5r); - T80 = T7 * TP; - T81 = FNMS(Tb, TL, T80); - { - E T5v, T7W, T7M, T7g, T63; - T5v = T3g * TP; - T5w = FNMS(T3i, TL, T5v); - T7W = T7 * TL; - T7X = FMA(Tb, TP, T7W); - T7M = TG * TL; - T7N = FMA(TJ, TP, T7M); - T7g = T10 * TL; - T7h = FMA(T14, TP, T7g); - T63 = T3a * TP; - T64 = FNMS(T3d, TL, T63); - } - { - E T69, T6d, T7k, T5Z, T7Q, T5z; - T69 = Tr * TL; - T6a = FMA(Tt, TP, T69); - T6d = Tr * TP; - T6e = FNMS(Tt, TL, T6d); - T7k = T10 * TP; - T7l = FNMS(T14, TL, T7k); - T5Z = T3a * TL; - T60 = FMA(T3d, TP, T5Z); - T7Q = TG * TP; - T7R = FNMS(TJ, TL, T7Q); - T5z = Tr * Te; - T5A = FMA(Tt, Ti, T5z); - T6h = FNMS(Tt, Ti, T5z); - } - { - E T6I, T5D, T6M, T6s, T6w; - T6I = T7 * Te; - T6J = FNMS(Tb, Ti, T6I); - T7o = FMA(Tb, Ti, T6I); - T5D = Tr * Ti; - T5E = FNMS(Tt, Te, T5D); - T6k = FMA(Tt, Te, T5D); - T6M = T7 * Ti; - T6N = FMA(Tb, Te, T6M); - T7r = FNMS(Tb, Te, T6M); - T6s = T2U * TL; - T6w = T2U * TP; - T2X = FNMS(Tc, Te, T1L); - T6t = FMA(T2X, TP, T6s); - T6x = FNMS(T2X, TL, T6w); - { - E TN, TM, TT, T2w, T2v, T2B; - TN = TG * Ti; - TO = FNMS(TJ, Te, TN); - TK = FMA(TJ, Ti, TH); - TM = TK * TL; - TT = TK * TP; - TQ = FMA(TO, TP, TM); - T7c = FMA(TJ, Te, TN); - TU = FNMS(TO, TL, TT); - T2w = T10 * Ti; - T2x = FNMS(T14, Te, T2w); - T2u = FMA(T14, Ti, T2t); - T2v = T2u * TL; - T2B = T2u * TP; - T2y = FMA(T2x, TP, T2v); - T7E = FMA(T14, Te, T2w); - T2C = FNMS(T2x, TL, T2B); - } - } - { - E T4a, T49, T4f, T3l, T3k, T3q; - T4a = T3a * Ti; - T4b = FNMS(T3d, Te, T4a); - T48 = FMA(T3d, Ti, T47); - T49 = T48 * TL; - T4f = T48 * TP; - T4c = FMA(T4b, TP, T49); - T5R = FMA(T3d, Te, T4a); - T4g = FNMS(T4b, TL, T4f); - T3l = T3g * Ti; - T3m = FNMS(T3i, Te, T3l); - T3j = FMA(T3i, Ti, T3h); - T3k = T3j * TL; - T3q = T3j * TP; - T3n = FMA(T3m, TP, T3k); - T4W = FMA(T3i, Te, T3l); - T3r = FNMS(T3m, TL, T3q); - { - E T1Y, T22, Tv, TB, T6Z, T6V; - T1Y = Tu * TL; - T22 = Tu * TP; - Tv = Tu * Te; - TB = Tu * Ti; - Tx = FMA(Tt, T8, Tw); - Ty = FMA(Tx, Ti, Tv); - TC = FNMS(Tx, Te, TB); - T1Z = FMA(Tx, TP, T1Y); - T23 = FNMS(Tx, TL, T22); - T4s = FMA(Tx, Te, TB); - T4p = FNMS(Tx, Ti, Tv); - T6Z = Ty * TP; - T70 = FNMS(TC, TL, T6Z); - T6V = Ty * TL; - T6W = FMA(TC, TP, T6V); - } - } - { - E T30, T34, T18, T1d, T58, T54; - T30 = T17 * TL; - T34 = T17 * TP; - T18 = T17 * Te; - T1d = T17 * Ti; - T19 = FMA(Tb, T8, Tg); - T41 = FMA(T19, Ti, T18); - T44 = FNMS(T19, Te, T1d); - T1a = FNMS(T19, Ti, T18); - T1e = FMA(T19, Te, T1d); - T35 = FNMS(T19, TL, T34); - T31 = FMA(T19, TP, T30); - T58 = T41 * TP; - T59 = FNMS(T44, TL, T58); - T54 = T41 * TL; - T55 = FMA(T44, TP, T54); - } - { - E T1j, T1o, T1Q, T1U, T8h, T8d; - T1j = T1i * TL; - T1o = T1i * TP; - T1Q = T1i * Te; - T1U = T1i * Ti; - T1k = FNMS(Tt, T8, Tw); - T1R = FMA(T1k, Ti, T1Q); - T1V = FNMS(T1k, Te, T1U); - T1l = FMA(T1k, TP, T1j); - T1p = FNMS(T1k, TL, T1o); - T2Q = FMA(T1k, Te, T1U); - T2N = FNMS(T1k, Ti, T1Q); - T8h = T1R * TP; - T8i = FNMS(T1V, TL, T8h); - T8d = T1R * TL; - T8e = FMA(T1V, TP, T8d); - } - { - E T3v, T3z, Tf, Tm, T5m, T5i; - T3v = Td * TL; - T3z = Td * TP; - Tf = Td * Te; - Tm = Td * Ti; - Th = FNMS(Tb, T8, Tg); - T4E = FMA(Th, Ti, Tf); - T4H = FNMS(Th, Te, Tm); - Tj = FNMS(Th, Ti, Tf); - Tn = FMA(Th, Te, Tm); - T3A = FNMS(Th, TL, T3z); - T3w = FMA(Th, TP, T3v); - T5m = T4E * TP; - T5n = FNMS(T4H, TL, T5m); - T5i = T4E * TL; - T5j = FMA(T4H, TP, T5i); - } - } - { - E TY, Tg4, Tl9, TlD, T8w, TdS, Tkd, TkE, T2G, Tge, Tgh, TiK, T98, Te1, T9f; - E Te0, T39, Tgq, Tgn, TiN, T9p, Te5, T9M, Te8, T74, Thr, Thc, Tja, TbI, TeE; - E TcB, TeP, T1B, TkD, Tg7, Tk7, T8D, TdT, T8K, TdU, T27, Tg9, Tgc, TiJ, T8T; - E TdY, T90, TdX, T4k, TgB, Tgy, TiT, T9Y, Tec, Tal, Tef, T5d, Th0, TgL, TiZ; - E Taz, Tel, Tbs, Tew, T3K, Tgo, Tgt, TiO, T9E, Te9, T9P, Te6, T4L, Tgz, TgE; - E TiU, Tad, Teg, Tao, Ted, T5I, TgM, Th3, Tj0, TaO, Tex, Tbv, Tem, T7v, Thd; - E Thu, Tjb, TbX, TeQ, TcE, TeF, T68, Tj5, TgS, Th5, Tbj, Tez, Tbx, Teq, T6B; - E Tj6, TgX, Th6, Tb4, TeA, Tby, Tet, T7V, Tjg, Thj, Thw, Tcs, TeS, TcG, TeJ; - E T8m, Tjh, Tho, Thx, Tcd, TeT, TcH, TeM; - { - E T1, Tkb, Tp, Tka, TE, T8s, TW, T8u; - T1 = ri[0]; - Tkb = ii[0]; - { - E Tk, Tl, To, Tk9; - Tk = ri[WS(rs, 32)]; - Tl = Tj * Tk; - To = ii[WS(rs, 32)]; - Tk9 = Tj * To; - Tp = FMA(Tn, To, Tl); - Tka = FNMS(Tn, Tk, Tk9); - } - { - E Tz, TA, TD, T8r; - Tz = ri[WS(rs, 16)]; - TA = Ty * Tz; - TD = ii[WS(rs, 16)]; - T8r = Ty * TD; - TE = FMA(TC, TD, TA); - T8s = FNMS(TC, Tz, T8r); - } - { - E TR, TS, TV, T8t; - TR = ri[WS(rs, 48)]; - TS = TQ * TR; - TV = ii[WS(rs, 48)]; - T8t = TQ * TV; - TW = FMA(TU, TV, TS); - T8u = FNMS(TU, TR, T8t); - } - { - E Tq, TX, Tl7, Tl8; - Tq = T1 + Tp; - TX = TE + TW; - TY = Tq + TX; - Tg4 = Tq - TX; - Tl7 = Tkb - Tka; - Tl8 = TE - TW; - Tl9 = Tl7 - Tl8; - TlD = Tl8 + Tl7; - } - { - E T8q, T8v, Tk8, Tkc; - T8q = T1 - Tp; - T8v = T8s - T8u; - T8w = T8q - T8v; - TdS = T8q + T8v; - Tk8 = T8s + T8u; - Tkc = Tka + Tkb; - Tkd = Tk8 + Tkc; - TkE = Tkc - Tk8; - } - } - { - E T2f, T93, T2E, T9d, T2n, T95, T2s, T9b; - { - E T2a, T2b, T2e, T92; - T2a = ri[WS(rs, 60)]; - T2b = T29 * T2a; - T2e = ii[WS(rs, 60)]; - T92 = T29 * T2e; - T2f = FMA(T2d, T2e, T2b); - T93 = FNMS(T2d, T2a, T92); - } - { - E T2z, T2A, T2D, T9c; - T2z = ri[WS(rs, 44)]; - T2A = T2y * T2z; - T2D = ii[WS(rs, 44)]; - T9c = T2y * T2D; - T2E = FMA(T2C, T2D, T2A); - T9d = FNMS(T2C, T2z, T9c); - } - { - E T2i, T2j, T2m, T94; - T2i = ri[WS(rs, 28)]; - T2j = T2h * T2i; - T2m = ii[WS(rs, 28)]; - T94 = T2h * T2m; - T2n = FMA(T2l, T2m, T2j); - T95 = FNMS(T2l, T2i, T94); - } - { - E T2p, T2q, T2r, T9a; - T2p = ri[WS(rs, 12)]; - T2q = TG * T2p; - T2r = ii[WS(rs, 12)]; - T9a = TG * T2r; - T2s = FMA(TJ, T2r, T2q); - T9b = FNMS(TJ, T2p, T9a); - } - { - E T2o, T2F, Tgf, Tgg; - T2o = T2f + T2n; - T2F = T2s + T2E; - T2G = T2o + T2F; - Tge = T2o - T2F; - Tgf = T93 + T95; - Tgg = T9b + T9d; - Tgh = Tgf - Tgg; - TiK = Tgf + Tgg; - } - { - E T96, T97, T99, T9e; - T96 = T93 - T95; - T97 = T2s - T2E; - T98 = T96 + T97; - Te1 = T96 - T97; - T99 = T2f - T2n; - T9e = T9b - T9d; - T9f = T99 - T9e; - Te0 = T99 + T9e; - } - } - { - E T2M, T9k, T37, T9K, T2S, T9m, T2Z, T9I; - { - E T2J, T2K, T2L, T9j; - T2J = ri[WS(rs, 2)]; - T2K = Tr * T2J; - T2L = ii[WS(rs, 2)]; - T9j = Tr * T2L; - T2M = FMA(Tt, T2L, T2K); - T9k = FNMS(Tt, T2J, T9j); - } - { - E T32, T33, T36, T9J; - T32 = ri[WS(rs, 50)]; - T33 = T31 * T32; - T36 = ii[WS(rs, 50)]; - T9J = T31 * T36; - T37 = FMA(T35, T36, T33); - T9K = FNMS(T35, T32, T9J); - } - { - E T2O, T2P, T2R, T9l; - T2O = ri[WS(rs, 34)]; - T2P = T2N * T2O; - T2R = ii[WS(rs, 34)]; - T9l = T2N * T2R; - T2S = FMA(T2Q, T2R, T2P); - T9m = FNMS(T2Q, T2O, T9l); - } - { - E T2V, T2W, T2Y, T9H; - T2V = ri[WS(rs, 18)]; - T2W = T2U * T2V; - T2Y = ii[WS(rs, 18)]; - T9H = T2U * T2Y; - T2Z = FMA(T2X, T2Y, T2W); - T9I = FNMS(T2X, T2V, T9H); - } - { - E T2T, T38, Tgl, Tgm; - T2T = T2M + T2S; - T38 = T2Z + T37; - T39 = T2T + T38; - Tgq = T2T - T38; - Tgl = T9k + T9m; - Tgm = T9I + T9K; - Tgn = Tgl - Tgm; - TiN = Tgl + Tgm; - } - { - E T9n, T9o, T9G, T9L; - T9n = T9k - T9m; - T9o = T2Z - T37; - T9p = T9n + T9o; - Te5 = T9n - T9o; - T9G = T2M - T2S; - T9L = T9I - T9K; - T9M = T9G - T9L; - Te8 = T9G + T9L; - } - } - { - E T6H, TbD, T72, Tcz, T6P, TbF, T6U, Tcx; - { - E T6E, T6F, T6G, TbC; - T6E = ri[WS(rs, 63)]; - T6F = TL * T6E; - T6G = ii[WS(rs, 63)]; - TbC = TL * T6G; - T6H = FMA(TP, T6G, T6F); - TbD = FNMS(TP, T6E, TbC); - } - { - E T6X, T6Y, T71, Tcy; - T6X = ri[WS(rs, 47)]; - T6Y = T6W * T6X; - T71 = ii[WS(rs, 47)]; - Tcy = T6W * T71; - T72 = FMA(T70, T71, T6Y); - Tcz = FNMS(T70, T6X, Tcy); - } - { - E T6K, T6L, T6O, TbE; - T6K = ri[WS(rs, 31)]; - T6L = T6J * T6K; - T6O = ii[WS(rs, 31)]; - TbE = T6J * T6O; - T6P = FMA(T6N, T6O, T6L); - TbF = FNMS(T6N, T6K, TbE); - } - { - E T6R, T6S, T6T, Tcw; - T6R = ri[WS(rs, 15)]; - T6S = TK * T6R; - T6T = ii[WS(rs, 15)]; - Tcw = TK * T6T; - T6U = FMA(TO, T6T, T6S); - Tcx = FNMS(TO, T6R, Tcw); - } - { - E T6Q, T73, Tha, Thb; - T6Q = T6H + T6P; - T73 = T6U + T72; - T74 = T6Q + T73; - Thr = T6Q - T73; - Tha = TbD + TbF; - Thb = Tcx + Tcz; - Thc = Tha - Thb; - Tja = Tha + Thb; - } - { - E TbG, TbH, Tcv, TcA; - TbG = TbD - TbF; - TbH = T6U - T72; - TbI = TbG + TbH; - TeE = TbG - TbH; - Tcv = T6H - T6P; - TcA = Tcx - Tcz; - TcB = Tcv - TcA; - TeP = Tcv + TcA; - } - } - { - E T16, T8y, T1z, T8I, T1g, T8A, T1r, T8G; - { - E T11, T12, T15, T8x; - T11 = ri[WS(rs, 8)]; - T12 = T10 * T11; - T15 = ii[WS(rs, 8)]; - T8x = T10 * T15; - T16 = FMA(T14, T15, T12); - T8y = FNMS(T14, T11, T8x); - } - { - E T1u, T1v, T1y, T8H; - T1u = ri[WS(rs, 24)]; - T1v = T1t * T1u; - T1y = ii[WS(rs, 24)]; - T8H = T1t * T1y; - T1z = FMA(T1x, T1y, T1v); - T8I = FNMS(T1x, T1u, T8H); - } - { - E T1b, T1c, T1f, T8z; - T1b = ri[WS(rs, 40)]; - T1c = T1a * T1b; - T1f = ii[WS(rs, 40)]; - T8z = T1a * T1f; - T1g = FMA(T1e, T1f, T1c); - T8A = FNMS(T1e, T1b, T8z); - } - { - E T1m, T1n, T1q, T8F; - T1m = ri[WS(rs, 56)]; - T1n = T1l * T1m; - T1q = ii[WS(rs, 56)]; - T8F = T1l * T1q; - T1r = FMA(T1p, T1q, T1n); - T8G = FNMS(T1p, T1m, T8F); - } - { - E T1h, T1A, Tg5, Tg6; - T1h = T16 + T1g; - T1A = T1r + T1z; - T1B = T1h + T1A; - TkD = T1A - T1h; - Tg5 = T8y + T8A; - Tg6 = T8G + T8I; - Tg7 = Tg5 - Tg6; - Tk7 = Tg5 + Tg6; - } - { - E T8B, T8C, T8E, T8J; - T8B = T8y - T8A; - T8C = T16 - T1g; - T8D = T8B - T8C; - TdT = T8C + T8B; - T8E = T1r - T1z; - T8J = T8G - T8I; - T8K = T8E + T8J; - TdU = T8E - T8J; - } - } - { - E T1G, T8O, T25, T8Y, T1O, T8Q, T1X, T8W; - { - E T1D, T1E, T1F, T8N; - T1D = ri[WS(rs, 4)]; - T1E = T7 * T1D; - T1F = ii[WS(rs, 4)]; - T8N = T7 * T1F; - T1G = FMA(Tb, T1F, T1E); - T8O = FNMS(Tb, T1D, T8N); - } - { - E T20, T21, T24, T8X; - T20 = ri[WS(rs, 52)]; - T21 = T1Z * T20; - T24 = ii[WS(rs, 52)]; - T8X = T1Z * T24; - T25 = FMA(T23, T24, T21); - T8Y = FNMS(T23, T20, T8X); - } - { - E T1J, T1K, T1N, T8P; - T1J = ri[WS(rs, 36)]; - T1K = T1I * T1J; - T1N = ii[WS(rs, 36)]; - T8P = T1I * T1N; - T1O = FMA(T1M, T1N, T1K); - T8Q = FNMS(T1M, T1J, T8P); - } - { - E T1S, T1T, T1W, T8V; - T1S = ri[WS(rs, 20)]; - T1T = T1R * T1S; - T1W = ii[WS(rs, 20)]; - T8V = T1R * T1W; - T1X = FMA(T1V, T1W, T1T); - T8W = FNMS(T1V, T1S, T8V); - } - { - E T1P, T26, Tga, Tgb; - T1P = T1G + T1O; - T26 = T1X + T25; - T27 = T1P + T26; - Tg9 = T1P - T26; - Tga = T8O + T8Q; - Tgb = T8W + T8Y; - Tgc = Tga - Tgb; - TiJ = Tga + Tgb; - } - { - E T8R, T8S, T8U, T8Z; - T8R = T8O - T8Q; - T8S = T1X - T25; - T8T = T8R + T8S; - TdY = T8R - T8S; - T8U = T1G - T1O; - T8Z = T8W - T8Y; - T90 = T8U - T8Z; - TdX = T8U + T8Z; - } - } - { - E T3T, T9T, T4i, Taj, T3Z, T9V, T46, Tah; - { - E T3O, T3P, T3S, T9S; - T3O = ri[WS(rs, 62)]; - T3P = T3N * T3O; - T3S = ii[WS(rs, 62)]; - T9S = T3N * T3S; - T3T = FMA(T3R, T3S, T3P); - T9T = FNMS(T3R, T3O, T9S); - } - { - E T4d, T4e, T4h, Tai; - T4d = ri[WS(rs, 46)]; - T4e = T4c * T4d; - T4h = ii[WS(rs, 46)]; - Tai = T4c * T4h; - T4i = FMA(T4g, T4h, T4e); - Taj = FNMS(T4g, T4d, Tai); - } - { - E T3V, T3W, T3Y, T9U; - T3V = ri[WS(rs, 30)]; - T3W = T3U * T3V; - T3Y = ii[WS(rs, 30)]; - T9U = T3U * T3Y; - T3Z = FMA(T3X, T3Y, T3W); - T9V = FNMS(T3X, T3V, T9U); - } - { - E T42, T43, T45, Tag; - T42 = ri[WS(rs, 14)]; - T43 = T41 * T42; - T45 = ii[WS(rs, 14)]; - Tag = T41 * T45; - T46 = FMA(T44, T45, T43); - Tah = FNMS(T44, T42, Tag); - } - { - E T40, T4j, Tgw, Tgx; - T40 = T3T + T3Z; - T4j = T46 + T4i; - T4k = T40 + T4j; - TgB = T40 - T4j; - Tgw = T9T + T9V; - Tgx = Tah + Taj; - Tgy = Tgw - Tgx; - TiT = Tgw + Tgx; - } - { - E T9W, T9X, Taf, Tak; - T9W = T9T - T9V; - T9X = T46 - T4i; - T9Y = T9W + T9X; - Tec = T9W - T9X; - Taf = T3T - T3Z; - Tak = Tah - Taj; - Tal = Taf - Tak; - Tef = Taf + Tak; - } - } - { - E T4S, Tau, T5b, Tbq, T4Y, Taw, T53, Tbo; - { - E T4P, T4Q, T4R, Tat; - T4P = ri[WS(rs, 1)]; - T4Q = T2 * T4P; - T4R = ii[WS(rs, 1)]; - Tat = T2 * T4R; - T4S = FMA(T5, T4R, T4Q); - Tau = FNMS(T5, T4P, Tat); - } - { - E T56, T57, T5a, Tbp; - T56 = ri[WS(rs, 49)]; - T57 = T55 * T56; - T5a = ii[WS(rs, 49)]; - Tbp = T55 * T5a; - T5b = FMA(T59, T5a, T57); - Tbq = FNMS(T59, T56, Tbp); - } - { - E T4U, T4V, T4X, Tav; - T4U = ri[WS(rs, 33)]; - T4V = T4T * T4U; - T4X = ii[WS(rs, 33)]; - Tav = T4T * T4X; - T4Y = FMA(T4W, T4X, T4V); - Taw = FNMS(T4W, T4U, Tav); - } - { - E T50, T51, T52, Tbn; - T50 = ri[WS(rs, 17)]; - T51 = T48 * T50; - T52 = ii[WS(rs, 17)]; - Tbn = T48 * T52; - T53 = FMA(T4b, T52, T51); - Tbo = FNMS(T4b, T50, Tbn); - } - { - E T4Z, T5c, TgJ, TgK; - T4Z = T4S + T4Y; - T5c = T53 + T5b; - T5d = T4Z + T5c; - Th0 = T4Z - T5c; - TgJ = Tau + Taw; - TgK = Tbo + Tbq; - TgL = TgJ - TgK; - TiZ = TgJ + TgK; - } - { - E Tax, Tay, Tbm, Tbr; - Tax = Tau - Taw; - Tay = T53 - T5b; - Taz = Tax + Tay; - Tel = Tax - Tay; - Tbm = T4S - T4Y; - Tbr = Tbo - Tbq; - Tbs = Tbm - Tbr; - Tew = Tbm + Tbr; - } - } - { - E T3f, T9s, T3I, T9B, T3t, T9u, T3C, T9z; - { - E T3b, T3c, T3e, T9r; - T3b = ri[WS(rs, 10)]; - T3c = T3a * T3b; - T3e = ii[WS(rs, 10)]; - T9r = T3a * T3e; - T3f = FMA(T3d, T3e, T3c); - T9s = FNMS(T3d, T3b, T9r); - } - { - E T3E, T3F, T3H, T9A; - T3E = ri[WS(rs, 26)]; - T3F = T3D * T3E; - T3H = ii[WS(rs, 26)]; - T9A = T3D * T3H; - T3I = FMA(T3G, T3H, T3F); - T9B = FNMS(T3G, T3E, T9A); - } - { - E T3o, T3p, T3s, T9t; - T3o = ri[WS(rs, 42)]; - T3p = T3n * T3o; - T3s = ii[WS(rs, 42)]; - T9t = T3n * T3s; - T3t = FMA(T3r, T3s, T3p); - T9u = FNMS(T3r, T3o, T9t); - } - { - E T3x, T3y, T3B, T9y; - T3x = ri[WS(rs, 58)]; - T3y = T3w * T3x; - T3B = ii[WS(rs, 58)]; - T9y = T3w * T3B; - T3C = FMA(T3A, T3B, T3y); - T9z = FNMS(T3A, T3x, T9y); - } - { - E T3u, T3J, Tgr, Tgs; - T3u = T3f + T3t; - T3J = T3C + T3I; - T3K = T3u + T3J; - Tgo = T3J - T3u; - Tgr = T9s + T9u; - Tgs = T9z + T9B; - Tgt = Tgr - Tgs; - TiO = Tgr + Tgs; - { - E T9w, T9O, T9D, T9N; - { - E T9q, T9v, T9x, T9C; - T9q = T3f - T3t; - T9v = T9s - T9u; - T9w = T9q + T9v; - T9O = T9v - T9q; - T9x = T3C - T3I; - T9C = T9z - T9B; - T9D = T9x - T9C; - T9N = T9x + T9C; - } - T9E = T9w - T9D; - Te9 = T9w + T9D; - T9P = T9N - T9O; - Te6 = T9O + T9N; - } - } - } - { - E T4o, Ta1, T4J, Taa, T4u, Ta3, T4D, Ta8; - { - E T4l, T4m, T4n, Ta0; - T4l = ri[WS(rs, 6)]; - T4m = T3g * T4l; - T4n = ii[WS(rs, 6)]; - Ta0 = T3g * T4n; - T4o = FMA(T3i, T4n, T4m); - Ta1 = FNMS(T3i, T4l, Ta0); - } - { - E T4F, T4G, T4I, Ta9; - T4F = ri[WS(rs, 22)]; - T4G = T4E * T4F; - T4I = ii[WS(rs, 22)]; - Ta9 = T4E * T4I; - T4J = FMA(T4H, T4I, T4G); - Taa = FNMS(T4H, T4F, Ta9); - } - { - E T4q, T4r, T4t, Ta2; - T4q = ri[WS(rs, 38)]; - T4r = T4p * T4q; - T4t = ii[WS(rs, 38)]; - Ta2 = T4p * T4t; - T4u = FMA(T4s, T4t, T4r); - Ta3 = FNMS(T4s, T4q, Ta2); - } - { - E T4y, T4z, T4C, Ta7; - T4y = ri[WS(rs, 54)]; - T4z = T4x * T4y; - T4C = ii[WS(rs, 54)]; - Ta7 = T4x * T4C; - T4D = FMA(T4B, T4C, T4z); - Ta8 = FNMS(T4B, T4y, Ta7); - } - { - E T4v, T4K, TgC, TgD; - T4v = T4o + T4u; - T4K = T4D + T4J; - T4L = T4v + T4K; - Tgz = T4K - T4v; - TgC = Ta1 + Ta3; - TgD = Ta8 + Taa; - TgE = TgC - TgD; - TiU = TgC + TgD; - { - E Ta5, Tan, Tac, Tam; - { - E T9Z, Ta4, Ta6, Tab; - T9Z = T4o - T4u; - Ta4 = Ta1 - Ta3; - Ta5 = T9Z + Ta4; - Tan = Ta4 - T9Z; - Ta6 = T4D - T4J; - Tab = Ta8 - Taa; - Tac = Ta6 - Tab; - Tam = Ta6 + Tab; - } - Tad = Ta5 - Tac; - Teg = Ta5 + Tac; - Tao = Tam - Tan; - Ted = Tan + Tam; - } - } - } - { - E T5h, TaC, T5G, TaL, T5p, TaE, T5y, TaJ; - { - E T5e, T5f, T5g, TaB; - T5e = ri[WS(rs, 9)]; - T5f = T8 * T5e; - T5g = ii[WS(rs, 9)]; - TaB = T8 * T5g; - T5h = FMA(Tc, T5g, T5f); - TaC = FNMS(Tc, T5e, TaB); - } - { - E T5B, T5C, T5F, TaK; - T5B = ri[WS(rs, 25)]; - T5C = T5A * T5B; - T5F = ii[WS(rs, 25)]; - TaK = T5A * T5F; - T5G = FMA(T5E, T5F, T5C); - TaL = FNMS(T5E, T5B, TaK); - } - { - E T5k, T5l, T5o, TaD; - T5k = ri[WS(rs, 41)]; - T5l = T5j * T5k; - T5o = ii[WS(rs, 41)]; - TaD = T5j * T5o; - T5p = FMA(T5n, T5o, T5l); - TaE = FNMS(T5n, T5k, TaD); - } - { - E T5t, T5u, T5x, TaI; - T5t = ri[WS(rs, 57)]; - T5u = T5s * T5t; - T5x = ii[WS(rs, 57)]; - TaI = T5s * T5x; - T5y = FMA(T5w, T5x, T5u); - TaJ = FNMS(T5w, T5t, TaI); - } - { - E T5q, T5H, Th1, Th2; - T5q = T5h + T5p; - T5H = T5y + T5G; - T5I = T5q + T5H; - TgM = T5H - T5q; - Th1 = TaC + TaE; - Th2 = TaJ + TaL; - Th3 = Th1 - Th2; - Tj0 = Th1 + Th2; - { - E TaG, Tbu, TaN, Tbt; - { - E TaA, TaF, TaH, TaM; - TaA = T5h - T5p; - TaF = TaC - TaE; - TaG = TaA + TaF; - Tbu = TaF - TaA; - TaH = T5y - T5G; - TaM = TaJ - TaL; - TaN = TaH - TaM; - Tbt = TaH + TaM; - } - TaO = TaG - TaN; - Tex = TaG + TaN; - Tbv = Tbt - Tbu; - Tem = Tbu + Tbt; - } - } - } - { - E T78, TbL, T7t, TbU, T7e, TbN, T7n, TbS; - { - E T75, T76, T77, TbK; - T75 = ri[WS(rs, 7)]; - T76 = T1i * T75; - T77 = ii[WS(rs, 7)]; - TbK = T1i * T77; - T78 = FMA(T1k, T77, T76); - TbL = FNMS(T1k, T75, TbK); - } - { - E T7p, T7q, T7s, TbT; - T7p = ri[WS(rs, 23)]; - T7q = T7o * T7p; - T7s = ii[WS(rs, 23)]; - TbT = T7o * T7s; - T7t = FMA(T7r, T7s, T7q); - TbU = FNMS(T7r, T7p, TbT); - } - { - E T7a, T7b, T7d, TbM; - T7a = ri[WS(rs, 39)]; - T7b = T79 * T7a; - T7d = ii[WS(rs, 39)]; - TbM = T79 * T7d; - T7e = FMA(T7c, T7d, T7b); - TbN = FNMS(T7c, T7a, TbM); - } - { - E T7i, T7j, T7m, TbR; - T7i = ri[WS(rs, 55)]; - T7j = T7h * T7i; - T7m = ii[WS(rs, 55)]; - TbR = T7h * T7m; - T7n = FMA(T7l, T7m, T7j); - TbS = FNMS(T7l, T7i, TbR); - } - { - E T7f, T7u, Ths, Tht; - T7f = T78 + T7e; - T7u = T7n + T7t; - T7v = T7f + T7u; - Thd = T7u - T7f; - Ths = TbL + TbN; - Tht = TbS + TbU; - Thu = Ths - Tht; - Tjb = Ths + Tht; - { - E TbP, TcD, TbW, TcC; - { - E TbJ, TbO, TbQ, TbV; - TbJ = T78 - T7e; - TbO = TbL - TbN; - TbP = TbJ + TbO; - TcD = TbO - TbJ; - TbQ = T7n - T7t; - TbV = TbS - TbU; - TbW = TbQ - TbV; - TcC = TbQ + TbV; - } - TbX = TbP - TbW; - TeQ = TbP + TbW; - TcE = TcC - TcD; - TeF = TcD + TcC; - } - } - } - { - E T5N, Tbd, T66, Tb9, T5T, Tbf, T5Y, Tb7; - { - E T5K, T5L, T5M, Tbc; - T5K = ri[WS(rs, 5)]; - T5L = Td * T5K; - T5M = ii[WS(rs, 5)]; - Tbc = Td * T5M; - T5N = FMA(Th, T5M, T5L); - Tbd = FNMS(Th, T5K, Tbc); - } - { - E T61, T62, T65, Tb8; - T61 = ri[WS(rs, 53)]; - T62 = T60 * T61; - T65 = ii[WS(rs, 53)]; - Tb8 = T60 * T65; - T66 = FMA(T64, T65, T62); - Tb9 = FNMS(T64, T61, Tb8); - } - { - E T5P, T5Q, T5S, Tbe; - T5P = ri[WS(rs, 37)]; - T5Q = T5O * T5P; - T5S = ii[WS(rs, 37)]; - Tbe = T5O * T5S; - T5T = FMA(T5R, T5S, T5Q); - Tbf = FNMS(T5R, T5P, Tbe); - } - { - E T5V, T5W, T5X, Tb6; - T5V = ri[WS(rs, 21)]; - T5W = T3j * T5V; - T5X = ii[WS(rs, 21)]; - Tb6 = T3j * T5X; - T5Y = FMA(T3m, T5X, T5W); - Tb7 = FNMS(T3m, T5V, Tb6); - } - { - E T5U, T67, TgR, TgO, TgP, TgQ; - T5U = T5N + T5T; - T67 = T5Y + T66; - TgR = T5U - T67; - TgO = Tbd + Tbf; - TgP = Tb7 + Tb9; - TgQ = TgO - TgP; - T68 = T5U + T67; - Tj5 = TgO + TgP; - TgS = TgQ - TgR; - Th5 = TgR + TgQ; - } - { - E Tbb, Tep, Tbi, Teo; - { - E Tb5, Tba, Tbg, Tbh; - Tb5 = T5N - T5T; - Tba = Tb7 - Tb9; - Tbb = Tb5 - Tba; - Tep = Tb5 + Tba; - Tbg = Tbd - Tbf; - Tbh = T5Y - T66; - Tbi = Tbg + Tbh; - Teo = Tbg - Tbh; - } - Tbj = FNMS(KP414213562, Tbi, Tbb); - Tez = FMA(KP414213562, Teo, Tep); - Tbx = FMA(KP414213562, Tbb, Tbi); - Teq = FNMS(KP414213562, Tep, Teo); - } - } - { - E T6g, TaY, T6z, TaU, T6m, Tb0, T6r, TaS; - { - E T6b, T6c, T6f, TaX; - T6b = ri[WS(rs, 61)]; - T6c = T6a * T6b; - T6f = ii[WS(rs, 61)]; - TaX = T6a * T6f; - T6g = FMA(T6e, T6f, T6c); - TaY = FNMS(T6e, T6b, TaX); - } - { - E T6u, T6v, T6y, TaT; - T6u = ri[WS(rs, 45)]; - T6v = T6t * T6u; - T6y = ii[WS(rs, 45)]; - TaT = T6t * T6y; - T6z = FMA(T6x, T6y, T6v); - TaU = FNMS(T6x, T6u, TaT); - } - { - E T6i, T6j, T6l, TaZ; - T6i = ri[WS(rs, 29)]; - T6j = T6h * T6i; - T6l = ii[WS(rs, 29)]; - TaZ = T6h * T6l; - T6m = FMA(T6k, T6l, T6j); - Tb0 = FNMS(T6k, T6i, TaZ); - } - { - E T6o, T6p, T6q, TaR; - T6o = ri[WS(rs, 13)]; - T6p = T17 * T6o; - T6q = ii[WS(rs, 13)]; - TaR = T17 * T6q; - T6r = FMA(T19, T6q, T6p); - TaS = FNMS(T19, T6o, TaR); - } - { - E T6n, T6A, TgT, TgU, TgV, TgW; - T6n = T6g + T6m; - T6A = T6r + T6z; - TgT = T6n - T6A; - TgU = TaY + Tb0; - TgV = TaS + TaU; - TgW = TgU - TgV; - T6B = T6n + T6A; - Tj6 = TgU + TgV; - TgX = TgT + TgW; - Th6 = TgT - TgW; - } - { - E TaW, Tes, Tb3, Ter; - { - E TaQ, TaV, Tb1, Tb2; - TaQ = T6g - T6m; - TaV = TaS - TaU; - TaW = TaQ - TaV; - Tes = TaQ + TaV; - Tb1 = TaY - Tb0; - Tb2 = T6r - T6z; - Tb3 = Tb1 + Tb2; - Ter = Tb1 - Tb2; - } - Tb4 = FMA(KP414213562, Tb3, TaW); - TeA = FNMS(KP414213562, Ter, Tes); - Tby = FNMS(KP414213562, TaW, Tb3); - Tet = FMA(KP414213562, Tes, Ter); - } - } - { - E T7A, Tcm, T7T, Tci, T7G, Tco, T7L, Tcg; - { - E T7x, T7y, T7z, Tcl; - T7x = ri[WS(rs, 3)]; - T7y = T3 * T7x; - T7z = ii[WS(rs, 3)]; - Tcl = T3 * T7z; - T7A = FMA(T6, T7z, T7y); - Tcm = FNMS(T6, T7x, Tcl); - } - { - E T7O, T7P, T7S, Tch; - T7O = ri[WS(rs, 51)]; - T7P = T7N * T7O; - T7S = ii[WS(rs, 51)]; - Tch = T7N * T7S; - T7T = FMA(T7R, T7S, T7P); - Tci = FNMS(T7R, T7O, Tch); - } - { - E T7C, T7D, T7F, Tcn; - T7C = ri[WS(rs, 35)]; - T7D = T7B * T7C; - T7F = ii[WS(rs, 35)]; - Tcn = T7B * T7F; - T7G = FMA(T7E, T7F, T7D); - Tco = FNMS(T7E, T7C, Tcn); - } - { - E T7I, T7J, T7K, Tcf; - T7I = ri[WS(rs, 19)]; - T7J = T2u * T7I; - T7K = ii[WS(rs, 19)]; - Tcf = T2u * T7K; - T7L = FMA(T2x, T7K, T7J); - Tcg = FNMS(T2x, T7I, Tcf); - } - { - E T7H, T7U, Thi, Thf, Thg, Thh; - T7H = T7A + T7G; - T7U = T7L + T7T; - Thi = T7H - T7U; - Thf = Tcm + Tco; - Thg = Tcg + Tci; - Thh = Thf - Thg; - T7V = T7H + T7U; - Tjg = Thf + Thg; - Thj = Thh - Thi; - Thw = Thi + Thh; - } - { - E Tck, TeI, Tcr, TeH; - { - E Tce, Tcj, Tcp, Tcq; - Tce = T7A - T7G; - Tcj = Tcg - Tci; - Tck = Tce - Tcj; - TeI = Tce + Tcj; - Tcp = Tcm - Tco; - Tcq = T7L - T7T; - Tcr = Tcp + Tcq; - TeH = Tcp - Tcq; - } - Tcs = FNMS(KP414213562, Tcr, Tck); - TeS = FMA(KP414213562, TeH, TeI); - TcG = FMA(KP414213562, Tck, Tcr); - TeJ = FNMS(KP414213562, TeI, TeH); - } - } - { - E T83, Tc7, T8k, Tc3, T87, Tc9, T8c, Tc1; - { - E T7Y, T7Z, T82, Tc6; - T7Y = ri[WS(rs, 59)]; - T7Z = T7X * T7Y; - T82 = ii[WS(rs, 59)]; - Tc6 = T7X * T82; - T83 = FMA(T81, T82, T7Z); - Tc7 = FNMS(T81, T7Y, Tc6); - } - { - E T8f, T8g, T8j, Tc2; - T8f = ri[WS(rs, 43)]; - T8g = T8e * T8f; - T8j = ii[WS(rs, 43)]; - Tc2 = T8e * T8j; - T8k = FMA(T8i, T8j, T8g); - Tc3 = FNMS(T8i, T8f, Tc2); - } - { - E T84, T85, T86, Tc8; - T84 = ri[WS(rs, 27)]; - T85 = Te * T84; - T86 = ii[WS(rs, 27)]; - Tc8 = Te * T86; - T87 = FMA(Ti, T86, T85); - Tc9 = FNMS(Ti, T84, Tc8); - } - { - E T89, T8a, T8b, Tc0; - T89 = ri[WS(rs, 11)]; - T8a = Tu * T89; - T8b = ii[WS(rs, 11)]; - Tc0 = Tu * T8b; - T8c = FMA(Tx, T8b, T8a); - Tc1 = FNMS(Tx, T89, Tc0); - } - { - E T88, T8l, Thk, Thl, Thm, Thn; - T88 = T83 + T87; - T8l = T8c + T8k; - Thk = T88 - T8l; - Thl = Tc7 + Tc9; - Thm = Tc1 + Tc3; - Thn = Thl - Thm; - T8m = T88 + T8l; - Tjh = Thl + Thm; - Tho = Thk + Thn; - Thx = Thk - Thn; - } - { - E Tc5, TeL, Tcc, TeK; - { - E TbZ, Tc4, Tca, Tcb; - TbZ = T83 - T87; - Tc4 = Tc1 - Tc3; - Tc5 = TbZ - Tc4; - TeL = TbZ + Tc4; - Tca = Tc7 - Tc9; - Tcb = T8c - T8k; - Tcc = Tca + Tcb; - TeK = Tca - Tcb; - } - Tcd = FMA(KP414213562, Tcc, Tc5); - TeT = FNMS(KP414213562, TeK, TeL); - TcH = FNMS(KP414213562, Tc5, Tcc); - TeM = FMA(KP414213562, TeL, TeK); - } - } - { - E T2I, TjG, T4N, Tkj, Tkf, Tkk, TjJ, Tk5, T8o, Tk2, TjU, TjY, T6D, Tk1, TjP; - E TjX; - { - E T1C, T2H, TjH, TjI; - T1C = TY + T1B; - T2H = T27 + T2G; - T2I = T1C + T2H; - TjG = T1C - T2H; - { - E T3L, T4M, Tk6, Tke; - T3L = T39 + T3K; - T4M = T4k + T4L; - T4N = T3L + T4M; - Tkj = T4M - T3L; - Tk6 = TiJ + TiK; - Tke = Tk7 + Tkd; - Tkf = Tk6 + Tke; - Tkk = Tke - Tk6; - } - TjH = TiN + TiO; - TjI = TiT + TiU; - TjJ = TjH - TjI; - Tk5 = TjH + TjI; - { - E T7w, T8n, TjQ, TjR, TjS, TjT; - T7w = T74 + T7v; - T8n = T7V + T8m; - TjQ = T7w - T8n; - TjR = Tja + Tjb; - TjS = Tjg + Tjh; - TjT = TjR - TjS; - T8o = T7w + T8n; - Tk2 = TjR + TjS; - TjU = TjQ - TjT; - TjY = TjQ + TjT; - } - { - E T5J, T6C, TjL, TjM, TjN, TjO; - T5J = T5d + T5I; - T6C = T68 + T6B; - TjL = T5J - T6C; - TjM = TiZ + Tj0; - TjN = Tj5 + Tj6; - TjO = TjM - TjN; - T6D = T5J + T6C; - Tk1 = TjM + TjN; - TjP = TjL + TjO; - TjX = TjO - TjL; - } - } - { - E T4O, T8p, Tk4, Tkg; - T4O = T2I + T4N; - T8p = T6D + T8o; - ri[WS(rs, 32)] = T4O - T8p; - ri[0] = T4O + T8p; - Tk4 = Tk1 + Tk2; - Tkg = Tk5 + Tkf; - ii[0] = Tk4 + Tkg; - ii[WS(rs, 32)] = Tkg - Tk4; - } - { - E TjK, TjV, Tkl, Tkm; - TjK = TjG + TjJ; - TjV = TjP + TjU; - ri[WS(rs, 40)] = FNMS(KP707106781, TjV, TjK); - ri[WS(rs, 8)] = FMA(KP707106781, TjV, TjK); - Tkl = Tkj + Tkk; - Tkm = TjX + TjY; - ii[WS(rs, 8)] = FMA(KP707106781, Tkm, Tkl); - ii[WS(rs, 40)] = FNMS(KP707106781, Tkm, Tkl); - } - { - E TjW, TjZ, Tkn, Tko; - TjW = TjG - TjJ; - TjZ = TjX - TjY; - ri[WS(rs, 56)] = FNMS(KP707106781, TjZ, TjW); - ri[WS(rs, 24)] = FMA(KP707106781, TjZ, TjW); - Tkn = Tkk - Tkj; - Tko = TjU - TjP; - ii[WS(rs, 24)] = FMA(KP707106781, Tko, Tkn); - ii[WS(rs, 56)] = FNMS(KP707106781, Tko, Tkn); - } - { - E Tk0, Tk3, Tkh, Tki; - Tk0 = T2I - T4N; - Tk3 = Tk1 - Tk2; - ri[WS(rs, 48)] = Tk0 - Tk3; - ri[WS(rs, 16)] = Tk0 + Tk3; - Tkh = T8o - T6D; - Tki = Tkf - Tk5; - ii[WS(rs, 16)] = Tkh + Tki; - ii[WS(rs, 48)] = Tki - Tkh; - } - } - { - E TiM, Tjq, Tkr, Tkx, TiX, Tky, Tjt, Tks, Tj9, TjD, Tjn, Tjx, Tjk, TjE, Tjo; - E TjA; - { - E TiI, TiL, Tkp, Tkq; - TiI = TY - T1B; - TiL = TiJ - TiK; - TiM = TiI - TiL; - Tjq = TiI + TiL; - Tkp = T2G - T27; - Tkq = Tkd - Tk7; - Tkr = Tkp + Tkq; - Tkx = Tkq - Tkp; - } - { - E TiR, Tjr, TiW, Tjs; - { - E TiP, TiQ, TiS, TiV; - TiP = TiN - TiO; - TiQ = T39 - T3K; - TiR = TiP - TiQ; - Tjr = TiQ + TiP; - TiS = T4k - T4L; - TiV = TiT - TiU; - TiW = TiS + TiV; - Tjs = TiS - TiV; - } - TiX = TiR - TiW; - Tky = Tjs - Tjr; - Tjt = Tjr + Tjs; - Tks = TiR + TiW; - } - { - E Tj3, Tjw, Tj8, Tjv; - { - E Tj1, Tj2, Tj4, Tj7; - Tj1 = TiZ - Tj0; - Tj2 = T6B - T68; - Tj3 = Tj1 - Tj2; - Tjw = Tj1 + Tj2; - Tj4 = T5d - T5I; - Tj7 = Tj5 - Tj6; - Tj8 = Tj4 - Tj7; - Tjv = Tj4 + Tj7; - } - Tj9 = FMA(KP414213562, Tj8, Tj3); - TjD = FNMS(KP414213562, Tjv, Tjw); - Tjn = FNMS(KP414213562, Tj3, Tj8); - Tjx = FMA(KP414213562, Tjw, Tjv); - } - { - E Tje, Tjz, Tjj, Tjy; - { - E Tjc, Tjd, Tjf, Tji; - Tjc = Tja - Tjb; - Tjd = T8m - T7V; - Tje = Tjc - Tjd; - Tjz = Tjc + Tjd; - Tjf = T74 - T7v; - Tji = Tjg - Tjh; - Tjj = Tjf - Tji; - Tjy = Tjf + Tji; - } - Tjk = FNMS(KP414213562, Tjj, Tje); - TjE = FMA(KP414213562, Tjy, Tjz); - Tjo = FMA(KP414213562, Tje, Tjj); - TjA = FNMS(KP414213562, Tjz, Tjy); - } - { - E TiY, Tjl, Tkz, TkA; - TiY = FMA(KP707106781, TiX, TiM); - Tjl = Tj9 - Tjk; - ri[WS(rs, 44)] = FNMS(KP923879532, Tjl, TiY); - ri[WS(rs, 12)] = FMA(KP923879532, Tjl, TiY); - Tkz = FMA(KP707106781, Tky, Tkx); - TkA = Tjo - Tjn; - ii[WS(rs, 12)] = FMA(KP923879532, TkA, Tkz); - ii[WS(rs, 44)] = FNMS(KP923879532, TkA, Tkz); - } - { - E Tjm, Tjp, TkB, TkC; - Tjm = FNMS(KP707106781, TiX, TiM); - Tjp = Tjn + Tjo; - ri[WS(rs, 28)] = FNMS(KP923879532, Tjp, Tjm); - ri[WS(rs, 60)] = FMA(KP923879532, Tjp, Tjm); - TkB = FNMS(KP707106781, Tky, Tkx); - TkC = Tj9 + Tjk; - ii[WS(rs, 28)] = FNMS(KP923879532, TkC, TkB); - ii[WS(rs, 60)] = FMA(KP923879532, TkC, TkB); - } - { - E Tju, TjB, Tkt, Tku; - Tju = FMA(KP707106781, Tjt, Tjq); - TjB = Tjx + TjA; - ri[WS(rs, 36)] = FNMS(KP923879532, TjB, Tju); - ri[WS(rs, 4)] = FMA(KP923879532, TjB, Tju); - Tkt = FMA(KP707106781, Tks, Tkr); - Tku = TjD + TjE; - ii[WS(rs, 4)] = FMA(KP923879532, Tku, Tkt); - ii[WS(rs, 36)] = FNMS(KP923879532, Tku, Tkt); - } - { - E TjC, TjF, Tkv, Tkw; - TjC = FNMS(KP707106781, Tjt, Tjq); - TjF = TjD - TjE; - ri[WS(rs, 52)] = FNMS(KP923879532, TjF, TjC); - ri[WS(rs, 20)] = FMA(KP923879532, TjF, TjC); - Tkv = FNMS(KP707106781, Tks, Tkr); - Tkw = TjA - Tjx; - ii[WS(rs, 20)] = FMA(KP923879532, Tkw, Tkv); - ii[WS(rs, 52)] = FNMS(KP923879532, Tkw, Tkv); - } - } - { - E Tgk, Tl1, ThG, TkV, Ti0, TkN, Tis, TkH, TgH, TkO, ThJ, TkI, Tim, TiG, Tiq; - E TiC, Th9, ThT, ThD, ThN, Ti7, Tl2, Tiv, TkW, Tif, TiF, Tip, Tiz, ThA, ThU; - E ThE, ThQ; - { - E Tg8, TkT, Tgj, TkU, Tgd, Tgi; - Tg8 = Tg4 + Tg7; - TkT = TkE - TkD; - Tgd = Tg9 + Tgc; - Tgi = Tge - Tgh; - Tgj = Tgd + Tgi; - TkU = Tgi - Tgd; - Tgk = FNMS(KP707106781, Tgj, Tg8); - Tl1 = FNMS(KP707106781, TkU, TkT); - ThG = FMA(KP707106781, Tgj, Tg8); - TkV = FMA(KP707106781, TkU, TkT); - } - { - E ThW, TkF, ThZ, TkG, ThX, ThY; - ThW = Tg4 - Tg7; - TkF = TkD + TkE; - ThX = Tgc - Tg9; - ThY = Tge + Tgh; - ThZ = ThX - ThY; - TkG = ThX + ThY; - Ti0 = FMA(KP707106781, ThZ, ThW); - TkN = FNMS(KP707106781, TkG, TkF); - Tis = FNMS(KP707106781, ThZ, ThW); - TkH = FMA(KP707106781, TkG, TkF); - } - { - E Tgv, ThH, TgG, ThI; - { - E Tgp, Tgu, TgA, TgF; - Tgp = Tgn + Tgo; - Tgu = Tgq + Tgt; - Tgv = FNMS(KP414213562, Tgu, Tgp); - ThH = FMA(KP414213562, Tgp, Tgu); - TgA = Tgy + Tgz; - TgF = TgB + TgE; - TgG = FMA(KP414213562, TgF, TgA); - ThI = FNMS(KP414213562, TgA, TgF); - } - TgH = Tgv - TgG; - TkO = ThI - ThH; - ThJ = ThH + ThI; - TkI = Tgv + TgG; - } - { - E Tii, TiB, Til, TiA; - { - E Tig, Tih, Tij, Tik; - Tig = Thr - Thu; - Tih = Tho - Thj; - Tii = FNMS(KP707106781, Tih, Tig); - TiB = FMA(KP707106781, Tih, Tig); - Tij = Thc - Thd; - Tik = Thw - Thx; - Til = FNMS(KP707106781, Tik, Tij); - TiA = FMA(KP707106781, Tik, Tij); - } - Tim = FNMS(KP668178637, Til, Tii); - TiG = FMA(KP198912367, TiA, TiB); - Tiq = FMA(KP668178637, Tii, Til); - TiC = FNMS(KP198912367, TiB, TiA); - } - { - E TgZ, ThM, Th8, ThL; - { - E TgN, TgY, Th4, Th7; - TgN = TgL + TgM; - TgY = TgS + TgX; - TgZ = FNMS(KP707106781, TgY, TgN); - ThM = FMA(KP707106781, TgY, TgN); - Th4 = Th0 + Th3; - Th7 = Th5 + Th6; - Th8 = FNMS(KP707106781, Th7, Th4); - ThL = FMA(KP707106781, Th7, Th4); - } - Th9 = FMA(KP668178637, Th8, TgZ); - ThT = FNMS(KP198912367, ThL, ThM); - ThD = FNMS(KP668178637, TgZ, Th8); - ThN = FMA(KP198912367, ThM, ThL); - } - { - E Ti3, Tit, Ti6, Tiu; - { - E Ti1, Ti2, Ti4, Ti5; - Ti1 = Tgn - Tgo; - Ti2 = Tgq - Tgt; - Ti3 = FMA(KP414213562, Ti2, Ti1); - Tit = FNMS(KP414213562, Ti1, Ti2); - Ti4 = Tgy - Tgz; - Ti5 = TgB - TgE; - Ti6 = FNMS(KP414213562, Ti5, Ti4); - Tiu = FMA(KP414213562, Ti4, Ti5); - } - Ti7 = Ti3 - Ti6; - Tl2 = Ti3 + Ti6; - Tiv = Tit + Tiu; - TkW = Tiu - Tit; - } - { - E Tib, Tiy, Tie, Tix; - { - E Ti9, Tia, Tic, Tid; - Ti9 = Th0 - Th3; - Tia = TgX - TgS; - Tib = FNMS(KP707106781, Tia, Ti9); - Tiy = FMA(KP707106781, Tia, Ti9); - Tic = TgL - TgM; - Tid = Th5 - Th6; - Tie = FNMS(KP707106781, Tid, Tic); - Tix = FMA(KP707106781, Tid, Tic); - } - Tif = FMA(KP668178637, Tie, Tib); - TiF = FNMS(KP198912367, Tix, Tiy); - Tip = FNMS(KP668178637, Tib, Tie); - Tiz = FMA(KP198912367, Tiy, Tix); - } - { - E Thq, ThP, Thz, ThO; - { - E The, Thp, Thv, Thy; - The = Thc + Thd; - Thp = Thj + Tho; - Thq = FNMS(KP707106781, Thp, The); - ThP = FMA(KP707106781, Thp, The); - Thv = Thr + Thu; - Thy = Thw + Thx; - Thz = FNMS(KP707106781, Thy, Thv); - ThO = FMA(KP707106781, Thy, Thv); - } - ThA = FNMS(KP668178637, Thz, Thq); - ThU = FMA(KP198912367, ThO, ThP); - ThE = FMA(KP668178637, Thq, Thz); - ThQ = FNMS(KP198912367, ThP, ThO); - } - { - E TgI, ThB, TkP, TkQ; - TgI = FMA(KP923879532, TgH, Tgk); - ThB = Th9 - ThA; - ri[WS(rs, 42)] = FNMS(KP831469612, ThB, TgI); - ri[WS(rs, 10)] = FMA(KP831469612, ThB, TgI); - TkP = FMA(KP923879532, TkO, TkN); - TkQ = ThE - ThD; - ii[WS(rs, 10)] = FMA(KP831469612, TkQ, TkP); - ii[WS(rs, 42)] = FNMS(KP831469612, TkQ, TkP); - } - { - E ThC, ThF, TkR, TkS; - ThC = FNMS(KP923879532, TgH, Tgk); - ThF = ThD + ThE; - ri[WS(rs, 26)] = FNMS(KP831469612, ThF, ThC); - ri[WS(rs, 58)] = FMA(KP831469612, ThF, ThC); - TkR = FNMS(KP923879532, TkO, TkN); - TkS = Th9 + ThA; - ii[WS(rs, 26)] = FNMS(KP831469612, TkS, TkR); - ii[WS(rs, 58)] = FMA(KP831469612, TkS, TkR); - } - { - E ThK, ThR, TkJ, TkK; - ThK = FMA(KP923879532, ThJ, ThG); - ThR = ThN + ThQ; - ri[WS(rs, 34)] = FNMS(KP980785280, ThR, ThK); - ri[WS(rs, 2)] = FMA(KP980785280, ThR, ThK); - TkJ = FMA(KP923879532, TkI, TkH); - TkK = ThT + ThU; - ii[WS(rs, 2)] = FMA(KP980785280, TkK, TkJ); - ii[WS(rs, 34)] = FNMS(KP980785280, TkK, TkJ); - } - { - E ThS, ThV, TkL, TkM; - ThS = FNMS(KP923879532, ThJ, ThG); - ThV = ThT - ThU; - ri[WS(rs, 50)] = FNMS(KP980785280, ThV, ThS); - ri[WS(rs, 18)] = FMA(KP980785280, ThV, ThS); - TkL = FNMS(KP923879532, TkI, TkH); - TkM = ThQ - ThN; - ii[WS(rs, 18)] = FMA(KP980785280, TkM, TkL); - ii[WS(rs, 50)] = FNMS(KP980785280, TkM, TkL); - } - { - E Ti8, Tin, TkX, TkY; - Ti8 = FMA(KP923879532, Ti7, Ti0); - Tin = Tif + Tim; - ri[WS(rs, 38)] = FNMS(KP831469612, Tin, Ti8); - ri[WS(rs, 6)] = FMA(KP831469612, Tin, Ti8); - TkX = FMA(KP923879532, TkW, TkV); - TkY = Tip + Tiq; - ii[WS(rs, 6)] = FMA(KP831469612, TkY, TkX); - ii[WS(rs, 38)] = FNMS(KP831469612, TkY, TkX); - } - { - E Tio, Tir, TkZ, Tl0; - Tio = FNMS(KP923879532, Ti7, Ti0); - Tir = Tip - Tiq; - ri[WS(rs, 54)] = FNMS(KP831469612, Tir, Tio); - ri[WS(rs, 22)] = FMA(KP831469612, Tir, Tio); - TkZ = FNMS(KP923879532, TkW, TkV); - Tl0 = Tim - Tif; - ii[WS(rs, 22)] = FMA(KP831469612, Tl0, TkZ); - ii[WS(rs, 54)] = FNMS(KP831469612, Tl0, TkZ); - } - { - E Tiw, TiD, Tl3, Tl4; - Tiw = FNMS(KP923879532, Tiv, Tis); - TiD = Tiz - TiC; - ri[WS(rs, 46)] = FNMS(KP980785280, TiD, Tiw); - ri[WS(rs, 14)] = FMA(KP980785280, TiD, Tiw); - Tl3 = FNMS(KP923879532, Tl2, Tl1); - Tl4 = TiG - TiF; - ii[WS(rs, 14)] = FMA(KP980785280, Tl4, Tl3); - ii[WS(rs, 46)] = FNMS(KP980785280, Tl4, Tl3); - } - { - E TiE, TiH, Tl5, Tl6; - TiE = FMA(KP923879532, Tiv, Tis); - TiH = TiF + TiG; - ri[WS(rs, 30)] = FNMS(KP980785280, TiH, TiE); - ri[WS(rs, 62)] = FMA(KP980785280, TiH, TiE); - Tl5 = FMA(KP923879532, Tl2, Tl1); - Tl6 = Tiz + TiC; - ii[WS(rs, 30)] = FNMS(KP980785280, Tl6, Tl5); - ii[WS(rs, 62)] = FMA(KP980785280, Tl6, Tl5); - } - } - { - E Tar, TlO, TcT, TlI, TbB, Td3, TcN, TcX, Tdw, TdQ, TdA, TdM, Tdp, TdP, Tdz; - E TdJ, Tdh, Tm2, TdF, TlW, TcK, Td4, TcO, Td0, T9i, TlV, Tm1, TcQ, Tda, TlH; - E TlN, TdC; - { - E T9R, TcR, Taq, TcS; - { - E T9F, T9Q, Tae, Tap; - T9F = FNMS(KP707106781, T9E, T9p); - T9Q = FNMS(KP707106781, T9P, T9M); - T9R = FNMS(KP668178637, T9Q, T9F); - TcR = FMA(KP668178637, T9F, T9Q); - Tae = FNMS(KP707106781, Tad, T9Y); - Tap = FNMS(KP707106781, Tao, Tal); - Taq = FMA(KP668178637, Tap, Tae); - TcS = FNMS(KP668178637, Tae, Tap); - } - Tar = T9R - Taq; - TlO = TcS - TcR; - TcT = TcR + TcS; - TlI = T9R + Taq; - } - { - E Tbl, TcW, TbA, TcV; - { - E TaP, Tbk, Tbw, Tbz; - TaP = FNMS(KP707106781, TaO, Taz); - Tbk = Tb4 - Tbj; - Tbl = FNMS(KP923879532, Tbk, TaP); - TcW = FMA(KP923879532, Tbk, TaP); - Tbw = FNMS(KP707106781, Tbv, Tbs); - Tbz = Tbx - Tby; - TbA = FNMS(KP923879532, Tbz, Tbw); - TcV = FMA(KP923879532, Tbz, Tbw); - } - TbB = FMA(KP534511135, TbA, Tbl); - Td3 = FNMS(KP303346683, TcV, TcW); - TcN = FNMS(KP534511135, Tbl, TbA); - TcX = FMA(KP303346683, TcW, TcV); - } - { - E Tds, TdL, Tdv, TdK; - { - E Tdq, Tdr, Tdt, Tdu; - Tdq = FMA(KP707106781, TcE, TcB); - Tdr = Tcs + Tcd; - Tds = FNMS(KP923879532, Tdr, Tdq); - TdL = FMA(KP923879532, Tdr, Tdq); - Tdt = FMA(KP707106781, TbX, TbI); - Tdu = TcG + TcH; - Tdv = FNMS(KP923879532, Tdu, Tdt); - TdK = FMA(KP923879532, Tdu, Tdt); - } - Tdw = FNMS(KP820678790, Tdv, Tds); - TdQ = FMA(KP098491403, TdK, TdL); - TdA = FMA(KP820678790, Tds, Tdv); - TdM = FNMS(KP098491403, TdL, TdK); - } - { - E Tdl, TdI, Tdo, TdH; - { - E Tdj, Tdk, Tdm, Tdn; - Tdj = FMA(KP707106781, Tbv, Tbs); - Tdk = Tbj + Tb4; - Tdl = FNMS(KP923879532, Tdk, Tdj); - TdI = FMA(KP923879532, Tdk, Tdj); - Tdm = FMA(KP707106781, TaO, Taz); - Tdn = Tbx + Tby; - Tdo = FNMS(KP923879532, Tdn, Tdm); - TdH = FMA(KP923879532, Tdn, Tdm); - } - Tdp = FMA(KP820678790, Tdo, Tdl); - TdP = FNMS(KP098491403, TdH, TdI); - Tdz = FNMS(KP820678790, Tdl, Tdo); - TdJ = FMA(KP098491403, TdI, TdH); - } - { - E Tdd, TdD, Tdg, TdE; - { - E Tdb, Tdc, Tde, Tdf; - Tdb = FMA(KP707106781, T9E, T9p); - Tdc = FMA(KP707106781, T9P, T9M); - Tdd = FMA(KP198912367, Tdc, Tdb); - TdD = FNMS(KP198912367, Tdb, Tdc); - Tde = FMA(KP707106781, Tad, T9Y); - Tdf = FMA(KP707106781, Tao, Tal); - Tdg = FNMS(KP198912367, Tdf, Tde); - TdE = FMA(KP198912367, Tde, Tdf); - } - Tdh = Tdd - Tdg; - Tm2 = Tdd + Tdg; - TdF = TdD + TdE; - TlW = TdE - TdD; - } - { - E Tcu, TcZ, TcJ, TcY; - { - E TbY, Tct, TcF, TcI; - TbY = FNMS(KP707106781, TbX, TbI); - Tct = Tcd - Tcs; - Tcu = FNMS(KP923879532, Tct, TbY); - TcZ = FMA(KP923879532, Tct, TbY); - TcF = FNMS(KP707106781, TcE, TcB); - TcI = TcG - TcH; - TcJ = FNMS(KP923879532, TcI, TcF); - TcY = FMA(KP923879532, TcI, TcF); - } - TcK = FNMS(KP534511135, TcJ, Tcu); - Td4 = FMA(KP303346683, TcY, TcZ); - TcO = FMA(KP534511135, Tcu, TcJ); - Td0 = FNMS(KP303346683, TcZ, TcY); - } - { - E T8M, Td6, TlF, TlT, T9h, TlU, Td9, TlG, T8L, TlE; - T8L = T8D - T8K; - T8M = FMA(KP707106781, T8L, T8w); - Td6 = FNMS(KP707106781, T8L, T8w); - TlE = TdU - TdT; - TlF = FMA(KP707106781, TlE, TlD); - TlT = FNMS(KP707106781, TlE, TlD); - { - E T91, T9g, Td7, Td8; - T91 = FMA(KP414213562, T90, T8T); - T9g = FNMS(KP414213562, T9f, T98); - T9h = T91 - T9g; - TlU = T91 + T9g; - Td7 = FNMS(KP414213562, T8T, T90); - Td8 = FMA(KP414213562, T98, T9f); - Td9 = Td7 + Td8; - TlG = Td8 - Td7; - } - T9i = FNMS(KP923879532, T9h, T8M); - TlV = FNMS(KP923879532, TlU, TlT); - Tm1 = FMA(KP923879532, TlU, TlT); - TcQ = FMA(KP923879532, T9h, T8M); - Tda = FNMS(KP923879532, Td9, Td6); - TlH = FMA(KP923879532, TlG, TlF); - TlN = FNMS(KP923879532, TlG, TlF); - TdC = FMA(KP923879532, Td9, Td6); - } - { - E Tas, TcL, TlP, TlQ; - Tas = FMA(KP831469612, Tar, T9i); - TcL = TbB - TcK; - ri[WS(rs, 43)] = FNMS(KP881921264, TcL, Tas); - ri[WS(rs, 11)] = FMA(KP881921264, TcL, Tas); - TlP = FMA(KP831469612, TlO, TlN); - TlQ = TcO - TcN; - ii[WS(rs, 11)] = FMA(KP881921264, TlQ, TlP); - ii[WS(rs, 43)] = FNMS(KP881921264, TlQ, TlP); - } - { - E TcM, TcP, TlR, TlS; - TcM = FNMS(KP831469612, Tar, T9i); - TcP = TcN + TcO; - ri[WS(rs, 27)] = FNMS(KP881921264, TcP, TcM); - ri[WS(rs, 59)] = FMA(KP881921264, TcP, TcM); - TlR = FNMS(KP831469612, TlO, TlN); - TlS = TbB + TcK; - ii[WS(rs, 27)] = FNMS(KP881921264, TlS, TlR); - ii[WS(rs, 59)] = FMA(KP881921264, TlS, TlR); - } - { - E TcU, Td1, TlJ, TlK; - TcU = FMA(KP831469612, TcT, TcQ); - Td1 = TcX + Td0; - ri[WS(rs, 35)] = FNMS(KP956940335, Td1, TcU); - ri[WS(rs, 3)] = FMA(KP956940335, Td1, TcU); - TlJ = FMA(KP831469612, TlI, TlH); - TlK = Td3 + Td4; - ii[WS(rs, 3)] = FMA(KP956940335, TlK, TlJ); - ii[WS(rs, 35)] = FNMS(KP956940335, TlK, TlJ); - } - { - E Td2, Td5, TlL, TlM; - Td2 = FNMS(KP831469612, TcT, TcQ); - Td5 = Td3 - Td4; - ri[WS(rs, 51)] = FNMS(KP956940335, Td5, Td2); - ri[WS(rs, 19)] = FMA(KP956940335, Td5, Td2); - TlL = FNMS(KP831469612, TlI, TlH); - TlM = Td0 - TcX; - ii[WS(rs, 19)] = FMA(KP956940335, TlM, TlL); - ii[WS(rs, 51)] = FNMS(KP956940335, TlM, TlL); - } - { - E Tdi, Tdx, TlX, TlY; - Tdi = FMA(KP980785280, Tdh, Tda); - Tdx = Tdp + Tdw; - ri[WS(rs, 39)] = FNMS(KP773010453, Tdx, Tdi); - ri[WS(rs, 7)] = FMA(KP773010453, Tdx, Tdi); - TlX = FMA(KP980785280, TlW, TlV); - TlY = Tdz + TdA; - ii[WS(rs, 7)] = FMA(KP773010453, TlY, TlX); - ii[WS(rs, 39)] = FNMS(KP773010453, TlY, TlX); - } - { - E Tdy, TdB, TlZ, Tm0; - Tdy = FNMS(KP980785280, Tdh, Tda); - TdB = Tdz - TdA; - ri[WS(rs, 55)] = FNMS(KP773010453, TdB, Tdy); - ri[WS(rs, 23)] = FMA(KP773010453, TdB, Tdy); - TlZ = FNMS(KP980785280, TlW, TlV); - Tm0 = Tdw - Tdp; - ii[WS(rs, 23)] = FMA(KP773010453, Tm0, TlZ); - ii[WS(rs, 55)] = FNMS(KP773010453, Tm0, TlZ); - } - { - E TdG, TdN, Tm3, Tm4; - TdG = FNMS(KP980785280, TdF, TdC); - TdN = TdJ - TdM; - ri[WS(rs, 47)] = FNMS(KP995184726, TdN, TdG); - ri[WS(rs, 15)] = FMA(KP995184726, TdN, TdG); - Tm3 = FNMS(KP980785280, Tm2, Tm1); - Tm4 = TdQ - TdP; - ii[WS(rs, 15)] = FMA(KP995184726, Tm4, Tm3); - ii[WS(rs, 47)] = FNMS(KP995184726, Tm4, Tm3); - } - { - E TdO, TdR, Tm5, Tm6; - TdO = FMA(KP980785280, TdF, TdC); - TdR = TdP + TdQ; - ri[WS(rs, 31)] = FNMS(KP995184726, TdR, TdO); - ri[WS(rs, 63)] = FMA(KP995184726, TdR, TdO); - Tm5 = FMA(KP980785280, Tm2, Tm1); - Tm6 = TdJ + TdM; - ii[WS(rs, 31)] = FNMS(KP995184726, Tm6, Tm5); - ii[WS(rs, 63)] = FMA(KP995184726, Tm6, Tm5); - } - } - { - E Tej, Tlk, Tf5, Tle, TeD, Tff, TeZ, Tf9, TfI, Tg2, TfM, TfY, TfB, Tg1, TfL; - E TfV, Tft, Tly, TfR, Tls, TeW, Tfg, Tf0, Tfc, Te4, Tlr, Tlx, Tf2, Tfm, Tld; - E Tlj, TfO; - { - E Teb, Tf3, Tei, Tf4; - { - E Te7, Tea, Tee, Teh; - Te7 = FMA(KP707106781, Te6, Te5); - Tea = FMA(KP707106781, Te9, Te8); - Teb = FNMS(KP198912367, Tea, Te7); - Tf3 = FMA(KP198912367, Te7, Tea); - Tee = FMA(KP707106781, Ted, Tec); - Teh = FMA(KP707106781, Teg, Tef); - Tei = FMA(KP198912367, Teh, Tee); - Tf4 = FNMS(KP198912367, Tee, Teh); - } - Tej = Teb - Tei; - Tlk = Tf4 - Tf3; - Tf5 = Tf3 + Tf4; - Tle = Teb + Tei; - } - { - E Tev, Tf8, TeC, Tf7; - { - E Ten, Teu, Tey, TeB; - Ten = FMA(KP707106781, Tem, Tel); - Teu = Teq + Tet; - Tev = FNMS(KP923879532, Teu, Ten); - Tf8 = FMA(KP923879532, Teu, Ten); - Tey = FMA(KP707106781, Tex, Tew); - TeB = Tez + TeA; - TeC = FNMS(KP923879532, TeB, Tey); - Tf7 = FMA(KP923879532, TeB, Tey); - } - TeD = FMA(KP820678790, TeC, Tev); - Tff = FNMS(KP098491403, Tf7, Tf8); - TeZ = FNMS(KP820678790, Tev, TeC); - Tf9 = FMA(KP098491403, Tf8, Tf7); - } - { - E TfE, TfX, TfH, TfW; - { - E TfC, TfD, TfF, TfG; - TfC = FNMS(KP707106781, TeQ, TeP); - TfD = TeM - TeJ; - TfE = FNMS(KP923879532, TfD, TfC); - TfX = FMA(KP923879532, TfD, TfC); - TfF = FNMS(KP707106781, TeF, TeE); - TfG = TeS - TeT; - TfH = FNMS(KP923879532, TfG, TfF); - TfW = FMA(KP923879532, TfG, TfF); - } - TfI = FNMS(KP534511135, TfH, TfE); - Tg2 = FMA(KP303346683, TfW, TfX); - TfM = FMA(KP534511135, TfE, TfH); - TfY = FNMS(KP303346683, TfX, TfW); - } - { - E Tfx, TfU, TfA, TfT; - { - E Tfv, Tfw, Tfy, Tfz; - Tfv = FNMS(KP707106781, Tex, Tew); - Tfw = Tet - Teq; - Tfx = FNMS(KP923879532, Tfw, Tfv); - TfU = FMA(KP923879532, Tfw, Tfv); - Tfy = FNMS(KP707106781, Tem, Tel); - Tfz = Tez - TeA; - TfA = FNMS(KP923879532, Tfz, Tfy); - TfT = FMA(KP923879532, Tfz, Tfy); - } - TfB = FMA(KP534511135, TfA, Tfx); - Tg1 = FNMS(KP303346683, TfT, TfU); - TfL = FNMS(KP534511135, Tfx, TfA); - TfV = FMA(KP303346683, TfU, TfT); - } - { - E Tfp, TfP, Tfs, TfQ; - { - E Tfn, Tfo, Tfq, Tfr; - Tfn = FNMS(KP707106781, Te6, Te5); - Tfo = FNMS(KP707106781, Te9, Te8); - Tfp = FMA(KP668178637, Tfo, Tfn); - TfP = FNMS(KP668178637, Tfn, Tfo); - Tfq = FNMS(KP707106781, Ted, Tec); - Tfr = FNMS(KP707106781, Teg, Tef); - Tfs = FNMS(KP668178637, Tfr, Tfq); - TfQ = FMA(KP668178637, Tfq, Tfr); - } - Tft = Tfp - Tfs; - Tly = Tfp + Tfs; - TfR = TfP + TfQ; - Tls = TfQ - TfP; - } - { - E TeO, Tfb, TeV, Tfa; - { - E TeG, TeN, TeR, TeU; - TeG = FMA(KP707106781, TeF, TeE); - TeN = TeJ + TeM; - TeO = FNMS(KP923879532, TeN, TeG); - Tfb = FMA(KP923879532, TeN, TeG); - TeR = FMA(KP707106781, TeQ, TeP); - TeU = TeS + TeT; - TeV = FNMS(KP923879532, TeU, TeR); - Tfa = FMA(KP923879532, TeU, TeR); - } - TeW = FNMS(KP820678790, TeV, TeO); - Tfg = FMA(KP098491403, Tfa, Tfb); - Tf0 = FMA(KP820678790, TeO, TeV); - Tfc = FNMS(KP098491403, Tfb, Tfa); - } - { - E TdW, Tfi, Tlb, Tlp, Te3, Tlq, Tfl, Tlc, TdV, Tla; - TdV = TdT + TdU; - TdW = FMA(KP707106781, TdV, TdS); - Tfi = FNMS(KP707106781, TdV, TdS); - Tla = T8D + T8K; - Tlb = FMA(KP707106781, Tla, Tl9); - Tlp = FNMS(KP707106781, Tla, Tl9); - { - E TdZ, Te2, Tfj, Tfk; - TdZ = FMA(KP414213562, TdY, TdX); - Te2 = FNMS(KP414213562, Te1, Te0); - Te3 = TdZ + Te2; - Tlq = Te2 - TdZ; - Tfj = FNMS(KP414213562, TdX, TdY); - Tfk = FMA(KP414213562, Te0, Te1); - Tfl = Tfj - Tfk; - Tlc = Tfj + Tfk; - } - Te4 = FNMS(KP923879532, Te3, TdW); - Tlr = FMA(KP923879532, Tlq, Tlp); - Tlx = FNMS(KP923879532, Tlq, Tlp); - Tf2 = FMA(KP923879532, Te3, TdW); - Tfm = FMA(KP923879532, Tfl, Tfi); - Tld = FMA(KP923879532, Tlc, Tlb); - Tlj = FNMS(KP923879532, Tlc, Tlb); - TfO = FNMS(KP923879532, Tfl, Tfi); - } - { - E Tek, TeX, Tll, Tlm; - Tek = FMA(KP980785280, Tej, Te4); - TeX = TeD - TeW; - ri[WS(rs, 41)] = FNMS(KP773010453, TeX, Tek); - ri[WS(rs, 9)] = FMA(KP773010453, TeX, Tek); - Tll = FMA(KP980785280, Tlk, Tlj); - Tlm = Tf0 - TeZ; - ii[WS(rs, 9)] = FMA(KP773010453, Tlm, Tll); - ii[WS(rs, 41)] = FNMS(KP773010453, Tlm, Tll); - } - { - E TeY, Tf1, Tln, Tlo; - TeY = FNMS(KP980785280, Tej, Te4); - Tf1 = TeZ + Tf0; - ri[WS(rs, 25)] = FNMS(KP773010453, Tf1, TeY); - ri[WS(rs, 57)] = FMA(KP773010453, Tf1, TeY); - Tln = FNMS(KP980785280, Tlk, Tlj); - Tlo = TeD + TeW; - ii[WS(rs, 25)] = FNMS(KP773010453, Tlo, Tln); - ii[WS(rs, 57)] = FMA(KP773010453, Tlo, Tln); - } - { - E Tf6, Tfd, Tlf, Tlg; - Tf6 = FMA(KP980785280, Tf5, Tf2); - Tfd = Tf9 + Tfc; - ri[WS(rs, 33)] = FNMS(KP995184726, Tfd, Tf6); - ri[WS(rs, 1)] = FMA(KP995184726, Tfd, Tf6); - Tlf = FMA(KP980785280, Tle, Tld); - Tlg = Tff + Tfg; - ii[WS(rs, 1)] = FMA(KP995184726, Tlg, Tlf); - ii[WS(rs, 33)] = FNMS(KP995184726, Tlg, Tlf); - } - { - E Tfe, Tfh, Tlh, Tli; - Tfe = FNMS(KP980785280, Tf5, Tf2); - Tfh = Tff - Tfg; - ri[WS(rs, 49)] = FNMS(KP995184726, Tfh, Tfe); - ri[WS(rs, 17)] = FMA(KP995184726, Tfh, Tfe); - Tlh = FNMS(KP980785280, Tle, Tld); - Tli = Tfc - Tf9; - ii[WS(rs, 17)] = FMA(KP995184726, Tli, Tlh); - ii[WS(rs, 49)] = FNMS(KP995184726, Tli, Tlh); - } - { - E Tfu, TfJ, Tlt, Tlu; - Tfu = FMA(KP831469612, Tft, Tfm); - TfJ = TfB + TfI; - ri[WS(rs, 37)] = FNMS(KP881921264, TfJ, Tfu); - ri[WS(rs, 5)] = FMA(KP881921264, TfJ, Tfu); - Tlt = FMA(KP831469612, Tls, Tlr); - Tlu = TfL + TfM; - ii[WS(rs, 5)] = FMA(KP881921264, Tlu, Tlt); - ii[WS(rs, 37)] = FNMS(KP881921264, Tlu, Tlt); - } - { - E TfK, TfN, Tlv, Tlw; - TfK = FNMS(KP831469612, Tft, Tfm); - TfN = TfL - TfM; - ri[WS(rs, 53)] = FNMS(KP881921264, TfN, TfK); - ri[WS(rs, 21)] = FMA(KP881921264, TfN, TfK); - Tlv = FNMS(KP831469612, Tls, Tlr); - Tlw = TfI - TfB; - ii[WS(rs, 21)] = FMA(KP881921264, Tlw, Tlv); - ii[WS(rs, 53)] = FNMS(KP881921264, Tlw, Tlv); - } - { - E TfS, TfZ, Tlz, TlA; - TfS = FNMS(KP831469612, TfR, TfO); - TfZ = TfV - TfY; - ri[WS(rs, 45)] = FNMS(KP956940335, TfZ, TfS); - ri[WS(rs, 13)] = FMA(KP956940335, TfZ, TfS); - Tlz = FNMS(KP831469612, Tly, Tlx); - TlA = Tg2 - Tg1; - ii[WS(rs, 13)] = FMA(KP956940335, TlA, Tlz); - ii[WS(rs, 45)] = FNMS(KP956940335, TlA, Tlz); - } - { - E Tg0, Tg3, TlB, TlC; - Tg0 = FMA(KP831469612, TfR, TfO); - Tg3 = Tg1 + Tg2; - ri[WS(rs, 29)] = FNMS(KP956940335, Tg3, Tg0); - ri[WS(rs, 61)] = FMA(KP956940335, Tg3, Tg0); - TlB = FMA(KP831469612, Tly, Tlx); - TlC = TfV + TfY; - ii[WS(rs, 29)] = FNMS(KP956940335, TlC, TlB); - ii[WS(rs, 61)] = FMA(KP956940335, TlC, TlB); - } - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 27 }, - { TW_CEXP, 0, 63 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 64, "t2_64", twinstr, &GENUS, { 520, 206, 634, 0 }, 0, 0, 0 }; - -void X(codelet_t2_64) (planner *p) { - X(kdft_dit_register) (p, t2_64, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 64 -name t2_64 -include dft/scalar/t.h */ - -/* - * This function contains 1154 FP additions, 660 FP multiplications, - * (or, 880 additions, 386 multiplications, 274 fused multiply/add), - * 302 stack variables, 15 constants, and 256 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP471396736, +0.471396736825997648556387625905254377657460319); - DK(KP881921264, +0.881921264348355029712756863660388349508442621); - DK(KP290284677, +0.290284677254462367636192375817395274691476278); - DK(KP956940335, +0.956940335732208864935797886980269969482849206); - DK(KP634393284, +0.634393284163645498215171613225493370675687095); - DK(KP773010453, +0.773010453362736960810906609758469800971041293); - DK(KP098017140, +0.098017140329560601994195563888641845861136673); - DK(KP995184726, +0.995184726672196886244836953109479921575474869); - DK(KP555570233, +0.555570233019602224742830813948532874374937191); - DK(KP831469612, +0.831469612302545237078788377617905756738560812); - DK(KP980785280, +0.980785280403230449126182236134239036973933731); - DK(KP195090322, +0.195090322016128267848284868477022240927691618); - DK(KP923879532, +0.923879532511286756128183189396788286822416626); - DK(KP382683432, +0.382683432365089771728459984030398866761344562); - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(128, rs)) { - E T2, T5, T3, T6, Te, T9, TP, T3e, T1e, T39, T3c, TT, T1a, T37, T8; - E Tw, Td, Ty, Tm, Th, T1C, T3K, T1V, T3x, T3I, T1G, T1R, T3v, T2m, T2q; - E T5Y, T6u, T53, T5B, T62, T6w, T57, T5D, T2V, T2X, Tg, TE, T3Y, T3V, T3j; - E Tl, TA, T3g, T1j, T1t, TV, T2C, T2z, T1u, TZ, T1h, To, T1p, T6j, T6H; - E Ts, T1l, T6l, T6F, T2P, T4b, T4x, T5i, T2R, T49, T4z, T5g, TG, T4k, T4m; - E TK, T21, T3O, T3Q, T25, TW, T10, T11, T79, T6X, T5M, T6b, T1v, T30, T69; - E T77, T13, T2F, T2D, T6p, T6O, T1x, T2a, T2f, T6V, T28, T6r, T2h, T6Q, T32; - E T5K, T5w, T4G, T4Q, T3m, T4h, T4I, T5y, T3k, T4f, T41, T4S, T4Y, T3q, T3D; - E T3F, T5r, T3s, T4W, T3Z, T5p; - { - E Ta, Tj, Tx, TC, Tf, Tk, Tz, TD, T1B, T1E, T2o, T2l, T1T, T1Q, T1A; - E T1F, T2p, T2k, T1U, T1P; - { - E T4, T1d, T19, Tb, T1c, T7, Tc, T18, TR, TO, TS, TN; - T2 = W[0]; - T5 = W[1]; - T3 = W[2]; - T6 = W[3]; - Te = W[5]; - T9 = W[4]; - T4 = T2 * T3; - T1d = T5 * T9; - T19 = T5 * Te; - Tb = T2 * T6; - T1c = T2 * Te; - T7 = T5 * T6; - Tc = T5 * T3; - T18 = T2 * T9; - TR = T3 * Te; - TO = T6 * Te; - TS = T6 * T9; - TN = T3 * T9; - TP = TN - TO; - T3e = TR - TS; - T1e = T1c - T1d; - T39 = T1c + T1d; - T3c = TN + TO; - TT = TR + TS; - T1a = T18 + T19; - T37 = T18 - T19; - T8 = T4 - T7; - Ta = T8 * T9; - Tj = T8 * Te; - Tw = T4 + T7; - Tx = Tw * T9; - TC = Tw * Te; - Td = Tb + Tc; - Tf = Td * Te; - Tk = Td * T9; - Ty = Tb - Tc; - Tz = Ty * Te; - TD = Ty * T9; - Tm = W[7]; - T1B = T6 * Tm; - T1E = T3 * Tm; - T2o = T2 * Tm; - T2l = T5 * Tm; - T1T = T9 * Tm; - T1Q = Te * Tm; - Th = W[6]; - T1A = T3 * Th; - T1F = T6 * Th; - T2p = T5 * Th; - T2k = T2 * Th; - T1U = Te * Th; - T1P = T9 * Th; - } - T1C = T1A + T1B; - T3K = T1E + T1F; - T1V = T1T + T1U; - T3x = T2o - T2p; - T3I = T1A - T1B; - T1G = T1E - T1F; - T1R = T1P - T1Q; - { - E T5W, T5X, T55, T56; - T3v = T2k + T2l; - T2m = T2k - T2l; - T2q = T2o + T2p; - T5W = T8 * Th; - T5X = Td * Tm; - T5Y = T5W - T5X; - T6u = T5W + T5X; - { - E T51, T52, T60, T61; - T51 = Tw * Th; - T52 = Ty * Tm; - T53 = T51 + T52; - T5B = T51 - T52; - T60 = T8 * Tm; - T61 = Td * Th; - T62 = T60 + T61; - T6w = T60 - T61; - } - T55 = Tw * Tm; - T56 = Ty * Th; - T57 = T55 - T56; - T5D = T55 + T56; - { - E Ti, Tq, TF, TJ, T3W, T3X, T3T, T3U, T3h, T3i, Tn, Tr, TB, TI, T3d; - E T3f, T1k, T1o, T1Z, T23, TQ, TU, T2A, T2B, T2x, T2y, T20, T24, TX, TY; - E T1i, T1n; - T2V = T1P + T1Q; - T2X = T1T - T1U; - Tg = Ta + Tf; - Ti = Tg * Th; - Tq = Tg * Tm; - TE = TC + TD; - TF = TE * Tm; - TJ = TE * Th; - T3W = T37 * Tm; - T3X = T39 * Th; - T3Y = T3W - T3X; - T3T = T37 * Th; - T3U = T39 * Tm; - T3V = T3T + T3U; - T3h = T3c * Tm; - T3i = T3e * Th; - T3j = T3h - T3i; - Tl = Tj - Tk; - Tn = Tl * Tm; - Tr = Tl * Th; - TA = Tx - Tz; - TB = TA * Th; - TI = TA * Tm; - T3d = T3c * Th; - T3f = T3e * Tm; - T3g = T3d + T3f; - T1j = Tj + Tk; - T1k = T1j * Tm; - T1o = T1j * Th; - T1t = Tx + Tz; - T1Z = T1t * Th; - T23 = T1t * Tm; - TQ = TP * Th; - TU = TT * Tm; - TV = TQ + TU; - T2A = T1a * Tm; - T2B = T1e * Th; - T2C = T2A - T2B; - T2x = T1a * Th; - T2y = T1e * Tm; - T2z = T2x + T2y; - T1u = TC - TD; - T20 = T1u * Tm; - T24 = T1u * Th; - TX = TP * Tm; - TY = TT * Th; - TZ = TX - TY; - T1h = Ta - Tf; - T1i = T1h * Th; - T1n = T1h * Tm; - To = Ti - Tn; - T1p = T1n + T1o; - T6j = TQ - TU; - T6H = T2A + T2B; - Ts = Tq + Tr; - T1l = T1i - T1k; - T6l = TX + TY; - T6F = T2x - T2y; - T2P = T1Z - T20; - T4b = TI + TJ; - T4x = T3d - T3f; - T5i = T3W + T3X; - T2R = T23 + T24; - T49 = TB - TF; - T4z = T3h + T3i; - T5g = T3T - T3U; - TG = TB + TF; - T4k = Ti + Tn; - T4m = Tq - Tr; - TK = TI - TJ; - T21 = T1Z + T20; - T3O = T1i + T1k; - T3Q = T1n - T1o; - T25 = T23 - T24; - TW = W[8]; - T10 = W[9]; - T11 = FMA(TV, TW, TZ * T10); - T79 = FNMS(T25, TW, T21 * T10); - T6X = FNMS(Td, TW, T8 * T10); - T5M = FNMS(T2X, TW, T2V * T10); - T6b = FNMS(TK, TW, TG * T10); - T1v = FMA(T1t, TW, T1u * T10); - T30 = FMA(T1h, TW, T1j * T10); - T69 = FMA(TG, TW, TK * T10); - T77 = FMA(T21, TW, T25 * T10); - T13 = FNMS(TZ, TW, TV * T10); - T2F = FNMS(T2C, TW, T2z * T10); - T2D = FMA(T2z, TW, T2C * T10); - T6p = FMA(T1a, TW, T1e * T10); - T6O = FMA(TP, TW, TT * T10); - T1x = FNMS(T1u, TW, T1t * T10); - T2a = FNMS(TE, TW, TA * T10); - T2f = FMA(T3, TW, T6 * T10); - T6V = FMA(T8, TW, Td * T10); - T28 = FMA(TA, TW, TE * T10); - T6r = FNMS(T1e, TW, T1a * T10); - T2h = FNMS(T6, TW, T3 * T10); - T6Q = FNMS(TT, TW, TP * T10); - T32 = FNMS(T1j, TW, T1h * T10); - T5K = FMA(T2V, TW, T2X * T10); - T5w = FMA(Tw, TW, Ty * T10); - T4G = FMA(T3O, TW, T3Q * T10); - T4Q = FMA(T4k, TW, T4m * T10); - T3m = FNMS(T3j, TW, T3g * T10); - T4h = FNMS(Te, TW, T9 * T10); - T4I = FNMS(T3Q, TW, T3O * T10); - T5y = FNMS(Ty, TW, Tw * T10); - T3k = FMA(T3g, TW, T3j * T10); - T4f = FMA(T9, TW, Te * T10); - T41 = FNMS(T3Y, TW, T3V * T10); - T4S = FNMS(T4m, TW, T4k * T10); - T4Y = FNMS(T3e, TW, T3c * T10); - T3q = FMA(Tg, TW, Tl * T10); - T3D = FMA(T2, TW, T5 * T10); - T3F = FNMS(T5, TW, T2 * T10); - T5r = FNMS(T39, TW, T37 * T10); - T3s = FNMS(Tl, TW, Tg * T10); - T4W = FMA(T3c, TW, T3e * T10); - T3Z = FMA(T3V, TW, T3Y * T10); - T5p = FMA(T37, TW, T39 * T10); - } - } - } - { - E T17, TdV, Tj3, Tjx, T7l, TbJ, Ti3, Tix, T1K, Tiw, TdY, ThY, T7w, Tj0, TbM; - E Tjw, T2e, TgA, T7I, TaY, TbQ, Tda, Te4, TfO, T2J, TgB, T7T, TaZ, TbT, Tdb; - E Te9, TfP, T36, T3B, TgH, TgE, TgF, TgG, T80, TbW, Tel, TfT, T8b, Tc0, T8k; - E TbX, Teg, TfS, T8h, TbZ, T45, T4q, TgJ, TgK, TgL, TgM, T8r, Tc6, Tew, TfW; - E T8C, Tc4, T8L, Tc7, Ter, TfV, T8I, Tc3, T6B, Th1, Tfm, Tga, Th8, ThI, T9N; - E Tcv, T9Y, TcH, Tav, Tcw, Tf5, Tg7, Tas, TcG, T5c, TgV, TeV, Tg0, TgS, ThD; - E T8U, Tcc, T95, Tco, T9C, Tcd, TeE, Tg3, T9z, Tcn, T5R, TgT, TeO, TeW, TgY; - E ThE, T9h, T9F, T9s, T9E, Tck, Tcq, TeJ, TeX, Tch, Tcr, T7e, Th9, Tff, Tfn; - E Th4, ThJ, Taa, Tay, Tal, Tax, TcD, TcJ, Tfa, Tfo, TcA, TcK; - { - E T1, Ti1, Tu, Ti0, TM, T7i, T15, T7j, Tp, Tt; - T1 = ri[0]; - Ti1 = ii[0]; - Tp = ri[WS(rs, 32)]; - Tt = ii[WS(rs, 32)]; - Tu = FMA(To, Tp, Ts * Tt); - Ti0 = FNMS(Ts, Tp, To * Tt); - { - E TH, TL, T12, T14; - TH = ri[WS(rs, 16)]; - TL = ii[WS(rs, 16)]; - TM = FMA(TG, TH, TK * TL); - T7i = FNMS(TK, TH, TG * TL); - T12 = ri[WS(rs, 48)]; - T14 = ii[WS(rs, 48)]; - T15 = FMA(T11, T12, T13 * T14); - T7j = FNMS(T13, T12, T11 * T14); - } - { - E Tv, T16, Tj1, Tj2; - Tv = T1 + Tu; - T16 = TM + T15; - T17 = Tv + T16; - TdV = Tv - T16; - Tj1 = Ti1 - Ti0; - Tj2 = TM - T15; - Tj3 = Tj1 - Tj2; - Tjx = Tj2 + Tj1; - } - { - E T7h, T7k, ThZ, Ti2; - T7h = T1 - Tu; - T7k = T7i - T7j; - T7l = T7h - T7k; - TbJ = T7h + T7k; - ThZ = T7i + T7j; - Ti2 = Ti0 + Ti1; - Ti3 = ThZ + Ti2; - Tix = Ti2 - ThZ; - } - } - { - E T1g, T7m, T1r, T7n, T7o, T7p, T1z, T7s, T1I, T7t, T7r, T7u; - { - E T1b, T1f, T1m, T1q; - T1b = ri[WS(rs, 8)]; - T1f = ii[WS(rs, 8)]; - T1g = FMA(T1a, T1b, T1e * T1f); - T7m = FNMS(T1e, T1b, T1a * T1f); - T1m = ri[WS(rs, 40)]; - T1q = ii[WS(rs, 40)]; - T1r = FMA(T1l, T1m, T1p * T1q); - T7n = FNMS(T1p, T1m, T1l * T1q); - } - T7o = T7m - T7n; - T7p = T1g - T1r; - { - E T1w, T1y, T1D, T1H; - T1w = ri[WS(rs, 56)]; - T1y = ii[WS(rs, 56)]; - T1z = FMA(T1v, T1w, T1x * T1y); - T7s = FNMS(T1x, T1w, T1v * T1y); - T1D = ri[WS(rs, 24)]; - T1H = ii[WS(rs, 24)]; - T1I = FMA(T1C, T1D, T1G * T1H); - T7t = FNMS(T1G, T1D, T1C * T1H); - } - T7r = T1z - T1I; - T7u = T7s - T7t; - { - E T1s, T1J, TdW, TdX; - T1s = T1g + T1r; - T1J = T1z + T1I; - T1K = T1s + T1J; - Tiw = T1J - T1s; - TdW = T7m + T7n; - TdX = T7s + T7t; - TdY = TdW - TdX; - ThY = TdW + TdX; - } - { - E T7q, T7v, TbK, TbL; - T7q = T7o - T7p; - T7v = T7r + T7u; - T7w = KP707106781 * (T7q - T7v); - Tj0 = KP707106781 * (T7q + T7v); - TbK = T7p + T7o; - TbL = T7r - T7u; - TbM = KP707106781 * (TbK + TbL); - Tjw = KP707106781 * (TbL - TbK); - } - } - { - E T1Y, Te0, T7A, T7D, T2d, Te1, T7B, T7G, T7C, T7H; - { - E T1O, T7y, T1X, T7z; - { - E T1M, T1N, T1S, T1W; - T1M = ri[WS(rs, 4)]; - T1N = ii[WS(rs, 4)]; - T1O = FMA(T8, T1M, Td * T1N); - T7y = FNMS(Td, T1M, T8 * T1N); - T1S = ri[WS(rs, 36)]; - T1W = ii[WS(rs, 36)]; - T1X = FMA(T1R, T1S, T1V * T1W); - T7z = FNMS(T1V, T1S, T1R * T1W); - } - T1Y = T1O + T1X; - Te0 = T7y + T7z; - T7A = T7y - T7z; - T7D = T1O - T1X; - } - { - E T27, T7E, T2c, T7F; - { - E T22, T26, T29, T2b; - T22 = ri[WS(rs, 20)]; - T26 = ii[WS(rs, 20)]; - T27 = FMA(T21, T22, T25 * T26); - T7E = FNMS(T25, T22, T21 * T26); - T29 = ri[WS(rs, 52)]; - T2b = ii[WS(rs, 52)]; - T2c = FMA(T28, T29, T2a * T2b); - T7F = FNMS(T2a, T29, T28 * T2b); - } - T2d = T27 + T2c; - Te1 = T7E + T7F; - T7B = T27 - T2c; - T7G = T7E - T7F; - } - T2e = T1Y + T2d; - TgA = Te0 + Te1; - T7C = T7A + T7B; - T7H = T7D - T7G; - T7I = FNMS(KP923879532, T7H, KP382683432 * T7C); - TaY = FMA(KP923879532, T7C, KP382683432 * T7H); - { - E TbO, TbP, Te2, Te3; - TbO = T7A - T7B; - TbP = T7D + T7G; - TbQ = FNMS(KP382683432, TbP, KP923879532 * TbO); - Tda = FMA(KP382683432, TbO, KP923879532 * TbP); - Te2 = Te0 - Te1; - Te3 = T1Y - T2d; - Te4 = Te2 - Te3; - TfO = Te3 + Te2; - } - } - { - E T2t, Te6, T7L, T7O, T2I, Te7, T7M, T7R, T7N, T7S; - { - E T2j, T7J, T2s, T7K; - { - E T2g, T2i, T2n, T2r; - T2g = ri[WS(rs, 60)]; - T2i = ii[WS(rs, 60)]; - T2j = FMA(T2f, T2g, T2h * T2i); - T7J = FNMS(T2h, T2g, T2f * T2i); - T2n = ri[WS(rs, 28)]; - T2r = ii[WS(rs, 28)]; - T2s = FMA(T2m, T2n, T2q * T2r); - T7K = FNMS(T2q, T2n, T2m * T2r); - } - T2t = T2j + T2s; - Te6 = T7J + T7K; - T7L = T7J - T7K; - T7O = T2j - T2s; - } - { - E T2w, T7P, T2H, T7Q; - { - E T2u, T2v, T2E, T2G; - T2u = ri[WS(rs, 12)]; - T2v = ii[WS(rs, 12)]; - T2w = FMA(TP, T2u, TT * T2v); - T7P = FNMS(TT, T2u, TP * T2v); - T2E = ri[WS(rs, 44)]; - T2G = ii[WS(rs, 44)]; - T2H = FMA(T2D, T2E, T2F * T2G); - T7Q = FNMS(T2F, T2E, T2D * T2G); - } - T2I = T2w + T2H; - Te7 = T7P + T7Q; - T7M = T2w - T2H; - T7R = T7P - T7Q; - } - T2J = T2t + T2I; - TgB = Te6 + Te7; - T7N = T7L + T7M; - T7S = T7O - T7R; - T7T = FMA(KP382683432, T7N, KP923879532 * T7S); - TaZ = FNMS(KP923879532, T7N, KP382683432 * T7S); - { - E TbR, TbS, Te5, Te8; - TbR = T7L - T7M; - TbS = T7O + T7R; - TbT = FMA(KP923879532, TbR, KP382683432 * TbS); - Tdb = FNMS(KP382683432, TbR, KP923879532 * TbS); - Te5 = T2t - T2I; - Te8 = Te6 - Te7; - Te9 = Te5 + Te8; - TfP = Te5 - Te8; - } - } - { - E T2O, T7W, T2T, T7X, T2U, Tec, T2Z, T8e, T34, T8f, T35, Ted, T3p, Tei, T86; - E T89, T3A, Tej, T81, T84; - { - E T2M, T2N, T2Q, T2S; - T2M = ri[WS(rs, 2)]; - T2N = ii[WS(rs, 2)]; - T2O = FMA(Tw, T2M, Ty * T2N); - T7W = FNMS(Ty, T2M, Tw * T2N); - T2Q = ri[WS(rs, 34)]; - T2S = ii[WS(rs, 34)]; - T2T = FMA(T2P, T2Q, T2R * T2S); - T7X = FNMS(T2R, T2Q, T2P * T2S); - } - T2U = T2O + T2T; - Tec = T7W + T7X; - { - E T2W, T2Y, T31, T33; - T2W = ri[WS(rs, 18)]; - T2Y = ii[WS(rs, 18)]; - T2Z = FMA(T2V, T2W, T2X * T2Y); - T8e = FNMS(T2X, T2W, T2V * T2Y); - T31 = ri[WS(rs, 50)]; - T33 = ii[WS(rs, 50)]; - T34 = FMA(T30, T31, T32 * T33); - T8f = FNMS(T32, T31, T30 * T33); - } - T35 = T2Z + T34; - Ted = T8e + T8f; - { - E T3b, T87, T3o, T88; - { - E T38, T3a, T3l, T3n; - T38 = ri[WS(rs, 10)]; - T3a = ii[WS(rs, 10)]; - T3b = FMA(T37, T38, T39 * T3a); - T87 = FNMS(T39, T38, T37 * T3a); - T3l = ri[WS(rs, 42)]; - T3n = ii[WS(rs, 42)]; - T3o = FMA(T3k, T3l, T3m * T3n); - T88 = FNMS(T3m, T3l, T3k * T3n); - } - T3p = T3b + T3o; - Tei = T87 + T88; - T86 = T3b - T3o; - T89 = T87 - T88; - } - { - E T3u, T82, T3z, T83; - { - E T3r, T3t, T3w, T3y; - T3r = ri[WS(rs, 58)]; - T3t = ii[WS(rs, 58)]; - T3u = FMA(T3q, T3r, T3s * T3t); - T82 = FNMS(T3s, T3r, T3q * T3t); - T3w = ri[WS(rs, 26)]; - T3y = ii[WS(rs, 26)]; - T3z = FMA(T3v, T3w, T3x * T3y); - T83 = FNMS(T3x, T3w, T3v * T3y); - } - T3A = T3u + T3z; - Tej = T82 + T83; - T81 = T3u - T3z; - T84 = T82 - T83; - } - T36 = T2U + T35; - T3B = T3p + T3A; - TgH = T36 - T3B; - TgE = Tec + Ted; - TgF = Tei + Tej; - TgG = TgE - TgF; - { - E T7Y, T7Z, Teh, Tek; - T7Y = T7W - T7X; - T7Z = T2Z - T34; - T80 = T7Y + T7Z; - TbW = T7Y - T7Z; - Teh = T2U - T35; - Tek = Tei - Tej; - Tel = Teh - Tek; - TfT = Teh + Tek; - } - { - E T85, T8a, T8i, T8j; - T85 = T81 - T84; - T8a = T86 + T89; - T8b = KP707106781 * (T85 - T8a); - Tc0 = KP707106781 * (T8a + T85); - T8i = T89 - T86; - T8j = T81 + T84; - T8k = KP707106781 * (T8i - T8j); - TbX = KP707106781 * (T8i + T8j); - } - { - E Tee, Tef, T8d, T8g; - Tee = Tec - Ted; - Tef = T3A - T3p; - Teg = Tee - Tef; - TfS = Tee + Tef; - T8d = T2O - T2T; - T8g = T8e - T8f; - T8h = T8d - T8g; - TbZ = T8d + T8g; - } - } - { - E T3H, T8n, T3M, T8o, T3N, Ten, T3S, T8F, T43, T8G, T44, Teo, T4e, Tet, T8x; - E T8A, T4p, Teu, T8s, T8v; - { - E T3E, T3G, T3J, T3L; - T3E = ri[WS(rs, 62)]; - T3G = ii[WS(rs, 62)]; - T3H = FMA(T3D, T3E, T3F * T3G); - T8n = FNMS(T3F, T3E, T3D * T3G); - T3J = ri[WS(rs, 30)]; - T3L = ii[WS(rs, 30)]; - T3M = FMA(T3I, T3J, T3K * T3L); - T8o = FNMS(T3K, T3J, T3I * T3L); - } - T3N = T3H + T3M; - Ten = T8n + T8o; - { - E T3P, T3R, T40, T42; - T3P = ri[WS(rs, 14)]; - T3R = ii[WS(rs, 14)]; - T3S = FMA(T3O, T3P, T3Q * T3R); - T8F = FNMS(T3Q, T3P, T3O * T3R); - T40 = ri[WS(rs, 46)]; - T42 = ii[WS(rs, 46)]; - T43 = FMA(T3Z, T40, T41 * T42); - T8G = FNMS(T41, T40, T3Z * T42); - } - T44 = T3S + T43; - Teo = T8F + T8G; - { - E T48, T8y, T4d, T8z; - { - E T46, T47, T4a, T4c; - T46 = ri[WS(rs, 6)]; - T47 = ii[WS(rs, 6)]; - T48 = FMA(T3c, T46, T3e * T47); - T8y = FNMS(T3e, T46, T3c * T47); - T4a = ri[WS(rs, 38)]; - T4c = ii[WS(rs, 38)]; - T4d = FMA(T49, T4a, T4b * T4c); - T8z = FNMS(T4b, T4a, T49 * T4c); - } - T4e = T48 + T4d; - Tet = T8y + T8z; - T8x = T48 - T4d; - T8A = T8y - T8z; - } - { - E T4j, T8t, T4o, T8u; - { - E T4g, T4i, T4l, T4n; - T4g = ri[WS(rs, 54)]; - T4i = ii[WS(rs, 54)]; - T4j = FMA(T4f, T4g, T4h * T4i); - T8t = FNMS(T4h, T4g, T4f * T4i); - T4l = ri[WS(rs, 22)]; - T4n = ii[WS(rs, 22)]; - T4o = FMA(T4k, T4l, T4m * T4n); - T8u = FNMS(T4m, T4l, T4k * T4n); - } - T4p = T4j + T4o; - Teu = T8t + T8u; - T8s = T4j - T4o; - T8v = T8t - T8u; - } - T45 = T3N + T44; - T4q = T4e + T4p; - TgJ = T45 - T4q; - TgK = Ten + Teo; - TgL = Tet + Teu; - TgM = TgK - TgL; - { - E T8p, T8q, Tes, Tev; - T8p = T8n - T8o; - T8q = T3S - T43; - T8r = T8p + T8q; - Tc6 = T8p - T8q; - Tes = T3N - T44; - Tev = Tet - Teu; - Tew = Tes - Tev; - TfW = Tes + Tev; - } - { - E T8w, T8B, T8J, T8K; - T8w = T8s - T8v; - T8B = T8x + T8A; - T8C = KP707106781 * (T8w - T8B); - Tc4 = KP707106781 * (T8B + T8w); - T8J = T8A - T8x; - T8K = T8s + T8v; - T8L = KP707106781 * (T8J - T8K); - Tc7 = KP707106781 * (T8J + T8K); - } - { - E Tep, Teq, T8E, T8H; - Tep = Ten - Teo; - Teq = T4p - T4e; - Ter = Tep - Teq; - TfV = Tep + Teq; - T8E = T3H - T3M; - T8H = T8F - T8G; - T8I = T8E - T8H; - Tc3 = T8E + T8H; - } - } - { - E T5V, Tao, T64, Tap, T65, Tfi, T68, T9K, T6d, T9L, T6e, Tfj, T6o, Tf2, T9Q; - E T9R, T6z, Tf3, T9T, T9W; - { - E T5T, T5U, T5Z, T63; - T5T = ri[WS(rs, 63)]; - T5U = ii[WS(rs, 63)]; - T5V = FMA(TW, T5T, T10 * T5U); - Tao = FNMS(T10, T5T, TW * T5U); - T5Z = ri[WS(rs, 31)]; - T63 = ii[WS(rs, 31)]; - T64 = FMA(T5Y, T5Z, T62 * T63); - Tap = FNMS(T62, T5Z, T5Y * T63); - } - T65 = T5V + T64; - Tfi = Tao + Tap; - { - E T66, T67, T6a, T6c; - T66 = ri[WS(rs, 15)]; - T67 = ii[WS(rs, 15)]; - T68 = FMA(TV, T66, TZ * T67); - T9K = FNMS(TZ, T66, TV * T67); - T6a = ri[WS(rs, 47)]; - T6c = ii[WS(rs, 47)]; - T6d = FMA(T69, T6a, T6b * T6c); - T9L = FNMS(T6b, T6a, T69 * T6c); - } - T6e = T68 + T6d; - Tfj = T9K + T9L; - { - E T6i, T9O, T6n, T9P; - { - E T6g, T6h, T6k, T6m; - T6g = ri[WS(rs, 7)]; - T6h = ii[WS(rs, 7)]; - T6i = FMA(T1t, T6g, T1u * T6h); - T9O = FNMS(T1u, T6g, T1t * T6h); - T6k = ri[WS(rs, 39)]; - T6m = ii[WS(rs, 39)]; - T6n = FMA(T6j, T6k, T6l * T6m); - T9P = FNMS(T6l, T6k, T6j * T6m); - } - T6o = T6i + T6n; - Tf2 = T9O + T9P; - T9Q = T9O - T9P; - T9R = T6i - T6n; - } - { - E T6t, T9U, T6y, T9V; - { - E T6q, T6s, T6v, T6x; - T6q = ri[WS(rs, 55)]; - T6s = ii[WS(rs, 55)]; - T6t = FMA(T6p, T6q, T6r * T6s); - T9U = FNMS(T6r, T6q, T6p * T6s); - T6v = ri[WS(rs, 23)]; - T6x = ii[WS(rs, 23)]; - T6y = FMA(T6u, T6v, T6w * T6x); - T9V = FNMS(T6w, T6v, T6u * T6x); - } - T6z = T6t + T6y; - Tf3 = T9U + T9V; - T9T = T6t - T6y; - T9W = T9U - T9V; - } - { - E T6f, T6A, Tfk, Tfl; - T6f = T65 + T6e; - T6A = T6o + T6z; - T6B = T6f + T6A; - Th1 = T6f - T6A; - Tfk = Tfi - Tfj; - Tfl = T6z - T6o; - Tfm = Tfk - Tfl; - Tga = Tfk + Tfl; - } - { - E Th6, Th7, T9J, T9M; - Th6 = Tfi + Tfj; - Th7 = Tf2 + Tf3; - Th8 = Th6 - Th7; - ThI = Th6 + Th7; - T9J = T5V - T64; - T9M = T9K - T9L; - T9N = T9J - T9M; - Tcv = T9J + T9M; - } - { - E T9S, T9X, Tat, Tau; - T9S = T9Q - T9R; - T9X = T9T + T9W; - T9Y = KP707106781 * (T9S - T9X); - TcH = KP707106781 * (T9S + T9X); - Tat = T9T - T9W; - Tau = T9R + T9Q; - Tav = KP707106781 * (Tat - Tau); - Tcw = KP707106781 * (Tau + Tat); - } - { - E Tf1, Tf4, Taq, Tar; - Tf1 = T65 - T6e; - Tf4 = Tf2 - Tf3; - Tf5 = Tf1 - Tf4; - Tg7 = Tf1 + Tf4; - Taq = Tao - Tap; - Tar = T68 - T6d; - Tas = Taq + Tar; - TcG = Taq - Tar; - } - } - { - E T4w, T8Q, T4B, T8R, T4C, TeA, T4F, T9w, T4K, T9x, T4L, TeB, T4V, TeS, T90; - E T93, T5a, TeT, T8V, T8Y; - { - E T4u, T4v, T4y, T4A; - T4u = ri[WS(rs, 1)]; - T4v = ii[WS(rs, 1)]; - T4w = FMA(T2, T4u, T5 * T4v); - T8Q = FNMS(T5, T4u, T2 * T4v); - T4y = ri[WS(rs, 33)]; - T4A = ii[WS(rs, 33)]; - T4B = FMA(T4x, T4y, T4z * T4A); - T8R = FNMS(T4z, T4y, T4x * T4A); - } - T4C = T4w + T4B; - TeA = T8Q + T8R; - { - E T4D, T4E, T4H, T4J; - T4D = ri[WS(rs, 17)]; - T4E = ii[WS(rs, 17)]; - T4F = FMA(T3V, T4D, T3Y * T4E); - T9w = FNMS(T3Y, T4D, T3V * T4E); - T4H = ri[WS(rs, 49)]; - T4J = ii[WS(rs, 49)]; - T4K = FMA(T4G, T4H, T4I * T4J); - T9x = FNMS(T4I, T4H, T4G * T4J); - } - T4L = T4F + T4K; - TeB = T9w + T9x; - { - E T4P, T91, T4U, T92; - { - E T4N, T4O, T4R, T4T; - T4N = ri[WS(rs, 9)]; - T4O = ii[WS(rs, 9)]; - T4P = FMA(T9, T4N, Te * T4O); - T91 = FNMS(Te, T4N, T9 * T4O); - T4R = ri[WS(rs, 41)]; - T4T = ii[WS(rs, 41)]; - T4U = FMA(T4Q, T4R, T4S * T4T); - T92 = FNMS(T4S, T4R, T4Q * T4T); - } - T4V = T4P + T4U; - TeS = T91 + T92; - T90 = T4P - T4U; - T93 = T91 - T92; - } - { - E T50, T8W, T59, T8X; - { - E T4X, T4Z, T54, T58; - T4X = ri[WS(rs, 57)]; - T4Z = ii[WS(rs, 57)]; - T50 = FMA(T4W, T4X, T4Y * T4Z); - T8W = FNMS(T4Y, T4X, T4W * T4Z); - T54 = ri[WS(rs, 25)]; - T58 = ii[WS(rs, 25)]; - T59 = FMA(T53, T54, T57 * T58); - T8X = FNMS(T57, T54, T53 * T58); - } - T5a = T50 + T59; - TeT = T8W + T8X; - T8V = T50 - T59; - T8Y = T8W - T8X; - } - { - E T4M, T5b, TeR, TeU; - T4M = T4C + T4L; - T5b = T4V + T5a; - T5c = T4M + T5b; - TgV = T4M - T5b; - TeR = T4C - T4L; - TeU = TeS - TeT; - TeV = TeR - TeU; - Tg0 = TeR + TeU; - } - { - E TgQ, TgR, T8S, T8T; - TgQ = TeA + TeB; - TgR = TeS + TeT; - TgS = TgQ - TgR; - ThD = TgQ + TgR; - T8S = T8Q - T8R; - T8T = T4F - T4K; - T8U = T8S + T8T; - Tcc = T8S - T8T; - } - { - E T8Z, T94, T9A, T9B; - T8Z = T8V - T8Y; - T94 = T90 + T93; - T95 = KP707106781 * (T8Z - T94); - Tco = KP707106781 * (T94 + T8Z); - T9A = T93 - T90; - T9B = T8V + T8Y; - T9C = KP707106781 * (T9A - T9B); - Tcd = KP707106781 * (T9A + T9B); - } - { - E TeC, TeD, T9v, T9y; - TeC = TeA - TeB; - TeD = T5a - T4V; - TeE = TeC - TeD; - Tg3 = TeC + TeD; - T9v = T4w - T4B; - T9y = T9w - T9x; - T9z = T9v - T9y; - Tcn = T9v + T9y; - } - } - { - E T5l, TeL, T9k, T9n, T5P, TeH, T9a, T9f, T5u, TeM, T9l, T9q, T5G, TeG, T97; - E T9e; - { - E T5f, T9i, T5k, T9j; - { - E T5d, T5e, T5h, T5j; - T5d = ri[WS(rs, 5)]; - T5e = ii[WS(rs, 5)]; - T5f = FMA(Tg, T5d, Tl * T5e); - T9i = FNMS(Tl, T5d, Tg * T5e); - T5h = ri[WS(rs, 37)]; - T5j = ii[WS(rs, 37)]; - T5k = FMA(T5g, T5h, T5i * T5j); - T9j = FNMS(T5i, T5h, T5g * T5j); - } - T5l = T5f + T5k; - TeL = T9i + T9j; - T9k = T9i - T9j; - T9n = T5f - T5k; - } - { - E T5J, T98, T5O, T99; - { - E T5H, T5I, T5L, T5N; - T5H = ri[WS(rs, 13)]; - T5I = ii[WS(rs, 13)]; - T5J = FMA(T1h, T5H, T1j * T5I); - T98 = FNMS(T1j, T5H, T1h * T5I); - T5L = ri[WS(rs, 45)]; - T5N = ii[WS(rs, 45)]; - T5O = FMA(T5K, T5L, T5M * T5N); - T99 = FNMS(T5M, T5L, T5K * T5N); - } - T5P = T5J + T5O; - TeH = T98 + T99; - T9a = T98 - T99; - T9f = T5J - T5O; - } - { - E T5o, T9o, T5t, T9p; - { - E T5m, T5n, T5q, T5s; - T5m = ri[WS(rs, 21)]; - T5n = ii[WS(rs, 21)]; - T5o = FMA(T3g, T5m, T3j * T5n); - T9o = FNMS(T3j, T5m, T3g * T5n); - T5q = ri[WS(rs, 53)]; - T5s = ii[WS(rs, 53)]; - T5t = FMA(T5p, T5q, T5r * T5s); - T9p = FNMS(T5r, T5q, T5p * T5s); - } - T5u = T5o + T5t; - TeM = T9o + T9p; - T9l = T5o - T5t; - T9q = T9o - T9p; - } - { - E T5A, T9c, T5F, T9d; - { - E T5x, T5z, T5C, T5E; - T5x = ri[WS(rs, 61)]; - T5z = ii[WS(rs, 61)]; - T5A = FMA(T5w, T5x, T5y * T5z); - T9c = FNMS(T5y, T5x, T5w * T5z); - T5C = ri[WS(rs, 29)]; - T5E = ii[WS(rs, 29)]; - T5F = FMA(T5B, T5C, T5D * T5E); - T9d = FNMS(T5D, T5C, T5B * T5E); - } - T5G = T5A + T5F; - TeG = T9c + T9d; - T97 = T5A - T5F; - T9e = T9c - T9d; - } - { - E T5v, T5Q, TeK, TeN; - T5v = T5l + T5u; - T5Q = T5G + T5P; - T5R = T5v + T5Q; - TgT = T5Q - T5v; - TeK = T5l - T5u; - TeN = TeL - TeM; - TeO = TeK + TeN; - TeW = TeN - TeK; - } - { - E TgW, TgX, T9b, T9g; - TgW = TeL + TeM; - TgX = TeG + TeH; - TgY = TgW - TgX; - ThE = TgW + TgX; - T9b = T97 - T9a; - T9g = T9e + T9f; - T9h = FNMS(KP923879532, T9g, KP382683432 * T9b); - T9F = FMA(KP382683432, T9g, KP923879532 * T9b); - } - { - E T9m, T9r, Tci, Tcj; - T9m = T9k + T9l; - T9r = T9n - T9q; - T9s = FMA(KP923879532, T9m, KP382683432 * T9r); - T9E = FNMS(KP923879532, T9r, KP382683432 * T9m); - Tci = T9k - T9l; - Tcj = T9n + T9q; - Tck = FMA(KP382683432, Tci, KP923879532 * Tcj); - Tcq = FNMS(KP382683432, Tcj, KP923879532 * Tci); - } - { - E TeF, TeI, Tcf, Tcg; - TeF = T5G - T5P; - TeI = TeG - TeH; - TeJ = TeF - TeI; - TeX = TeF + TeI; - Tcf = T97 + T9a; - Tcg = T9e - T9f; - Tch = FNMS(KP382683432, Tcg, KP923879532 * Tcf); - Tcr = FMA(KP923879532, Tcg, KP382683432 * Tcf); - } - } - { - E T6K, Tf6, Ta2, Ta5, T7c, Tfd, Tae, Taj, T6T, Tf7, Ta3, Ta8, T73, Tfc, Tad; - E Tag; - { - E T6E, Ta0, T6J, Ta1; - { - E T6C, T6D, T6G, T6I; - T6C = ri[WS(rs, 3)]; - T6D = ii[WS(rs, 3)]; - T6E = FMA(T3, T6C, T6 * T6D); - Ta0 = FNMS(T6, T6C, T3 * T6D); - T6G = ri[WS(rs, 35)]; - T6I = ii[WS(rs, 35)]; - T6J = FMA(T6F, T6G, T6H * T6I); - Ta1 = FNMS(T6H, T6G, T6F * T6I); - } - T6K = T6E + T6J; - Tf6 = Ta0 + Ta1; - Ta2 = Ta0 - Ta1; - Ta5 = T6E - T6J; - } - { - E T76, Tah, T7b, Tai; - { - E T74, T75, T78, T7a; - T74 = ri[WS(rs, 11)]; - T75 = ii[WS(rs, 11)]; - T76 = FMA(TA, T74, TE * T75); - Tah = FNMS(TE, T74, TA * T75); - T78 = ri[WS(rs, 43)]; - T7a = ii[WS(rs, 43)]; - T7b = FMA(T77, T78, T79 * T7a); - Tai = FNMS(T79, T78, T77 * T7a); - } - T7c = T76 + T7b; - Tfd = Tah + Tai; - Tae = T76 - T7b; - Taj = Tah - Tai; - } - { - E T6N, Ta6, T6S, Ta7; - { - E T6L, T6M, T6P, T6R; - T6L = ri[WS(rs, 19)]; - T6M = ii[WS(rs, 19)]; - T6N = FMA(T2z, T6L, T2C * T6M); - Ta6 = FNMS(T2C, T6L, T2z * T6M); - T6P = ri[WS(rs, 51)]; - T6R = ii[WS(rs, 51)]; - T6S = FMA(T6O, T6P, T6Q * T6R); - Ta7 = FNMS(T6Q, T6P, T6O * T6R); - } - T6T = T6N + T6S; - Tf7 = Ta6 + Ta7; - Ta3 = T6N - T6S; - Ta8 = Ta6 - Ta7; - } - { - E T6Z, Tab, T72, Tac; - { - E T6W, T6Y, T70, T71; - T6W = ri[WS(rs, 59)]; - T6Y = ii[WS(rs, 59)]; - T6Z = FMA(T6V, T6W, T6X * T6Y); - Tab = FNMS(T6X, T6W, T6V * T6Y); - T70 = ri[WS(rs, 27)]; - T71 = ii[WS(rs, 27)]; - T72 = FMA(Th, T70, Tm * T71); - Tac = FNMS(Tm, T70, Th * T71); - } - T73 = T6Z + T72; - Tfc = Tab + Tac; - Tad = Tab - Tac; - Tag = T6Z - T72; - } - { - E T6U, T7d, Tfb, Tfe; - T6U = T6K + T6T; - T7d = T73 + T7c; - T7e = T6U + T7d; - Th9 = T7d - T6U; - Tfb = T73 - T7c; - Tfe = Tfc - Tfd; - Tff = Tfb + Tfe; - Tfn = Tfb - Tfe; - } - { - E Th2, Th3, Ta4, Ta9; - Th2 = Tf6 + Tf7; - Th3 = Tfc + Tfd; - Th4 = Th2 - Th3; - ThJ = Th2 + Th3; - Ta4 = Ta2 + Ta3; - Ta9 = Ta5 - Ta8; - Taa = FNMS(KP923879532, Ta9, KP382683432 * Ta4); - Tay = FMA(KP923879532, Ta4, KP382683432 * Ta9); - } - { - E Taf, Tak, TcB, TcC; - Taf = Tad + Tae; - Tak = Tag - Taj; - Tal = FMA(KP382683432, Taf, KP923879532 * Tak); - Tax = FNMS(KP923879532, Taf, KP382683432 * Tak); - TcB = Tad - Tae; - TcC = Tag + Taj; - TcD = FMA(KP923879532, TcB, KP382683432 * TcC); - TcJ = FNMS(KP382683432, TcB, KP923879532 * TcC); - } - { - E Tf8, Tf9, Tcy, Tcz; - Tf8 = Tf6 - Tf7; - Tf9 = T6K - T6T; - Tfa = Tf8 - Tf9; - Tfo = Tf9 + Tf8; - Tcy = Ta2 - Ta3; - Tcz = Ta5 + Ta8; - TcA = FNMS(KP382683432, Tcz, KP923879532 * Tcy); - TcK = FMA(KP382683432, Tcy, KP923879532 * Tcz); - } - } - { - E T2L, Thx, ThU, ThV, Ti5, Tib, T4s, Tia, T7g, Ti7, ThG, ThO, ThL, ThP, ThA; - E ThW; - { - E T1L, T2K, ThS, ThT; - T1L = T17 + T1K; - T2K = T2e + T2J; - T2L = T1L + T2K; - Thx = T1L - T2K; - ThS = ThD + ThE; - ThT = ThI + ThJ; - ThU = ThS - ThT; - ThV = ThS + ThT; - } - { - E ThX, Ti4, T3C, T4r; - ThX = TgA + TgB; - Ti4 = ThY + Ti3; - Ti5 = ThX + Ti4; - Tib = Ti4 - ThX; - T3C = T36 + T3B; - T4r = T45 + T4q; - T4s = T3C + T4r; - Tia = T4r - T3C; - } - { - E T5S, T7f, ThC, ThF; - T5S = T5c + T5R; - T7f = T6B + T7e; - T7g = T5S + T7f; - Ti7 = T7f - T5S; - ThC = T5c - T5R; - ThF = ThD - ThE; - ThG = ThC + ThF; - ThO = ThF - ThC; - } - { - E ThH, ThK, Thy, Thz; - ThH = T6B - T7e; - ThK = ThI - ThJ; - ThL = ThH - ThK; - ThP = ThH + ThK; - Thy = TgE + TgF; - Thz = TgK + TgL; - ThA = Thy - Thz; - ThW = Thy + Thz; - } - { - E T4t, Ti6, ThR, Ti8; - T4t = T2L + T4s; - ri[WS(rs, 32)] = T4t - T7g; - ri[0] = T4t + T7g; - Ti6 = ThW + Ti5; - ii[0] = ThV + Ti6; - ii[WS(rs, 32)] = Ti6 - ThV; - ThR = T2L - T4s; - ri[WS(rs, 48)] = ThR - ThU; - ri[WS(rs, 16)] = ThR + ThU; - Ti8 = Ti5 - ThW; - ii[WS(rs, 16)] = Ti7 + Ti8; - ii[WS(rs, 48)] = Ti8 - Ti7; - } - { - E ThB, ThM, Ti9, Tic; - ThB = Thx + ThA; - ThM = KP707106781 * (ThG + ThL); - ri[WS(rs, 40)] = ThB - ThM; - ri[WS(rs, 8)] = ThB + ThM; - Ti9 = KP707106781 * (ThO + ThP); - Tic = Tia + Tib; - ii[WS(rs, 8)] = Ti9 + Tic; - ii[WS(rs, 40)] = Tic - Ti9; - } - { - E ThN, ThQ, Tid, Tie; - ThN = Thx - ThA; - ThQ = KP707106781 * (ThO - ThP); - ri[WS(rs, 56)] = ThN - ThQ; - ri[WS(rs, 24)] = ThN + ThQ; - Tid = KP707106781 * (ThL - ThG); - Tie = Tib - Tia; - ii[WS(rs, 24)] = Tid + Tie; - ii[WS(rs, 56)] = Tie - Tid; - } - } - { - E TgD, Thh, Thr, Thv, Tij, Tip, TgO, Tig, Th0, The, Thk, Tio, Tho, Thu, Thb; - E Thf; - { - E Tgz, TgC, Thp, Thq; - Tgz = T17 - T1K; - TgC = TgA - TgB; - TgD = Tgz - TgC; - Thh = Tgz + TgC; - Thp = Th1 + Th4; - Thq = Th8 + Th9; - Thr = FNMS(KP382683432, Thq, KP923879532 * Thp); - Thv = FMA(KP923879532, Thq, KP382683432 * Thp); - } - { - E Tih, Tii, TgI, TgN; - Tih = T2J - T2e; - Tii = Ti3 - ThY; - Tij = Tih + Tii; - Tip = Tii - Tih; - TgI = TgG - TgH; - TgN = TgJ + TgM; - TgO = KP707106781 * (TgI - TgN); - Tig = KP707106781 * (TgI + TgN); - } - { - E TgU, TgZ, Thi, Thj; - TgU = TgS - TgT; - TgZ = TgV - TgY; - Th0 = FMA(KP923879532, TgU, KP382683432 * TgZ); - The = FNMS(KP923879532, TgZ, KP382683432 * TgU); - Thi = TgH + TgG; - Thj = TgJ - TgM; - Thk = KP707106781 * (Thi + Thj); - Tio = KP707106781 * (Thj - Thi); - } - { - E Thm, Thn, Th5, Tha; - Thm = TgS + TgT; - Thn = TgV + TgY; - Tho = FMA(KP382683432, Thm, KP923879532 * Thn); - Thu = FNMS(KP382683432, Thn, KP923879532 * Thm); - Th5 = Th1 - Th4; - Tha = Th8 - Th9; - Thb = FNMS(KP923879532, Tha, KP382683432 * Th5); - Thf = FMA(KP382683432, Tha, KP923879532 * Th5); - } - { - E TgP, Thc, Tin, Tiq; - TgP = TgD + TgO; - Thc = Th0 + Thb; - ri[WS(rs, 44)] = TgP - Thc; - ri[WS(rs, 12)] = TgP + Thc; - Tin = The + Thf; - Tiq = Tio + Tip; - ii[WS(rs, 12)] = Tin + Tiq; - ii[WS(rs, 44)] = Tiq - Tin; - } - { - E Thd, Thg, Tir, Tis; - Thd = TgD - TgO; - Thg = The - Thf; - ri[WS(rs, 60)] = Thd - Thg; - ri[WS(rs, 28)] = Thd + Thg; - Tir = Thb - Th0; - Tis = Tip - Tio; - ii[WS(rs, 28)] = Tir + Tis; - ii[WS(rs, 60)] = Tis - Tir; - } - { - E Thl, Ths, Tif, Tik; - Thl = Thh + Thk; - Ths = Tho + Thr; - ri[WS(rs, 36)] = Thl - Ths; - ri[WS(rs, 4)] = Thl + Ths; - Tif = Thu + Thv; - Tik = Tig + Tij; - ii[WS(rs, 4)] = Tif + Tik; - ii[WS(rs, 36)] = Tik - Tif; - } - { - E Tht, Thw, Til, Tim; - Tht = Thh - Thk; - Thw = Thu - Thv; - ri[WS(rs, 52)] = Tht - Thw; - ri[WS(rs, 20)] = Tht + Thw; - Til = Thr - Tho; - Tim = Tij - Tig; - ii[WS(rs, 20)] = Til + Tim; - ii[WS(rs, 52)] = Tim - Til; - } - } - { - E Teb, Tfx, Tey, TiK, TiN, TiT, TfA, TiS, Tfr, TfL, Tfv, TfH, Tf0, TfK, Tfu; - E TfE; - { - E TdZ, Tea, Tfy, Tfz; - TdZ = TdV - TdY; - Tea = KP707106781 * (Te4 - Te9); - Teb = TdZ - Tea; - Tfx = TdZ + Tea; - { - E Tem, Tex, TiL, TiM; - Tem = FNMS(KP923879532, Tel, KP382683432 * Teg); - Tex = FMA(KP382683432, Ter, KP923879532 * Tew); - Tey = Tem - Tex; - TiK = Tem + Tex; - TiL = KP707106781 * (TfP - TfO); - TiM = Tix - Tiw; - TiN = TiL + TiM; - TiT = TiM - TiL; - } - Tfy = FMA(KP923879532, Teg, KP382683432 * Tel); - Tfz = FNMS(KP923879532, Ter, KP382683432 * Tew); - TfA = Tfy + Tfz; - TiS = Tfz - Tfy; - { - E Tfh, TfF, Tfq, TfG, Tfg, Tfp; - Tfg = KP707106781 * (Tfa - Tff); - Tfh = Tf5 - Tfg; - TfF = Tf5 + Tfg; - Tfp = KP707106781 * (Tfn - Tfo); - Tfq = Tfm - Tfp; - TfG = Tfm + Tfp; - Tfr = FNMS(KP980785280, Tfq, KP195090322 * Tfh); - TfL = FMA(KP831469612, TfG, KP555570233 * TfF); - Tfv = FMA(KP195090322, Tfq, KP980785280 * Tfh); - TfH = FNMS(KP555570233, TfG, KP831469612 * TfF); - } - { - E TeQ, TfC, TeZ, TfD, TeP, TeY; - TeP = KP707106781 * (TeJ - TeO); - TeQ = TeE - TeP; - TfC = TeE + TeP; - TeY = KP707106781 * (TeW - TeX); - TeZ = TeV - TeY; - TfD = TeV + TeY; - Tf0 = FMA(KP980785280, TeQ, KP195090322 * TeZ); - TfK = FNMS(KP555570233, TfD, KP831469612 * TfC); - Tfu = FNMS(KP980785280, TeZ, KP195090322 * TeQ); - TfE = FMA(KP555570233, TfC, KP831469612 * TfD); - } - } - { - E Tez, Tfs, TiR, TiU; - Tez = Teb + Tey; - Tfs = Tf0 + Tfr; - ri[WS(rs, 46)] = Tez - Tfs; - ri[WS(rs, 14)] = Tez + Tfs; - TiR = Tfu + Tfv; - TiU = TiS + TiT; - ii[WS(rs, 14)] = TiR + TiU; - ii[WS(rs, 46)] = TiU - TiR; - } - { - E Tft, Tfw, TiV, TiW; - Tft = Teb - Tey; - Tfw = Tfu - Tfv; - ri[WS(rs, 62)] = Tft - Tfw; - ri[WS(rs, 30)] = Tft + Tfw; - TiV = Tfr - Tf0; - TiW = TiT - TiS; - ii[WS(rs, 30)] = TiV + TiW; - ii[WS(rs, 62)] = TiW - TiV; - } - { - E TfB, TfI, TiJ, TiO; - TfB = Tfx + TfA; - TfI = TfE + TfH; - ri[WS(rs, 38)] = TfB - TfI; - ri[WS(rs, 6)] = TfB + TfI; - TiJ = TfK + TfL; - TiO = TiK + TiN; - ii[WS(rs, 6)] = TiJ + TiO; - ii[WS(rs, 38)] = TiO - TiJ; - } - { - E TfJ, TfM, TiP, TiQ; - TfJ = Tfx - TfA; - TfM = TfK - TfL; - ri[WS(rs, 54)] = TfJ - TfM; - ri[WS(rs, 22)] = TfJ + TfM; - TiP = TfH - TfE; - TiQ = TiN - TiK; - ii[WS(rs, 22)] = TiP + TiQ; - ii[WS(rs, 54)] = TiQ - TiP; - } - } - { - E TfR, Tgj, TfY, Tiu, Tiz, TiF, Tgm, TiE, Tgd, Tgx, Tgh, Tgt, Tg6, Tgw, Tgg; - E Tgq; - { - E TfN, TfQ, Tgk, Tgl; - TfN = TdV + TdY; - TfQ = KP707106781 * (TfO + TfP); - TfR = TfN - TfQ; - Tgj = TfN + TfQ; - { - E TfU, TfX, Tiv, Tiy; - TfU = FNMS(KP382683432, TfT, KP923879532 * TfS); - TfX = FMA(KP923879532, TfV, KP382683432 * TfW); - TfY = TfU - TfX; - Tiu = TfU + TfX; - Tiv = KP707106781 * (Te4 + Te9); - Tiy = Tiw + Tix; - Tiz = Tiv + Tiy; - TiF = Tiy - Tiv; - } - Tgk = FMA(KP382683432, TfS, KP923879532 * TfT); - Tgl = FNMS(KP382683432, TfV, KP923879532 * TfW); - Tgm = Tgk + Tgl; - TiE = Tgl - Tgk; - { - E Tg9, Tgr, Tgc, Tgs, Tg8, Tgb; - Tg8 = KP707106781 * (Tfo + Tfn); - Tg9 = Tg7 - Tg8; - Tgr = Tg7 + Tg8; - Tgb = KP707106781 * (Tfa + Tff); - Tgc = Tga - Tgb; - Tgs = Tga + Tgb; - Tgd = FNMS(KP831469612, Tgc, KP555570233 * Tg9); - Tgx = FMA(KP195090322, Tgr, KP980785280 * Tgs); - Tgh = FMA(KP831469612, Tg9, KP555570233 * Tgc); - Tgt = FNMS(KP195090322, Tgs, KP980785280 * Tgr); - } - { - E Tg2, Tgo, Tg5, Tgp, Tg1, Tg4; - Tg1 = KP707106781 * (TeO + TeJ); - Tg2 = Tg0 - Tg1; - Tgo = Tg0 + Tg1; - Tg4 = KP707106781 * (TeW + TeX); - Tg5 = Tg3 - Tg4; - Tgp = Tg3 + Tg4; - Tg6 = FMA(KP555570233, Tg2, KP831469612 * Tg5); - Tgw = FNMS(KP195090322, Tgo, KP980785280 * Tgp); - Tgg = FNMS(KP831469612, Tg2, KP555570233 * Tg5); - Tgq = FMA(KP980785280, Tgo, KP195090322 * Tgp); - } - } - { - E TfZ, Tge, TiD, TiG; - TfZ = TfR + TfY; - Tge = Tg6 + Tgd; - ri[WS(rs, 42)] = TfZ - Tge; - ri[WS(rs, 10)] = TfZ + Tge; - TiD = Tgg + Tgh; - TiG = TiE + TiF; - ii[WS(rs, 10)] = TiD + TiG; - ii[WS(rs, 42)] = TiG - TiD; - } - { - E Tgf, Tgi, TiH, TiI; - Tgf = TfR - TfY; - Tgi = Tgg - Tgh; - ri[WS(rs, 58)] = Tgf - Tgi; - ri[WS(rs, 26)] = Tgf + Tgi; - TiH = Tgd - Tg6; - TiI = TiF - TiE; - ii[WS(rs, 26)] = TiH + TiI; - ii[WS(rs, 58)] = TiI - TiH; - } - { - E Tgn, Tgu, Tit, TiA; - Tgn = Tgj + Tgm; - Tgu = Tgq + Tgt; - ri[WS(rs, 34)] = Tgn - Tgu; - ri[WS(rs, 2)] = Tgn + Tgu; - Tit = Tgw + Tgx; - TiA = Tiu + Tiz; - ii[WS(rs, 2)] = Tit + TiA; - ii[WS(rs, 34)] = TiA - Tit; - } - { - E Tgv, Tgy, TiB, TiC; - Tgv = Tgj - Tgm; - Tgy = Tgw - Tgx; - ri[WS(rs, 50)] = Tgv - Tgy; - ri[WS(rs, 18)] = Tgv + Tgy; - TiB = Tgt - Tgq; - TiC = Tiz - Tiu; - ii[WS(rs, 18)] = TiB + TiC; - ii[WS(rs, 50)] = TiC - TiB; - } - } - { - E T7V, TaH, TjN, TjT, T8O, TjS, TaK, TjK, T9I, TaU, TaE, TaO, TaB, TaV, TaF; - E TaR; - { - E T7x, T7U, TjL, TjM; - T7x = T7l - T7w; - T7U = T7I - T7T; - T7V = T7x - T7U; - TaH = T7x + T7U; - TjL = TaZ - TaY; - TjM = Tjx - Tjw; - TjN = TjL + TjM; - TjT = TjM - TjL; - } - { - E T8m, TaI, T8N, TaJ; - { - E T8c, T8l, T8D, T8M; - T8c = T80 - T8b; - T8l = T8h - T8k; - T8m = FNMS(KP980785280, T8l, KP195090322 * T8c); - TaI = FMA(KP980785280, T8c, KP195090322 * T8l); - T8D = T8r - T8C; - T8M = T8I - T8L; - T8N = FMA(KP195090322, T8D, KP980785280 * T8M); - TaJ = FNMS(KP980785280, T8D, KP195090322 * T8M); - } - T8O = T8m - T8N; - TjS = TaJ - TaI; - TaK = TaI + TaJ; - TjK = T8m + T8N; - } - { - E T9u, TaM, T9H, TaN; - { - E T96, T9t, T9D, T9G; - T96 = T8U - T95; - T9t = T9h - T9s; - T9u = T96 - T9t; - TaM = T96 + T9t; - T9D = T9z - T9C; - T9G = T9E - T9F; - T9H = T9D - T9G; - TaN = T9D + T9G; - } - T9I = FMA(KP995184726, T9u, KP098017140 * T9H); - TaU = FNMS(KP634393284, TaN, KP773010453 * TaM); - TaE = FNMS(KP995184726, T9H, KP098017140 * T9u); - TaO = FMA(KP634393284, TaM, KP773010453 * TaN); - } - { - E Tan, TaP, TaA, TaQ; - { - E T9Z, Tam, Taw, Taz; - T9Z = T9N - T9Y; - Tam = Taa - Tal; - Tan = T9Z - Tam; - TaP = T9Z + Tam; - Taw = Tas - Tav; - Taz = Tax - Tay; - TaA = Taw - Taz; - TaQ = Taw + Taz; - } - TaB = FNMS(KP995184726, TaA, KP098017140 * Tan); - TaV = FMA(KP773010453, TaQ, KP634393284 * TaP); - TaF = FMA(KP098017140, TaA, KP995184726 * Tan); - TaR = FNMS(KP634393284, TaQ, KP773010453 * TaP); - } - { - E T8P, TaC, TjR, TjU; - T8P = T7V + T8O; - TaC = T9I + TaB; - ri[WS(rs, 47)] = T8P - TaC; - ri[WS(rs, 15)] = T8P + TaC; - TjR = TaE + TaF; - TjU = TjS + TjT; - ii[WS(rs, 15)] = TjR + TjU; - ii[WS(rs, 47)] = TjU - TjR; - } - { - E TaD, TaG, TjV, TjW; - TaD = T7V - T8O; - TaG = TaE - TaF; - ri[WS(rs, 63)] = TaD - TaG; - ri[WS(rs, 31)] = TaD + TaG; - TjV = TaB - T9I; - TjW = TjT - TjS; - ii[WS(rs, 31)] = TjV + TjW; - ii[WS(rs, 63)] = TjW - TjV; - } - { - E TaL, TaS, TjJ, TjO; - TaL = TaH + TaK; - TaS = TaO + TaR; - ri[WS(rs, 39)] = TaL - TaS; - ri[WS(rs, 7)] = TaL + TaS; - TjJ = TaU + TaV; - TjO = TjK + TjN; - ii[WS(rs, 7)] = TjJ + TjO; - ii[WS(rs, 39)] = TjO - TjJ; - } - { - E TaT, TaW, TjP, TjQ; - TaT = TaH - TaK; - TaW = TaU - TaV; - ri[WS(rs, 55)] = TaT - TaW; - ri[WS(rs, 23)] = TaT + TaW; - TjP = TaR - TaO; - TjQ = TjN - TjK; - ii[WS(rs, 23)] = TjP + TjQ; - ii[WS(rs, 55)] = TjQ - TjP; - } - } - { - E TbV, TcT, Tjj, Tjp, Tca, Tjo, TcW, Tjg, Tcu, Td6, TcQ, Td0, TcN, Td7, TcR; - E Td3; - { - E TbN, TbU, Tjh, Tji; - TbN = TbJ - TbM; - TbU = TbQ - TbT; - TbV = TbN - TbU; - TcT = TbN + TbU; - Tjh = Tdb - Tda; - Tji = Tj3 - Tj0; - Tjj = Tjh + Tji; - Tjp = Tji - Tjh; - } - { - E Tc2, TcU, Tc9, TcV; - { - E TbY, Tc1, Tc5, Tc8; - TbY = TbW - TbX; - Tc1 = TbZ - Tc0; - Tc2 = FNMS(KP831469612, Tc1, KP555570233 * TbY); - TcU = FMA(KP555570233, Tc1, KP831469612 * TbY); - Tc5 = Tc3 - Tc4; - Tc8 = Tc6 - Tc7; - Tc9 = FMA(KP831469612, Tc5, KP555570233 * Tc8); - TcV = FNMS(KP831469612, Tc8, KP555570233 * Tc5); - } - Tca = Tc2 - Tc9; - Tjo = TcV - TcU; - TcW = TcU + TcV; - Tjg = Tc2 + Tc9; - } - { - E Tcm, TcY, Tct, TcZ; - { - E Tce, Tcl, Tcp, Tcs; - Tce = Tcc - Tcd; - Tcl = Tch - Tck; - Tcm = Tce - Tcl; - TcY = Tce + Tcl; - Tcp = Tcn - Tco; - Tcs = Tcq - Tcr; - Tct = Tcp - Tcs; - TcZ = Tcp + Tcs; - } - Tcu = FMA(KP956940335, Tcm, KP290284677 * Tct); - Td6 = FNMS(KP471396736, TcZ, KP881921264 * TcY); - TcQ = FNMS(KP956940335, Tct, KP290284677 * Tcm); - Td0 = FMA(KP471396736, TcY, KP881921264 * TcZ); - } - { - E TcF, Td1, TcM, Td2; - { - E Tcx, TcE, TcI, TcL; - Tcx = Tcv - Tcw; - TcE = TcA - TcD; - TcF = Tcx - TcE; - Td1 = Tcx + TcE; - TcI = TcG - TcH; - TcL = TcJ - TcK; - TcM = TcI - TcL; - Td2 = TcI + TcL; - } - TcN = FNMS(KP956940335, TcM, KP290284677 * TcF); - Td7 = FMA(KP881921264, Td2, KP471396736 * Td1); - TcR = FMA(KP290284677, TcM, KP956940335 * TcF); - Td3 = FNMS(KP471396736, Td2, KP881921264 * Td1); - } - { - E Tcb, TcO, Tjn, Tjq; - Tcb = TbV + Tca; - TcO = Tcu + TcN; - ri[WS(rs, 45)] = Tcb - TcO; - ri[WS(rs, 13)] = Tcb + TcO; - Tjn = TcQ + TcR; - Tjq = Tjo + Tjp; - ii[WS(rs, 13)] = Tjn + Tjq; - ii[WS(rs, 45)] = Tjq - Tjn; - } - { - E TcP, TcS, Tjr, Tjs; - TcP = TbV - Tca; - TcS = TcQ - TcR; - ri[WS(rs, 61)] = TcP - TcS; - ri[WS(rs, 29)] = TcP + TcS; - Tjr = TcN - Tcu; - Tjs = Tjp - Tjo; - ii[WS(rs, 29)] = Tjr + Tjs; - ii[WS(rs, 61)] = Tjs - Tjr; - } - { - E TcX, Td4, Tjf, Tjk; - TcX = TcT + TcW; - Td4 = Td0 + Td3; - ri[WS(rs, 37)] = TcX - Td4; - ri[WS(rs, 5)] = TcX + Td4; - Tjf = Td6 + Td7; - Tjk = Tjg + Tjj; - ii[WS(rs, 5)] = Tjf + Tjk; - ii[WS(rs, 37)] = Tjk - Tjf; - } - { - E Td5, Td8, Tjl, Tjm; - Td5 = TcT - TcW; - Td8 = Td6 - Td7; - ri[WS(rs, 53)] = Td5 - Td8; - ri[WS(rs, 21)] = Td5 + Td8; - Tjl = Td3 - Td0; - Tjm = Tjj - Tjg; - ii[WS(rs, 21)] = Tjl + Tjm; - ii[WS(rs, 53)] = Tjm - Tjl; - } - } - { - E Tdd, TdF, Tj5, Tjb, Tdk, Tja, TdI, TiY, Tds, TdS, TdC, TdM, Tdz, TdT, TdD; - E TdP; - { - E Td9, Tdc, TiZ, Tj4; - Td9 = TbJ + TbM; - Tdc = Tda + Tdb; - Tdd = Td9 - Tdc; - TdF = Td9 + Tdc; - TiZ = TbQ + TbT; - Tj4 = Tj0 + Tj3; - Tj5 = TiZ + Tj4; - Tjb = Tj4 - TiZ; - } - { - E Tdg, TdG, Tdj, TdH; - { - E Tde, Tdf, Tdh, Tdi; - Tde = TbW + TbX; - Tdf = TbZ + Tc0; - Tdg = FNMS(KP195090322, Tdf, KP980785280 * Tde); - TdG = FMA(KP980785280, Tdf, KP195090322 * Tde); - Tdh = Tc3 + Tc4; - Tdi = Tc6 + Tc7; - Tdj = FMA(KP195090322, Tdh, KP980785280 * Tdi); - TdH = FNMS(KP195090322, Tdi, KP980785280 * Tdh); - } - Tdk = Tdg - Tdj; - Tja = TdH - TdG; - TdI = TdG + TdH; - TiY = Tdg + Tdj; - } - { - E Tdo, TdK, Tdr, TdL; - { - E Tdm, Tdn, Tdp, Tdq; - Tdm = Tcn + Tco; - Tdn = Tck + Tch; - Tdo = Tdm - Tdn; - TdK = Tdm + Tdn; - Tdp = Tcc + Tcd; - Tdq = Tcq + Tcr; - Tdr = Tdp - Tdq; - TdL = Tdp + Tdq; - } - Tds = FMA(KP634393284, Tdo, KP773010453 * Tdr); - TdS = FNMS(KP098017140, TdK, KP995184726 * TdL); - TdC = FNMS(KP773010453, Tdo, KP634393284 * Tdr); - TdM = FMA(KP995184726, TdK, KP098017140 * TdL); - } - { - E Tdv, TdN, Tdy, TdO; - { - E Tdt, Tdu, Tdw, Tdx; - Tdt = Tcv + Tcw; - Tdu = TcK + TcJ; - Tdv = Tdt - Tdu; - TdN = Tdt + Tdu; - Tdw = TcG + TcH; - Tdx = TcA + TcD; - Tdy = Tdw - Tdx; - TdO = Tdw + Tdx; - } - Tdz = FNMS(KP773010453, Tdy, KP634393284 * Tdv); - TdT = FMA(KP098017140, TdN, KP995184726 * TdO); - TdD = FMA(KP773010453, Tdv, KP634393284 * Tdy); - TdP = FNMS(KP098017140, TdO, KP995184726 * TdN); - } - { - E Tdl, TdA, Tj9, Tjc; - Tdl = Tdd + Tdk; - TdA = Tds + Tdz; - ri[WS(rs, 41)] = Tdl - TdA; - ri[WS(rs, 9)] = Tdl + TdA; - Tj9 = TdC + TdD; - Tjc = Tja + Tjb; - ii[WS(rs, 9)] = Tj9 + Tjc; - ii[WS(rs, 41)] = Tjc - Tj9; - } - { - E TdB, TdE, Tjd, Tje; - TdB = Tdd - Tdk; - TdE = TdC - TdD; - ri[WS(rs, 57)] = TdB - TdE; - ri[WS(rs, 25)] = TdB + TdE; - Tjd = Tdz - Tds; - Tje = Tjb - Tja; - ii[WS(rs, 25)] = Tjd + Tje; - ii[WS(rs, 57)] = Tje - Tjd; - } - { - E TdJ, TdQ, TiX, Tj6; - TdJ = TdF + TdI; - TdQ = TdM + TdP; - ri[WS(rs, 33)] = TdJ - TdQ; - ri[WS(rs, 1)] = TdJ + TdQ; - TiX = TdS + TdT; - Tj6 = TiY + Tj5; - ii[WS(rs, 1)] = TiX + Tj6; - ii[WS(rs, 33)] = Tj6 - TiX; - } - { - E TdR, TdU, Tj7, Tj8; - TdR = TdF - TdI; - TdU = TdS - TdT; - ri[WS(rs, 49)] = TdR - TdU; - ri[WS(rs, 17)] = TdR + TdU; - Tj7 = TdP - TdM; - Tj8 = Tj5 - TiY; - ii[WS(rs, 17)] = Tj7 + Tj8; - ii[WS(rs, 49)] = Tj8 - Tj7; - } - } - { - E Tb1, Tbt, Tjz, TjF, Tb8, TjE, Tbw, Tju, Tbg, TbG, Tbq, TbA, Tbn, TbH, Tbr; - E TbD; - { - E TaX, Tb0, Tjv, Tjy; - TaX = T7l + T7w; - Tb0 = TaY + TaZ; - Tb1 = TaX - Tb0; - Tbt = TaX + Tb0; - Tjv = T7I + T7T; - Tjy = Tjw + Tjx; - Tjz = Tjv + Tjy; - TjF = Tjy - Tjv; - } - { - E Tb4, Tbu, Tb7, Tbv; - { - E Tb2, Tb3, Tb5, Tb6; - Tb2 = T80 + T8b; - Tb3 = T8h + T8k; - Tb4 = FNMS(KP555570233, Tb3, KP831469612 * Tb2); - Tbu = FMA(KP555570233, Tb2, KP831469612 * Tb3); - Tb5 = T8r + T8C; - Tb6 = T8I + T8L; - Tb7 = FMA(KP831469612, Tb5, KP555570233 * Tb6); - Tbv = FNMS(KP555570233, Tb5, KP831469612 * Tb6); - } - Tb8 = Tb4 - Tb7; - TjE = Tbv - Tbu; - Tbw = Tbu + Tbv; - Tju = Tb4 + Tb7; - } - { - E Tbc, Tby, Tbf, Tbz; - { - E Tba, Tbb, Tbd, Tbe; - Tba = T9z + T9C; - Tbb = T9s + T9h; - Tbc = Tba - Tbb; - Tby = Tba + Tbb; - Tbd = T8U + T95; - Tbe = T9E + T9F; - Tbf = Tbd - Tbe; - Tbz = Tbd + Tbe; - } - Tbg = FMA(KP471396736, Tbc, KP881921264 * Tbf); - TbG = FNMS(KP290284677, Tby, KP956940335 * Tbz); - Tbq = FNMS(KP881921264, Tbc, KP471396736 * Tbf); - TbA = FMA(KP956940335, Tby, KP290284677 * Tbz); - } - { - E Tbj, TbB, Tbm, TbC; - { - E Tbh, Tbi, Tbk, Tbl; - Tbh = T9N + T9Y; - Tbi = Tay + Tax; - Tbj = Tbh - Tbi; - TbB = Tbh + Tbi; - Tbk = Tas + Tav; - Tbl = Taa + Tal; - Tbm = Tbk - Tbl; - TbC = Tbk + Tbl; - } - Tbn = FNMS(KP881921264, Tbm, KP471396736 * Tbj); - TbH = FMA(KP290284677, TbB, KP956940335 * TbC); - Tbr = FMA(KP881921264, Tbj, KP471396736 * Tbm); - TbD = FNMS(KP290284677, TbC, KP956940335 * TbB); - } - { - E Tb9, Tbo, TjD, TjG; - Tb9 = Tb1 + Tb8; - Tbo = Tbg + Tbn; - ri[WS(rs, 43)] = Tb9 - Tbo; - ri[WS(rs, 11)] = Tb9 + Tbo; - TjD = Tbq + Tbr; - TjG = TjE + TjF; - ii[WS(rs, 11)] = TjD + TjG; - ii[WS(rs, 43)] = TjG - TjD; - } - { - E Tbp, Tbs, TjH, TjI; - Tbp = Tb1 - Tb8; - Tbs = Tbq - Tbr; - ri[WS(rs, 59)] = Tbp - Tbs; - ri[WS(rs, 27)] = Tbp + Tbs; - TjH = Tbn - Tbg; - TjI = TjF - TjE; - ii[WS(rs, 27)] = TjH + TjI; - ii[WS(rs, 59)] = TjI - TjH; - } - { - E Tbx, TbE, Tjt, TjA; - Tbx = Tbt + Tbw; - TbE = TbA + TbD; - ri[WS(rs, 35)] = Tbx - TbE; - ri[WS(rs, 3)] = Tbx + TbE; - Tjt = TbG + TbH; - TjA = Tju + Tjz; - ii[WS(rs, 3)] = Tjt + TjA; - ii[WS(rs, 35)] = TjA - Tjt; - } - { - E TbF, TbI, TjB, TjC; - TbF = Tbt - Tbw; - TbI = TbG - TbH; - ri[WS(rs, 51)] = TbF - TbI; - ri[WS(rs, 19)] = TbF + TbI; - TjB = TbD - TbA; - TjC = Tjz - Tju; - ii[WS(rs, 19)] = TjB + TjC; - ii[WS(rs, 51)] = TjC - TjB; - } - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 9 }, - { TW_CEXP, 0, 27 }, - { TW_CEXP, 0, 63 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 64, "t2_64", twinstr, &GENUS, { 880, 386, 274, 0 }, 0, 0, 0 }; - -void X(codelet_t2_64) (planner *p) { - X(kdft_dit_register) (p, t2_64, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_8.c b/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_8.c deleted file mode 100644 index fab59597..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/dft/scalar/codelets/t2_8.c +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* This file was automatically generated --- DO NOT EDIT */ -/* Generated on Tue Sep 14 10:44:32 EDT 2021 */ - -#include "dft/codelet-dft.h" - -#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA) - -/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */ - -/* - * This function contains 74 FP additions, 50 FP multiplications, - * (or, 44 additions, 20 multiplications, 30 fused multiply/add), - * 48 stack variables, 1 constants, and 32 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) { - E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG; - { - E T4, Tm, Tr, Ta, TB, TF; - T2 = W[0]; - T3 = W[2]; - T4 = T2 * T3; - Tl = W[4]; - Tm = T2 * Tl; - Tn = W[5]; - Tr = T2 * Tn; - T5 = W[1]; - T6 = W[3]; - Ta = T2 * T6; - Tf = FMA(T5, T6, T4); - T7 = FNMS(T5, T6, T4); - Ts = FNMS(T5, Tl, Tr); - Tb = FMA(T5, T3, Ta); - To = FMA(T5, Tn, Tm); - TB = Tf * Tl; - TF = Tf * Tn; - Ti = FNMS(T5, T3, Ta); - TC = FMA(Ti, Tn, TB); - TG = FNMS(Ti, Tl, TF); - } - { - E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA; - E TI, T11, T13, T15, T16; - T1 = ri[0]; - T1s = ii[0]; - { - E T8, T9, Tc, T1q; - T8 = ri[WS(rs, 4)]; - T9 = T7 * T8; - Tc = ii[WS(rs, 4)]; - T1q = T7 * Tc; - Td = FMA(Tb, Tc, T9); - T1r = FNMS(Tb, T8, T1q); - } - { - E Tp, Tq, Tt, TX; - Tp = ri[WS(rs, 6)]; - Tq = To * Tp; - Tt = ii[WS(rs, 6)]; - TX = To * Tt; - Tu = FMA(Ts, Tt, Tq); - TY = FNMS(Ts, Tp, TX); - } - { - E Tg, Th, Tj, TV; - Tg = ri[WS(rs, 2)]; - Th = Tf * Tg; - Tj = ii[WS(rs, 2)]; - TV = Tf * Tj; - Tk = FMA(Ti, Tj, Th); - TW = FNMS(Ti, Tg, TV); - } - { - E TK, TL, TM, T19, TO, TP, TQ, T1b; - TK = ri[WS(rs, 7)]; - TL = Tl * TK; - TM = ii[WS(rs, 7)]; - T19 = Tl * TM; - TO = ri[WS(rs, 3)]; - TP = T3 * TO; - TQ = ii[WS(rs, 3)]; - T1b = T3 * TQ; - TN = FMA(Tn, TM, TL); - TR = FMA(T6, TQ, TP); - T18 = TN - TR; - T1a = FNMS(Tn, TK, T19); - T1c = FNMS(T6, TO, T1b); - T1d = T1a - T1c; - } - { - E Tx, Ty, Tz, T12, TD, TE, TH, T14; - Tx = ri[WS(rs, 1)]; - Ty = T2 * Tx; - Tz = ii[WS(rs, 1)]; - T12 = T2 * Tz; - TD = ri[WS(rs, 5)]; - TE = TC * TD; - TH = ii[WS(rs, 5)]; - T14 = TC * TH; - TA = FMA(T5, Tz, Ty); - TI = FMA(TG, TH, TE); - T11 = TA - TI; - T13 = FNMS(T5, Tx, T12); - T15 = FNMS(TG, TD, T14); - T16 = T13 - T15; - } - { - E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A; - { - E TU, TZ, T1x, T1y; - TU = T1 - Td; - TZ = TW - TY; - T10 = TU + TZ; - T1g = TU - TZ; - T1x = T1s - T1r; - T1y = Tk - Tu; - T1z = T1x - T1y; - T1B = T1y + T1x; - } - { - E T17, T1e, T1h, T1i; - T17 = T11 + T16; - T1e = T18 - T1d; - T1f = T17 + T1e; - T1C = T1e - T17; - T1h = T16 - T11; - T1i = T18 + T1d; - T1j = T1h - T1i; - T1A = T1h + T1i; - } - ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10); - ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z); - ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10); - ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z); - ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g); - ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B); - ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g); - ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B); - } - { - E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o; - { - E Te, Tv, T1p, T1t; - Te = T1 + Td; - Tv = Tk + Tu; - Tw = Te + Tv; - T1k = Te - Tv; - T1p = TW + TY; - T1t = T1r + T1s; - T1u = T1p + T1t; - T1w = T1t - T1p; - } - { - E TJ, TS, T1l, T1m; - TJ = TA + TI; - TS = TN + TR; - TT = TJ + TS; - T1v = TS - TJ; - T1l = T13 + T15; - T1m = T1a + T1c; - T1n = T1l - T1m; - T1o = T1l + T1m; - } - ri[WS(rs, 4)] = Tw - TT; - ii[WS(rs, 4)] = T1u - T1o; - ri[0] = Tw + TT; - ii[0] = T1o + T1u; - ri[WS(rs, 6)] = T1k - T1n; - ii[WS(rs, 6)] = T1w - T1v; - ri[WS(rs, 2)] = T1k + T1n; - ii[WS(rs, 2)] = T1v + T1w; - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 7 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 44, 20, 30, 0 }, 0, 0, 0 }; - -void X(codelet_t2_8) (planner *p) { - X(kdft_dit_register) (p, t2_8, &desc); -} -#else - -/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */ - -/* - * This function contains 74 FP additions, 44 FP multiplications, - * (or, 56 additions, 26 multiplications, 18 fused multiply/add), - * 42 stack variables, 1 constants, and 32 memory accesses - */ -#include "dft/scalar/t.h" - -static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) -{ - DK(KP707106781, +0.707106781186547524400844362104849039284835938); - { - INT m; - for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) { - E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx; - { - E T4, Tb, T7, Ta; - T2 = W[0]; - T5 = W[1]; - T3 = W[2]; - T6 = W[3]; - T4 = T2 * T3; - Tb = T5 * T3; - T7 = T5 * T6; - Ta = T2 * T6; - T8 = T4 - T7; - Tc = Ta + Tb; - Tg = T4 + T7; - Ti = Ta - Tb; - Tl = W[4]; - Tm = W[5]; - Tn = FMA(T2, Tl, T5 * Tm); - Tz = FNMS(Ti, Tl, Tg * Tm); - Tp = FNMS(T5, Tl, T2 * Tm); - Tx = FMA(Tg, Tl, Ti * Tm); - } - { - E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ; - E TT; - { - E T1, T1c, Te, T1b, T9, Td; - T1 = ri[0]; - T1c = ii[0]; - T9 = ri[WS(rs, 4)]; - Td = ii[WS(rs, 4)]; - Te = FMA(T8, T9, Tc * Td); - T1b = FNMS(Tc, T9, T8 * Td); - Tf = T1 + Te; - T1i = T1c - T1b; - TL = T1 - Te; - T1d = T1b + T1c; - } - { - E TF, TW, TI, TX; - { - E TD, TE, TG, TH; - TD = ri[WS(rs, 7)]; - TE = ii[WS(rs, 7)]; - TF = FMA(Tl, TD, Tm * TE); - TW = FNMS(Tm, TD, Tl * TE); - TG = ri[WS(rs, 3)]; - TH = ii[WS(rs, 3)]; - TI = FMA(T3, TG, T6 * TH); - TX = FNMS(T6, TG, T3 * TH); - } - TJ = TF + TI; - T17 = TW + TX; - TV = TF - TI; - TY = TW - TX; - } - { - E Tk, TM, Tr, TN; - { - E Th, Tj, To, Tq; - Th = ri[WS(rs, 2)]; - Tj = ii[WS(rs, 2)]; - Tk = FMA(Tg, Th, Ti * Tj); - TM = FNMS(Ti, Th, Tg * Tj); - To = ri[WS(rs, 6)]; - Tq = ii[WS(rs, 6)]; - Tr = FMA(Tn, To, Tp * Tq); - TN = FNMS(Tp, To, Tn * Tq); - } - Ts = Tk + Tr; - T1j = Tk - Tr; - TO = TM - TN; - T1a = TM + TN; - } - { - E Tw, TR, TB, TS; - { - E Tu, Tv, Ty, TA; - Tu = ri[WS(rs, 1)]; - Tv = ii[WS(rs, 1)]; - Tw = FMA(T2, Tu, T5 * Tv); - TR = FNMS(T5, Tu, T2 * Tv); - Ty = ri[WS(rs, 5)]; - TA = ii[WS(rs, 5)]; - TB = FMA(Tx, Ty, Tz * TA); - TS = FNMS(Tz, Ty, Tx * TA); - } - TC = Tw + TB; - T16 = TR + TS; - TQ = Tw - TB; - TT = TR - TS; - } - { - E Tt, TK, T1f, T1g; - Tt = Tf + Ts; - TK = TC + TJ; - ri[WS(rs, 4)] = Tt - TK; - ri[0] = Tt + TK; - { - E T19, T1e, T15, T18; - T19 = T16 + T17; - T1e = T1a + T1d; - ii[0] = T19 + T1e; - ii[WS(rs, 4)] = T1e - T19; - T15 = Tf - Ts; - T18 = T16 - T17; - ri[WS(rs, 6)] = T15 - T18; - ri[WS(rs, 2)] = T15 + T18; - } - T1f = TJ - TC; - T1g = T1d - T1a; - ii[WS(rs, 2)] = T1f + T1g; - ii[WS(rs, 6)] = T1g - T1f; - { - E T11, T1k, T14, T1h, T12, T13; - T11 = TL - TO; - T1k = T1i - T1j; - T12 = TT - TQ; - T13 = TV + TY; - T14 = KP707106781 * (T12 - T13); - T1h = KP707106781 * (T12 + T13); - ri[WS(rs, 7)] = T11 - T14; - ii[WS(rs, 5)] = T1k - T1h; - ri[WS(rs, 3)] = T11 + T14; - ii[WS(rs, 1)] = T1h + T1k; - } - { - E TP, T1m, T10, T1l, TU, TZ; - TP = TL + TO; - T1m = T1j + T1i; - TU = TQ + TT; - TZ = TV - TY; - T10 = KP707106781 * (TU + TZ); - T1l = KP707106781 * (TZ - TU); - ri[WS(rs, 5)] = TP - T10; - ii[WS(rs, 7)] = T1m - T1l; - ri[WS(rs, 1)] = TP + T10; - ii[WS(rs, 3)] = T1l + T1m; - } - } - } - } - } -} - -static const tw_instr twinstr[] = { - { TW_CEXP, 0, 1 }, - { TW_CEXP, 0, 3 }, - { TW_CEXP, 0, 7 }, - { TW_NEXT, 1, 0 } -}; - -static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 56, 26, 18, 0 }, 0, 0, 0 }; - -void X(codelet_t2_8) (planner *p) { - X(kdft_dit_register) (p, t2_8, &desc); -} -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/simd-support/neon.c b/rpg_cpp/thirdparty/fftw-3.3.10/simd-support/neon.c deleted file mode 100644 index 196959ca..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/simd-support/neon.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - - -#include "kernel/ifftw.h" - -#if HAVE_NEON - -/* check for an environment where signals are known to work */ -#if defined(unix) || defined(linux) - # include - # include - - static jmp_buf jb; - - static void sighandler(int x) - { - UNUSED(x); - longjmp(jb, 1); - } - - static int really_have_neon(void) - { - void (*oldsig)(int); - oldsig = signal(SIGILL, sighandler); - if (setjmp(jb)) { - signal(SIGILL, oldsig); - return 0; - } else { - /* paranoia: encode the instruction in binary because the - assembler may not recognize it without -mfpu=neon */ - /*asm volatile ("vand q0, q0, q0");*/ - asm volatile (".long 0xf2000150"); - signal(SIGILL, oldsig); - return 1; - } - } - - int X(have_simd_neon)(void) - { - static int init = 0, res; - - if (!init) { - res = really_have_neon(); - init = 1; - } - return res; - } - - -#else -/* don't know how to autodetect NEON; assume it is present */ - int X(have_simd_neon)(void) - { - return 1; - } -#endif - -#endif diff --git a/rpg_cpp/thirdparty/fftw-3.3.10/stamp-h1 b/rpg_cpp/thirdparty/fftw-3.3.10/stamp-h1 deleted file mode 100644 index 4547fe1b..00000000 --- a/rpg_cpp/thirdparty/fftw-3.3.10/stamp-h1 +++ /dev/null @@ -1 +0,0 @@ -timestamp for config.h diff --git a/setup.py b/setup.py index a9548dbd..957223fd 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def change_permissions_recursive(path, mode): setup( name ='designer2', - version ='0.0.32', + version ='0.0.33', author ='Benjamin Ades-Aron', author_email ='benjamin.ades-aron@nyulangone.org', url ='https://github.com/badesar1/designer_v2_dev.git',