@@ -394,14 +394,12 @@ end
394
394
Base. Cartesian. @nexprs $ W c ->
395
395
A11_c = vfnmadd_fast (U_ki, vload (spc, (static (c - 1 ), nk)), A11_c)
396
396
end
397
- # Base.Cartesian.@nexprs $W c -> @show A11_c
398
397
# solve AU wants us to transpose
399
398
# We then have column-major multiplies
400
399
# take A[(u-1)*W,u*W), [0,W)]
401
400
X = VectorizationBase. transpose_vecunroll (
402
401
VecUnroll (Base. Cartesian. @ntuple $ W A11)
403
402
)
404
- # @show X
405
403
C_u = solve_AU (X, spu, n, $ (Val (UNIT)))
406
404
vstore! (spc, C_u, $ (Unroll{2 ,1 ,W,1 ,W,zero (UInt),1 })(($ z, n)))
407
405
end
@@ -1018,26 +1016,42 @@ end
1018
1016
end
1019
1017
end
1020
1018
end
1021
-
1022
- # spc = spa / spu
1023
- # spc' = (spu' \ spa')'
1024
- # This is ldiv
1025
- function rdiv_U! (
1019
+ @inline function rdiv_U! (
1026
1020
spc:: AbstractStridedPointer{T,2,2} ,
1027
1021
spa:: AbstractStridedPointer{T,2,2} ,
1028
1022
spu:: AbstractStridedPointer{T,2,2} ,
1029
1023
M,
1030
1024
N,
1031
1025
:: Val{UNIT}
1032
1026
) where {T,UNIT}
1027
+ tup = (spc, spa, spu)
1028
+ _ldiv_L! (
1029
+ M,
1030
+ N,
1031
+ Val (UNIT),
1032
+ typeof (tup),
1033
+ LoopVectorization. flatten_to_tuple (tup)...
1034
+ )
1035
+ end
1036
+
1037
+ # spc = spa / spu
1038
+ # spc' = (spu' \ spa')'
1039
+ # This is ldiv
1040
+ function _ldiv_L! (
1041
+ M,
1042
+ N,
1043
+ :: Val{UNIT} ,
1044
+ :: Type{Args} ,
1045
+ args:: Vararg{Any,K}
1046
+ ) where {UNIT,Args,K}
1047
+ spc, spa, spu = LoopVectorization. reassemble_tuple (Args, args)
1048
+ T = eltype (spc)
1033
1049
WS = pick_vector_width (T)
1034
1050
W = Int (WS)
1035
1051
UF = unroll_factor (WS)
1036
1052
WU = UF * WS
1037
- MU = UF > 1 ? M : 0
1038
1053
Nd, Nr = VectorizationBase. vdivrem (N, WS)
1039
1054
m = 0
1040
- # @show M,N
1041
1055
# m, no remainder
1042
1056
while m < M - WS + 1
1043
1057
n = Nr # non factor of W remainder
0 commit comments