Skip to content

Commit 24578f5

Browse files
committed
Merge branch 'staticallycompileableldiv'
2 parents 64f1b07 + 4739359 commit 24578f5

File tree

1 file changed

+23
-9
lines changed

1 file changed

+23
-9
lines changed

src/TriangularSolve.jl

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -394,14 +394,12 @@ end
394394
Base.Cartesian.@nexprs $W c ->
395395
A11_c = vfnmadd_fast(U_ki, vload(spc, (static(c - 1), nk)), A11_c)
396396
end
397-
# Base.Cartesian.@nexprs $W c -> @show A11_c
398397
# solve AU wants us to transpose
399398
# We then have column-major multiplies
400399
# take A[(u-1)*W,u*W), [0,W)]
401400
X = VectorizationBase.transpose_vecunroll(
402401
VecUnroll(Base.Cartesian.@ntuple $W A11)
403402
)
404-
# @show X
405403
C_u = solve_AU(X, spu, n, $(Val(UNIT)))
406404
vstore!(spc, C_u, $(Unroll{2,1,W,1,W,zero(UInt),1})(($z, n)))
407405
end
@@ -1018,26 +1016,42 @@ end
10181016
end
10191017
end
10201018
end
1021-
1022-
# spc = spa / spu
1023-
# spc' = (spu' \ spa')'
1024-
# This is ldiv
1025-
function rdiv_U!(
1019+
@inline function rdiv_U!(
10261020
spc::AbstractStridedPointer{T,2,2},
10271021
spa::AbstractStridedPointer{T,2,2},
10281022
spu::AbstractStridedPointer{T,2,2},
10291023
M,
10301024
N,
10311025
::Val{UNIT}
10321026
) where {T,UNIT}
1027+
tup = (spc, spa, spu)
1028+
_ldiv_L!(
1029+
M,
1030+
N,
1031+
Val(UNIT),
1032+
typeof(tup),
1033+
LoopVectorization.flatten_to_tuple(tup)...
1034+
)
1035+
end
1036+
1037+
# spc = spa / spu
1038+
# spc' = (spu' \ spa')'
1039+
# This is ldiv
1040+
function _ldiv_L!(
1041+
M,
1042+
N,
1043+
::Val{UNIT},
1044+
::Type{Args},
1045+
args::Vararg{Any,K}
1046+
) where {UNIT,Args,K}
1047+
spc, spa, spu = LoopVectorization.reassemble_tuple(Args, args)
1048+
T = eltype(spc)
10331049
WS = pick_vector_width(T)
10341050
W = Int(WS)
10351051
UF = unroll_factor(WS)
10361052
WU = UF * WS
1037-
MU = UF > 1 ? M : 0
10381053
Nd, Nr = VectorizationBase.vdivrem(N, WS)
10391054
m = 0
1040-
# @show M,N
10411055
# m, no remainder
10421056
while m < M - WS + 1
10431057
n = Nr # non factor of W remainder

0 commit comments

Comments
 (0)