Skip to content

Commit 5392f6d

Browse files
committed
LoongArch64: Fixed LASX version of cscal and zscal
1 parent b2117bb commit 5392f6d

File tree

1 file changed

+61
-183
lines changed

1 file changed

+61
-183
lines changed

kernel/loongarch64/cscal_lasx.S

+61-183
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333
#define ALPHAI $f1
3434
#define X $r7
3535
#define INCX $r8
36+
#define DUMMY2 $r9
3637

3738
#define I $r12
3839
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566

6667
bge $r0, N, .L999
6768
bge $r0, INCX, .L999
69+
ld.d DUMMY2, $sp, 0
6870
li.d TEMP, 1
6971
movgr2fr.d a1, $r0
7072
FFINT a1, a1
@@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8688
#endif
8789
bne INCX, TEMP, .L22
8890

91+
/////// INCX == 1 ////////
8992
.L11:
90-
bge $r0, I, .L997
9193
CMPEQ $fcc0, ALPHAR, a1
9294
CMPEQ $fcc1, ALPHAI, a1
93-
bceqz $fcc0, .L13
94-
b .L14
95-
.align 3
95+
bge $r0, I, .L19
96+
/////// INCX == 1 && N >= 4 ////////
97+
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
9698

97-
.L13:
98-
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
99-
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
99+
bceqz $fcc0, .L17
100100

101-
.L14:
102-
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
103-
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
104-
.align 3
101+
bceqz $fcc1, .L17
105102

106-
.L111: //alpha_r == 0.0 && alpha_i == 0.0
103+
.L15: //alpha_r == 0.0 && alpha_i == 0.0
107104
xvst VXZ, X, 0 * SIZE
108105
#ifdef DOUBLE
109106
xvst VXZ, X, 4 * SIZE
@@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113110
addi.d X, X, 16 * SIZE
114111
#endif
115112
addi.d I, I, -1
116-
blt $r0, I, .L111
117-
b .L997
113+
blt $r0, I, .L15
114+
b .L19
118115
.align 3
119116

120-
.L113: //alpha_r != 0.0 && alpha_i == 0.0
121-
xvld VX0, X, 0 * SIZE
122-
#ifdef DOUBLE
123-
xvld VX1, X, 4 * SIZE
124-
xvpickev.d x1, VX1, VX0
125-
xvpickod.d x2, VX1, VX0
126-
xvfmul.d x3, VXAR, x1
127-
xvfmul.d x4, VXAR, x2
128-
xvilvl.d VX2, x4 ,x3
129-
xvilvh.d VX3, x4, x3
130-
xvst VX2, X, 0 * SIZE
131-
xvst VX3, X, 4 * SIZE
132-
addi.d X, X, 8 * SIZE
133-
#else
134-
xvld VX1, X, 8 * SIZE
135-
xvpickev.w x1, VX1, VX0
136-
xvpickod.w x2, VX1, VX0
137-
xvfmul.s x3, VXAR, x1
138-
xvfmul.s x4, VXAR, x2
139-
xvilvl.w VX2, x4 ,x3
140-
xvilvh.w VX3, x4, x3
141-
xvst VX2, X, 0 * SIZE
142-
xvst VX3, X, 8 * SIZE
143-
addi.d X, X, 16 * SIZE
144-
#endif
145-
addi.d I, I, -1
146-
blt $r0, I, .L113
147-
b .L997
148-
.align 3
149-
150-
.L114: //alpha_r != 0.0 && alpha_i != 0.0
117+
.L17:
151118
xvld VX0, X, 0 * SIZE
152119
#ifdef DOUBLE
153120
xvld VX1, X, 4 * SIZE
@@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177144
addi.d X, X, 16 * SIZE
178145
#endif
179146
addi.d I, I, -1
180-
blt $r0, I, .L114
181-
b .L997
147+
blt $r0, I, .L17
148+
b .L19
149+
.align 3
150+
151+
/////// INCX == 1 && N < 8 ///////
152+
.L19:
153+
#ifdef DOUBLE
154+
andi I, N, 3
155+
#else
156+
andi I, N, 7
157+
#endif
158+
beqz I, .L999
159+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
160+
161+
bceqz $fcc0, .L998
162+
163+
bceqz $fcc1, .L998
164+
165+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
182166
.align 3
183167

168+
/////// INCX != 1 ////////
184169
.L22:
185-
bge $r0, I, .L997
186-
move XX, X
187170
CMPEQ $fcc0, ALPHAR, a1
188171
CMPEQ $fcc1, ALPHAI, a1
189-
bceqz $fcc0, .L23
190-
b .L24
191-
.align 3
192-
193-
.L23:
194-
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
195-
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
172+
move XX, X
173+
bge $r0, I, .L29
174+
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
175+
bceqz $fcc0, .L25
196176

197-
.L24:
198-
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
199-
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
200-
.align 3
177+
bceqz $fcc1, .L25
201178

202-
.L221: //alpha_r == 0.0 && alpha_i == 0.0
179+
.L27: //alpha_r == 0.0 && alpha_i == 0.0
203180
#ifdef DOUBLE
204181
xvstelm.d VXZ, X, 0, 0
205182
xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239216
#endif
240217
add.d X, X, INCX
241218
addi.d I, I, -1
242-
blt $r0, I, .L221
243-
b .L997
219+
blt $r0, I, .L27
220+
b .L29
244221
.align 3
245222

246-
.L223: //alpha_r != 0.0 && alpha_i == 0.0
247-
#ifdef DOUBLE
248-
ld.d t1, X, 0 * SIZE
249-
ld.d t2, X, 1 * SIZE
250-
add.d X, X, INCX
251-
ld.d t3, X, 0 * SIZE
252-
ld.d t4, X, 1 * SIZE
253-
add.d X, X, INCX
254-
xvinsgr2vr.d x1, t1, 0
255-
xvinsgr2vr.d x2, t2, 0
256-
xvinsgr2vr.d x1, t3, 1
257-
xvinsgr2vr.d x2, t4, 1
258-
ld.d t1, X, 0 * SIZE
259-
ld.d t2, X, 1 * SIZE
260-
add.d X, X, INCX
261-
ld.d t3, X, 0 * SIZE
262-
ld.d t4, X, 1 * SIZE
263-
xvinsgr2vr.d x1, t1, 2
264-
xvinsgr2vr.d x2, t2, 2
265-
xvinsgr2vr.d x1, t3, 3
266-
xvinsgr2vr.d x2, t4, 3
267-
add.d X, X, INCX
268-
269-
xvfmul.d x3, VXAR, x1
270-
xvfmul.d x4, VXAR, x2
271-
addi.d I, I, -1
272-
xvstelm.d x3, XX, 0 * SIZE, 0
273-
xvstelm.d x4, XX, 1 * SIZE, 0
274-
add.d XX, XX, INCX
275-
xvstelm.d x3, XX, 0 * SIZE, 1
276-
xvstelm.d x4, XX, 1 * SIZE, 1
277-
add.d XX, XX, INCX
278-
xvstelm.d x3, XX, 0 * SIZE, 2
279-
xvstelm.d x4, XX, 1 * SIZE, 2
280-
add.d XX, XX, INCX
281-
xvstelm.d x3, XX, 0 * SIZE, 3
282-
xvstelm.d x4, XX, 1 * SIZE, 3
283-
#else
284-
ld.w t1, X, 0 * SIZE
285-
ld.w t2, X, 1 * SIZE
286-
add.d X, X, INCX
287-
ld.w t3, X, 0 * SIZE
288-
ld.w t4, X, 1 * SIZE
289-
add.d X, X, INCX
290-
xvinsgr2vr.w x1, t1, 0
291-
xvinsgr2vr.w x2, t2, 0
292-
xvinsgr2vr.w x1, t3, 1
293-
xvinsgr2vr.w x2, t4, 1
294-
ld.w t1, X, 0 * SIZE
295-
ld.w t2, X, 1 * SIZE
296-
add.d X, X, INCX
297-
ld.w t3, X, 0 * SIZE
298-
ld.w t4, X, 1 * SIZE
299-
xvinsgr2vr.w x1, t1, 2
300-
xvinsgr2vr.w x2, t2, 2
301-
xvinsgr2vr.w x1, t3, 3
302-
xvinsgr2vr.w x2, t4, 3
303-
add.d X, X, INCX
304-
ld.w t1, X, 0 * SIZE
305-
ld.w t2, X, 1 * SIZE
306-
add.d X, X, INCX
307-
ld.w t3, X, 0 * SIZE
308-
ld.w t4, X, 1 * SIZE
309-
add.d X, X, INCX
310-
xvinsgr2vr.w x1, t1, 4
311-
xvinsgr2vr.w x2, t2, 4
312-
xvinsgr2vr.w x1, t3, 5
313-
xvinsgr2vr.w x2, t4, 5
314-
ld.w t1, X, 0 * SIZE
315-
ld.w t2, X, 1 * SIZE
316-
add.d X, X, INCX
317-
ld.w t3, X, 0 * SIZE
318-
ld.w t4, X, 1 * SIZE
319-
xvinsgr2vr.w x1, t1, 6
320-
xvinsgr2vr.w x2, t2, 6
321-
xvinsgr2vr.w x1, t3, 7
322-
xvinsgr2vr.w x2, t4, 7
323-
add.d X, X, INCX
324-
325-
xvfmul.s x3, VXAR, x1
326-
xvfmul.s x4, VXAR, x2
327-
addi.d I, I, -1
328-
xvstelm.w x3, XX, 0 * SIZE, 0
329-
xvstelm.w x4, XX, 1 * SIZE, 0
330-
add.d XX, XX, INCX
331-
xvstelm.w x3, XX, 0 * SIZE, 1
332-
xvstelm.w x4, XX, 1 * SIZE, 1
333-
add.d XX, XX, INCX
334-
xvstelm.w x3, XX, 0 * SIZE, 2
335-
xvstelm.w x4, XX, 1 * SIZE, 2
336-
add.d XX, XX, INCX
337-
xvstelm.w x3, XX, 0 * SIZE, 3
338-
xvstelm.w x4, XX, 1 * SIZE, 3
339-
add.d XX, XX, INCX
340-
xvstelm.w x3, XX, 0 * SIZE, 4
341-
xvstelm.w x4, XX, 1 * SIZE, 4
342-
add.d XX, XX, INCX
343-
xvstelm.w x3, XX, 0 * SIZE, 5
344-
xvstelm.w x4, XX, 1 * SIZE, 5
345-
add.d XX, XX, INCX
346-
xvstelm.w x3, XX, 0 * SIZE, 6
347-
xvstelm.w x4, XX, 1 * SIZE, 6
348-
add.d XX, XX, INCX
349-
xvstelm.w x3, XX, 0 * SIZE, 7
350-
xvstelm.w x4, XX, 1 * SIZE, 7
351-
#endif
352-
add.d XX, XX, INCX
353-
blt $r0, I, .L223
354-
b .L997
355-
.align 3
356-
357-
.L224: //alpha_r != 0.0 && alpha_i != 0.0
223+
.L25:
358224
#ifdef DOUBLE
359225
ld.d t1, X, 0 * SIZE
360226
ld.d t2, X, 1 * SIZE
@@ -465,19 +331,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
465331
xvstelm.w x4, XX, 1 * SIZE, 7
466332
#endif
467333
add.d XX, XX, INCX
468-
blt $r0, I, .L224
469-
b .L997
334+
blt $r0, I, .L25
335+
b .L29
470336
.align 3
471337

472-
.L997:
338+
/////// INCX != 1 && N < 8 ///////
339+
.L29:
473340
#ifdef DOUBLE
474-
andi I, N, 3
341+
andi I, N, 3
475342
#else
476-
andi I, N, 7
343+
andi I, N, 7
477344
#endif
478-
bge $r0, I, .L999
479-
.align 3
345+
beqz I, .L999
346+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
480347

348+
bceqz $fcc0, .L998
349+
350+
bceqz $fcc1, .L998
351+
352+
.L995: // alpha_r == 0.0 && alpha_i == 0.0
353+
ST a1, X, 0 * SIZE
354+
ST a1, X, 1 * SIZE
355+
addi.d I, I, -1
356+
add.d X, X, INCX
357+
blt $r0, I, .L995
358+
b .L999
481359
.L998:
482360
LD a1, X, 0 * SIZE
483361
LD a2, X, 1 * SIZE
@@ -490,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
490368
ST s2, X, 1 * SIZE
491369
add.d X, X, INCX
492370
blt $r0, I, .L998
493-
.align 3
371+
b .L999
494372

495373
.L999:
496374
move $r4, $r12

0 commit comments

Comments
 (0)