Skip to content

Commit 1573c82

Browse files
authored
Merge pull request #1069 from angsch/use-gemmtr
Use GEMMTR for SY/HE linear updates
2 parents 6061809 + 09cb849 commit 1573c82

16 files changed

+136
-676
lines changed

SRC/clahef.f

+10-45
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
200200
PARAMETER ( EIGHT = 8.0E+0, SEVTEN = 17.0E+0 )
201201
* ..
202202
* .. Local Scalars ..
203-
INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP,
203+
INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP,
204204
$ KSTEP, KW
205205
REAL ABSAKK, ALPHA, COLMAX, R1, ROWMAX, T
206206
COMPLEX D11, D21, D22, Z
@@ -211,7 +211,7 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
211211
EXTERNAL LSAME, ICAMAX
212212
* ..
213213
* .. External Subroutines ..
214-
EXTERNAL CCOPY, CGEMM, CGEMV, CLACGV, CSSCAL,
214+
EXTERNAL CCOPY, CGEMMTR, CGEMV, CLACGV, CSSCAL,
215215
$ CSWAP
216216
* ..
217217
* .. Intrinsic Functions ..
@@ -552,28 +552,11 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
552552
*
553553
* A11 := A11 - U12*D*U12**H = A11 - U12*W**H
554554
*
555-
* computing blocks of NB columns at a time (note that conjg(W) is
556-
* actually stored)
555+
* (note that conjg(W) is actually stored)
557556
*
558-
DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB
559-
JB = MIN( NB, K-J+1 )
560-
*
561-
* Update the upper triangle of the diagonal block
562-
*
563-
DO 40 JJ = J, J + JB - 1
564-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
565-
CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE,
566-
$ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE,
567-
$ A( J, JJ ), 1 )
568-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
569-
40 CONTINUE
570-
*
571-
* Update the rectangular superdiagonal block
572-
*
573-
CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, N-K,
574-
$ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW,
575-
$ CONE, A( 1, J ), LDA )
576-
50 CONTINUE
557+
CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K,
558+
$ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW,
559+
$ CONE, A( 1, 1 ), LDA )
577560
*
578561
* Put U12 in standard form by partially undoing the interchanges
579562
* in of rows in columns k+1:n looping backwards from k+1 to n
@@ -916,29 +899,11 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
916899
*
917900
* A22 := A22 - L21*D*L21**H = A22 - L21*W**H
918901
*
919-
* computing blocks of NB columns at a time (note that conjg(W) is
920-
* actually stored)
921-
*
922-
DO 110 J = K, N, NB
923-
JB = MIN( NB, N-J+1 )
924-
*
925-
* Update the lower triangle of the diagonal block
926-
*
927-
DO 100 JJ = J, J + JB - 1
928-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
929-
CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE,
930-
$ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE,
931-
$ A( JJ, JJ ), 1 )
932-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
933-
100 CONTINUE
934-
*
935-
* Update the rectangular subdiagonal block
902+
* (note that conjg(W) is actually stored)
936903
*
937-
IF( J+JB.LE.N )
938-
$ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB,
939-
$ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ),
940-
$ LDW, CONE, A( J+JB, J ), LDA )
941-
110 CONTINUE
904+
CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1,
905+
$ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW,
906+
$ CONE, A( K, K ), LDA )
942907
*
943908
* Put L21 in standard form by partially undoing the interchanges
944909
* of rows in columns 1:k-1 looping backwards from k-1 to 1

SRC/clahef_rk.f

+9-45
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ SUBROUTINE CLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW,
286286
* ..
287287
* .. Local Scalars ..
288288
LOGICAL DONE
289-
INTEGER IMAX, ITEMP, II, J, JB, JJ, JMAX, K, KK, KKW,
289+
INTEGER IMAX, ITEMP, II, J, JMAX, K, KK, KKW,
290290
$ KP, KSTEP, KW, P
291291
REAL ABSAKK, ALPHA, COLMAX, STEMP, R1, ROWMAX, T,
292292
$ SFMIN
@@ -755,29 +755,11 @@ SUBROUTINE CLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW,
755755
*
756756
* A11 := A11 - U12*D*U12**H = A11 - U12*W**H
757757
*
758-
* computing blocks of NB columns at a time (note that conjg(W) is
759-
* actually stored)
758+
* (note that conjg(W) is actually stored)
760759
*
761-
DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB
762-
JB = MIN( NB, K-J+1 )
763-
*
764-
* Update the upper triangle of the diagonal block
765-
*
766-
DO 40 JJ = J, J + JB - 1
767-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
768-
CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE,
769-
$ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE,
770-
$ A( J, JJ ), 1 )
771-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
772-
40 CONTINUE
773-
*
774-
* Update the rectangular superdiagonal block
775-
*
776-
IF( J.GE.2 )
777-
$ CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, N-K,
778-
$ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW,
779-
$ CONE, A( 1, J ), LDA )
780-
50 CONTINUE
760+
CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K,
761+
$ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW,
762+
$ CONE, A( 1, 1 ), LDA )
781763
*
782764
* Set KB to the number of columns factorized
783765
*
@@ -1203,29 +1185,11 @@ SUBROUTINE CLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW,
12031185
*
12041186
* A22 := A22 - L21*D*L21**H = A22 - L21*W**H
12051187
*
1206-
* computing blocks of NB columns at a time (note that conjg(W) is
1207-
* actually stored)
1208-
*
1209-
DO 110 J = K, N, NB
1210-
JB = MIN( NB, N-J+1 )
1211-
*
1212-
* Update the lower triangle of the diagonal block
1213-
*
1214-
DO 100 JJ = J, J + JB - 1
1215-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
1216-
CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE,
1217-
$ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE,
1218-
$ A( JJ, JJ ), 1 )
1219-
A( JJ, JJ ) = REAL( A( JJ, JJ ) )
1220-
100 CONTINUE
1221-
*
1222-
* Update the rectangular subdiagonal block
1188+
* (note that conjg(W) is actually stored)
12231189
*
1224-
IF( J+JB.LE.N )
1225-
$ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB,
1226-
$ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ),
1227-
$ LDW, CONE, A( J+JB, J ), LDA )
1228-
110 CONTINUE
1190+
CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1,
1191+
$ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW,
1192+
$ CONE, A( K, K ), LDA )
12291193
*
12301194
* Set KB to the number of columns factorized
12311195
*

SRC/clasyf.f

+8-41
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
200200
PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) )
201201
* ..
202202
* .. Local Scalars ..
203-
INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP,
203+
INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP,
204204
$ KSTEP, KW
205205
REAL ABSAKK, ALPHA, COLMAX, ROWMAX
206206
COMPLEX D11, D21, D22, R1, T, Z
@@ -211,7 +211,7 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
211211
EXTERNAL LSAME, ICAMAX
212212
* ..
213213
* .. External Subroutines ..
214-
EXTERNAL CCOPY, CGEMM, CGEMV, CSCAL, CSWAP
214+
EXTERNAL CCOPY, CGEMMTR, CGEMV, CSCAL, CSWAP
215215
* ..
216216
* .. Intrinsic Functions ..
217217
INTRINSIC ABS, AIMAG, MAX, MIN, REAL, SQRT
@@ -482,25 +482,9 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
482482
*
483483
* A11 := A11 - U12*D*U12**T = A11 - U12*W**T
484484
*
485-
* computing blocks of NB columns at a time
486-
*
487-
DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB
488-
JB = MIN( NB, K-J+1 )
489-
*
490-
* Update the upper triangle of the diagonal block
491-
*
492-
DO 40 JJ = J, J + JB - 1
493-
CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE,
494-
$ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE,
495-
$ A( J, JJ ), 1 )
496-
40 CONTINUE
497-
*
498-
* Update the rectangular superdiagonal block
499-
*
500-
CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, N-K,
501-
$ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW,
502-
$ CONE, A( 1, J ), LDA )
503-
50 CONTINUE
485+
CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K,
486+
$ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW,
487+
$ CONE, A( 1, 1 ), LDA )
504488
*
505489
* Put U12 in standard form by partially undoing the interchanges
506490
* in columns k+1:n looping backwards from k+1 to n
@@ -778,26 +762,9 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
778762
*
779763
* A22 := A22 - L21*D*L21**T = A22 - L21*W**T
780764
*
781-
* computing blocks of NB columns at a time
782-
*
783-
DO 110 J = K, N, NB
784-
JB = MIN( NB, N-J+1 )
785-
*
786-
* Update the lower triangle of the diagonal block
787-
*
788-
DO 100 JJ = J, J + JB - 1
789-
CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE,
790-
$ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE,
791-
$ A( JJ, JJ ), 1 )
792-
100 CONTINUE
793-
*
794-
* Update the rectangular subdiagonal block
795-
*
796-
IF( J+JB.LE.N )
797-
$ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB,
798-
$ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ),
799-
$ LDW, CONE, A( J+JB, J ), LDA )
800-
110 CONTINUE
765+
CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1,
766+
$ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW,
767+
$ CONE, A( K, K ), LDA )
801768
*
802769
* Put L21 in standard form by partially undoing the interchanges
803770
* of rows in columns 1:k-1 looping backwards from k-1 to 1

SRC/clasyf_rk.f

+7-41
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ SUBROUTINE CLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW,
298298
EXTERNAL LSAME, ICAMAX, SLAMCH
299299
* ..
300300
* .. External Subroutines ..
301-
EXTERNAL CCOPY, CGEMM, CGEMV, CSCAL, CSWAP
301+
EXTERNAL CCOPY, CGEMMTR, CGEMV, CSCAL, CSWAP
302302
* ..
303303
* .. Intrinsic Functions ..
304304
INTRINSIC ABS, AIMAG, MAX, MIN, REAL, SQRT
@@ -627,26 +627,9 @@ SUBROUTINE CLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW,
627627
*
628628
* A11 := A11 - U12*D*U12**T = A11 - U12*W**T
629629
*
630-
* computing blocks of NB columns at a time
631-
*
632-
DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB
633-
JB = MIN( NB, K-J+1 )
634-
*
635-
* Update the upper triangle of the diagonal block
636-
*
637-
DO 40 JJ = J, J + JB - 1
638-
CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE,
639-
$ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE,
640-
$ A( J, JJ ), 1 )
641-
40 CONTINUE
642-
*
643-
* Update the rectangular superdiagonal block
644-
*
645-
IF( J.GE.2 )
646-
$ CALL CGEMM( 'No transpose', 'Transpose', J-1, JB,
647-
$ N-K, -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ),
648-
$ LDW, CONE, A( 1, J ), LDA )
649-
50 CONTINUE
630+
CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K,
631+
$ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW,
632+
$ CONE, A( 1, 1 ), LDA )
650633
*
651634
* Set KB to the number of columns factorized
652635
*
@@ -945,26 +928,9 @@ SUBROUTINE CLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW,
945928
*
946929
* A22 := A22 - L21*D*L21**T = A22 - L21*W**T
947930
*
948-
* computing blocks of NB columns at a time
949-
*
950-
DO 110 J = K, N, NB
951-
JB = MIN( NB, N-J+1 )
952-
*
953-
* Update the lower triangle of the diagonal block
954-
*
955-
DO 100 JJ = J, J + JB - 1
956-
CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE,
957-
$ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE,
958-
$ A( JJ, JJ ), 1 )
959-
100 CONTINUE
960-
*
961-
* Update the rectangular subdiagonal block
962-
*
963-
IF( J+JB.LE.N )
964-
$ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB,
965-
$ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ),
966-
$ LDW, CONE, A( J+JB, J ), LDA )
967-
110 CONTINUE
931+
CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1,
932+
$ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW,
933+
$ CONE, A( K, K ), LDA )
968934
*
969935
* Set KB to the number of columns factorized
970936
*

SRC/clasyf_rook.f

+10-40
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
208208
* ..
209209
* .. Local Scalars ..
210210
LOGICAL DONE
211-
INTEGER IMAX, ITEMP, J, JB, JJ, JMAX, JP1, JP2, K, KK,
211+
INTEGER IMAX, ITEMP, J, JJ, JMAX, JP1, JP2, K, KK,
212212
$ KW, KKW, KP, KSTEP, P, II
213213
REAL ABSAKK, ALPHA, COLMAX, ROWMAX, STEMP, SFMIN
214214
COMPLEX D11, D12, D21, D22, R1, T, Z
@@ -220,7 +220,7 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
220220
EXTERNAL LSAME, ICAMAX, SLAMCH
221221
* ..
222222
* .. External Subroutines ..
223-
EXTERNAL CCOPY, CGEMM, CGEMV, CSCAL, CSWAP
223+
EXTERNAL CCOPY, CGEMMTR, CGEMV, CSCAL, CSWAP
224224
* ..
225225
* .. Intrinsic Functions ..
226226
INTRINSIC ABS, MAX, MIN, SQRT, AIMAG, REAL
@@ -525,26 +525,11 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
525525
*
526526
* A11 := A11 - U12*D*U12**T = A11 - U12*W**T
527527
*
528-
* computing blocks of NB columns at a time
528+
* (note that conjg(W) is actually stored)
529529
*
530-
DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB
531-
JB = MIN( NB, K-J+1 )
532-
*
533-
* Update the upper triangle of the diagonal block
534-
*
535-
DO 40 JJ = J, J + JB - 1
536-
CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE,
537-
$ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE,
538-
$ A( J, JJ ), 1 )
539-
40 CONTINUE
540-
*
541-
* Update the rectangular superdiagonal block
542-
*
543-
IF( J.GE.2 )
544-
$ CALL CGEMM( 'No transpose', 'Transpose', J-1, JB,
545-
$ N-K, -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW,
546-
$ CONE, A( 1, J ), LDA )
547-
50 CONTINUE
530+
CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K,
531+
$ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW,
532+
$ CONE, A( 1, 1 ), LDA )
548533
*
549534
* Put U12 in standard form by partially undoing the interchanges
550535
* in columns k+1:n
@@ -846,26 +831,11 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW,
846831
*
847832
* A22 := A22 - L21*D*L21**T = A22 - L21*W**T
848833
*
849-
* computing blocks of NB columns at a time
850-
*
851-
DO 110 J = K, N, NB
852-
JB = MIN( NB, N-J+1 )
853-
*
854-
* Update the lower triangle of the diagonal block
855-
*
856-
DO 100 JJ = J, J + JB - 1
857-
CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE,
858-
$ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE,
859-
$ A( JJ, JJ ), 1 )
860-
100 CONTINUE
861-
*
862-
* Update the rectangular subdiagonal block
834+
* (note that conjg(W) is actually stored)
863835
*
864-
IF( J+JB.LE.N )
865-
$ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB,
866-
$ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW,
867-
$ CONE, A( J+JB, J ), LDA )
868-
110 CONTINUE
836+
CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1,
837+
$ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW,
838+
$ CONE, A( K, K ), LDA )
869839
*
870840
* Put L21 in standard form by partially undoing the interchanges
871841
* in columns 1:k-1

0 commit comments

Comments
 (0)