@@ -417,9 +417,9 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
417
417
uint32_t nounce = (g_nonceVector != NULL ) ? g_nonceVector[thread] : (startNounce + thread);
418
418
419
419
uint32_t hashPosition = nounce - startNounce;
420
- uint64_t *inpHash = &g_hash[hashPosition * 8U ];
420
+ uint2 *inpHash = ( uint2 *) ( &g_hash[hashPosition * 8U ]) ;
421
421
422
- // Initialisierung
422
+ // Init
423
423
h0 = vectorize (0x4903ADFF749C51CEull );
424
424
h1 = vectorize (0x0D95DE399746DF03ull );
425
425
h2 = vectorize (0x8FD1934127C79BCEull );
@@ -433,17 +433,19 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
433
433
// 1st Round -> etype = 480, ptr = 64, bcount = 0, data = msg
434
434
#pragma unroll 8
435
435
for (int i = 0 ; i < 8 ; i++)
436
- p[i] = vectorize ( inpHash[i]) ;
436
+ p[i] = inpHash[i];
437
437
438
- t0 = vectorize (64 ); // ptr
439
- // t1 = vectorize(480ull << 55); // etype
440
- t1 = vectorize (0xf000000000000000ULL );
438
+ t0 = make_uint2 (0x40 , 0 ); // 64
439
+ t1 = vectorize (0xf000000000000000ULL ); // 480ull << 55 (etype)
441
440
442
441
// #if CUDA_VERSION >= 7000
443
442
// doesnt really affect x11 perfs.
444
443
__threadfence ();
445
444
// #endif
446
- TFBIG_KINIT_UI2 (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
445
+ // TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
446
+ h8 = vectorize (0xcab2076d98173ec4ULL );
447
+ t2 = vectorize (0xf000000000000040ULL );
448
+
447
449
TFBIG_4e_UI2 (0 );
448
450
TFBIG_4o_UI2 (1 );
449
451
TFBIG_4e_UI2 (2 );
@@ -464,23 +466,22 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
464
466
TFBIG_4o_UI2 (17 );
465
467
TFBIG_ADDKEY_UI2 (p[0 ], p[1 ], p[2 ], p[3 ], p[4 ], p[5 ], p[6 ], p[7 ], h, t, 18 );
466
468
467
- h0 = vectorize ( inpHash[0 ]) ^ p[0 ];
468
- h1 = vectorize ( inpHash[1 ]) ^ p[1 ];
469
- h2 = vectorize ( inpHash[2 ]) ^ p[2 ];
470
- h3 = vectorize ( inpHash[3 ]) ^ p[3 ];
471
- h4 = vectorize ( inpHash[4 ]) ^ p[4 ];
472
- h5 = vectorize ( inpHash[5 ]) ^ p[5 ];
473
- h6 = vectorize ( inpHash[6 ]) ^ p[6 ];
474
- h7 = vectorize ( inpHash[7 ]) ^ p[7 ];
469
+ h0 = inpHash[0 ] ^ p[0 ];
470
+ h1 = inpHash[1 ] ^ p[1 ];
471
+ h2 = inpHash[2 ] ^ p[2 ];
472
+ h3 = inpHash[3 ] ^ p[3 ];
473
+ h4 = inpHash[4 ] ^ p[4 ];
474
+ h5 = inpHash[5 ] ^ p[5 ];
475
+ h6 = inpHash[6 ] ^ p[6 ];
476
+ h7 = inpHash[7 ] ^ p[7 ];
475
477
476
478
// 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0
477
479
#pragma unroll 8
478
480
for (int i=0 ; i<8 ; i++)
479
481
p[i] = vectorize (0 );
480
482
481
- t0 = vectorize (8 ); // ptr
482
- // t1 = vectorize(510ull << 55); // etype
483
- t1 = vectorize (0xff00000000000000ULL );
483
+ t0 = make_uint2 (0x8 , 0 );
484
+ t1 = vectorize (0xff00000000000000ULL ); // etype
484
485
485
486
TFBIG_KINIT_UI2 (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
486
487
TFBIG_4e_UI2 (0 );
@@ -526,7 +527,7 @@ void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint
526
527
uint32_t nounce = (g_nonceVector != NULL ) ? g_nonceVector[thread] : (startNounce + thread);
527
528
528
529
uint32_t hashPosition = nounce - startNounce;
529
- uint64_t *inpHash = &g_hash[hashPosition * 8 ];
530
+ uint64_t *inpHash = &g_hash[hashPosition * 8U ];
530
531
531
532
// Init
532
533
h0 = 0x4903ADFF749C51CEull ;
@@ -538,16 +539,18 @@ void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint
538
539
h6 = 0x991112C71A75B523ull ;
539
540
h7 = 0xAE18A40B660FCC33ull ;
540
541
541
- // 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
542
+ // 1st Round -> etype = 480, ptr = 64, bcount = 0, data = msg
542
543
#pragma unroll 8
543
544
for (int i=0 ; i<8 ; i++)
544
545
p[i] = inpHash[i];
545
546
546
- t0 = 64 ; // ptr
547
- // t1 = 480ull << 55; // etype
548
- t1 = 0xf000000000000000ULL ;
547
+ t0 = 0x40 ; // 64.
548
+ t1 = 0xf000000000000000ULL ; // 480ull << 55 (etype)
549
+
550
+ // TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
551
+ h8 = 0xcab2076d98173ec4ULL ;
552
+ t2 = 0xf000000000000040ULL ;
549
553
550
- TFBIG_KINIT (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
551
554
TFBIG_4e (0 );
552
555
TFBIG_4o (1 );
553
556
TFBIG_4e (2 );
@@ -577,14 +580,15 @@ void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint
577
580
h6 = inpHash[6 ] ^ p[6 ];
578
581
h7 = inpHash[7 ] ^ p[7 ];
579
582
580
- // 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0
583
+ // 2nd Round -> etype = 510, ptr = 8, bcount = 0, data = 0
581
584
#pragma unroll 8
582
585
for (int i=0 ; i<8 ; i++)
583
586
p[i] = 0ull ;
584
587
585
588
t0 = 8 ; // ptr
586
589
t1 = 510ull << 55 ; // etype
587
590
TFBIG_KINIT (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
591
+
588
592
TFBIG_4e (0 );
589
593
TFBIG_4o (1 );
590
594
TFBIG_4e (2 );
@@ -606,7 +610,7 @@ void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint
606
610
TFBIG_ADDKEY (p[0 ], p[1 ], p[2 ], p[3 ], p[4 ], p[5 ], p[6 ], p[7 ], h, t, 18 );
607
611
608
612
// output
609
- uint64_t *outpHash = &g_hash[hashPosition * 8 ];
613
+ uint64_t *outpHash = &g_hash[hashPosition * 8U ];
610
614
611
615
#pragma unroll 8
612
616
for (int i=0 ; i<8 ; i++)
@@ -633,8 +637,6 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
633
637
h6 = vectorize (c_PaddedMessage80[16 ]);
634
638
h7 = vectorize (c_PaddedMessage80[17 ]);
635
639
636
- t2 = vectorize (c_PaddedMessage80[18 ]);
637
-
638
640
uint32_t nonce = swap ? cuda_swab32 (startNounce + thread) : startNounce + thread;
639
641
uint2 nonce2 = make_uint2 (_LODWORD (c_PaddedMessage80[9 ]), nonce);
640
642
@@ -646,9 +648,13 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
646
648
for (int i = 2 ; i < 8 ; i++)
647
649
p[i] = vectorize (0ull );
648
650
649
- t0 = vectorize ( 0x50ull );
651
+ t0 = make_uint2 ( 0x50 , 0 );
650
652
t1 = vectorize (0xB000000000000000ull );
651
- TFBIG_KINIT_UI2 (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
653
+
654
+ // TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
655
+ h8 = vectorize (c_PaddedMessage80[18 ]);
656
+ t2 = vectorize (0xB000000000000050ull ); // t0 ^ t1
657
+
652
658
TFBIG_4e_UI2 (0 );
653
659
TFBIG_4o_UI2 (1 );
654
660
TFBIG_4e_UI2 (2 );
@@ -669,7 +675,7 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
669
675
TFBIG_4o_UI2 (17 );
670
676
TFBIG_ADDKEY_UI2 (p[0 ], p[1 ], p[2 ], p[3 ], p[4 ], p[5 ], p[6 ], p[7 ], h, t, 18 );
671
677
672
- uint64_t *outpHash = &output64[thread * 8 ];
678
+ uint64_t *outpHash = &output64[thread * 8U ];
673
679
outpHash[0 ] = c_PaddedMessage80[8 ] ^ devectorize (p[0 ]);
674
680
outpHash[1 ] = devectorize (nonce2 ^ p[1 ]);
675
681
#pragma unroll
@@ -684,23 +690,22 @@ void skein512_gpu_hash_80_sm3(uint32_t threads, uint32_t startNounce, uint64_t *
684
690
uint32_t thread = (blockDim .x * blockIdx .x + threadIdx .x );
685
691
if (thread < threads)
686
692
{
687
- uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8;
688
- uint64_t t0, t1, t2;
689
-
690
693
// Init
691
- h0 = 0x4903ADFF749C51CEull ;
692
- h1 = 0x0D95DE399746DF03ull ;
693
- h2 = 0x8FD1934127C79BCEull ;
694
- h3 = 0x9A255629FF352CB1ull ;
695
- h4 = 0x5DB62599DF6CA7B0ull ;
696
- h5 = 0xEABE394CA9D5C3F4ull ;
697
- h6 = 0x991112C71A75B523ull ;
698
- h7 = 0xAE18A40B660FCC33ull ;
699
-
700
- t0 = 64 ; // ptr
701
- // t1 = vectorize(0xE0ull << 55); // etype
702
- t1 = 0x7000000000000000ull ;
703
- TFBIG_KINIT (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
694
+ uint64_t h0 = 0x4903ADFF749C51CEull ;
695
+ uint64_t h1 = 0x0D95DE399746DF03ull ;
696
+ uint64_t h2 = 0x8FD1934127C79BCEull ;
697
+ uint64_t h3 = 0x9A255629FF352CB1ull ;
698
+ uint64_t h4 = 0x5DB62599DF6CA7B0ull ;
699
+ uint64_t h5 = 0xEABE394CA9D5C3F4ull ;
700
+ uint64_t h6 = 0x991112C71A75B523ull ;
701
+ uint64_t h7 = 0xAE18A40B660FCC33ull ;
702
+
703
+ uint64_t t0 = 0x40 ; // ptr = 64.
704
+ uint64_t t1 = 0x7000000000000000ull ; // 0xE0ull << 55 // etype
705
+
706
+ // TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
707
+ uint64_t t2 = 0x7000000000000040ull ;
708
+ uint64_t h8 = 0xcab2076d98173ec4ull ;
704
709
705
710
uint64_t p[8 ];
706
711
#pragma unroll 8
@@ -745,12 +750,15 @@ void skein512_gpu_hash_80_sm3(uint32_t threads, uint32_t startNounce, uint64_t *
745
750
746
751
#pragma unroll
747
752
for (int i = 2 ; i < 8 ; i++)
748
- p[i] = 0ull ;
753
+ p[i] = 0 ;
749
754
750
- t0 = 0x50ull ; // SPH_T64 (bcount << 6) + (sph_u64)( extra) ;
755
+ t0 = 0x50 ; // (bcount << 6) + extra;
751
756
t1 = 0xB000000000000000ull ; // (bcount >> 58) + ((sph_u64)(etype) << 55);
752
757
753
- TFBIG_KINIT (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
758
+ // TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
759
+ t2 = 0xB000000000000050ull ;
760
+ h8 = c_PaddedMessage80[18 ];
761
+
754
762
TFBIG_4e (0 );
755
763
TFBIG_4o (1 );
756
764
TFBIG_4e (2 );
@@ -773,7 +781,7 @@ void skein512_gpu_hash_80_sm3(uint32_t threads, uint32_t startNounce, uint64_t *
773
781
774
782
// skein_big_close 2nd loop -> etype = 0x1fe, ptr = 8, bcount = 0
775
783
// output
776
- uint64_t *outpHash = &output64[thread * 8 ];
784
+ uint64_t *outpHash = &output64[thread * 8U ];
777
785
outpHash[0 ] = c_PaddedMessage80[8 ] ^ p[0 ];
778
786
outpHash[1 ] = nonce64 ^ p[1 ];
779
787
#pragma unroll
@@ -788,11 +796,10 @@ void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g
788
796
uint32_t thread = (blockDim .x * blockIdx .x + threadIdx .x );
789
797
if (thread < threads)
790
798
{
791
- uint2 t0 = vectorize ( 8 ); // extra
799
+ uint2 t0 = make_uint2 ( 0x8 , 0 ); // extra
792
800
uint2 t1 = vectorize (0xFF00000000000000ull ); // etype
793
- uint2 t2 = vectorize (0xB000000000000050ull );
794
801
795
- uint64_t *state = &g_hash[thread * 8 ];
802
+ uint64_t *state = &g_hash[thread * 8U ];
796
803
uint2 h0 = vectorize (state[0 ]);
797
804
uint2 h1 = vectorize (state[1 ]);
798
805
uint2 h2 = vectorize (state[2 ]);
@@ -801,7 +808,8 @@ void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g
801
808
uint2 h5 = vectorize (state[5 ]);
802
809
uint2 h6 = vectorize (state[6 ]);
803
810
uint2 h7 = vectorize (state[7 ]);
804
- uint2 h8;
811
+
812
+ uint2 h8, t2;
805
813
TFBIG_KINIT_UI2 (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
806
814
807
815
uint2 p[8 ] = { 0 };
@@ -841,9 +849,8 @@ void skein512_gpu_hash_close_sm3(uint32_t threads, uint32_t startNounce, uint64_
841
849
{
842
850
uint64_t t0 = 8ull ; // extra
843
851
uint64_t t1 = 0xFF00000000000000ull ; // etype
844
- uint64_t t2 = 0xB000000000000050ull ;
845
852
846
- uint64_t *state = &g_hash[thread * 8 ];
853
+ uint64_t *state = &g_hash[thread * 8U ];
847
854
848
855
uint64_t h0 = state[0 ];
849
856
uint64_t h1 = state[1 ];
@@ -853,7 +860,7 @@ void skein512_gpu_hash_close_sm3(uint32_t threads, uint32_t startNounce, uint64_
853
860
uint64_t h5 = state[5 ];
854
861
uint64_t h6 = state[6 ];
855
862
uint64_t h7 = state[7 ];
856
- uint64_t h8;
863
+ uint64_t h8, t2 ;
857
864
TFBIG_KINIT (h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
858
865
859
866
uint64_t p[8 ] = { 0 };
@@ -971,7 +978,10 @@ static void skein512_precalc_80(uint64_t* message)
971
978
message[16 ] = message[6 ] ^ p[6 ];
972
979
message[17 ] = message[7 ] ^ p[7 ];
973
980
974
- message[18 ] = t2;
981
+ // h8
982
+ message[18 ] = 0x1BD11BDAA9FC1A22ULL ;
983
+ for (int i=10 ; i<18 ; i++)
984
+ message[18 ] ^= message[i];
975
985
}
976
986
977
987
__host__
0 commit comments