10
10
#include < map>
11
11
12
12
#include < cuda_runtime.h>
13
+ #include < cuda_helper.h>
14
+
13
15
#include " miner.h"
14
16
15
17
#include " salsa_kernel.h"
18
20
#define TEXWIDTH 32768
19
21
#define THREADS_PER_WU 4 // four threads per hash
20
22
23
+ #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
24
+ #define __shfl2 (var, srcLane ) __shfl_sync(0xFFFFFFFFu , var, srcLane)
25
+ #else
26
+ #define __shfl2 __shfl
27
+ #endif
28
+
21
29
typedef enum
22
30
{
23
31
ANDERSEN,
@@ -57,12 +65,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right)
57
65
return left;
58
66
}
59
67
60
- static __device__ uint4 __shfl (const uint4 bx, int target_thread) {
68
+ static __device__ uint4 shfl4 (const uint4 bx, int target_thread) {
61
69
return make_uint4 (
62
- __shfl ((int )bx.x , target_thread),
63
- __shfl ((int )bx.y , target_thread),
64
- __shfl ((int )bx.z , target_thread),
65
- __shfl ((int )bx.w , target_thread)
70
+ __shfl2 ((int )bx.x , target_thread),
71
+ __shfl2 ((int )bx.y , target_thread),
72
+ __shfl2 ((int )bx.z , target_thread),
73
+ __shfl2 ((int )bx.w , target_thread)
66
74
);
67
75
}
68
76
@@ -97,8 +105,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
97
105
98
106
if (SCHEME == ANDERSEN) {
99
107
int target_thread = (threadIdx .x + 4 )%32 ;
100
- uint4 t= b, t2= __shfl (bx, target_thread);
101
- int t2_start = __shfl ((int )start, target_thread) + 4 ;
108
+ uint4 t = b, t2 = shfl4 (bx, target_thread);
109
+ int t2_start = __shfl2 ((int )start, target_thread) + 4 ;
102
110
bool c = (threadIdx .x & 0x4 );
103
111
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
104
112
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
@@ -115,7 +123,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
115
123
116
124
if (TEX_DIM == 0 ) scratch = c_V[(blockIdx .x *blockDim .x + threadIdx .x )/32 ];
117
125
if (SCHEME == ANDERSEN) {
118
- int t2_start = __shfl ((int )start, (threadIdx .x + 4 )%32 ) + 4 ;
126
+ int t2_start = __shfl2 ((int )start, (threadIdx .x + 4 )%32 ) + 4 ;
119
127
if (TEX_DIM > 0 ) { start /= 4 ; t2_start /= 4 ; }
120
128
bool c = (threadIdx .x & 0x4 );
121
129
if (TEX_DIM == 0 ) {
@@ -129,7 +137,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
129
137
bx = tex2D (texRef2D_4_V, 0 .5f + ((c ? start : t2_start)%TEXWIDTH), 0 .5f + ((c ? start : t2_start)/TEXWIDTH));
130
138
}
131
139
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
132
- bx = __shfl (bx, (threadIdx .x + 28 )%32 );
140
+ bx = shfl4 (bx, (threadIdx .x + 28 )%32 );
133
141
} else {
134
142
if (TEX_DIM == 0 ) b = *((uint4 *)(&scratch[start]));
135
143
else if (TEX_DIM == 1 ) b = tex1Dfetch (texRef1D_4_V, start/4 );
@@ -149,14 +157,15 @@ void primary_order_shuffle(uint4 &b, uint4 &bx)
149
157
int x2 = (threadIdx .x & 0x1c ) + (((threadIdx .x & 0x03 )+2 )&0x3 );
150
158
int x3 = (threadIdx .x & 0x1c ) + (((threadIdx .x & 0x03 )+3 )&0x3 );
151
159
152
- b.w = __shfl ((int )b.w , x1);
153
- b.z = __shfl ((int )b.z , x2);
154
- b.y = __shfl ((int )b.y , x3);
160
+ b.w = __shfl2 ((int )b.w , x1);
161
+ b.z = __shfl2 ((int )b.z , x2);
162
+ b.y = __shfl2 ((int )b.y , x3);
163
+
155
164
uint32_t tmp = b.y ; b.y = b.w ; b.w = tmp;
156
165
157
- bx.w = __shfl ((int )bx.w , x1);
158
- bx.z = __shfl ((int )bx.z , x2);
159
- bx.y = __shfl ((int )bx.y , x3);
166
+ bx.w = __shfl2 ((int )bx.w , x1);
167
+ bx.z = __shfl2 ((int )bx.z , x2);
168
+ bx.y = __shfl2 ((int )bx.y , x3);
160
169
tmp = bx.y ; bx.y = bx.w ; bx.w = tmp;
161
170
}
162
171
@@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
318
327
/* Unclear if this optimization is needed: These are ordered based
319
328
* upon the dependencies needed in the later xors. Compiler should be
320
329
* able to figure this out, but might as well give it a hand. */
321
- x.y = __shfl ((int )x.y , x3);
322
- x.w = __shfl ((int )x.w , x1);
323
- x.z = __shfl ((int )x.z , x2);
330
+ x.y = __shfl2 ((int )x.y , x3);
331
+ x.w = __shfl2 ((int )x.w , x1);
332
+ x.z = __shfl2 ((int )x.z , x2);
324
333
325
334
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
326
335
* but the register targets are rewritten here to swap x[1] and x[3] so that
@@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
333
342
XOR_ROTATE_ADD (x.y , x.z , x.w , 13 );
334
343
XOR_ROTATE_ADD (x.x , x.y , x.z , 18 );
335
344
336
- x.w = __shfl ((int )x.w , x3);
337
- x.y = __shfl ((int )x.y , x1);
338
- x.z = __shfl ((int )x.z , x2);
345
+ x.w = __shfl2 ((int )x.w , x3);
346
+ x.y = __shfl2 ((int )x.y , x1);
347
+ x.z = __shfl2 ((int )x.z , x2);
339
348
}
340
349
341
350
b += x;
@@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
352
361
XOR_ROTATE_ADD (x.w , x.z , x.y , 13 );
353
362
XOR_ROTATE_ADD (x.x , x.w , x.z , 18 );
354
363
355
- x.y = __shfl ((int )x.y , x3);
356
- x.w = __shfl ((int )x.w , x1);
357
- x.z = __shfl ((int )x.z , x2);
364
+ x.y = __shfl2 ((int )x.y , x3);
365
+ x.w = __shfl2 ((int )x.w , x1);
366
+ x.z = __shfl2 ((int )x.z , x2);
358
367
359
368
XOR_ROTATE_ADD (x.w , x.x , x.y , 7 );
360
369
XOR_ROTATE_ADD (x.z , x.w , x.x , 9 );
361
370
XOR_ROTATE_ADD (x.y , x.z , x.w , 13 );
362
371
XOR_ROTATE_ADD (x.x , x.y , x.z , 18 );
363
372
364
- x.w = __shfl ((int )x.w , x3);
365
- x.y = __shfl ((int )x.y , x1);
366
- x.z = __shfl ((int )x.z , x2);
373
+ x.w = __shfl2 ((int )x.w , x3);
374
+ x.y = __shfl2 ((int )x.y , x1);
375
+ x.z = __shfl2 ((int )x.z , x2);
367
376
}
368
377
369
378
// At the end of these iterations, the data is in primary order again.
@@ -407,19 +416,19 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
407
416
CHACHA_PRIMITIVE (x.x ,x.w , x.y , 8 )
408
417
CHACHA_PRIMITIVE (x.z ,x.y , x.w , 7 )
409
418
410
- x.y = __shfl ((int )x.y , x1);
411
- x.z = __shfl ((int )x.z , x2);
412
- x.w = __shfl ((int )x.w , x3);
419
+ x.y = __shfl2 ((int )x.y , x1);
420
+ x.z = __shfl2 ((int )x.z , x2);
421
+ x.w = __shfl2 ((int )x.w , x3);
413
422
414
423
// Diagonal Mixing phase of chacha
415
424
CHACHA_PRIMITIVE (x.x ,x.w , x.y , 16 )
416
425
CHACHA_PRIMITIVE (x.z ,x.y , x.w , 12 )
417
426
CHACHA_PRIMITIVE (x.x ,x.w , x.y , 8 )
418
427
CHACHA_PRIMITIVE (x.z ,x.y , x.w , 7 )
419
428
420
- x.y = __shfl ((int )x.y , x3);
421
- x.z = __shfl ((int )x.z , x2);
422
- x.w = __shfl ((int )x.w , x1);
429
+ x.y = __shfl2 ((int )x.y , x3);
430
+ x.z = __shfl2 ((int )x.z , x2);
431
+ x.w = __shfl2 ((int )x.w , x1);
423
432
}
424
433
425
434
b += x;
@@ -436,19 +445,19 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
436
445
CHACHA_PRIMITIVE (x.x ,x.w , x.y , 8 )
437
446
CHACHA_PRIMITIVE (x.z ,x.y , x.w , 7 )
438
447
439
- x.y = __shfl ((int )x.y , x1);
440
- x.z = __shfl ((int )x.z , x2);
441
- x.w = __shfl ((int )x.w , x3);
448
+ x.y = __shfl2 ((int )x.y , x1);
449
+ x.z = __shfl2 ((int )x.z , x2);
450
+ x.w = __shfl2 ((int )x.w , x3);
442
451
443
452
// Diagonal Mixing phase of chacha
444
453
CHACHA_PRIMITIVE (x.x ,x.w , x.y , 16 )
445
454
CHACHA_PRIMITIVE (x.z ,x.y , x.w , 12 )
446
455
CHACHA_PRIMITIVE (x.x ,x.w , x.y , 8 )
447
456
CHACHA_PRIMITIVE (x.z ,x.y , x.w , 7 )
448
457
449
- x.y = __shfl ((int )x.y , x3);
450
- x.z = __shfl ((int )x.z , x2);
451
- x.w = __shfl ((int )x.w , x1);
458
+ x.y = __shfl2 ((int )x.y , x3);
459
+ x.z = __shfl2 ((int )x.z , x2);
460
+ x.w = __shfl2 ((int )x.w , x1);
452
461
}
453
462
454
463
#undef CHACHA_PRIMITIVE
@@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
572
581
} else load_key<ALGO>(d_odata, b, bx);
573
582
574
583
for (int i = begin; i < end; i++) {
575
- int j = (__shfl ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
584
+ int j = (__shfl2 ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
576
585
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32 *j);
577
586
b ^= t; bx ^= tx;
578
587
block_mixer<ALGO>(b, bx, x1, x2, x3);
@@ -604,15 +613,15 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
604
613
{
605
614
// better divergent thread handling submitted by nVidia engineers, but
606
615
// supposedly this does not run with the ANDERSEN memory access scheme
607
- int j = (__shfl ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
616
+ int j = (__shfl2 ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
608
617
int pos = j/LOOKUP_GAP;
609
618
int loop = -1 ;
610
619
uint4 t, tx;
611
620
612
621
int i = begin;
613
622
while (i < end) {
614
623
if (loop==-1 ) {
615
- j = (__shfl ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
624
+ j = (__shfl2 ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
616
625
pos = j/LOOKUP_GAP;
617
626
loop = j-pos*LOOKUP_GAP;
618
627
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32 *pos);
@@ -634,7 +643,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
634
643
// this is my original implementation, now used with the ANDERSEN
635
644
// memory access scheme only.
636
645
for (int i = begin; i < end; i++) {
637
- int j = (__shfl ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
646
+ int j = (__shfl2 ((int )bx.x , (threadIdx .x & 0x1c )) & (c_N_1));
638
647
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
639
648
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32 *pos);
640
649
while (loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
@@ -644,7 +653,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
644
653
}
645
654
646
655
// for (int i = begin; i < end; i++) {
647
- // int j = (__shfl ((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
656
+ // int j = (__shfl2 ((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
648
657
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
649
658
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
650
659
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
0 commit comments