Skip to content

Commit 3761774

Browse files
committed
cuda: get ride of cuda 9 mask warnings
1 parent f1a7de4 commit 3761774

9 files changed

+200
-150
lines changed

Makefile.am

+3-2
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,10 @@ endif
111111
#ccminer_LDADD += -lsodium
112112
ccminer_LDADD += -lcuda
113113

114-
nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
115-
114+
nvcc_ARCH :=
115+
#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
116116
nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
117+
nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
117118
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
118119
#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
119120

configure.ac

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
1+
AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
22

33
AC_PREREQ([2.59c])
44
AC_CANONICAL_SYSTEM

cuda_helper.h

+10
Original file line numberDiff line numberDiff line change
@@ -669,4 +669,14 @@ static uint2 SHR2(uint2 a, int offset)
669669
#endif
670670
}
671671

672+
// CUDA 9+ deprecated functions warnings (new mask param)
673+
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
674+
#undef __shfl
675+
#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
676+
#undef __shfl_up
677+
#define __shfl_up(var, delta, width) __shfl_up_sync(0xFFFFFFFF, var, delta, width)
678+
#undef __any
679+
#define __any(p) __any_sync(0xFFFFFFFFu, p)
680+
#endif
681+
672682
#endif // #ifndef CUDA_HELPER_H

equi/cuda_equi.cu

+11-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@
6565
#define __CUDA_ARCH__ 520
6666
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
6767
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
68-
uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
68+
uint32_t __shfl2(uint32_t x, uint32_t y);
69+
uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y);
6970
uint32_t atomicExch(uint32_t *x, uint32_t y);
7071
uint32_t atomicAdd(uint32_t *x, uint32_t y);
7172
void __syncthreads(void);
@@ -79,6 +80,14 @@ u32 umin(const u32, const u32);
7980
u32 umax(const u32, const u32);
8081
#endif
8182

83+
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
84+
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
85+
#undef __any
86+
#define __any(p) __any_sync(0xFFFFFFFFu, p)
87+
#else
88+
#define __shfl2 __shfl
89+
#endif
90+
8291
typedef u32 proof[PROOFSIZE];
8392

8493
struct __align__(32) slot {
@@ -1844,7 +1853,7 @@ __global__ void digit_last_wdc(equi<RB, SM>* eq)
18441853
}
18451854
#if __CUDA_ARCH__ >= 300
18461855
// all threads get the value from lane 0
1847-
soli = __shfl(soli, 0);
1856+
soli = __shfl2(soli, 0);
18481857
#else
18491858
__syncthreads();
18501859
soli = eq->edata.srealcont.nsols;

lyra2/cuda_lyra2_vectors.h

+28-22
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
#define __shfl(x, y, z) (x)
1717
#endif
1818

19+
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
20+
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
21+
#else
22+
#define __shfl2 __shfl
23+
#endif
24+
1925
#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
2026
#define __ldg4(x) (*(x))
2127
#endif
@@ -89,7 +95,7 @@ typedef struct __align__(16) uint28 {
8995
typedef uint2x4 uint28; /* name deprecated */
9096

9197
typedef struct __builtin_align__(32) uint48 {
92-
uint4 s0,s1;
98+
uint4 s0,s1;
9399
} uint48;
94100

95101
typedef struct __builtin_align__(128) uint4x16{
@@ -368,10 +374,10 @@ static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulong
368374

369375
static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
370376
static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
371-
static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
372-
static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
373-
static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
374-
static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
377+
static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
378+
static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
379+
static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
380+
static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
375381

376382
static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
377383
static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
@@ -551,14 +557,14 @@ static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane)
551557
{
552558
#if __CUDA_ARCH__ >= 300
553559
uint28 res;
554-
res.x.x = __shfl(var.x.x, lane);
555-
res.x.y = __shfl(var.x.y, lane);
556-
res.y.x = __shfl(var.y.x, lane);
557-
res.y.y = __shfl(var.y.y, lane);
558-
res.z.x = __shfl(var.z.x, lane);
559-
res.z.y = __shfl(var.z.y, lane);
560-
res.w.x = __shfl(var.w.x, lane);
561-
res.w.y = __shfl(var.w.y, lane);
560+
res.x.x = __shfl2(var.x.x, lane);
561+
res.x.y = __shfl2(var.x.y, lane);
562+
res.y.x = __shfl2(var.y.x, lane);
563+
res.y.y = __shfl2(var.y.y, lane);
564+
res.z.x = __shfl2(var.z.x, lane);
565+
res.z.y = __shfl2(var.z.y, lane);
566+
res.w.x = __shfl2(var.w.x, lane);
567+
res.w.y = __shfl2(var.w.y, lane);
562568
return res;
563569
#else
564570
return var;
@@ -569,22 +575,22 @@ static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
569575
{
570576
#if __CUDA_ARCH__ >= 300
571577
ulonglong4 res;
572-
uint2 temp;
578+
uint2 temp;
573579
temp = vectorize(var.x);
574-
temp.x = __shfl(temp.x, lane);
575-
temp.y = __shfl(temp.y, lane);
580+
temp.x = __shfl2(temp.x, lane);
581+
temp.y = __shfl2(temp.y, lane);
576582
res.x = devectorize(temp);
577583
temp = vectorize(var.y);
578-
temp.x = __shfl(temp.x, lane);
579-
temp.y = __shfl(temp.y, lane);
584+
temp.x = __shfl2(temp.x, lane);
585+
temp.y = __shfl2(temp.y, lane);
580586
res.y = devectorize(temp);
581587
temp = vectorize(var.z);
582-
temp.x = __shfl(temp.x, lane);
583-
temp.y = __shfl(temp.y, lane);
588+
temp.x = __shfl2(temp.x, lane);
589+
temp.y = __shfl2(temp.y, lane);
584590
res.z = devectorize(temp);
585591
temp = vectorize(var.w);
586-
temp.x = __shfl(temp.x, lane);
587-
temp.y = __shfl(temp.y, lane);
592+
temp.x = __shfl2(temp.x, lane);
593+
temp.y = __shfl2(temp.y, lane);
588594
res.w = devectorize(temp);
589595
return res;
590596
#else

scrypt/kepler_kernel.cu

+53-44
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <map>
1111

1212
#include <cuda_runtime.h>
13+
#include <cuda_helper.h>
14+
1315
#include "miner.h"
1416

1517
#include "salsa_kernel.h"
@@ -18,6 +20,12 @@
1820
#define TEXWIDTH 32768
1921
#define THREADS_PER_WU 4 // four threads per hash
2022

23+
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
24+
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
25+
#else
26+
#define __shfl2 __shfl
27+
#endif
28+
2129
typedef enum
2230
{
2331
ANDERSEN,
@@ -57,12 +65,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right)
5765
return left;
5866
}
5967

60-
static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
68+
static __device__ uint4 shfl4(const uint4 bx, int target_thread) {
6169
return make_uint4(
62-
__shfl((int)bx.x, target_thread),
63-
__shfl((int)bx.y, target_thread),
64-
__shfl((int)bx.z, target_thread),
65-
__shfl((int)bx.w, target_thread)
70+
__shfl2((int)bx.x, target_thread),
71+
__shfl2((int)bx.y, target_thread),
72+
__shfl2((int)bx.z, target_thread),
73+
__shfl2((int)bx.w, target_thread)
6674
);
6775
}
6876

@@ -97,8 +105,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
97105

98106
if (SCHEME == ANDERSEN) {
99107
int target_thread = (threadIdx.x + 4)%32;
100-
uint4 t=b, t2=__shfl(bx, target_thread);
101-
int t2_start = __shfl((int)start, target_thread) + 4;
108+
uint4 t = b, t2 = shfl4(bx, target_thread);
109+
int t2_start = __shfl2((int)start, target_thread) + 4;
102110
bool c = (threadIdx.x & 0x4);
103111
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
104112
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
@@ -115,7 +123,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
115123

116124
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
117125
if (SCHEME == ANDERSEN) {
118-
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
126+
int t2_start = __shfl2((int)start, (threadIdx.x + 4)%32) + 4;
119127
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
120128
bool c = (threadIdx.x & 0x4);
121129
if (TEX_DIM == 0) {
@@ -129,7 +137,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
129137
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
130138
}
131139
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
132-
bx = __shfl(bx, (threadIdx.x + 28)%32);
140+
bx = shfl4(bx, (threadIdx.x + 28)%32);
133141
} else {
134142
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
135143
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
@@ -149,14 +157,15 @@ void primary_order_shuffle(uint4 &b, uint4 &bx)
149157
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
150158
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
151159

152-
b.w = __shfl((int)b.w, x1);
153-
b.z = __shfl((int)b.z, x2);
154-
b.y = __shfl((int)b.y, x3);
160+
b.w = __shfl2((int)b.w, x1);
161+
b.z = __shfl2((int)b.z, x2);
162+
b.y = __shfl2((int)b.y, x3);
163+
155164
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
156165

157-
bx.w = __shfl((int)bx.w, x1);
158-
bx.z = __shfl((int)bx.z, x2);
159-
bx.y = __shfl((int)bx.y, x3);
166+
bx.w = __shfl2((int)bx.w, x1);
167+
bx.z = __shfl2((int)bx.z, x2);
168+
bx.y = __shfl2((int)bx.y, x3);
160169
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
161170
}
162171

@@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
318327
/* Unclear if this optimization is needed: These are ordered based
319328
* upon the dependencies needed in the later xors. Compiler should be
320329
* able to figure this out, but might as well give it a hand. */
321-
x.y = __shfl((int)x.y, x3);
322-
x.w = __shfl((int)x.w, x1);
323-
x.z = __shfl((int)x.z, x2);
330+
x.y = __shfl2((int)x.y, x3);
331+
x.w = __shfl2((int)x.w, x1);
332+
x.z = __shfl2((int)x.z, x2);
324333

325334
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
326335
* but the register targets are rewritten here to swap x[1] and x[3] so that
@@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
333342
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
334343
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
335344

336-
x.w = __shfl((int)x.w, x3);
337-
x.y = __shfl((int)x.y, x1);
338-
x.z = __shfl((int)x.z, x2);
345+
x.w = __shfl2((int)x.w, x3);
346+
x.y = __shfl2((int)x.y, x1);
347+
x.z = __shfl2((int)x.z, x2);
339348
}
340349

341350
b += x;
@@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
352361
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
353362
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
354363

355-
x.y = __shfl((int)x.y, x3);
356-
x.w = __shfl((int)x.w, x1);
357-
x.z = __shfl((int)x.z, x2);
364+
x.y = __shfl2((int)x.y, x3);
365+
x.w = __shfl2((int)x.w, x1);
366+
x.z = __shfl2((int)x.z, x2);
358367

359368
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
360369
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
361370
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
362371
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
363372

364-
x.w = __shfl((int)x.w, x3);
365-
x.y = __shfl((int)x.y, x1);
366-
x.z = __shfl((int)x.z, x2);
373+
x.w = __shfl2((int)x.w, x3);
374+
x.y = __shfl2((int)x.y, x1);
375+
x.z = __shfl2((int)x.z, x2);
367376
}
368377

369378
// At the end of these iterations, the data is in primary order again.
@@ -407,19 +416,19 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
407416
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
408417
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
409418

410-
x.y = __shfl((int)x.y, x1);
411-
x.z = __shfl((int)x.z, x2);
412-
x.w = __shfl((int)x.w, x3);
419+
x.y = __shfl2((int)x.y, x1);
420+
x.z = __shfl2((int)x.z, x2);
421+
x.w = __shfl2((int)x.w, x3);
413422

414423
// Diagonal Mixing phase of chacha
415424
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
416425
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
417426
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
418427
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
419428

420-
x.y = __shfl((int)x.y, x3);
421-
x.z = __shfl((int)x.z, x2);
422-
x.w = __shfl((int)x.w, x1);
429+
x.y = __shfl2((int)x.y, x3);
430+
x.z = __shfl2((int)x.z, x2);
431+
x.w = __shfl2((int)x.w, x1);
423432
}
424433

425434
b += x;
@@ -436,19 +445,19 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
436445
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
437446
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
438447

439-
x.y = __shfl((int)x.y, x1);
440-
x.z = __shfl((int)x.z, x2);
441-
x.w = __shfl((int)x.w, x3);
448+
x.y = __shfl2((int)x.y, x1);
449+
x.z = __shfl2((int)x.z, x2);
450+
x.w = __shfl2((int)x.w, x3);
442451

443452
// Diagonal Mixing phase of chacha
444453
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
445454
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
446455
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
447456
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
448457

449-
x.y = __shfl((int)x.y, x3);
450-
x.z = __shfl((int)x.z, x2);
451-
x.w = __shfl((int)x.w, x1);
458+
x.y = __shfl2((int)x.y, x3);
459+
x.z = __shfl2((int)x.z, x2);
460+
x.w = __shfl2((int)x.w, x1);
452461
}
453462

454463
#undef CHACHA_PRIMITIVE
@@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
572581
} else load_key<ALGO>(d_odata, b, bx);
573582

574583
for (int i = begin; i < end; i++) {
575-
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
584+
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
576585
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
577586
b ^= t; bx ^= tx;
578587
block_mixer<ALGO>(b, bx, x1, x2, x3);
@@ -604,15 +613,15 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
604613
{
605614
// better divergent thread handling submitted by nVidia engineers, but
606615
// supposedly this does not run with the ANDERSEN memory access scheme
607-
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
616+
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
608617
int pos = j/LOOKUP_GAP;
609618
int loop = -1;
610619
uint4 t, tx;
611620

612621
int i = begin;
613622
while(i < end) {
614623
if (loop==-1) {
615-
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
624+
j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
616625
pos = j/LOOKUP_GAP;
617626
loop = j-pos*LOOKUP_GAP;
618627
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
@@ -634,7 +643,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
634643
// this is my original implementation, now used with the ANDERSEN
635644
// memory access scheme only.
636645
for (int i = begin; i < end; i++) {
637-
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
646+
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
638647
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
639648
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
640649
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
@@ -644,7 +653,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
644653
}
645654

646655
//for (int i = begin; i < end; i++) {
647-
// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
656+
// int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
648657
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
649658
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
650659
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);

0 commit comments

Comments
 (0)