30#ifndef REMORA_KERNELS_GPU_RANDOM_HPP
31#define REMORA_KERNELS_GPU_RANDOM_HPP
33#include <boost/compute/kernel.hpp>
34#include <boost/compute/utility/program_cache.hpp>
38namespace remora{
namespace bindings{
40inline boost::compute::kernel get_random_kernel32(std::string
const& kernelname, boost::compute::context
const& ctx){
72 "#define SKEIN_KS_PARITY_32 0x1BD11BDA\n"
73 "enum r123_enum_threefry32x4 {\n"
74 " R_32x4_0_0=10, R_32x4_0_1=26,\n"
75 " R_32x4_1_0=11, R_32x4_1_1=21,\n"
76 " R_32x4_2_0=13, R_32x4_2_1=27,\n"
77 " R_32x4_3_0=23, R_32x4_3_1= 5,\n"
78 " R_32x4_4_0= 6, R_32x4_4_1=20,\n"
79 " R_32x4_5_0=17, R_32x4_5_1=11,\n"
80 " R_32x4_6_0=25, R_32x4_6_1=10,\n"
81 " R_32x4_7_0=18, R_32x4_7_1=20\n"
84 "static uint RotL_32(uint x, uint N){\n"
85 " return (x << (N & 31)) | (x >> ((32-N) & 31));\n"
88 "uint4 rng4_32(uint4 X, uint4 k){ \n"
91 " ks.s4 = SKEIN_KS_PARITY_32;\n"
98 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_0_0); X.s1^= X.s0;\n"
99 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_0_1); X.s3^= X.s2;\n"
100 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_1_0); X.s3^= X.s0;\n"
101 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_1_1); X.s1^= X.s2;\n"
102 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_2_0); X.s1^= X.s0;\n"
103 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_2_1); X.s3^= X.s2;\n"
104 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_3_0); X.s3^= X.s0;\n"
105 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_3_1); X.s1^= X.s2;\n"
109 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_4_0); X.s1^= X.s0;\n"
110 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_4_1); X.s3^= X.s2;\n"
111 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_5_0); X.s3^= X.s0;\n"
112 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_5_1); X.s1^= X.s2;\n"
113 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_6_0); X.s1^= X.s0;\n"
114 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_6_1); X.s3^= X.s2;\n"
115 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_7_0); X.s3^= X.s0;\n"
116 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_7_1); X.s1^= X.s2;\n"
120 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_0_0); X.s1^= X.s0;\n"
121 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_0_1); X.s3^= X.s2;\n"
122 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_1_0); X.s3^= X.s0;\n"
123 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_1_1); X.s1^= X.s2;\n"
124 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_2_0); X.s1^= X.s0;\n"
125 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_2_1); X.s3^= X.s2;\n"
126 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_3_0); X.s3^= X.s0;\n"
127 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_3_1); X.s1^= X.s2;\n"
133 "#define MAX_RANDOM_32 0xffffffff\n"
134 "float4 uniform4_32(float low, float high, uint4 x){\n"
135 " float4 z = convert_float4(x) / MAX_RANDOM_32;\n"
136 " return nextafter(low + z * (high - low), low);\n"
138 "float4 normal4_32(float mean, float stddev, uint4 x){\n"
139 " float4 u = uniform4_32(0,1,x);\n"
140 " float2 r = sqrt(-2 * log(u.lo));\n"
143 " phi.lo = sincos(2*(float)M_PI * u.hi,&z);\n"
145 " return mean + stddev * r.s0011 * phi;\n"
147 "uint4 discrete4_32(uint4 key, uint4 ctr, uint N, uint4 inc) {\n"
148 " uint max_valid = (MAX_RANDOM_32 / N) * N;\n"
149 " int num_valid = 0;\n"
151 " while(num_valid < 4){\n"
152 " uint4 x = rng4_32(ctr, key);\n"
153 " if(x.s0 < max_valid){\n"
154 " res[num_valid] = x.s0 % N;\n"
157 " if(num_valid < 4 && x.s1 < max_valid){\n"
158 " res[num_valid] = x.s1 % N;\n"
161 " if(num_valid < 4 && x.s2 < max_valid){\n"
162 " res[num_valid] = x.s2 % N;\n"
165 " if(num_valid < 4 && x.s3 < max_valid){\n"
166 " res[num_valid] = x.s3 % N;\n"
171 " return (uint4)(res[0],res[1],res[2],res[3]);\n"
174 "__kernel void generate_uniform32(__global float *res, uint4 key, float low, float high, ulong offset, ulong cols, ulong leading, ulong stride) {\n"
176 " ctr.s0 = get_global_id(0) & 0x0000ffff;\n"
177 " ctr.s2 = get_global_id(0) >> 32;\n"
178 " ctr.s1 = get_global_id(1) & 0x0000ffff;\n"
179 " ctr.s3 = get_global_id(1) >> 32;\n"
180 " uint4 x = rng4_32(ctr, key);\n"
181 " float4 u = uniform4_32(low, high, x);\n"
182 " size_t col = 4 * get_global_id(1);\n"
183 " size_t pos = offset + stride * (get_global_id(0) * cols + col);\n"
184 " res[pos] = u.s0;\n"
185 " if(col < cols) res[pos + stride] = u.s1;\n"
186 " if(col + 1 < cols) res[pos +2 * stride] = u.s2;\n"
187 " if(col + 2 < cols) res[pos +3 * stride] = u.s3;\n"
189 "__kernel void generate_normal32(__global float *res, uint4 key, float mean, float stddev, ulong offset, ulong cols, ulong leading, ulong stride) {\n"
191 " ctr.s0 = get_global_id(0) & 0x0000ffff;\n"
192 " ctr.s2 = get_global_id(0) >> 32;\n"
193 " ctr.s1 = get_global_id(1) & 0x0000ffff;\n"
194 " ctr.s3 = get_global_id(1) >> 32;\n"
195 " uint4 x = rng4_32(ctr, key);\n"
196 " float4 u = normal4_32(mean, stddev, x);\n"
197 " size_t col = 4 * get_global_id(1);\n"
198 " size_t pos = offset + stride * (get_global_id(0) * cols + col);\n"
199 " res[pos] = u.s0;\n"
200 " if(col < cols) res[pos + stride] = u.s1;\n"
201 " if(col + 1 < cols) res[pos +2 * stride] = u.s2;\n"
202 " if(col + 2 < cols) res[pos +3 * stride] = u.s3;\n"
204 "__kernel void generate_discrete_float_int32(__global float* res, uint4 key, int low, int high, ulong offset, ulong cols, ulong leading, ulong stride) {\n"
205 " uint N = high - low + 1;\n"
207 " ctr.s0 = get_global_id(0) & 0x0000ffff;\n"
208 " ctr.s2 = get_global_id(0) >> 32;\n"
209 " ctr.s1 = get_global_id(1) & 0x0000ffff;\n"
210 " ctr.s3 = get_global_id(1) >> 32;\n"
211 " uint4 inc = (uint4)(0,get_global_size(1) & 0x0000ffff, 0, get_global_size(1) >> 32);\n"
212 " uint4 z = discrete4_32(key,ctr,N,inc);\n"
213 " size_t col = 4 * get_global_id(1);\n"
214 " size_t pos = offset + stride * (get_global_id(0) * cols + col);\n"
215 " int valid_required = min((size_t)4, cols - col);\n"
216 " res[pos] = low + convert_float(z.s0)\n;"
217 " if(1 < valid_required) res[pos+stride] = low + convert_float(z.s1)\n;"
218 " if(2 < valid_required) res[pos+2*stride] = low + convert_float(z.s2)\n;"
219 " if(3 < valid_required) res[pos+3*stride] = low + convert_float(z.s2)\n;"
221 auto program = boost::compute::program_cache::get_global_cache(ctx)->get_or_build(
"remora_random_program32",
"", source, ctx);
222 return program.create_kernel(kernelname+
"32");
224template<
class Rng,
class T>
225void run_random_kernel(std::string
const& kernelname, Rng& rng, boost::compute::command_queue& queue,
226 boost::compute::buffer& buffer, uint64_t offset, uint64_t major, uint64_t minor, uint64_t leading_dimension, uint64_t stride,
229 static_assert(
sizeof(T) == 4,
"only 32 bit random number types are currently supported");
232 auto k = get_random_kernel32(kernelname,queue.get_context());
234 std::uniform_int_distribution<uint32_t> dist(0,0xffffffff);
235 static const std::size_t vectorSize = 4;
236 uint32_t key[vectorSize] = {dist(rng),dist(rng),dist(rng),dist(rng)};
238 k.set_arg(0,buffer.get());
239 k.set_arg(1,
sizeof(key), key);
240 k.set_arg(2,
sizeof(arg1), &arg1);
241 k.set_arg(3,
sizeof(arg2), &arg2);
242 k.set_arg(4, 8, &offset);
243 k.set_arg(5, 8, &minor);
244 k.set_arg(6, 8, &leading_dimension);
245 k.set_arg(7, 8, &stride);
247 std::size_t global_work_size[2] = {major,(minor + vectorSize-1) / vectorSize};
248 queue.enqueue_nd_range_kernel(k, 2,
nullptr, global_work_size,
nullptr);
251template<
class V,
class Rng>
253 vector_expression<V, gpu_tag>& v,
255 typename V::value_type mean,
256 typename V::value_type variance
258 auto storage = v().raw_storage();
259 run_random_kernel(
"generate_normal", rng, v().queue(), storage.buffer, storage.offset, 1,v().size(), 1, storage.stride, mean, std::sqrt(variance));
262template<
class M,
class Rng>
264 matrix_expression<M, gpu_tag>& m,
266 typename M::value_type mean,
267 typename M::value_type variance
269 auto storage = m().raw_storage();
270 std::size_t major = M::orientation::index_M(m().size1(), m().size2());
271 std::size_t minor = M::orientation::index_m(m().size1(), m().size2());
272 run_random_kernel(
"generate_normal", rng, m().queue(), storage.buffer, storage.offset, major, minor,storage.leading_dimension, 1, mean, std::sqrt(variance));
275template<
class V,
class Rng>
276void generate_uniform(
277 vector_expression<V, gpu_tag>& v,
279 typename V::value_type low,
280 typename V::value_type high
283 auto storage = v().raw_storage();
284 run_random_kernel(
"generate_uniform", rng, v().queue(), storage.buffer, storage.offset, 1, v().size(), 1, storage.stride, low, high);
287template<
class M,
class Rng>
288void generate_uniform(
289 matrix_expression<M, gpu_tag>& m,
291 typename M::value_type low,
292 typename M::value_type high
294 auto storage = m().raw_storage();
295 std::size_t major = M::orientation::index_M(m().size1(), m().size2());
296 std::size_t minor = M::orientation::index_m(m().size1(), m().size2());
297 run_random_kernel(
"generate_uniform", rng, m().queue(), storage.buffer, storage.offset, major, minor, storage.leading_dimension, 1,low, high);
300template<
class V,
class Rng>
301void generate_discrete(
302 vector_expression<V, gpu_tag>& v,
307 auto storage = v().raw_storage();
308 run_random_kernel(
"generate_discrete_float_int", rng, v().queue(), storage.buffer, storage.offset, 1, v().size(), 1, storage.stride, low, high);
311template<
class M,
class Rng>
312void generate_discrete(
313 matrix_expression<M, gpu_tag>& m,
318 auto storage = m().raw_storage();
319 std::size_t major = M::orientation::index_M(m().size1(), m().size2());
320 std::size_t minor = M::orientation::index_m(m().size1(), m().size2());
321 run_random_kernel(
"generate_discrete_float_int", rng, m().queue(), storage.buffer, storage.offset, major, minor,storage.leading_dimension, 1, low, high);