random.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Generation of random variates on opencl devices
5 *
6 * \author O. Krause
7 * \date 2017
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30#ifndef REMORA_KERNELS_GPU_RANDOM_HPP
31#define REMORA_KERNELS_GPU_RANDOM_HPP
32
33#include <boost/compute/kernel.hpp>
34#include <boost/compute/utility/program_cache.hpp>
35#include <random>
36#include <cstdint>
37
38namespace remora{ namespace bindings{
39
40inline boost::compute::kernel get_random_kernel32(std::string const& kernelname, boost::compute::context const& ctx){
41 const char source[] =
42 //rng4_32 is based on the implementation of the threefry algorithm by:
43 // Copyright 2010-2012, D. E. Shaw Research.
44 // All rights reserved.
45
46 // Redistribution and use in source and binary forms, with or without
47 // modification, are permitted provided that the following conditions are
48 // met:
49
50 // * Redistributions of source code must retain the above copyright
51 // notice, this list of conditions, and the following disclaimer.
52
53 // * Redistributions in binary form must reproduce the above copyright
54 // notice, this list of conditions, and the following disclaimer in the
55 // documentation and/or other materials provided with the distribution.
56
57 // * Neither the name of D. E. Shaw Research nor the names of its
58 // contributors may be used to endorse or promote products derived from
59 // this software without specific prior written permission.
60
61 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
62 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
63 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
64 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
65 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
66 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
67 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
68 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
69 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
70 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
71 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
72 "#define SKEIN_KS_PARITY_32 0x1BD11BDA\n"
73 "enum r123_enum_threefry32x4 {\n"
74 " R_32x4_0_0=10, R_32x4_0_1=26,\n"
75 " R_32x4_1_0=11, R_32x4_1_1=21,\n"
76 " R_32x4_2_0=13, R_32x4_2_1=27,\n"
77 " R_32x4_3_0=23, R_32x4_3_1= 5,\n"
78 " R_32x4_4_0= 6, R_32x4_4_1=20,\n"
79 " R_32x4_5_0=17, R_32x4_5_1=11,\n"
80 " R_32x4_6_0=25, R_32x4_6_1=10,\n"
81 " R_32x4_7_0=18, R_32x4_7_1=20\n"
82 "};\n"
83
84 "static uint RotL_32(uint x, uint N){\n"
85 " return (x << (N & 31)) | (x >> ((32-N) & 31));\n"
86 "}\n"
87
88 "uint4 rng4_32(uint4 X, uint4 k){ \n"
89 " uint8 ks;\n"
90 " ks.lo = k;\n"
91 " ks.s4 = SKEIN_KS_PARITY_32;\n"
92 " ks.s4 ^= k.s0;\n"
93 " ks.s4 ^= k.s1;\n"
94 " ks.s4 ^= k.s2;\n"
95 " ks.s4 ^= k.s3;\n"
96
97 " X += ks.lo;\n"
98 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_0_0); X.s1^= X.s0;\n"
99 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_0_1); X.s3^= X.s2;\n"
100 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_1_0); X.s3^= X.s0;\n"
101 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_1_1); X.s1^= X.s2;\n"
102 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_2_0); X.s1^= X.s0;\n"
103 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_2_1); X.s3^= X.s2;\n"
104 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_3_0); X.s3^= X.s0;\n"
105 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_3_1); X.s1^= X.s2;\n"
106 " X += ks.s1234;\n"
107 " X.s3 += 1;\n"
108
109 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_4_0); X.s1^= X.s0;\n"
110 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_4_1); X.s3^= X.s2;\n"
111 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_5_0); X.s3^= X.s0;\n"
112 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_5_1); X.s1^= X.s2;\n"
113 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_6_0); X.s1^= X.s0;\n"
114 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_6_1); X.s3^= X.s2;\n"
115 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_7_0); X.s3^= X.s0;\n"
116 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_7_1); X.s1^= X.s2;\n"
117 " X += ks.s2340;\n"
118 " X.s3 += 2;\n"
119
120 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_0_0); X.s1^= X.s0;\n"
121 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_0_1); X.s3^= X.s2;\n"
122 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_1_0); X.s3^= X.s0;\n"
123 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_1_1); X.s1^= X.s2;\n"
124 " X.s0+= X.s1; X.s1= RotL_32(X.s1,R_32x4_2_0); X.s1^= X.s0;\n"
125 " X.s2+= X.s3; X.s3= RotL_32(X.s3,R_32x4_2_1); X.s3^= X.s2;\n"
126 " X.s0+= X.s3; X.s3= RotL_32(X.s3,R_32x4_3_0); X.s3^= X.s0;\n"
127 " X.s2+= X.s1; X.s1= RotL_32(X.s1,R_32x4_3_1); X.s1^= X.s2;\n"
128 " X += ks.s3401;\n"
129 " X.s3 += 3;\n"
130 " return X;\n"
131 "}\n"
132 //source for internal routines to generate random numbers
133 "#define MAX_RANDOM_32 0xffffffff\n"
134 "float4 uniform4_32(float low, float high, uint4 x){\n"
135 " float4 z = convert_float4(x) / MAX_RANDOM_32;\n"
136 " return nextafter(low + z * (high - low), low);\n"
137 "}\n"
138 "float4 normal4_32(float mean, float stddev, uint4 x){\n"
139 " float4 u = uniform4_32(0,1,x);\n"
140 " float2 r = sqrt(-2 * log(u.lo));\n"
141 " float4 phi;\n"
142 " float2 z;\n"
143 " phi.lo = sincos(2*(float)M_PI * u.hi,&z);\n"
144 " phi.hi = z;"
145 " return mean + stddev * r.s0011 * phi;\n"
146 "}\n"
147 "uint4 discrete4_32(uint4 key, uint4 ctr, uint N, uint4 inc) {\n"
148 " uint max_valid = (MAX_RANDOM_32 / N) * N;\n"
149 " int num_valid = 0;\n"
150 " uint res[4];"
151 " while(num_valid < 4){\n"
152 " uint4 x = rng4_32(ctr, key);\n"
153 " if(x.s0 < max_valid){\n"
154 " res[num_valid] = x.s0 % N;\n"
155 " ++num_valid;\n"
156 " }\n"
157 " if(num_valid < 4 && x.s1 < max_valid){\n"
158 " res[num_valid] = x.s1 % N;\n"
159 " ++num_valid;\n"
160 " }\n"
161 " if(num_valid < 4 && x.s2 < max_valid){\n"
162 " res[num_valid] = x.s2 % N;\n"
163 " ++num_valid;\n"
164 " }\n"
165 " if(num_valid < 4 && x.s3 < max_valid){\n"
166 " res[num_valid] = x.s3 % N;\n"
167 " ++num_valid;\n"
168 " }\n"
169 " ctr += inc;\n"
170 " }\n"
171 " return (uint4)(res[0],res[1],res[2],res[3]);\n"
172 "}\n"
173 //kernel routines
174 "__kernel void generate_uniform32(__global float *res, uint4 key, float low, float high, ulong offset, ulong cols, ulong leading, ulong stride) {\n"
175 " uint4 ctr;\n"
176 " ctr.s0 = get_global_id(0) & 0x0000ffff;\n"
177 " ctr.s2 = get_global_id(0) >> 32;\n"
178 " ctr.s1 = get_global_id(1) & 0x0000ffff;\n"
179 " ctr.s3 = get_global_id(1) >> 32;\n"
180 " uint4 x = rng4_32(ctr, key);\n"
181 " float4 u = uniform4_32(low, high, x);\n"
182 " size_t col = 4 * get_global_id(1);\n"
183 " size_t pos = offset + stride * (get_global_id(0) * cols + col);\n"
184 " res[pos] = u.s0;\n"
185 " if(col < cols) res[pos + stride] = u.s1;\n"
186 " if(col + 1 < cols) res[pos +2 * stride] = u.s2;\n"
187 " if(col + 2 < cols) res[pos +3 * stride] = u.s3;\n"
188 "}\n"
189 "__kernel void generate_normal32(__global float *res, uint4 key, float mean, float stddev, ulong offset, ulong cols, ulong leading, ulong stride) {\n"
190 " uint4 ctr;\n"
191 " ctr.s0 = get_global_id(0) & 0x0000ffff;\n"
192 " ctr.s2 = get_global_id(0) >> 32;\n"
193 " ctr.s1 = get_global_id(1) & 0x0000ffff;\n"
194 " ctr.s3 = get_global_id(1) >> 32;\n"
195 " uint4 x = rng4_32(ctr, key);\n"
196 " float4 u = normal4_32(mean, stddev, x);\n"
197 " size_t col = 4 * get_global_id(1);\n"
198 " size_t pos = offset + stride * (get_global_id(0) * cols + col);\n"
199 " res[pos] = u.s0;\n"
200 " if(col < cols) res[pos + stride] = u.s1;\n"
201 " if(col + 1 < cols) res[pos +2 * stride] = u.s2;\n"
202 " if(col + 2 < cols) res[pos +3 * stride] = u.s3;\n"
203 "}\n"
204 "__kernel void generate_discrete_float_int32(__global float* res, uint4 key, int low, int high, ulong offset, ulong cols, ulong leading, ulong stride) {\n"
205 " uint N = high - low + 1;\n"
206 " uint4 ctr;\n"
207 " ctr.s0 = get_global_id(0) & 0x0000ffff;\n"
208 " ctr.s2 = get_global_id(0) >> 32;\n"
209 " ctr.s1 = get_global_id(1) & 0x0000ffff;\n"
210 " ctr.s3 = get_global_id(1) >> 32;\n"
211 " uint4 inc = (uint4)(0,get_global_size(1) & 0x0000ffff, 0, get_global_size(1) >> 32);\n"
212 " uint4 z = discrete4_32(key,ctr,N,inc);\n"
213 " size_t col = 4 * get_global_id(1);\n"
214 " size_t pos = offset + stride * (get_global_id(0) * cols + col);\n"
215 " int valid_required = min((size_t)4, cols - col);\n"
216 " res[pos] = low + convert_float(z.s0)\n;"
217 " if(1 < valid_required) res[pos+stride] = low + convert_float(z.s1)\n;"
218 " if(2 < valid_required) res[pos+2*stride] = low + convert_float(z.s2)\n;"
219 " if(3 < valid_required) res[pos+3*stride] = low + convert_float(z.s2)\n;"
220 "}\n";
221 auto program = boost::compute::program_cache::get_global_cache(ctx)->get_or_build("remora_random_program32", "", source, ctx);
222 return program.create_kernel(kernelname+"32");
223}
224template<class Rng, class T>
225void run_random_kernel(std::string const& kernelname, Rng& rng, boost::compute::command_queue& queue,
226 boost::compute::buffer& buffer, uint64_t offset, uint64_t major, uint64_t minor, uint64_t leading_dimension, uint64_t stride,
227 T arg1, T arg2
228){
229 static_assert(sizeof(T) == 4, "only 32 bit random number types are currently supported");
230 if(sizeof(T) == 4){
231 //get kernel from program
232 auto k = get_random_kernel32(kernelname,queue.get_context());
233 //seed key from rng (this is the only possible entropy source for the rng!)
234 std::uniform_int_distribution<uint32_t> dist(0,0xffffffff);
235 static const std::size_t vectorSize = 4;
236 uint32_t key[vectorSize] = {dist(rng),dist(rng),dist(rng),dist(rng)};
237
238 k.set_arg(0,buffer.get());
239 k.set_arg(1, sizeof(key), key);
240 k.set_arg(2, sizeof(arg1), &arg1);
241 k.set_arg(3, sizeof(arg2), &arg2);
242 k.set_arg(4, 8, &offset);
243 k.set_arg(5, 8, &minor);
244 k.set_arg(6, 8, &leading_dimension);
245 k.set_arg(7, 8, &stride);
246
247 std::size_t global_work_size[2] = {major,(minor + vectorSize-1) / vectorSize};
248 queue.enqueue_nd_range_kernel(k, 2,nullptr, global_work_size, nullptr);
249 }
250}
251template<class V, class Rng>
252void generate_normal(
253 vector_expression<V, gpu_tag>& v,
254 Rng& rng,
255 typename V::value_type mean,
256 typename V::value_type variance
257) {
258 auto storage = v().raw_storage();
259 run_random_kernel("generate_normal", rng, v().queue(), storage.buffer, storage.offset, 1,v().size(), 1, storage.stride, mean, std::sqrt(variance));
260}
261
262template<class M, class Rng>
263void generate_normal(
264 matrix_expression<M, gpu_tag>& m,
265 Rng& rng,
266 typename M::value_type mean,
267 typename M::value_type variance
268) {
269 auto storage = m().raw_storage();
270 std::size_t major = M::orientation::index_M(m().size1(), m().size2());
271 std::size_t minor = M::orientation::index_m(m().size1(), m().size2());
272 run_random_kernel("generate_normal", rng, m().queue(), storage.buffer, storage.offset, major, minor,storage.leading_dimension, 1, mean, std::sqrt(variance));
273}
274
275template<class V, class Rng>
276void generate_uniform(
277 vector_expression<V, gpu_tag>& v,
278 Rng& rng,
279 typename V::value_type low,
280 typename V::value_type high
281) {
282
283 auto storage = v().raw_storage();
284 run_random_kernel("generate_uniform", rng, v().queue(), storage.buffer, storage.offset, 1, v().size(), 1, storage.stride, low, high);
285}
286
287template<class M, class Rng>
288void generate_uniform(
289 matrix_expression<M, gpu_tag>& m,
290 Rng& rng,
291 typename M::value_type low,
292 typename M::value_type high
293) {
294 auto storage = m().raw_storage();
295 std::size_t major = M::orientation::index_M(m().size1(), m().size2());
296 std::size_t minor = M::orientation::index_m(m().size1(), m().size2());
297 run_random_kernel("generate_uniform", rng, m().queue(), storage.buffer, storage.offset, major, minor, storage.leading_dimension, 1,low, high);
298}
299
300template<class V, class Rng>
301void generate_discrete(
302 vector_expression<V, gpu_tag>& v,
303 Rng& rng,
304 int low,
305 int high
306) {
307 auto storage = v().raw_storage();
308 run_random_kernel("generate_discrete_float_int", rng, v().queue(), storage.buffer, storage.offset, 1, v().size(), 1, storage.stride, low, high);
309}
310
311template<class M, class Rng>
312void generate_discrete(
313 matrix_expression<M, gpu_tag>& m,
314 Rng& rng,
315 int low,
316 int high
317) {
318 auto storage = m().raw_storage();
319 std::size_t major = M::orientation::index_M(m().size1(), m().size2());
320 std::size_t minor = M::orientation::index_m(m().size1(), m().size2());
321 run_random_kernel("generate_discrete_float_int", rng, m().queue(), storage.buffer, storage.offset, major, minor,storage.leading_dimension, 1, low, high);
322}
323
324}}
325#endif