39 #ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_ 40 #define PCL_GPU_KINFU_CUDA_UTILS_HPP_ 49 __device__ __host__ __forceinline__
void swap ( T& a, T& b )
58 __device__ __forceinline__
static float 60 __device__ __forceinline__
static float 63 __device__ __forceinline__
static float 64 min() {
return 1.175494351e-38f; };
65 __device__ __forceinline__
static float 66 max() {
return 3.402823466e+38f; };
71 __device__ __forceinline__
static short 72 max() {
return SHRT_MAX; };
75 __device__ __forceinline__
float 76 dot(
const float3& v1,
const float3& v2)
78 return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
81 __device__ __forceinline__ float3&
84 vec.x += v; vec.y += v; vec.z += v;
return vec;
87 __device__ __forceinline__ float3
90 return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
93 __device__ __forceinline__ float3&
96 vec.x *= v; vec.y *= v; vec.z *= v;
return vec;
99 __device__ __forceinline__ float3
102 return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
105 __device__ __forceinline__ float3
108 return make_float3(v1.x * v, v1.y * v, v1.z * v);
111 __device__ __forceinline__
float 114 return sqrt(
dot(v, v));
117 __device__ __forceinline__ float3
120 return v * rsqrt(
dot(v, v));
123 __device__ __host__ __forceinline__ float3
124 cross(
const float3& v1,
const float3& v2)
126 return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
129 __device__ __forceinline__
void computeRoots2(
const float& b,
const float& c, float3& roots)
132 float d = b * b - 4.f * c;
138 roots.z = 0.5f * (b + sd);
139 roots.y = 0.5f * (b - sd);
142 __device__ __forceinline__
void 151 const float s_inv3 = 1.f/3.f;
152 const float s_sqrt3 = sqrtf(3.f);
155 float c2_over_3 = c2 * s_inv3;
156 float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
160 float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
162 float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
167 float rho = sqrtf(-a_over_3);
168 float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
169 float cos_theta = __cosf (theta);
170 float sin_theta = __sinf (theta);
171 roots.x = c2_over_3 + 2.f * rho * cos_theta;
172 roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
173 roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
176 if (roots.x >= roots.y)
177 swap(roots.x, roots.y);
179 if (roots.y >= roots.z)
181 swap(roots.y, roots.z);
183 if (roots.x >= roots.y)
184 swap (roots.x, roots.y);
198 __device__ __host__ __forceinline__ float3&
operator[](
int i) {
return data[i]; }
199 __device__ __host__ __forceinline__
const float3&
operator[](
int i)
const {
return data[i]; }
205 static __forceinline__ __device__ float3
216 if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
218 float invnm = rsqrtf(src.x*src.x + src.y*src.y);
219 perp.x = -src.y * invnm;
220 perp.y = src.x * invnm;
229 float invnm = rsqrtf(src.z * src.z + src.y * src.y);
231 perp.y = -src.z * invnm;
232 perp.z = src.y * invnm;
238 __device__ __forceinline__
239 Eigen33(
volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
240 __device__ __forceinline__
void 241 compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
246 float max01 = fmaxf( fabsf(mat_pkg[0]), fabsf(mat_pkg[1]) );
247 float max23 = fmaxf( fabsf(mat_pkg[2]), fabsf(mat_pkg[3]) );
248 float max45 = fmaxf( fabsf(mat_pkg[4]), fabsf(mat_pkg[5]) );
249 float m0123 = fmaxf( max01, max23);
250 float scale = fmaxf( max45, m0123);
265 float c0 = m00() * m11() * m22()
266 + 2.f * m01() * m02() * m12()
267 - m00() * m12() * m12()
268 - m11() * m02() * m02()
269 - m22() * m01() * m01();
270 float c1 = m00() * m11() -
276 float c2 = m00() + m11() + m22();
282 evecs[0] = make_float3(1.f, 0.f, 0.f);
283 evecs[1] = make_float3(0.f, 1.f, 0.f);
284 evecs[2] = make_float3(0.f, 0.f, 1.f);
289 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
290 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
292 vec_tmp[0] =
cross(tmp[0], tmp[1]);
293 vec_tmp[1] =
cross(tmp[0], tmp[2]);
294 vec_tmp[2] =
cross(tmp[1], tmp[2]);
296 float len1 =
dot (vec_tmp[0], vec_tmp[0]);
297 float len2 =
dot (vec_tmp[1], vec_tmp[1]);
298 float len3 =
dot (vec_tmp[2], vec_tmp[2]);
300 if (len1 >= len2 && len1 >= len3)
302 evecs[2] = vec_tmp[0] * rsqrtf (len1);
304 else if (len2 >= len1 && len2 >= len3)
306 evecs[2] = vec_tmp[1] * rsqrtf (len2);
310 evecs[2] = vec_tmp[2] * rsqrtf (len3);
313 evecs[1] = unitOrthogonal(evecs[2]);
314 evecs[0] =
cross(evecs[1], evecs[2]);
319 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
320 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
322 vec_tmp[0] =
cross(tmp[0], tmp[1]);
323 vec_tmp[1] =
cross(tmp[0], tmp[2]);
324 vec_tmp[2] =
cross(tmp[1], tmp[2]);
326 float len1 =
dot(vec_tmp[0], vec_tmp[0]);
327 float len2 =
dot(vec_tmp[1], vec_tmp[1]);
328 float len3 =
dot(vec_tmp[2], vec_tmp[2]);
330 if (len1 >= len2 && len1 >= len3)
332 evecs[0] = vec_tmp[0] * rsqrtf(len1);
334 else if (len2 >= len1 && len2 >= len3)
336 evecs[0] = vec_tmp[1] * rsqrtf(len2);
340 evecs[0] = vec_tmp[2] * rsqrtf(len3);
343 evecs[1] = unitOrthogonal( evecs[0] );
344 evecs[2] =
cross(evecs[0], evecs[1]);
349 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
350 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
352 vec_tmp[0] =
cross(tmp[0], tmp[1]);
353 vec_tmp[1] =
cross(tmp[0], tmp[2]);
354 vec_tmp[2] =
cross(tmp[1], tmp[2]);
356 float len1 =
dot(vec_tmp[0], vec_tmp[0]);
357 float len2 =
dot(vec_tmp[1], vec_tmp[1]);
358 float len3 =
dot(vec_tmp[2], vec_tmp[2]);
362 unsigned int min_el = 2;
363 unsigned int max_el = 2;
364 if (len1 >= len2 && len1 >= len3)
367 evecs[2] = vec_tmp[0] * rsqrtf (len1);
369 else if (len2 >= len1 && len2 >= len3)
372 evecs[2] = vec_tmp[1] * rsqrtf (len2);
377 evecs[2] = vec_tmp[2] * rsqrtf (len3);
380 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
381 tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
383 vec_tmp[0] =
cross(tmp[0], tmp[1]);
384 vec_tmp[1] =
cross(tmp[0], tmp[2]);
385 vec_tmp[2] =
cross(tmp[1], tmp[2]);
387 len1 =
dot(vec_tmp[0], vec_tmp[0]);
388 len2 =
dot(vec_tmp[1], vec_tmp[1]);
389 len3 =
dot(vec_tmp[2], vec_tmp[2]);
391 if (len1 >= len2 && len1 >= len3)
394 evecs[1] = vec_tmp[0] * rsqrtf (len1);
395 min_el = len1 <= mmax[min_el] ? 1 : min_el;
396 max_el = len1 > mmax[max_el] ? 1 : max_el;
398 else if (len2 >= len1 && len2 >= len3)
401 evecs[1] = vec_tmp[1] * rsqrtf (len2);
402 min_el = len2 <= mmax[min_el] ? 1 : min_el;
403 max_el = len2 > mmax[max_el] ? 1 : max_el;
408 evecs[1] = vec_tmp[2] * rsqrtf (len3);
409 min_el = len3 <= mmax[min_el] ? 1 : min_el;
410 max_el = len3 > mmax[max_el] ? 1 : max_el;
413 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
414 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
416 vec_tmp[0] =
cross(tmp[0], tmp[1]);
417 vec_tmp[1] =
cross(tmp[0], tmp[2]);
418 vec_tmp[2] =
cross(tmp[1], tmp[2]);
420 len1 =
dot (vec_tmp[0], vec_tmp[0]);
421 len2 =
dot (vec_tmp[1], vec_tmp[1]);
422 len3 =
dot (vec_tmp[2], vec_tmp[2]);
425 if (len1 >= len2 && len1 >= len3)
428 evecs[0] = vec_tmp[0] * rsqrtf (len1);
429 min_el = len3 <= mmax[min_el] ? 0 : min_el;
430 max_el = len3 > mmax[max_el] ? 0 : max_el;
432 else if (len2 >= len1 && len2 >= len3)
435 evecs[0] = vec_tmp[1] * rsqrtf (len2);
436 min_el = len3 <= mmax[min_el] ? 0 : min_el;
437 max_el = len3 > mmax[max_el] ? 0 : max_el;
442 evecs[0] = vec_tmp[2] * rsqrtf (len3);
443 min_el = len3 <= mmax[min_el] ? 0 : min_el;
444 max_el = len3 > mmax[max_el] ? 0 : max_el;
447 unsigned mid_el = 3 - min_el - max_el;
448 evecs[min_el] =
normalized(
cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
449 evecs[mid_el] =
normalized(
cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
455 volatile float* mat_pkg;
457 __device__ __forceinline__
float m00()
const {
return mat_pkg[0]; }
458 __device__ __forceinline__
float m01()
const {
return mat_pkg[1]; }
459 __device__ __forceinline__
float m02()
const {
return mat_pkg[2]; }
460 __device__ __forceinline__
float m10()
const {
return mat_pkg[1]; }
461 __device__ __forceinline__
float m11()
const {
return mat_pkg[3]; }
462 __device__ __forceinline__
float m12()
const {
return mat_pkg[4]; }
463 __device__ __forceinline__
float m20()
const {
return mat_pkg[2]; }
464 __device__ __forceinline__
float m21()
const {
return mat_pkg[4]; }
465 __device__ __forceinline__
float m22()
const {
return mat_pkg[5]; }
467 __device__ __forceinline__ float3 row0()
const {
return make_float3( m00(), m01(), m02() ); }
468 __device__ __forceinline__ float3 row1()
const {
return make_float3( m10(), m11(), m12() ); }
469 __device__ __forceinline__ float3 row2()
const {
return make_float3( m20(), m21(), m22() ); }
471 __device__ __forceinline__
static bool isMuchSmallerThan (
float x,
float y)
475 return x * x <= prec_sqr * y * y;
481 static __device__ __forceinline__
unsigned int stride()
483 return blockDim.x * blockDim.y * blockDim.z;
486 static __device__ __forceinline__
int 489 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
492 template<
int CTA_SIZE,
typename T,
class BinOp>
493 static __device__ __forceinline__
void reduce(
volatile T* buffer, BinOp op)
495 int tid = flattenedThreadId();
498 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
499 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
500 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
501 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
505 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
506 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
507 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
508 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
509 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
510 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
514 template<
int CTA_SIZE,
typename T,
class BinOp>
515 static __device__ __forceinline__ T
reduce(
volatile T* buffer, T init, BinOp op)
517 int tid = flattenedThreadId();
518 T val = buffer[tid] = init;
521 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
522 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
523 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
524 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
528 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
529 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
530 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
531 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
532 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
533 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
545 WARP_SIZE = 1 << LOG_WARP_SIZE,
550 static __device__ __forceinline__
unsigned int 554 asm(
"mov.u32 %0, %laneid;" :
"=r"(ret) );
558 static __device__ __forceinline__
unsigned int id()
560 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
561 return tid >> LOG_WARP_SIZE;
564 static __device__ __forceinline__
567 #if (__CUDA_ARCH__ >= 200) 569 asm(
"mov.u32 %0, %lanemask_lt;" :
"=r"(ret) );
572 return 0xFFFFFFFF >> (32 - laneId());
585 static __device__ __forceinline__
int 588 const unsigned int lane = tid & 31;
592 int partial = ptr[tid];
594 ptr[tid] = partial = partial + ptr[tid + 16];
595 ptr[tid] = partial = partial + ptr[tid + 8];
596 ptr[tid] = partial = partial + ptr[tid + 4];
597 ptr[tid] = partial = partial + ptr[tid + 2];
598 ptr[tid] = partial = partial + ptr[tid + 1];
600 return ptr[tid - lane];
603 static __forceinline__ __device__
int 604 Ballot(
int predicate,
volatile int* cta_buffer)
606 #if CUDA_VERSION >= 9000 608 return __ballot_sync (__activemask (), predicate);
609 #elif __CUDA_ARCH__ >= 200 611 return __ballot(predicate);
614 cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
619 static __forceinline__ __device__
bool 620 All(
int predicate,
volatile int* cta_buffer)
622 #if CUDA_VERSION >= 9000 624 return __all_sync (__activemask (), predicate);
625 #elif __CUDA_ARCH__ >= 200 627 return __all(predicate);
630 cta_buffer[tid] = predicate ? 1 : 0;
__device__ __host__ __forceinline__ void swap(T &a, T &b)
static __device__ __forceinline__ unsigned int stride()
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
This file defines compatibility wrappers for low level I/O functions.
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
__device__ static __forceinline__ float quiet_NaN()
__device__ __forceinline__ float3 normalized(const float3 &v)
__device__ __forceinline__ T warp_reduce(volatile T *ptr, const unsigned int tid=threadIdx.x)
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
__device__ static __forceinline__ float max()
static __device__ __forceinline__ unsigned int id()
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
static __device__ __forceinline__ int laneMaskLt()
__device__ static __forceinline__ float min()
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
__device__ static __forceinline__ type epsilon()
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
static __device__ __forceinline__ int flattenedThreadId()
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
__device__ __host__ __forceinline__ float norm(const float3 &v1, const float3 &v2)
__device__ static __forceinline__ float epsilon()
__device__ static __forceinline__ short max()
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
__device__ __host__ __forceinline__ float3 & operator[](int i)
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)