5 #ifndef __UTIL_SSEI_H__
6 #define __UTIL_SSEI_H__
10 #ifdef __KERNEL_SSE2__
89 return _mm_castps_si128(
a);
97 return _mm_sub_epi32(_mm_setzero_si128(),
a.m128);
99 # if defined(__KERNEL_SSSE3__)
102 return _mm_abs_epi32(
a.m128);
112 return _mm_add_epi32(
a.m128,
b.m128);
125 return _mm_sub_epi32(
a.m128,
b.m128);
136 # if defined(__KERNEL_SSE41__)
139 return _mm_mullo_epi32(
a.m128,
b.m128);
153 return _mm_and_si128(
a.m128,
b.m128);
166 return _mm_or_si128(
a.m128,
b.m128);
179 return _mm_xor_si128(
a.m128,
b.m128);
192 return _mm_slli_epi32(
a.m128, n);
196 return _mm_srai_epi32(
a.m128, n);
201 return _mm_andnot_si128(
a.m128,
b.m128);
205 return _mm_andnot_si128(
cast(
a.m128),
b.m128);
209 return _mm_andnot_si128(
a.m128,
cast(
b.m128));
214 return _mm_srai_epi32(
a.m128,
b);
218 return _mm_srli_epi32(
a.m128,
b);
221 # if defined(__KERNEL_SSE41__)
224 return _mm_min_epi32(
a.m128,
b.m128);
228 return min(
a, ssei(
b));
232 return min(ssei(
a),
b);
237 return _mm_max_epi32(
a.m128,
b.m128);
241 return max(
a, ssei(
b));
245 return max(ssei(
a),
b);
271 # if defined(__KERNEL_SSE41__)
324 return _mm_castsi128_ps(_mm_cmpeq_epi32(
a.m128,
b.m128));
350 return _mm_castsi128_ps(_mm_cmplt_epi32(
a.m128,
b.m128));
376 return _mm_castsi128_ps(_mm_cmpgt_epi32(
a.m128,
b.m128));
402 # ifdef __KERNEL_SSE41__
403 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(
t), m));
405 return _mm_or_si128(_mm_and_si128(m,
t), _mm_andnot_si128(m, f));
411 # if defined(__KERNEL_SSE41__) && \
412 ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
413 return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(
t),
mask));
425 return _mm_unpacklo_epi32(
a,
b);
429 return _mm_unpackhi_epi32(
a,
b);
432 template<
size_t i0,
size_t i1,
size_t i2,
size_t i3>
435 # ifdef __KERNEL_NEON__
436 int32x4_t
result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(
a));
437 return vreinterpretq_m128i_s32(
result);
439 return _mm_shuffle_epi32(
a, _MM_SHUFFLE(i3, i2,
i1, i0));
443 template<
size_t i0,
size_t i1,
size_t i2,
size_t i3>
446 # ifdef __KERNEL_NEON__
447 int32x4_t
result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(
a),
448 vreinterpretq_s32_m128i(
b));
449 return vreinterpretq_m128i_s32(
result);
451 return _mm_castps_si128(
452 _mm_shuffle_ps(_mm_castsi128_ps(
a), _mm_castsi128_ps(
b), _MM_SHUFFLE(i3, i2,
i1, i0)));
458 return shuffle<i0, i0, i0, i0>(
b);
461 # if defined(__KERNEL_SSE41__)
464 return _mm_extract_epi32(
b,
src);
468 return _mm_insert_epi32(
a,
b, dst);
487 # if defined(__KERNEL_SSE41__)
490 ssei h =
min(shuffle<1, 0, 3, 2>(
v),
v);
491 return min(shuffle<2, 3, 0, 1>(h), h);
495 ssei h =
max(shuffle<1, 0, 3, 2>(
v),
v);
496 return max(shuffle<2, 3, 0, 1>(h), h);
500 ssei h = shuffle<1, 0, 3, 2>(
v) +
v;
501 return shuffle<2, 3, 0, 1>(h) + h;
506 # ifdef __KERNEL_NEON__
507 return vminvq_s32(vreinterpretq_s32_m128i(
v));
514 # ifdef __KERNEL_NEON__
515 return vmaxvq_s32(vreinterpretq_s32_m128i(
v));
522 # ifdef __KERNEL_NEON__
523 return vaddvq_s32(vreinterpretq_s32_m128i(
v));
540 const ssei
a =
select(valid,
v, ssei((
int)pos_inf));
545 const ssei
a =
select(valid,
v, ssei((
int)neg_inf));
553 return (
a <
b) ?
a :
b;
557 return (
a >
b) ?
a :
b;
561 return ssei_min(ssei_min(
v[0],
v[1]), ssei_min(
v[2],
v[3]));
565 return ssei_max(ssei_max(
v[0],
v[1]), ssei_max(
v[2],
v[3]));
569 return v[0] +
v[1] +
v[2] +
v[3];
580 return _mm_load_si128((__m128i *)
a);
585 _mm_store_si128((__m128i *)
ptr,
v);
590 _mm_storeu_si128((__m128i *)
ptr,
v);
595 # if defined(__KERNEL_AVX__)
596 _mm_maskstore_ps((
float *)
ptr, (__m128i)
mask, _mm_castsi128_ps(i));
604 # if defined(__KERNEL_SSE41__)
605 return _mm_stream_load_si128((__m128i *)
ptr);
607 return _mm_load_si128((__m128i *)
ptr);
613 # if defined(__KERNEL_SSE41__)
614 _mm_stream_ps((
float *)
ptr, _mm_castsi128_ps(
v));
616 _mm_store_si128((__m128i *)
ptr,
v);
626 printf(
"%s: %df %df %df %d\n",
label,
a[0],
a[1],
a[2],
a[3]);
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint i1
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble t
__forceinline const avxb operator|=(avxb &a, const avxb &b)
__forceinline uint32_t movemask(const avxb &a)
__forceinline const avxb operator&(const avxb &a, const avxb &b)
Binary Operators.
__forceinline const avxb operator^=(avxb &a, const avxb &b)
__forceinline const avxb operator&=(avxb &a, const avxb &b)
Assignment Operators.
__forceinline const avxb unpacklo(const avxb &a, const avxb &b)
Movement/Shifting/Shuffling Functions.
__forceinline const avxb operator|(const avxb &a, const avxb &b)
__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
__forceinline const avxb unpackhi(const avxb &a, const avxb &b)
__forceinline float extract< 0 >(const avxf &a)
__forceinline avxi & operator-=(avxi &a, const avxi &b)
__forceinline int reduce_max(const avxi &v)
__forceinline avxi & operator<<=(avxi &a, const int32_t b)
__forceinline uint32_t select_max(const avxi &v)
__forceinline const avxi vreduce_add(const avxi &v)
__forceinline int reduce_min(const avxi &v)
__forceinline const avxi operator>>(const avxi &a, const int32_t n)
__forceinline const avxi vreduce_min(const avxi &v)
__forceinline avxi & operator>>=(avxi &a, const int32_t b)
__forceinline avxi & operator+=(avxi &a, const avxi &b)
Assignment Operators.
__forceinline const avxi vreduce_max(const avxi &v)
__forceinline const avxi srl(const avxi &a, const int32_t b)
__forceinline int reduce_add(const avxi &v)
__forceinline uint32_t select_min(const avxi &v)
__forceinline const avxi sra(const avxi &a, const int32_t b)
__forceinline avxi & operator*=(avxi &a, const avxi &b)
__forceinline float extract(const int4 &b)
ATTR_WARN_UNUSED_RESULT const BMVert * v
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
btGeneric6DofConstraint & operator=(btGeneric6DofConstraint &other)
SIMD_FORCE_INLINE btVector3 & operator[](int i)
Get a mutable reference to a row of the matrix as a vector.
#define ccl_device_inline
#define CCL_NAMESPACE_END
SyclQueue void void * src
SyclQueue void void size_t num_bytes void
static void shuffle(float2 points[], int size, int rng_seed)
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
Matrix< T, M, N > operator-(const Matrix< T, M, N > &m1, const Matrix< T, M, N > &m2)
Vec< T, N > operator*(const typename Vec< T, N >::value_type r, const Vec< T, N > &v)
Insertion insert(const float3 &point_prev, const float3 &handle_prev, const float3 &handle_next, const float3 &point_next, float parameter)
std::ostream & operator<<(std::ostream &stream, const AssetCatalogPath &path_to_append)
bool operator==(const AttributeIDRef &a, const AttributeIDRef &b)
GPUState operator^(const GPUState &a, const GPUState &b)
constexpr bool operator!=(StringRef a, StringRef b)
constexpr bool operator>=(StringRef a, StringRef b)
constexpr bool operator<(StringRef a, StringRef b)
constexpr bool operator<=(StringRef a, StringRef b)
constexpr bool operator>(StringRef a, StringRef b)
std::string operator+(StringRef a, StringRef b)
static const pxr::TfToken b("b", pxr::TfToken::Immortal)
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)