Blender  V3.3
math_matrix.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #ifndef __UTIL_MATH_MATRIX_H__
5 #define __UTIL_MATH_MATRIX_H__
6 
8 
9 #define MAT(A, size, row, col) A[(row) * (size) + (col)]
10 
11 /* Variants that use a constant stride on GPUS. */
12 #ifdef __KERNEL_GPU__
13 # define MATS(A, n, r, c, s) A[((r) * (n) + (c)) * (s)]
14 /* Element access when only the lower-triangular elements are stored. */
15 # define MATHS(A, r, c, s) A[((r) * ((r) + 1) / 2 + (c)) * (s)]
16 # define VECS(V, i, s) V[(i) * (s)]
17 #else
18 # define MATS(A, n, r, c, s) MAT(A, n, r, c)
19 # define MATHS(A, r, c, s) A[(r) * ((r) + 1) / 2 + (c)]
20 # define VECS(V, i, s) V[i]
21 #endif
22 
23 /* Zeroing helpers. */
24 
26 {
27  for (int i = 0; i < n; i++) {
28  v[i] = 0.0f;
29  }
30 }
31 
33 {
34  for (int row = 0; row < n; row++) {
35  for (int col = 0; col <= row; col++) {
36  MAT(A, n, row, col) = 0.0f;
37  }
38  }
39 }
40 
41 /* Elementary vector operations. */
42 
44  ccl_private const float *ccl_restrict b,
45  int n)
46 {
47  for (int i = 0; i < n; i++) {
48  a[i] += b[i];
49  }
50 }
51 
53  ccl_private const float *ccl_restrict b,
54  int n)
55 {
56  for (int i = 0; i < n; i++) {
57  a[i] *= b[i];
58  }
59 }
60 
62  ccl_private const float *ccl_restrict b,
63  int astride,
64  int n)
65 {
66  for (int i = 0; i < n; i++) {
67  a[i * astride] *= b[i];
68  }
69 }
70 
72 {
73  for (int i = 0; i < n; i++) {
74  a[i] *= b;
75  }
76 }
77 
79  ccl_private const float *ccl_restrict b,
80  int n)
81 {
82  for (int i = 0; i < n; i++) {
83  a[i] = max(a[i], b[i]);
84  }
85 }
86 
88 {
89  for (int i = 0; i < n; i++) {
90  v[i] += w * x[i];
91  }
92 }
93 
95  ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
96 {
97  for (int i = 0; i < n; i++) {
98  ccl_global float *elem = (ccl_global float *)(v + i * stride);
99  atomic_add_and_fetch_float(elem + 0, w.x * x[i]);
100  atomic_add_and_fetch_float(elem + 1, w.y * x[i]);
101  atomic_add_and_fetch_float(elem + 2, w.z * x[i]);
102  }
103 }
104 
105 /* Elementary matrix operations.
106  * NOTE: TriMatrix refers to a square matrix that is symmetric,
107  * and therefore its upper-triangular part isn't stored. */
108 
110  int n,
111  float val,
112  int stride)
113 {
114  for (int row = 0; row < n; row++) {
115  MATHS(A, row, row, stride) += val;
116  }
117 }
118 
119 /* Add Gramian matrix of v to A.
120  * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
122  int n,
123  ccl_private const float *ccl_restrict v,
124  float weight)
125 {
126  for (int row = 0; row < n; row++) {
127  for (int col = 0; col <= row; col++) {
128  MAT(A, n, row, col) += v[row] * v[col] * weight;
129  }
130  }
131 }
132 
133 /* Add Gramian matrix of v to A.
134  * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */
136  ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight, int stride)
137 {
138  for (int row = 0; row < n; row++) {
139  for (int col = 0; col <= row; col++) {
140  atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row] * v[col] * weight);
141  }
142  }
143 }
144 
146  int n,
147  ccl_private const float *ccl_restrict v,
148  float weight)
149 {
150  for (int row = 0; row < n; row++) {
151  for (int col = 0; col <= row; col++) {
152  atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
153  }
154  }
155 }
156 
157 /* Transpose matrix A in place. */
159 {
160  for (int i = 0; i < n; i++) {
161  for (int j = 0; j < i; j++) {
162  float temp = MATS(A, n, i, j, stride);
163  MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride);
164  MATS(A, n, j, i, stride) = temp;
165  }
166  }
167 }
168 
169 /* Solvers for matrix problems */
170 
171 /* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
172  * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L.
173  * Also, only the lower triangular part of A is ever accessed. */
175 {
176  for (int row = 0; row < n; row++) {
177  for (int col = 0; col <= row; col++) {
178  float sum_col = MATHS(A, row, col, stride);
179  for (int k = 0; k < col; k++) {
180  sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride);
181  }
182  if (row == col) {
183  sum_col = sqrtf(max(sum_col, 0.0f));
184  }
185  else {
186  sum_col /= MATHS(A, col, col, stride);
187  }
188  MATHS(A, row, col, stride) = sum_col;
189  }
190  }
191 }
192 
193 /* Solve A*S=y for S given A and y,
194  * where A is symmetrical positive-semi-definite and both inputs are destroyed in the process.
195  *
196  * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A.
197  * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S.
198  * Since L is lower triangular, finding b is relatively easy since y is known.
199  * Then, the remaining problem is Lt*S = b, which again can be solved easily.
200  *
201  * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is
202  * symmetrical positive-semidefinite by construction,
203  * so we can just use this function with A=Xt*W*X and y=Xt*W*y. */
206  int n,
207  int stride)
208 {
209  /* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good
210  * heuristic for the amount of pixels considered (with weighting),
211  * therefore the amount of correction is scaled based on it. */
212  math_trimatrix_add_diagonal(A, n, 3e-7f * A[0], stride); /* Improve the numerical stability. */
213  math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */
214 
215  /* Use forward substitution to solve L*b = y, replacing y by b. */
216  for (int row = 0; row < n; row++) {
217  float3 sum = VECS(y, row, stride);
218  for (int col = 0; col < row; col++)
219  sum -= MATHS(A, row, col, stride) * VECS(y, col, stride);
220  VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
221  }
222 
223  /* Use backward substitution to solve Lt*S = b, replacing b by S. */
224  for (int row = n - 1; row >= 0; row--) {
225  float3 sum = VECS(y, row, stride);
226  for (int col = row + 1; col < n; col++)
227  sum -= MATHS(A, col, row, stride) * VECS(y, col, stride);
228  VECS(y, row, stride) = sum / MATHS(A, row, row, stride);
229  }
230 }
231 
232 /* Perform the Jacobi Eigenvalue Method on matrix A.
233  * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever
234  * accessed. The algorithm overwrites the contents of A.
235  *
236  * After returning, A will be overwritten with D, which is (almost) diagonal,
237  * and V will contain the eigenvectors of the original A in its rows (!),
238  * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A.
239  */
241  ccl_global float *V,
242  int n,
243  int v_stride)
244 {
245  const float singular_epsilon = 1e-9f;
246 
247  for (int row = 0; row < n; row++) {
248  for (int col = 0; col < n; col++) {
249  MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
250  }
251  }
252 
253  for (int sweep = 0; sweep < 8; sweep++) {
254  float off_diagonal = 0.0f;
255  for (int row = 1; row < n; row++) {
256  for (int col = 0; col < row; col++) {
257  off_diagonal += fabsf(MAT(A, n, row, col));
258  }
259  }
260  if (off_diagonal < 1e-7f) {
261  /* The matrix has nearly reached diagonal form.
262  * Since the eigenvalues are only used to determine truncation, their exact values aren't
263  * required - a relative error of a few ULPs won't matter at all. */
264  break;
265  }
266 
267  /* Set the threshold for the small element rotation skip in the first sweep:
268  * Skip all elements that are less than a tenth of the average off-diagonal element. */
269  float threshold = 0.2f * off_diagonal / (n * n);
270 
271  for (int row = 1; row < n; row++) {
272  for (int col = 0; col < row; col++) {
273  /* Perform a Jacobi rotation on this element that reduces it to zero. */
274  float element = MAT(A, n, row, col);
275  float abs_element = fabsf(element);
276 
277  /* If we're in a later sweep and the element already is very small,
278  * just set it to zero and skip the rotation. */
279  if (sweep > 3 && abs_element <= singular_epsilon * fabsf(MAT(A, n, row, row)) &&
280  abs_element <= singular_epsilon * fabsf(MAT(A, n, col, col))) {
281  MAT(A, n, row, col) = 0.0f;
282  continue;
283  }
284 
285  if (element == 0.0f) {
286  continue;
287  }
288 
289  /* If we're in one of the first sweeps and the element is smaller than the threshold,
290  * skip it. */
291  if (sweep < 3 && (abs_element < threshold)) {
292  continue;
293  }
294 
295  /* Determine rotation: The rotation is characterized by its angle phi - or,
296  * in the actual implementation, sin(phi) and cos(phi).
297  * To find those, we first compute their ratio - that might be unstable if the angle
298  * approaches 90°, so there's a fallback for that case.
299  * Then, we compute sin(phi) and cos(phi) themselves. */
300  float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
301  float ratio;
302  if (abs_element > singular_epsilon * fabsf(singular_diff)) {
303  float cot_2phi = 0.5f * singular_diff / element;
304  ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi * cot_2phi));
305  if (cot_2phi < 0.0f)
306  ratio = -ratio; /* Copy sign. */
307  }
308  else {
309  ratio = element / singular_diff;
310  }
311 
312  float c = 1.0f / sqrtf(1.0f + ratio * ratio);
313  float s = ratio * c;
314  /* To improve numerical stability by avoiding cancellation, the update equations are
315  * reformulized to use sin(phi) and tan(phi/2) instead. */
316  float tan_phi_2 = s / (1.0f + c);
317 
318  /* Update the singular values in the diagonal. */
319  float singular_delta = ratio * element;
320  MAT(A, n, row, row) += singular_delta;
321  MAT(A, n, col, col) -= singular_delta;
322 
323  /* Set the element itself to zero. */
324  MAT(A, n, row, col) = 0.0f;
325 
326  /* Perform the actual rotations on the matrices. */
327 #define ROT(M, r1, c1, r2, c2, stride) \
328  { \
329  float M1 = MATS(M, n, r1, c1, stride); \
330  float M2 = MATS(M, n, r2, c2, stride); \
331  MATS(M, n, r1, c1, stride) -= s * (M2 + tan_phi_2 * M1); \
332  MATS(M, n, r2, c2, stride) += s * (M1 - tan_phi_2 * M2); \
333  }
334 
335  /* Split into three parts to ensure correct accesses since we only store the
336  * lower-triangular part of A. */
337  for (int i = 0; i < col; i++)
338  ROT(A, col, i, row, i, 1);
339  for (int i = col + 1; i < row; i++)
340  ROT(A, i, col, row, i, 1);
341  for (int i = row + 1; i < n; i++)
342  ROT(A, i, col, i, row, 1);
343 
344  for (int i = 0; i < n; i++)
345  ROT(V, col, i, row, i, v_stride);
346 #undef ROT
347  }
348  }
349  }
350 
351  /* Sort eigenvalues and the associated eigenvectors. */
352  for (int i = 0; i < n - 1; i++) {
353  float v = MAT(A, n, i, i);
354  int k = i;
355  for (int j = i; j < n; j++) {
356  if (MAT(A, n, j, j) >= v) {
357  v = MAT(A, n, j, j);
358  k = j;
359  }
360  }
361  if (k != i) {
362  /* Swap eigenvalues. */
363  MAT(A, n, k, k) = MAT(A, n, i, i);
364  MAT(A, n, i, i) = v;
365  /* Swap eigenvectors. */
366  for (int j = 0; j < n; j++) {
367  float v = MATS(V, n, i, j, v_stride);
368  MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
369  MATS(V, n, k, j, v_stride) = v;
370  }
371  }
372  }
373 }
374 
375 #ifdef __KERNEL_SSE3__
376 ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
377 {
378  for (int i = 0; i < n; i++) {
379  A[i] = zero_float4();
380  }
381 }
382 
383 ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
384 {
385  for (int row = 0; row < n; row++) {
386  for (int col = 0; col <= row; col++) {
387  MAT(A, n, row, col) = zero_float4();
388  }
389  }
390 }
391 
392 /* Add Gramian matrix of v to A.
393  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
394 ccl_device_inline void math_matrix_add_gramian_sse(float4 *A,
395  int n,
396  const float4 *ccl_restrict v,
397  float4 weight)
398 {
399  for (int row = 0; row < n; row++) {
400  for (int col = 0; col <= row; col++) {
401  MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
402  }
403  }
404 }
405 
406 ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
407 {
408  for (int i = 0; i < n; i++) {
409  V[i] += a[i];
410  }
411 }
412 
413 ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
414 {
415  for (int i = 0; i < n; i++) {
416  V[i] *= a[i];
417  }
418 }
419 
420 ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
421 {
422  for (int i = 0; i < n; i++) {
423  a[i] = max(a[i], b[i]);
424  }
425 }
426 
427 ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
428 {
429  for (int row = 0; row < n; row++) {
430  for (int col = 0; col <= row; col++) {
431  MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
432  }
433  }
434 }
435 #endif
436 
437 #undef MAT
438 
440 
441 #endif /* __UTIL_MATH_MATRIX_H__ */
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei stride
float float4[4]
#define atomic_add_and_fetch_float(p, x)
Definition: atomic.h:12
__forceinline int reduce_add(const avxi &v)
Definition: avxi.h:696
ATTR_WARN_UNUSED_RESULT const void * element
ATTR_WARN_UNUSED_RESULT const BMVert * v
#define A
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
static T sum(const btAlignedObjectArray< T > &items)
#define ccl_restrict
Definition: cuda/compat.h:50
#define ccl_device
Definition: cuda/compat.h:32
#define ccl_private
Definition: cuda/compat.h:48
#define ccl_device_inline
Definition: cuda/compat.h:34
#define ccl_global
Definition: cuda/compat.h:43
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
uint col
ccl_gpu_kernel_postfix ccl_global float int int int int float threshold
ccl_device_inline float4 zero_float4()
Definition: math_float4.h:92
#define MAT(A, size, row, col)
Definition: math_matrix.h:9
ccl_device_inline void math_vec3_add(ccl_private float3 *v, int n, ccl_private float *x, float3 w)
Definition: math_matrix.h:87
ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride)
Definition: math_matrix.h:174
ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
Definition: math_matrix.h:158
ccl_device_inline void math_vector_add(ccl_private float *a, ccl_private const float *ccl_restrict b, int n)
Definition: math_matrix.h:43
ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, ccl_private float *x, float3 w, int stride)
Definition: math_matrix.h:94
ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
Definition: math_matrix.h:109
ccl_device_inline void math_matrix_zero(ccl_private float *A, int n)
Definition: math_matrix.h:32
ccl_device_inline void math_vector_mul(ccl_private float *a, ccl_private const float *ccl_restrict b, int n)
Definition: math_matrix.h:52
#define MATHS(A, r, c, s)
Definition: math_matrix.h:19
ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight, int stride)
Definition: math_matrix.h:135
#define ROT(M, r1, c1, r2, c2, stride)
ccl_device_inline void math_vector_max(ccl_private float *a, ccl_private const float *ccl_restrict b, int n)
Definition: math_matrix.h:78
ccl_device_inline void math_vector_scale(ccl_private float *a, float b, int n)
Definition: math_matrix.h:71
ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride)
Definition: math_matrix.h:204
#define MATS(A, n, r, c, s)
Definition: math_matrix.h:18
ccl_device void math_matrix_jacobi_eigendecomposition(ccl_private float *A, ccl_global float *V, int n, int v_stride)
Definition: math_matrix.h:240
#define VECS(V, i, s)
Definition: math_matrix.h:20
ccl_device_inline void math_vector_mul_strided(ccl_global float *a, ccl_private const float *ccl_restrict b, int astride, int n)
Definition: math_matrix.h:61
ccl_device_inline void math_matrix_add_gramian(ccl_private float *A, int n, ccl_private const float *ccl_restrict v, float weight)
Definition: math_matrix.h:121
ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A, int n, ccl_private const float *ccl_restrict v, float weight)
Definition: math_matrix.h:145
ccl_device_inline void math_vector_zero(ccl_private float *v, int n)
Definition: math_matrix.h:25
#define B
#define fabsf(x)
Definition: metal/compat.h:219
#define sqrtf(x)
Definition: metal/compat.h:243
static unsigned c
Definition: RandGen.cpp:83
static unsigned a[3]
Definition: RandGen.cpp:78
static const pxr::TfToken b("b", pxr::TfToken::Immortal)
float max
CCL_NAMESPACE_BEGIN struct Window V