libflame  revision_anchor
Functions | Variables
bl1_dotv2axpyv2b.c File Reference

(r)

Functions

void bl1_sdotv2axpyv2b (int n, float *a1, int inc_a1, float *a2, int inc_a2, float *x, int inc_x, float *kappa1, float *kappa2, float *rho1, float *rho2, float *w, int inc_w)
 
void bl1_ddotv2axpyv2b (int n, double *a1, int inc_a1, double *a2, int inc_a2, double *x, int inc_x, double *kappa1, double *kappa2, double *rho1, double *rho2, double *w, int inc_w)
 
 if (inc_a1 !=1||inc_a2 !=1||inc_x !=1||inc_w !=1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_cdotv2axpyv2b (int n, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *x, int inc_x, scomplex *kappa1, scomplex *kappa2, scomplex *rho1, scomplex *rho2, scomplex *w, int inc_w)
 
void bl1_zdotv2axpyv2b (int n, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *x, int inc_x, dcomplex *kappa1, dcomplex *kappa2, dcomplex *rho1, dcomplex *rho2, dcomplex *w, int inc_w)
 

Variables

double *restrict alpha2
 
double *restrict chi1 = x
 
double *restrict omega1 = w
 
double kappa1_c = *kappa1
 
double kappa2_c = *kappa2
 
double rho1_c
 
double rho2_c
 
int i
 
int n_pre
 
int n_run
 
int n_left
 
rho1 = rho1_c
 
rho2 = rho2_c
 
 alpha1 = a1
 
rho1_c real = 0.0
 
rho1_c imag = 0.0
 

Function Documentation

◆ bl1_cdotv2axpyv2b()

void bl1_cdotv2axpyv2b ( int  n,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex x,
int  inc_x,
scomplex kappa1,
scomplex kappa2,
scomplex rho1,
scomplex rho2,
scomplex w,
int  inc_w 
)

References bl1_abort().

326 {
327  bl1_abort();
328 }
void bl1_abort(void)
Definition: bl1_abort.c:13

◆ bl1_ddotv2axpyv2b()

void bl1_ddotv2axpyv2b ( int  n,
double *  a1,
int  inc_a1,
double *  a2,
int  inc_a2,
double *  x,
int  inc_x,
double *  kappa1,
double *  kappa2,
double *  rho1,
double *  rho2,
double *  w,
int  inc_w 
)

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, v2df_t::d, i, kappa1_c, kappa2_c, n_left, n_pre, n_run, omega1, rho1_c, rho2_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

46 {
47  double* restrict alpha1;
48  double* restrict alpha2;
49  double* restrict chi1;
50  double* restrict omega1;
51  double rho1_c;
52  double rho2_c;
53  int i;
54 
55  int n_pre;
56  int n_run;
57  int n_left;
58 
59  v2df_t k1v, rho1v;
60  v2df_t k2v, rho2v;
61  v2df_t a11v, a12v, x1v, w1v;
62  v2df_t a21v, a22v, x2v, w2v;
63 
64  if ( inc_a1 != 1 ||
65  inc_a2 != 1 ||
66  inc_x != 1 ||
67  inc_w != 1 ) bl1_abort();
68 
69  n_pre = 0;
70  if ( ( unsigned long ) a1 % 16 != 0 )
71  {
72  if ( ( unsigned long ) a2 % 16 == 0 ||
73  ( unsigned long ) x % 16 == 0 ||
74  ( unsigned long ) w % 16 == 0 ) bl1_abort();
75 
76  n_pre = 1;
77  }
78 
79  n_run = ( n - n_pre ) / 4;
80  n_left = ( n - n_pre ) % 4;
81 
82  alpha1 = a1;
83  alpha2 = a2;
84  chi1 = x;
85  omega1 = w;
86 
87  rho1_c = 0.0;
88  rho2_c = 0.0;
89 
90  if ( n_pre == 1 )
91  {
92  double kappa1_c = *kappa1;
93  double kappa2_c = *kappa2;
94  double alpha1_c = *alpha1;
95  double alpha2_c = *alpha2;
96  double chi1_c = *chi1;
97  double omega1_c = *omega1;
98 
99  rho1_c += alpha1_c * chi1_c;
100  omega1_c += kappa1_c * alpha1_c;
101 
102  rho2_c += alpha2_c * chi1_c;
103  omega1_c += kappa2_c * alpha2_c;
104 
105  *omega1 = omega1_c;
106 
107  alpha1 += inc_a1;
108  alpha2 += inc_a2;
109  chi1 += inc_x;
110  omega1 += inc_w;
111  }
112 
113  rho1v.v = _mm_setzero_pd();
114  rho2v.v = _mm_setzero_pd();
115 
116  k1v.v = _mm_loaddup_pd( ( double* )kappa1 );
117  k2v.v = _mm_loaddup_pd( ( double* )kappa2 );
118 
119  for ( i = 0; i < n_run; ++i )
120  {
121  a11v.v = _mm_load_pd( ( double* )alpha1 );
122  a12v.v = _mm_load_pd( ( double* )alpha2 );
123  x1v.v = _mm_load_pd( ( double* )chi1 );
124  w1v.v = _mm_load_pd( ( double* )omega1 );
125 
126  rho1v.v += a11v.v * x1v.v;
127  w1v.v += k1v.v * a11v.v;
128 
129  rho2v.v += a12v.v * x1v.v;
130  w1v.v += k2v.v * a12v.v;
131 
132  _mm_store_pd( ( double* )omega1, w1v.v );
133 
134  a21v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
135  a22v.v = _mm_load_pd( ( double* )(alpha2 + 2) );
136  x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137  w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
138 
139  rho1v.v += a21v.v * x2v.v;
140  w2v.v += k1v.v * a21v.v;
141 
142  rho2v.v += a22v.v * x2v.v;
143  w2v.v += k2v.v * a22v.v;
144 
145  _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
146 
147  alpha1 += 4;
148  alpha2 += 4;
149  chi1 += 4;
150  omega1 += 4;
151  }
152 
153  if ( n_left > 0 )
154  {
155  for ( i = 0; i < n_left; ++i )
156  {
157  double kappa1_c = *kappa1;
158  double kappa2_c = *kappa2;
159  double alpha1_c = *alpha1;
160  double alpha2_c = *alpha2;
161  double chi1_c = *chi1;
162  double omega1_c = *omega1;
163 
164  rho1_c += alpha1_c * chi1_c;
165  omega1_c += kappa1_c * alpha1_c;
166 
167  rho2_c += alpha2_c * chi1_c;
168  omega1_c += kappa2_c * alpha2_c;
169 
170  *omega1 = omega1_c;
171 
172  alpha1 += inc_a1;
173  alpha2 += inc_a2;
174  chi1 += inc_x;
175  omega1 += inc_w;
176  }
177  }
178 
179  rho1_c += rho1v.d[0] + rho1v.d[1];
180  rho2_c += rho2v.d[0] + rho2v.d[1];
181 
182  *rho1 = rho1_c;
183  *rho2 = rho2_c;
184 }
double rho2_c
Definition: bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition: bl1_dotv2axpyv2b.c:189
double d[2]
Definition: blis_type_defs.h:119
* rho1
Definition: bl1_dotv2axpyv2b.c:311
double alpha1_c
Definition: bl1_axpyv2b.c:144
int n_run
Definition: bl1_dotv2axpyv2b.c:198
double *restrict alpha2
Definition: bl1_dotv2axpyv2b.c:186
int n_pre
Definition: bl1_dotv2axpyv2b.c:197
double rho1_c
Definition: bl1_dotv2axpyv2b.c:193
double kappa1_c
Definition: bl1_dotv2axpyv2b.c:191
double alpha2_c
Definition: bl1_axpyv2b.c:145
__m128d v
Definition: blis_type_defs.h:118
double *restrict omega1
Definition: bl1_dotv2axpyv2b.c:190
double kappa2_c
Definition: bl1_dotv2axpyv2b.c:192
Definition: blis_type_defs.h:116
int i
Definition: bl1_dotv2axpyv2b.c:195
int n_left
Definition: bl1_dotv2axpyv2b.c:199
* rho2
Definition: bl1_dotv2axpyv2b.c:312
alpha1
Definition: bl1_dotv2axpyv2b.c:456
void bl1_abort(void)
Definition: bl1_abort.c:13

◆ bl1_sdotv2axpyv2b()

void bl1_sdotv2axpyv2b ( int  n,
float *  a1,
int  inc_a1,
float *  a2,
int  inc_a2,
float *  x,
int  inc_x,
float *  kappa1,
float *  kappa2,
float *  rho1,
float *  rho2,
float *  w,
int  inc_w 
)

References bl1_abort().

31 {
32  bl1_abort();
33 }
void bl1_abort(void)
Definition: bl1_abort.c:13

◆ bl1_zdotv2axpyv2b()

void bl1_zdotv2axpyv2b ( int  n,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex x,
int  inc_x,
dcomplex kappa1,
dcomplex kappa2,
dcomplex rho1,
dcomplex rho2,
dcomplex w,
int  inc_w 
)

References alpha1, alpha2, bl1_abort(), chi1, i, kappa1_c, kappa2_c, omega1, rho1_c, rho2_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opz_var1().

341 {
342  dcomplex* restrict alpha1;
343  dcomplex* restrict alpha2;
344  dcomplex* restrict chi1;
345  dcomplex* restrict omega1;
346  int i;
347 
348  v2df_t kappa1v, kappa1rv;
349  v2df_t kappa2v, kappa2rv;
350  v2df_t rho1v;
351  v2df_t rho2v;
352  v2df_t a11v, a12v;
353  v2df_t a21v, a22v;
354  v2df_t x1v, x1rv;
355  v2df_t w1v;
356  v2df_t acbc, bdad;
357  v2df_t adac, bcbd;
358 
359  if ( inc_a1 != 1 ||
360  inc_a2 != 1 ||
361  inc_x != 1 ||
362  inc_w != 1 ) bl1_abort();
363 
364  alpha1 = a1;
365  alpha2 = a2;
366  chi1 = x;
367  omega1 = w;
368 
369  rho1v.v = _mm_setzero_pd();
370  rho2v.v = _mm_setzero_pd();
371 
372  kappa1v.v = _mm_load_pd( ( double* )kappa1 );
373  kappa1rv.v = _mm_shuffle_pd( kappa1v.v, kappa1v.v, _MM_SHUFFLE2 (0,1) );
374  kappa2v.v = _mm_load_pd( ( double* )kappa2 );
375  kappa2rv.v = _mm_shuffle_pd( kappa2v.v, kappa2v.v, _MM_SHUFFLE2 (0,1) );
376 
377  for ( i = 0; i < n; ++i )
378  {
379  //dcomplex omega1_c = *omega1;
380  w1v.v = _mm_load_pd( ( double* )omega1 );
381 
382  //dcomplex chi1_c = *chi1;
383  x1v.v = _mm_load_pd( ( double* )chi1 );
384 
385 
386  //dcomplex alpha1_c = *alpha1;
387  a11v.v = _mm_loaddup_pd( ( double* )&(alpha1->real) );
388  a12v.v = _mm_loaddup_pd( ( double* )&(alpha1->imag) );
389 
390  //rho1_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
391  //rho1_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
392  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
393  adac.v = a11v.v * x1rv.v;
394  bcbd.v = a12v.v * x1v.v;
395  rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
396 
397  //omega1_c.real += kappa1_c.real * alpha1_c.real - kappa1_c.imag * alpha1_c.imag;
398  //omega1_c.imag += kappa1_c.real * alpha1_c.imag + kappa1_c.imag * alpha1_c.real;
399  acbc.v = kappa1v.v * a11v.v;
400  bdad.v = kappa1rv.v * a12v.v;
401  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
402 
403 
404  //dcomplex alpha2_c = *alpha2;
405  a21v.v = _mm_loaddup_pd( ( double* )&(alpha2->real) );
406  a22v.v = _mm_loaddup_pd( ( double* )&(alpha2->imag) );
407 
408  //rho2_c.real += alpha2_c.real * chi1_c.real - -alpha2_c.imag * chi1_c.imag;
409  //rho2_c.imag += alpha2_c.real * chi1_c.imag + -alpha2_c.imag * chi1_c.real;
410  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
411  adac.v = a21v.v * x1rv.v;
412  bcbd.v = a22v.v * x1v.v;
413  rho2v.v = rho2v.v + _mm_addsub_pd( adac.v, bcbd.v );
414 
415  //omega1_c.real += kappa2_c.real * alpha2_c.real - kappa2_c.imag * alpha2_c.imag;
416  //omega1_c.imag += kappa2_c.real * alpha2_c.imag + kappa2_c.imag * alpha2_c.real;
417  acbc.v = kappa2v.v * a21v.v;
418  bdad.v = kappa2rv.v * a22v.v;
419  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
420 
421 
422  //*omega1 = omega1_c;
423  _mm_store_pd( ( double* )omega1, w1v.v );
424 
425 
426  //alpha1 += inc_a1;
427  //alpha2 += inc_a2;
428  //chi1 += inc_x;
429  //omega1 += inc_w;
430  alpha1 += 1;
431  alpha2 += 1;
432  chi1 += 1;
433  omega1 += 1;
434  }
435 
436  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
437  rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
438 
439  //*rho1 = rho1_c;
440  //*rho2 = rho2_c;
441  _mm_store_pd( ( double* )rho1, rho1v.v );
442  _mm_store_pd( ( double* )rho2, rho2v.v );
443 }
double *restrict chi1
Definition: bl1_dotv2axpyv2b.c:189
double *restrict alpha2
Definition: bl1_dotv2axpyv2b.c:186
__m128d v
Definition: blis_type_defs.h:118
double *restrict omega1
Definition: bl1_dotv2axpyv2b.c:190
Definition: blis_type_defs.h:116
int i
Definition: bl1_dotv2axpyv2b.c:195
alpha1
Definition: bl1_dotv2axpyv2b.c:456
void bl1_abort(void)
Definition: bl1_abort.c:13
Definition: blis_type_defs.h:137

◆ for()

for ( )

References alpha1, alpha2, chi1, and omega1.

252  {
253  double alpha11_c = *alpha1;
254  double alpha21_c = *(alpha1 + 1);
255  double alpha12_c = *alpha2;
256  double alpha22_c = *(alpha2 + 1);
257  double chi1_c = *chi1;
258  double chi2_c = *(chi1 + 1);
259  double omega1_c = *omega1;
260  double omega2_c = *(omega1 + 1);
261 
262  // rho1 += conj(alpha1) * chi1;
263  rho1_c += alpha11_c * chi1_c;
264  rho1_c += alpha21_c * chi2_c;
265 
266  // omega1 += kappa1 * alpha1;
267  omega1_c += kappa1_c * alpha11_c;
268  omega2_c += kappa1_c * alpha21_c;
269 
270  // rho2 += conj(alpha2) * chi1;
271  rho2_c += alpha12_c * chi1_c;
272  rho2_c += alpha22_c * chi2_c;
273 
274  // omega1 += kappa2 * alpha2;
275  omega1_c += kappa2_c * alpha12_c;
276  omega2_c += kappa2_c * alpha22_c;
277 
278  *omega1 = omega1_c;
279  *(omega1 + 1) = omega2_c;
280 
281  alpha1 += 2*inc_a1;
282  alpha2 += 2*inc_a2;
283  chi1 += 2*inc_x;
284  omega1 += 2*inc_w;
285  }
double rho2_c
Definition: bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition: bl1_dotv2axpyv2b.c:189
double *restrict alpha2
Definition: bl1_dotv2axpyv2b.c:186
double rho1_c
Definition: bl1_dotv2axpyv2b.c:193
double kappa1_c
Definition: bl1_dotv2axpyv2b.c:191
double *restrict omega1
Definition: bl1_dotv2axpyv2b.c:190
double kappa2_c
Definition: bl1_dotv2axpyv2b.c:192
alpha1
Definition: bl1_dotv2axpyv2b.c:456

◆ if() [1/2]

if ( inc_a1 !  = 1 || inc_a2 != 1 || inc_x != 1 || inc_w != 1)
231  {
232  double alpha1_c = *alpha1;
233  double alpha2_c = *alpha2;
234  double chi1_c = *chi1;
235  double omega1_c = *omega1;
236 
237  rho1_c += alpha1_c * chi1_c;
238  omega1_c += kappa1_c * alpha1_c;
239 
240  rho2_c += alpha2_c * chi1_c;
241  omega1_c += kappa2_c * alpha2_c;
242 
243  *omega1 = omega1_c;
244 
245  alpha1 += inc_a1;
246  alpha2 += inc_a2;
247  chi1 += inc_x;
248  omega1 += inc_w;
249  }
double rho2_c
Definition: bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition: bl1_dotv2axpyv2b.c:189
double alpha1_c
Definition: bl1_axpyv2b.c:144
double *restrict alpha2
Definition: bl1_dotv2axpyv2b.c:186
double rho1_c
Definition: bl1_dotv2axpyv2b.c:193
double kappa1_c
Definition: bl1_dotv2axpyv2b.c:191
double alpha2_c
Definition: bl1_axpyv2b.c:145
double *restrict omega1
Definition: bl1_dotv2axpyv2b.c:190
double kappa2_c
Definition: bl1_dotv2axpyv2b.c:192
alpha1
Definition: bl1_dotv2axpyv2b.c:456

◆ if() [2/2]

if ( n_left  ,
 
)

References alpha1, alpha1_c, alpha2, alpha2_c, chi1, i, n_left, and omega1.

288  {
289  for ( i = 0; i < n_left; ++i )
290  {
291  double alpha1_c = *alpha1;
292  double alpha2_c = *alpha2;
293  double chi1_c = *chi1;
294  double omega1_c = *omega1;
295 
296  rho1_c += alpha1_c * chi1_c;
297  omega1_c += kappa1_c * alpha1_c;
298 
299  rho2_c += alpha2_c * chi1_c;
300  omega1_c += kappa2_c * alpha2_c;
301 
302  *omega1 = omega1_c;
303 
304  alpha1 += inc_a1;
305  alpha2 += inc_a2;
306  chi1 += inc_x;
307  omega1 += inc_w;
308  }
309  }
double rho2_c
Definition: bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition: bl1_dotv2axpyv2b.c:189
double alpha1_c
Definition: bl1_axpyv2b.c:144
double *restrict alpha2
Definition: bl1_dotv2axpyv2b.c:186
double rho1_c
Definition: bl1_dotv2axpyv2b.c:193
double kappa1_c
Definition: bl1_dotv2axpyv2b.c:191
double alpha2_c
Definition: bl1_axpyv2b.c:145
double *restrict omega1
Definition: bl1_dotv2axpyv2b.c:190
double kappa2_c
Definition: bl1_dotv2axpyv2b.c:192
int i
Definition: bl1_dotv2axpyv2b.c:195
int n_left
Definition: bl1_dotv2axpyv2b.c:199
alpha1
Definition: bl1_dotv2axpyv2b.c:456

Variable Documentation

◆ alpha1

alpha1 = a1

◆ alpha2

dcomplex *restrict alpha2

◆ chi1

chi1 = x

◆ i

int i

◆ imag

rho2_c imag = 0.0

◆ kappa1_c

dcomplex kappa1_c = *kappa1

◆ kappa2_c

dcomplex kappa2_c = *kappa2

◆ n_left

int n_left

Referenced by bl1_ddotv2axpyv2b(), and if().

◆ n_pre

int n_pre

Referenced by bl1_ddotv2axpyv2b().

◆ n_run

int n_run

Referenced by bl1_ddotv2axpyv2b().

◆ omega1

omega1 = w

◆ real

rho2_c real = 0.0

◆ rho1

* rho1 = rho1_c

◆ rho1_c

dcomplex rho1_c

◆ rho2

* rho2 = rho2_c

◆ rho2_c

dcomplex rho2_c