libflame  revision_anchor
Functions | Variables
bl1_dotsv2.c File Reference

(r)

Functions

void bl1_sdotsv2 (conj1_t conjxy, int n, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz)
 
void bl1_ddotsv2 (conj1_t conjxy, int n, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz)
 
 if (inc_x !=1||inc_y !=1||inc_z !=1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_cdotsv2 (conj1_t conjxy, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz)
 
void bl1_zdotsv2 (conj1_t conjxy, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz)
 
 if (bl1_is_conj(conjxy))
 
 bl1_zscals (beta, rho_yz)
 

Variables

double *restrict y1
 
double *restrict z1 = z
 
double rho1 = rho1_c
 
double rho2 = rho2_c
 
double x1c
 
double y1c
 
double z1c
 
double x2c
 
double y2c
 
double z2c
 
int i
 
int n_pre
 
int n_run
 
int n_left
 
rho_xz = *beta * *rho_xz + rho1
 
rho_yz = *beta * *rho_yz + rho2
 
 x1 = x
 
rho1 real = 0.0
 
rho1 imag = 0.0
 
 else
 

Function Documentation

◆ bl1_cdotsv2()

void bl1_cdotsv2 ( conj1_t  conjxy,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz 
)

References bl1_abort().

243 {
244  bl1_abort();
245 }
void bl1_abort(void)
Definition: bl1_abort.c:13

◆ bl1_ddotsv2()

void bl1_ddotsv2 ( conj1_t  conjxy,
int  n,
double *  x,
int  inc_x,
double *  y,
int  inc_y,
double *  z,
int  inc_z,
double *  beta,
double *  rho_xz,
double *  rho_yz 
)

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, v2df_t::v, x1, x1c, y1, y1c, z1, and z1c.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Fused_UYx_ZVx_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

44 {
45  double* restrict x1;
46  double* restrict y1;
47  double* restrict z1;
48  double rho1, rho2;
49  double x1c, y1c, z1c;
50  int i;
51 
52  int n_pre;
53  int n_run;
54  int n_left;
55 
56  v2df_t rho1v, rho2v;
57  v2df_t x1v, y1v, z1v;
58  v2df_t x2v, y2v, z2v;
59 
60  if ( inc_x != 1 ||
61  inc_y != 1 ||
62  inc_z != 1 ) bl1_abort();
63 
64  n_pre = 0;
65  if ( ( unsigned long ) z % 16 != 0 )
66  {
67  if ( ( unsigned long ) x % 16 == 0 ||
68  ( unsigned long ) y % 16 == 0 ) bl1_abort();
69 
70  n_pre = 1;
71  }
72 
73  n_run = ( n - n_pre ) / 4;
74  n_left = ( n - n_pre ) % 4;
75 
76  x1 = x;
77  y1 = y;
78  z1 = z;
79 
80  rho1 = 0.0;
81  rho2 = 0.0;
82 
83  if ( n_pre == 1 )
84  {
85  x1c = *x1;
86  y1c = *y1;
87  z1c = *z1;
88 
89  rho1 += x1c * z1c;
90  rho2 += y1c * z1c;
91 
92  x1 += inc_x;
93  y1 += inc_y;
94  z1 += inc_z;
95  }
96 
97  rho1v.v = _mm_setzero_pd();
98  rho2v.v = _mm_setzero_pd();
99 
100  for ( i = 0; i < n_run; ++i )
101  {
102  x1v.v = _mm_load_pd( ( double* )x1 );
103  y1v.v = _mm_load_pd( ( double* )y1 );
104  z1v.v = _mm_load_pd( ( double* )z1 );
105 
106  x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
107  y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
108  z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
109 
110  rho1v.v += x1v.v * z1v.v;
111  rho2v.v += y1v.v * z1v.v;
112 
113  rho1v.v += x2v.v * z2v.v;
114  rho2v.v += y2v.v * z2v.v;
115 
116  x1 += 4;
117  y1 += 4;
118  z1 += 4;
119  }
120 
121  rho1 += rho1v.d[0] + rho1v.d[1];
122  rho2 += rho2v.d[0] + rho2v.d[1];
123 
124  if ( n_left > 0 )
125  {
126  for ( i = 0; i < n_left; ++i )
127  {
128  x1c = *x1;
129  y1c = *y1;
130  z1c = *z1;
131 
132  rho1 += x1c * z1c;
133  rho2 += y1c * z1c;
134 
135  x1 += inc_x;
136  y1 += inc_y;
137  z1 += inc_z;
138  }
139  }
140 
141  *rho_xz = *beta * *rho_xz + rho1;
142  *rho_yz = *beta * *rho_yz + rho2;
143 }
double x1c
Definition: bl1_dotsv2.c:150
double rho1
Definition: bl1_dotsv2.c:149
double y1c
Definition: bl1_dotsv2.c:150
int i
Definition: bl1_dotsv2.c:152
double d[2]
Definition: blis_type_defs.h:119
double z1c
Definition: bl1_dotsv2.c:150
* rho_yz
Definition: bl1_dotsv2.c:230
int n_left
Definition: bl1_dotsv2.c:156
__m128d v
Definition: blis_type_defs.h:118
* rho_xz
Definition: bl1_dotsv2.c:229
double rho2
Definition: bl1_dotsv2.c:149
double *restrict z1
Definition: bl1_dotsv2.c:148
Definition: blis_type_defs.h:116
x1
Definition: bl1_dotsv2.c:374
int n_pre
Definition: bl1_dotsv2.c:154
int n_run
Definition: bl1_dotsv2.c:155
double *restrict y1
Definition: bl1_dotsv2.c:145
void bl1_abort(void)
Definition: bl1_abort.c:13

◆ bl1_sdotsv2()

void bl1_sdotsv2 ( conj1_t  conjxy,
int  n,
float *  x,
int  inc_x,
float *  y,
int  inc_y,
float *  z,
int  inc_z,
float *  beta,
float *  rho_xz,
float *  rho_yz 
)

References bl1_abort().

30 {
31  bl1_abort();
32 }
void bl1_abort(void)
Definition: bl1_abort.c:13

◆ bl1_zdotsv2()

void bl1_zdotsv2 ( conj1_t  conjxy,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz 
)

References bl1_is_conj(), v2df_t::d, i, dcomplex::imag, dcomplex::real, rho1, rho2, v2df_t::v, x1, x1c, y1, y1c, z1, and z1c.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1(), and FLA_Fused_UYx_ZVx_opz_var1().

257 {
258  dcomplex* restrict x1;
259  dcomplex* restrict y1;
260  dcomplex* restrict z1;
261  int i;
262  v2df_t r1v, rho1v;
263  v2df_t r2v, rho2v;
264  v2df_t z11v, z12v;
265  v2df_t x1v, x1rv;
266  v2df_t y1v, y1rv;
267 
268  x1 = x;
269  y1 = y;
270  z1 = z;
271 
272  rho1v.v = _mm_setzero_pd();
273  rho2v.v = _mm_setzero_pd();
274 
275  if ( bl1_is_conj( conjxy ) )
276  {
277  v2df_t bcac, adbd;
278 
279  for ( i = 0; i < n; ++i )
280  {
281  z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
282  z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
283 
284  x1v.v = _mm_load_pd( ( double* )x1 );
285  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
286  bcac.v = x1rv.v * z11v.v;
287  adbd.v = x1v.v * z12v.v;
288  rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
289 
290  y1v.v = _mm_load_pd( ( double* )y1 );
291  y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
292  bcac.v = y1rv.v * z11v.v;
293  adbd.v = y1v.v * z12v.v;
294  rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
295 
296  x1 += inc_x;
297  y1 += inc_y;
298  z1 += inc_z;
299  }
300 
301  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
302  rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
303 
304  rho1v.d[1] = -rho1v.d[1];
305  rho2v.d[1] = -rho2v.d[1];
306  }
307  else
308  {
309  v2df_t cada, dbcb;
310 
311  for ( i = 0; i < n; ++i )
312  {
313  z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
314  z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
315 
316  x1v.v = _mm_load_pd( ( double* )x1 );
317  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
318  cada.v = x1v.v * z11v.v;
319  dbcb.v = x1rv.v * z12v.v;
320  rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
321 
322  y1v.v = _mm_load_pd( ( double* )y1 );
323  y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
324  cada.v = y1v.v * z11v.v;
325  dbcb.v = y1rv.v * z12v.v;
326  rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
327 
328  x1 += inc_x;
329  y1 += inc_y;
330  z1 += inc_z;
331  }
332  }
333 
334  //bl1_zscals( beta, rho_xz );
335  //bl1_zscals( beta, rho_yz );
336  {
337  v2df_t ab, ba, cc, dd, acbc, bdad;
338 
339  ab.v = _mm_load_pd( ( double* )beta );
340  ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
341 
342  cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
343  dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
344  acbc.v = ab.v * cc.v;
345  bdad.v = ba.v * dd.v;
346  r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
347 
348  cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
349  dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
350  acbc.v = ab.v * cc.v;
351  bdad.v = ba.v * dd.v;
352  r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
353  }
354 
355  //rho_xz->real = rho_xz->real + rho1.real;
356  //rho_xz->imag = rho_xz->imag + rho1.imag;
357  rho1v.v = r1v.v + rho1v.v;
358  _mm_store_pd( ( double* )rho_xz, rho1v.v );
359 
360  //rho_yz->real = rho_yz->real + rho2.real;
361  //rho_yz->imag = rho_yz->imag + rho2.imag;
362  rho2v.v = r2v.v + rho2v.v;
363  _mm_store_pd( ( double* )rho_yz, rho2v.v );
364 }
int bl1_is_conj(conj1_t conj)
Definition: bl1_is.c:42
double imag
Definition: blis_type_defs.h:139
int i
Definition: bl1_dotsv2.c:152
double d[2]
Definition: blis_type_defs.h:119
double real
Definition: blis_type_defs.h:139
__m128d v
Definition: blis_type_defs.h:118
double *restrict z1
Definition: bl1_dotsv2.c:148
Definition: blis_type_defs.h:116
x1
Definition: bl1_dotsv2.c:374
double *restrict y1
Definition: bl1_dotsv2.c:145
Definition: blis_type_defs.h:137

◆ bl1_zscals()

bl1_zscals ( beta  ,
rho_yz   
)

◆ for()

for ( )

References x1, y1, z1, and z2c.

196  {
197  x1c = *x1;
198  x2c = *(x1 + 1);
199  y1c = *y1;
200  y2c = *(y1 + 1);
201  z1c = *z1;
202  z2c = *(z1 + 1);
203 
204  rho1 += x1c * z1c + x2c * z2c;
205  rho2 += y1c * z1c + y2c * z2c;
206 
207  x1 += 2*inc_x;
208  y1 += 2*inc_y;
209  z1 += 2*inc_z;
210  }
double x1c
Definition: bl1_dotsv2.c:150
double rho1
Definition: bl1_dotsv2.c:149
double y1c
Definition: bl1_dotsv2.c:150
double z1c
Definition: bl1_dotsv2.c:150
double x2c
Definition: bl1_dotsv2.c:151
double rho2
Definition: bl1_dotsv2.c:149
double *restrict z1
Definition: bl1_dotsv2.c:148
x1
Definition: bl1_dotsv2.c:374
double y2c
Definition: bl1_dotsv2.c:151
double *restrict y1
Definition: bl1_dotsv2.c:145
double z2c
Definition: bl1_dotsv2.c:151

◆ if() [1/3]

if ( inc_x !  = 1 || inc_y != 1 || inc_z != 1)
182  {
183  x1c = *x1;
184  y1c = *y1;
185  z1c = *z1;
186 
187  rho1 += x1c * z1c;
188  rho2 += y1c * z1c;
189 
190  x1 += inc_x;
191  y1 += inc_y;
192  z1 += inc_z;
193  }
double x1c
Definition: bl1_dotsv2.c:150
double rho1
Definition: bl1_dotsv2.c:149
double y1c
Definition: bl1_dotsv2.c:150
double z1c
Definition: bl1_dotsv2.c:150
double rho2
Definition: bl1_dotsv2.c:149
double *restrict z1
Definition: bl1_dotsv2.c:148
x1
Definition: bl1_dotsv2.c:374
double *restrict y1
Definition: bl1_dotsv2.c:145

◆ if() [2/3]

if ( n_left  ,
 
)

References i, n_left, x1, y1, z1, and z1c.

213  {
214  for ( i = 0; i < n_left; ++i )
215  {
216  x1c = *x1;
217  y1c = *y1;
218  z1c = *z1;
219 
220  rho1 += x1c * z1c;
221  rho2 += y1c * z1c;
222 
223  x1 += inc_x;
224  y1 += inc_y;
225  z1 += inc_z;
226  }
227  }
double x1c
Definition: bl1_dotsv2.c:150
double rho1
Definition: bl1_dotsv2.c:149
double y1c
Definition: bl1_dotsv2.c:150
int i
Definition: bl1_dotsv2.c:152
double z1c
Definition: bl1_dotsv2.c:150
int n_left
Definition: bl1_dotsv2.c:156
double rho2
Definition: bl1_dotsv2.c:149
double *restrict z1
Definition: bl1_dotsv2.c:148
x1
Definition: bl1_dotsv2.c:374
double *restrict y1
Definition: bl1_dotsv2.c:145

◆ if() [3/3]

if ( bl1_is_conj(conjxy)  )

References i, dcomplex::imag, dcomplex::real, x1, y1, and z1.

382  {
383  for ( i = 0; i < n; ++i )
384  {
385  x1c = *x1;
386  y1c = *y1;
387  z1c = *z1;
388 
389  rho1.real += x1c.real * z1c.real - -x1c.imag * z1c.imag;
390  rho1.imag += x1c.real * z1c.imag + -x1c.imag * z1c.real;
391 
392  rho2.real += y1c.real * z1c.real - -y1c.imag * z1c.imag;
393  rho2.imag += y1c.real * z1c.imag + -y1c.imag * z1c.real;
394 
395  x1 += inc_x;
396  y1 += inc_y;
397  z1 += inc_z;
398  }
399  }
double x1c
Definition: bl1_dotsv2.c:150
double rho1
Definition: bl1_dotsv2.c:149
double y1c
Definition: bl1_dotsv2.c:150
int i
Definition: bl1_dotsv2.c:152
double z1c
Definition: bl1_dotsv2.c:150
double rho2
Definition: bl1_dotsv2.c:149
double *restrict z1
Definition: bl1_dotsv2.c:148
x1
Definition: bl1_dotsv2.c:374
double *restrict y1
Definition: bl1_dotsv2.c:145

Variable Documentation

◆ else

else
Initial value:
{
for ( i = 0; i < n; ++i )
{
x1c = *x1;
y1c = *y1;
z1c = *z1;
rho1.real += x1c.real * z1c.real - x1c.imag * z1c.imag;
rho1.imag += x1c.real * z1c.imag + x1c.imag * z1c.real;
rho2.real += y1c.real * z1c.real - y1c.imag * z1c.imag;
rho2.imag += y1c.real * z1c.imag + y1c.imag * z1c.real;
x1 += inc_x;
y1 += inc_y;
z1 += inc_z;
}
}
bl1_zscals( beta, rho_xz )
double x1c
Definition: bl1_dotsv2.c:150
double rho1
Definition: bl1_dotsv2.c:149
double y1c
Definition: bl1_dotsv2.c:150
int i
Definition: bl1_dotsv2.c:152
double z1c
Definition: bl1_dotsv2.c:150
* rho_xz
Definition: bl1_dotsv2.c:229
double rho2
Definition: bl1_dotsv2.c:149
double *restrict z1
Definition: bl1_dotsv2.c:148
x1
Definition: bl1_dotsv2.c:374
double *restrict y1
Definition: bl1_dotsv2.c:145
bl1_zscals(beta, rho_yz)

◆ i

int i

Referenced by bl1_ddotsv2(), bl1_zdotsv2(), and if().

◆ imag

rho_yz imag = 0.0

◆ n_left

int n_left

Referenced by bl1_ddotsv2(), and if().

◆ n_pre

int n_pre

Referenced by bl1_ddotsv2().

◆ n_run

int n_run

Referenced by bl1_ddotsv2().

◆ real

rho_yz real = 0.0

◆ rho1

* rho1 = rho1_c

◆ rho2

* rho2 = rho2_c

Referenced by bl1_ddotsv2(), and bl1_zdotsv2().

◆ rho_xz

* rho_xz = *beta * *rho_xz + rho1

◆ rho_yz

* rho_yz = *beta * *rho_yz + rho2

◆ x1

x1 = x

◆ x1c

dcomplex x1c

Referenced by bl1_ddotsv2(), and bl1_zdotsv2().

◆ x2c

double x2c

◆ y1

dcomplex *restrict y1

◆ y1c

dcomplex y1c

Referenced by bl1_ddotsv2(), and bl1_zdotsv2().

◆ y2c

double y2c

◆ z1

dcomplex *restrict z1 = z

◆ z1c

dcomplex z1c

Referenced by bl1_ddotsv2(), bl1_zdotsv2(), and if().

◆ z2c

double z2c

Referenced by for().