Go to the documentation of this file. 12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 14 #define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops 15 #define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd 16 #define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc 17 #define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz 19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 21 #define MAC_Apply_G_mx4s_ass( m_A, \ 35 int n_iter32 = m_A / ( 4 * 8 ); \ 36 int n_left32 = m_A % ( 4 * 8 ); \ 37 int n_iter4 = n_left32 / ( 4 * 1 ); \ 38 int n_left = n_left32 % ( 4 * 1 ); \ 41 const int step_a1 = inc_a1 * 4; \ 42 const int step_a2 = inc_a2 * 4; \ 43 const int step_a3 = inc_a3 * 4; \ 44 const int step_a4 = inc_a4 * 4; \ 46 float* restrict alpha1 = a1; \ 47 float* restrict alpha2 = a2; \ 48 float* restrict alpha3 = a3; \ 49 float* restrict alpha4 = a4; \ 51 v4sf_t a1v, a2v, a3v, a4v; \ 52 v4sf_t b1v, b2v, b3v, b4v; \ 53 v4sf_t g23_k1v, s23_k1v; \ 54 v4sf_t g34_k1v, s34_k1v; \ 55 v4sf_t g12_k2v, s12_k2v; \ 56 v4sf_t g23_k2v, s23_k2v; \ 57 v4sf_t t1v, t2v, t3v; \ 59 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \ 60 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \ 61 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \ 62 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \ 63 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \ 64 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \ 65 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \ 66 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \ 68 for ( i = 0; i < n_iter32; ++i ) \ 71 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 72 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 73 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 76 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 77 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 79 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 82 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 83 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 85 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 89 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 90 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 92 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 94 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 97 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 98 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 100 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 102 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 106 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 109 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 110 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 112 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 114 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 117 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 118 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 120 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 124 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 125 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 127 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 129 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 132 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 133 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 135 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 137 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 141 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 144 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 145 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 147 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 149 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 152 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 153 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 155 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 159 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 160 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 162 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 164 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 167 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 168 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 170 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 172 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 176 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 179 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 180 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 182 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 184 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 187 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 188 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 190 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 194 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 195 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 197 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 199 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \ 202 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 203 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 205 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 207 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 212 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 215 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 216 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 218 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 220 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 223 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 224 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 226 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 230 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 231 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 233 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 235 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 238 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 239 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 241 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 243 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 247 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 250 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 251 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 253 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 255 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 258 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 259 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 261 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 265 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 266 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 268 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 270 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 273 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 274 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 276 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 278 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 282 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 285 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 286 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 288 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 290 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 293 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 294 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 296 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 300 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 301 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 303 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 305 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 308 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 309 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 311 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 313 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 317 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 320 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 321 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 323 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 325 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 328 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 329 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 331 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 335 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 336 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 338 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 342 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 343 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 345 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 348 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 354 for ( i = 0; i < n_iter4; ++i ) \ 357 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 358 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 359 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 362 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 363 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 365 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 368 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 369 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 371 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 375 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 376 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 378 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 382 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 383 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 385 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 387 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 391 for ( i = 0; i < n_left; ++i ) \ 393 float ga23_k1 = *gamma23_k1; \ 394 float si23_k1 = *sigma23_k1; \ 395 float ga34_k1 = *gamma34_k1; \ 396 float si34_k1 = *sigma34_k1; \ 397 float ga12_k2 = *gamma12_k2; \ 398 float si12_k2 = *sigma12_k2; \ 399 float ga23_k2 = *gamma23_k2; \ 400 float si23_k2 = *sigma23_k2; \ 409 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 410 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 415 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 416 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 421 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 422 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 427 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 428 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 437 #define MAC_Apply_G_mx4s_asd( m_A, \ 451 int n_iter16 = m_A / ( 2 * 8 ); \ 452 int n_left16 = m_A % ( 2 * 8 ); \ 453 int n_iter2 = n_left16 / ( 2 * 1 ); \ 454 int n_left = n_left16 % ( 2 * 1 ); \ 457 const int step_a1 = inc_a1 * 2; \ 458 const int step_a2 = inc_a2 * 2; \ 459 const int step_a3 = inc_a3 * 2; \ 460 const int step_a4 = inc_a4 * 2; \ 462 double* restrict alpha1 = a1; \ 463 double* restrict alpha2 = a2; \ 464 double* restrict alpha3 = a3; \ 465 double* restrict alpha4 = a4; \ 467 v2df_t a1v, a2v, a3v, a4v; \ 468 v2df_t b1v, b2v, b3v, b4v; \ 469 v2df_t g23_k1v, s23_k1v; \ 470 v2df_t g34_k1v, s34_k1v; \ 471 v2df_t g12_k2v, s12_k2v; \ 472 v2df_t g23_k2v, s23_k2v; \ 473 v2df_t t1v, t2v, t3v; \ 475 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \ 476 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \ 477 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \ 478 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \ 479 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \ 480 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \ 481 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \ 482 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \ 484 for ( i = 0; i < n_iter16; ++i ) \ 487 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 488 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 489 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 492 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 493 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 495 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 498 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 499 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 501 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 505 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 506 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 508 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 510 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 513 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 514 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 516 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 518 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 522 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 525 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 526 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 528 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 530 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 533 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 534 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 536 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 540 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 541 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 543 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 545 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 548 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 549 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 551 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 553 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 557 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 560 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 561 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 563 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 565 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 568 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 569 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 571 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 575 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 576 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 578 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 580 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 583 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 584 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 586 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 588 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 592 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 595 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 596 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 598 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 600 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 603 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 604 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 606 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 610 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 611 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 613 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 615 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \ 618 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 619 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 621 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 623 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 628 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 631 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 632 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 634 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 636 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 639 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 640 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 642 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 646 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 647 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 649 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 651 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 654 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 655 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 657 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 659 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 663 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 666 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 667 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 669 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 671 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 674 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 675 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 677 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 681 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 682 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 684 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 686 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 689 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 690 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 692 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 694 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 698 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 701 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 702 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 704 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 706 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 709 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 710 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 712 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 716 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 717 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 719 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 721 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 724 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 725 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 727 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 729 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 733 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 736 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 737 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 739 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 741 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 744 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 745 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 747 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 751 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 752 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 754 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 758 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 759 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 761 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 764 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 770 for ( i = 0; i < n_iter2; ++i ) \ 773 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 774 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 775 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 778 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 779 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 781 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 784 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 785 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 787 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 791 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 792 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 794 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 798 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 799 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 801 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 803 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 809 double ga23_k1 = *gamma23_k1; \ 810 double si23_k1 = *sigma23_k1; \ 811 double ga34_k1 = *gamma34_k1; \ 812 double si34_k1 = *sigma34_k1; \ 813 double ga12_k2 = *gamma12_k2; \ 814 double si12_k2 = *sigma12_k2; \ 815 double ga23_k2 = *gamma23_k2; \ 816 double si23_k2 = *sigma23_k2; \ 825 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 826 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 831 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 832 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 837 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 838 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 843 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 844 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 849 #define MAC_Apply_G_mx4s_asc( m_A, \ 863 int n_iter16 = m_A / ( 2 * 8 ); \ 864 int n_left16 = m_A % ( 2 * 8 ); \ 865 int n_iter2 = n_left16 / ( 2 * 1 ); \ 866 int n_left = n_left16 % ( 2 * 1 ); \ 869 const int step_a1 = inc_a1 * 2; \ 870 const int step_a2 = inc_a2 * 2; \ 871 const int step_a3 = inc_a3 * 2; \ 872 const int step_a4 = inc_a4 * 2; \ 874 scomplex* restrict alpha1 = a1; \ 875 scomplex* restrict alpha2 = a2; \ 876 scomplex* restrict alpha3 = a3; \ 877 scomplex* restrict alpha4 = a4; \ 879 v4sf_t a1v, a2v, a3v, a4v; \ 880 v4sf_t b1v, b2v, b3v, b4v; \ 881 v4sf_t g23_k1v, s23_k1v; \ 882 v4sf_t g34_k1v, s34_k1v; \ 883 v4sf_t g12_k2v, s12_k2v; \ 884 v4sf_t g23_k2v, s23_k2v; \ 885 v4sf_t t1v, t2v, t3v; \ 887 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \ 888 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \ 889 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \ 890 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \ 891 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \ 892 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \ 893 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \ 894 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \ 896 for ( i = 0; i < n_iter16; ++i ) \ 899 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 900 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 901 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 904 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 905 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 907 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 910 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 911 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 913 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 917 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 918 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 920 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 922 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 925 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 926 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 928 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 930 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 934 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 937 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 938 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 940 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 942 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 945 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 946 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 948 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 952 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 953 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 955 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 957 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 960 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 961 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 963 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 965 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 969 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 972 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 973 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 975 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 977 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 980 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 981 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 983 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 987 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 988 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 990 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 992 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 995 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 996 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 998 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 1000 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 1004 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 1007 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1008 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1010 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 1011 alpha3 += step_a3; \ 1012 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 1015 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1016 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1018 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 1019 alpha4 += step_a4; \ 1022 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1023 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1025 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 1026 alpha1 += step_a1; \ 1027 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \ 1030 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1031 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1033 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 1034 alpha2 += step_a2; \ 1035 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 1040 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 1043 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1044 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1046 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 1047 alpha3 += step_a3; \ 1048 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 1051 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1052 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1054 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 1055 alpha4 += step_a4; \ 1058 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1059 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1061 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 1062 alpha1 += step_a1; \ 1063 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 1066 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1067 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1069 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 1070 alpha2 += step_a2; \ 1071 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 1075 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 1078 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1079 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1081 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 1082 alpha3 += step_a3; \ 1083 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 1086 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1087 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1089 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 1090 alpha4 += step_a4; \ 1093 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1094 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1096 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 1097 alpha1 += step_a1; \ 1098 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 1101 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1102 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1104 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 1105 alpha2 += step_a2; \ 1106 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 1110 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 1113 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1114 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1116 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 1117 alpha3 += step_a3; \ 1118 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 1121 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1122 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1124 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 1125 alpha4 += step_a4; \ 1128 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1129 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1131 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 1132 alpha1 += step_a1; \ 1133 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 1136 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1137 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1139 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 1140 alpha2 += step_a2; \ 1141 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 1145 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 1148 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1149 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1151 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 1152 alpha3 += step_a3; \ 1153 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 1156 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1157 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1159 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 1160 alpha4 += step_a4; \ 1163 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1164 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1166 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 1167 alpha1 += step_a1; \ 1170 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1171 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1173 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 1174 alpha2 += step_a2; \ 1176 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 1177 alpha3 += step_a3; \ 1182 for ( i = 0; i < n_iter2; ++i ) \ 1185 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 1186 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 1187 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 1190 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1191 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1193 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 1196 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1197 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1199 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 1200 alpha4 += step_a4; \ 1203 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1204 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1206 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 1207 alpha1 += step_a1; \ 1210 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1211 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1213 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 1214 alpha2 += step_a2; \ 1215 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 1216 alpha3 += step_a3; \ 1219 if ( n_left == 1 ) \ 1221 float ga23_k1 = *gamma23_k1; \ 1222 float si23_k1 = *sigma23_k1; \ 1223 float ga34_k1 = *gamma34_k1; \ 1224 float si34_k1 = *sigma34_k1; \ 1225 float ga12_k2 = *gamma12_k2; \ 1226 float si12_k2 = *sigma12_k2; \ 1227 float ga23_k2 = *gamma23_k2; \ 1228 float si23_k2 = *sigma23_k2; \ 1237 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \ 1238 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \ 1240 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \ 1241 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \ 1246 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \ 1247 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \ 1249 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \ 1250 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \ 1255 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \ 1256 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \ 1258 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \ 1259 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \ 1264 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \ 1265 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \ 1267 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \ 1268 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \ 1273 #define MAC_Apply_G_mx4s_asz( m_A, \ 1287 int n_iter = m_A / 8; \ 1288 int n_left = m_A % 8; \ 1291 const int step_a1 = inc_a1 * 1; \ 1292 const int step_a2 = inc_a2 * 1; \ 1293 const int step_a3 = inc_a3 * 1; \ 1294 const int step_a4 = inc_a4 * 1; \ 1296 dcomplex* restrict alpha1 = a1; \ 1297 dcomplex* restrict alpha2 = a2; \ 1298 dcomplex* restrict alpha3 = a3; \ 1299 dcomplex* restrict alpha4 = a4; \ 1301 v2df_t a1v, a2v, a3v, a4v; \ 1302 v2df_t b1v, b2v, b3v, b4v; \ 1303 v2df_t g23_k1v, s23_k1v; \ 1304 v2df_t g34_k1v, s34_k1v; \ 1305 v2df_t g12_k2v, s12_k2v; \ 1306 v2df_t g23_k2v, s23_k2v; \ 1307 v2df_t t1v, t2v, t3v; \ 1309 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \ 1310 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \ 1311 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \ 1312 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \ 1313 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \ 1314 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \ 1315 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \ 1316 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \ 1318 for ( i = 0; i < n_iter; ++i ) \ 1321 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 1322 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 1323 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1326 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1327 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1329 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1332 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1333 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1335 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 1336 alpha4 += step_a4; \ 1339 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1340 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1342 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 1343 alpha1 += step_a1; \ 1344 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 1347 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1348 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1350 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 1351 alpha2 += step_a2; \ 1352 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1356 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1359 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1360 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1362 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 1363 alpha3 += step_a3; \ 1364 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1367 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1368 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1370 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 1371 alpha4 += step_a4; \ 1374 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1375 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1377 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 1378 alpha1 += step_a1; \ 1379 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 1382 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1383 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1385 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 1386 alpha2 += step_a2; \ 1387 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1391 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1394 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1395 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1397 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 1398 alpha3 += step_a3; \ 1399 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1402 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1403 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1405 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 1406 alpha4 += step_a4; \ 1409 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1410 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1412 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 1413 alpha1 += step_a1; \ 1414 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 1417 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1418 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1420 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 1421 alpha2 += step_a2; \ 1422 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1426 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1429 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1430 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1432 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 1433 alpha3 += step_a3; \ 1434 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1437 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1438 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1440 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 1441 alpha4 += step_a4; \ 1444 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1445 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1447 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 1448 alpha1 += step_a1; \ 1449 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \ 1452 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1453 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1455 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 1456 alpha2 += step_a2; \ 1457 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1461 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1464 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1465 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1467 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 1468 alpha3 += step_a3; \ 1469 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1472 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1473 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1475 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 1476 alpha4 += step_a4; \ 1479 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1480 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1482 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 1483 alpha1 += step_a1; \ 1484 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 1487 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1488 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1490 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 1491 alpha2 += step_a2; \ 1492 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1496 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1499 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1500 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1502 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 1503 alpha3 += step_a3; \ 1504 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1507 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1508 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1510 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 1511 alpha4 += step_a4; \ 1514 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1515 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1517 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 1518 alpha1 += step_a1; \ 1519 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 1522 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1523 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1525 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 1526 alpha2 += step_a2; \ 1527 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1531 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1534 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1535 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1537 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 1538 alpha3 += step_a3; \ 1539 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1542 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1543 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1545 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 1546 alpha4 += step_a4; \ 1549 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1550 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1552 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 1553 alpha1 += step_a1; \ 1554 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 1557 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1558 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1560 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 1561 alpha2 += step_a2; \ 1562 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 1566 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1569 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 1570 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1572 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 1573 alpha3 += step_a3; \ 1574 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1577 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 1578 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1580 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 1581 alpha4 += step_a4; \ 1584 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 1585 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1587 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 1588 alpha1 += step_a1; \ 1591 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 1592 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1594 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 1595 alpha2 += step_a2; \ 1597 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 1598 alpha3 += step_a3; \ 1603 for ( i = 0; i < n_left; ++i ) \ 1606 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 1607 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 1608 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 1611 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 1612 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 1614 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 1617 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 1618 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 1620 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 1621 alpha4 += step_a4; \ 1624 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 1625 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 1627 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 1628 alpha1 += step_a1; \ 1631 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 1632 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 1634 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 1635 alpha2 += step_a2; \ 1636 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 1637 alpha3 += step_a3; \