Go to the documentation of this file. 12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 14 #define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops 15 #define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd 16 #define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc 17 #define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz 19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 21 #define MAC_Apply_G_mx3b_ass( m_A, \ 30 int n_iter32 = m_A / ( 4 * 8 ); \ 31 int n_left32 = m_A % ( 4 * 8 ); \ 32 int n_iter4 = n_left32 / ( 4 * 1 ); \ 33 int n_left = n_left32 % ( 4 * 1 ); \ 36 const int step_a1 = inc_a1 * 4; \ 37 const int step_a2 = inc_a2 * 4; \ 38 const int step_a3 = inc_a3 * 4; \ 40 float* restrict alpha1 = a1; \ 41 float* restrict alpha2 = a2; \ 42 float* restrict alpha3 = a3; \ 44 v4sf_t a1v, a2v, a3v; \ 49 g12v.v = _mm_load1_ps( gamma12 ); \ 50 s12v.v = _mm_load1_ps( sigma12 ); \ 51 g23v.v = _mm_load1_ps( gamma23 ); \ 52 s23v.v = _mm_load1_ps( sigma23 ); \ 54 for ( i = 0; i < n_iter32; ++i ) \ 57 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 58 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 61 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 62 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 64 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 66 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 69 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 70 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 72 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 74 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 77 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 78 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 81 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 82 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 84 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 86 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 89 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 90 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 92 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 94 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 97 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 98 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 101 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 102 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 104 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 106 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 109 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 110 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 112 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 114 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 117 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 118 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 121 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 122 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 124 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 126 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 129 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 130 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 132 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 134 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 137 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 138 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 141 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 142 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 144 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 146 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 149 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 150 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 152 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 154 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 157 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 158 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 161 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 162 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 164 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 166 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 169 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 170 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 172 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 174 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 177 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 178 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 181 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 182 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 184 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 186 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 189 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 190 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 192 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 194 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 197 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 198 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 201 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 202 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 204 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 206 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 209 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 210 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 212 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 214 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 218 for ( i = 0; i < n_iter4; ++i ) \ 221 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 222 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 225 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 226 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 228 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 230 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 233 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 234 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 236 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 238 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 242 for ( i = 0; i < n_left; ++i ) \ 244 float ga12 = *gamma12; \ 245 float si12 = *sigma12; \ 246 float ga23 = *gamma23; \ 247 float si23 = *sigma23; \ 255 *alpha2 = temp2 * ga23 + temp3 * si23; \ 256 *alpha3 = temp3 * ga23 - temp2 * si23; \ 261 *alpha1 = temp1 * ga12 + temp2 * si12; \ 262 *alpha2 = temp2 * ga12 - temp1 * si12; \ 270 #define MAC_Apply_G_mx3b_asd( m_A, \ 279 int n_iter16 = m_A / ( 2 * 8 ); \ 280 int n_left16 = m_A % ( 2 * 8 ); \ 281 int n_iter2 = n_left16 / ( 2 * 1 ); \ 282 int n_left = n_left16 % ( 2 * 1 ); \ 285 const int step_a1 = inc_a1 * 2; \ 286 const int step_a2 = inc_a2 * 2; \ 287 const int step_a3 = inc_a3 * 2; \ 289 double* restrict alpha1 = a1; \ 290 double* restrict alpha2 = a2; \ 291 double* restrict alpha3 = a3; \ 293 v2df_t a1v, a2v, a3v; \ 298 g12v.v = _mm_loaddup_pd( gamma12 ); \ 299 s12v.v = _mm_loaddup_pd( sigma12 ); \ 300 g23v.v = _mm_loaddup_pd( gamma23 ); \ 301 s23v.v = _mm_loaddup_pd( sigma23 ); \ 303 for ( i = 0; i < n_iter16; ++i ) \ 306 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 307 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 310 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 311 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 313 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 315 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 318 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 319 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 321 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 323 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 327 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 330 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 331 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 333 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 335 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 338 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 339 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 341 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 343 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 346 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 347 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 350 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 351 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 353 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 355 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 358 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 359 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 361 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 363 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 366 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 367 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 370 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 371 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 373 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 375 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 378 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 379 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 381 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 383 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 386 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 387 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 390 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 391 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 393 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 395 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 398 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 399 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 401 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 403 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 406 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 407 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 410 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 411 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 413 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 415 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 418 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 419 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 421 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 423 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 426 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 427 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 430 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 431 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 433 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 435 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 441 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 443 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 446 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 447 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 450 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 451 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 453 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 455 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 458 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 459 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 461 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 463 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 467 for ( i = 0; i < n_iter2; ++i ) \ 470 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 471 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 474 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 475 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 477 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 479 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 482 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 483 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 485 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 487 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 493 double ga12 = *gamma12; \ 494 double si12 = *sigma12; \ 495 double ga23 = *gamma23; \ 496 double si23 = *sigma23; \ 504 *alpha2 = temp2 * ga23 + temp3 * si23; \ 505 *alpha3 = temp3 * ga23 - temp2 * si23; \ 510 *alpha1 = temp1 * ga12 + temp2 * si12; \ 511 *alpha2 = temp2 * ga12 - temp1 * si12; \ 515 #define MAC_Apply_G_mx3b_asc( m_A, \ 524 int n_iter16 = m_A / ( 2 * 8 ); \ 525 int n_left16 = m_A % ( 2 * 8 ); \ 526 int n_iter2 = n_left16 / ( 2 * 1 ); \ 527 int n_left = n_left16 % ( 2 * 1 ); \ 530 const int step_a1 = inc_a1 * 2; \ 531 const int step_a2 = inc_a2 * 2; \ 532 const int step_a3 = inc_a3 * 2; \ 534 scomplex* restrict alpha1 = a1; \ 535 scomplex* restrict alpha2 = a2; \ 536 scomplex* restrict alpha3 = a3; \ 538 v4sf_t a1v, a2v, a3v; \ 543 g12v.v = _mm_load1_ps( gamma12 ); \ 544 s12v.v = _mm_load1_ps( sigma12 ); \ 545 g23v.v = _mm_load1_ps( gamma23 ); \ 546 s23v.v = _mm_load1_ps( sigma23 ); \ 548 for ( i = 0; i < n_iter16; ++i ) \ 551 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 552 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 555 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 556 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 558 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 560 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 563 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 564 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 566 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 568 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 571 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 572 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 575 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 576 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 578 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 580 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 583 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 584 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 586 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 588 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 591 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 592 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 595 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 596 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 598 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 600 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 603 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 604 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 606 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 608 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 611 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 612 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 615 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 616 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 618 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 620 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 623 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 624 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 626 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 628 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 631 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 632 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 635 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 636 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 638 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 640 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 646 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 648 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 651 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 652 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 655 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 656 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 658 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 660 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 663 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 664 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 666 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 668 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 671 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 672 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 675 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 676 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 678 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 680 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 683 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 684 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 686 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 688 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 691 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 692 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 695 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 696 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 698 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 700 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 703 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 704 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 706 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 708 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 712 for ( i = 0; i < n_iter2; ++i ) \ 715 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 716 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 719 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 720 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 722 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 724 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 727 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 728 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 730 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 732 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 738 float ga12 = *gamma12; \ 739 float si12 = *sigma12; \ 740 float ga23 = *gamma23; \ 741 float si23 = *sigma23; \ 749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \ 759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \ 761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \ 762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \ 766 #define MAC_Apply_G_mx3b_asz( m_A, \ 775 int n_iter = m_A / 8; \ 776 int n_left = m_A % 8; \ 779 const int step_a1 = inc_a1 * 1; \ 780 const int step_a2 = inc_a2 * 1; \ 781 const int step_a3 = inc_a3 * 1; \ 783 dcomplex* restrict alpha1 = a1; \ 784 dcomplex* restrict alpha2 = a2; \ 785 dcomplex* restrict alpha3 = a3; \ 787 v2df_t a1v, a2v, a3v; \ 792 g12v.v = _mm_loaddup_pd( gamma12 ); \ 793 s12v.v = _mm_loaddup_pd( sigma12 ); \ 794 g23v.v = _mm_loaddup_pd( gamma23 ); \ 795 s23v.v = _mm_loaddup_pd( sigma23 ); \ 797 for ( i = 0; i < n_iter; ++i ) \ 800 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 801 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 804 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 805 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 807 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 809 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 812 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 813 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 815 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 817 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 820 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 821 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 824 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 825 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 827 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 829 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 832 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 833 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 835 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 837 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 840 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 841 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 844 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 845 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 847 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 849 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 852 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 853 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 855 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 857 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 860 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 861 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 864 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 865 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 867 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 869 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 872 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 873 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 875 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 877 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 880 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 881 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 884 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 885 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 887 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 889 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 892 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 893 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 895 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 897 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 900 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 901 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 904 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 905 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 907 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 909 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 912 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 913 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 915 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 917 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 920 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 921 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 924 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 925 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 927 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 929 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 932 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 933 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 935 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 937 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 940 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 941 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 944 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 945 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 947 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 949 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 952 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 953 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 955 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 957 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 961 for ( i = 0; i < n_left; ++i ) \ 964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 965 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 968 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 969 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 971 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 973 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 976 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 977 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 979 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 981 _mm_store_pd( ( double* )alpha2, a2v.v ); \