Go to the documentation of this file. 12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 14 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops 15 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd 16 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc 17 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz 19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 21 #define MAC_Apply_G_mx3_ass( m_A, \ 30 int n_iter32 = m_A / ( 4 * 8 ); \ 31 int n_left32 = m_A % ( 4 * 8 ); \ 32 int n_iter4 = n_left32 / ( 4 * 1 ); \ 33 int n_left = n_left32 % ( 4 * 1 ); \ 36 const int step_a1 = inc_a1 * 4; \ 37 const int step_a2 = inc_a1 * 4; \ 38 const int step_a3 = inc_a1 * 4; \ 40 float* restrict alpha1 = a1; \ 41 float* restrict alpha2 = a2; \ 42 float* restrict alpha3 = a3; \ 44 v4sf_t a1v, a2v, a3v; \ 49 g12v.v = _mm_load1_ps( gamma12 ); \ 50 s12v.v = _mm_load1_ps( sigma12 ); \ 51 g23v.v = _mm_load1_ps( gamma23 ); \ 52 s23v.v = _mm_load1_ps( sigma23 ); \ 54 for ( i = 0; i < n_iter32; ++i ) \ 57 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 58 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 61 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 62 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 64 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 65 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 69 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 70 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 72 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 74 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 77 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 78 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 81 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 82 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 84 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 85 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 89 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 90 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 92 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 94 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 97 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 98 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 101 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 102 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 104 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 105 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 109 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 110 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 112 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 114 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 117 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 118 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 121 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 122 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 124 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 125 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 129 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 130 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 132 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 134 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 137 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 138 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 141 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 142 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 144 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 145 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 149 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 150 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 152 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 154 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 164 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 165 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 169 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 170 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 172 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 174 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 177 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 178 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 181 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 182 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 184 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 185 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 189 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 190 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 192 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 194 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 197 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 198 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 201 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 202 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 204 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 205 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 209 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 210 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 212 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 214 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 218 for ( i = 0; i < n_iter4; ++i ) \ 221 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 222 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 225 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 226 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 228 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 229 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 233 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 234 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 236 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 238 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 242 for ( i = 0; i < n_left; ++i ) \ 244 float ga12 = *gamma12; \ 245 float si12 = *sigma12; \ 246 float ga23 = *gamma23; \ 247 float si23 = *sigma23; \ 255 *alpha1 = temp1 * ga12 + temp2 * si12; \ 256 *alpha2 = temp2 * ga12 - temp1 * si12; \ 261 *alpha2 = temp2 * ga23 + temp3 * si23; \ 262 *alpha3 = temp3 * ga23 - temp2 * si23; \ 270 #define MAC_Apply_G_mx3_asd( m_A, \ 279 int n_iter16 = m_A / ( 2 * 8 ); \ 280 int n_left16 = m_A % ( 2 * 8 ); \ 281 int n_iter2 = n_left16 / ( 2 * 1 ); \ 282 int n_left = n_left16 % ( 2 * 1 ); \ 285 const int step_a1 = inc_a1 * 2; \ 286 const int step_a2 = inc_a1 * 2; \ 287 const int step_a3 = inc_a1 * 2; \ 289 double* restrict alpha1 = a1; \ 290 double* restrict alpha2 = a2; \ 291 double* restrict alpha3 = a3; \ 293 v2df_t a1v, a2v, a3v; \ 298 g12v.v = _mm_loaddup_pd( gamma12 ); \ 299 s12v.v = _mm_loaddup_pd( sigma12 ); \ 300 g23v.v = _mm_loaddup_pd( gamma23 ); \ 301 s23v.v = _mm_loaddup_pd( sigma23 ); \ 303 for ( i = 0; i < n_iter16; ++i ) \ 306 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 307 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 310 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 311 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 313 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 314 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 318 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 319 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 321 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 323 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 326 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 327 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 330 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 331 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 333 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 334 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 338 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 339 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 341 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 343 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 346 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 347 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 350 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 351 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 353 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 354 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 358 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 359 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 361 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 363 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 366 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 367 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 370 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 371 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 373 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 374 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 378 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 379 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 381 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 383 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 386 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 387 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 390 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 391 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 393 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 394 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 398 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 399 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 401 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 403 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 406 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 407 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 410 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 411 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 413 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 414 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 418 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 419 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 421 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 423 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 426 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 427 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 430 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 431 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 433 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 434 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 438 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 439 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 441 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 443 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 446 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 447 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 450 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 451 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 453 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 454 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 458 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 459 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 461 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 463 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 467 for ( i = 0; i < n_iter2; ++i ) \ 470 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 471 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 474 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 475 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 477 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 478 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 482 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 483 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 485 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 487 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 493 double ga12 = *gamma12; \ 494 double si12 = *sigma12; \ 495 double ga23 = *gamma23; \ 496 double si23 = *sigma23; \ 504 *alpha1 = temp1 * ga12 + temp2 * si12; \ 505 *alpha2 = temp2 * ga12 - temp1 * si12; \ 510 *alpha2 = temp2 * ga23 + temp3 * si23; \ 511 *alpha3 = temp3 * ga23 - temp2 * si23; \ 515 #define MAC_Apply_G_mx3_asc( m_A, \ 524 int n_iter16 = m_A / ( 2 * 8 ); \ 525 int n_left16 = m_A % ( 2 * 8 ); \ 526 int n_iter2 = n_left16 / ( 2 * 1 ); \ 527 int n_left = n_left16 % ( 2 * 1 ); \ 530 const int step_a1 = inc_a1 * 2; \ 531 const int step_a2 = inc_a1 * 2; \ 532 const int step_a3 = inc_a1 * 2; \ 534 scomplex* restrict alpha1 = a1; \ 535 scomplex* restrict alpha2 = a2; \ 536 scomplex* restrict alpha3 = a3; \ 538 v4sf_t a1v, a2v, a3v; \ 543 g12v.v = _mm_load1_ps( gamma12 ); \ 544 s12v.v = _mm_load1_ps( sigma12 ); \ 545 g23v.v = _mm_load1_ps( gamma23 ); \ 546 s23v.v = _mm_load1_ps( sigma23 ); \ 548 for ( i = 0; i < n_iter16; ++i ) \ 551 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 552 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 555 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 556 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 558 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 559 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 563 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 564 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 566 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 568 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 571 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 572 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 575 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 576 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 578 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 579 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 583 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 584 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 586 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 588 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 591 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 592 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 595 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 596 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 598 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 599 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 603 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 604 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 606 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 608 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 611 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 612 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 615 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 616 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 618 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 619 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 623 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 624 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 626 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 628 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 631 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 632 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 635 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 636 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 638 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 639 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 643 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 644 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 646 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 648 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 651 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 652 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 655 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 656 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 658 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 659 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 663 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 664 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 666 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 668 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 671 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 672 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 675 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 676 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 678 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 679 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 683 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 684 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 686 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 688 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 691 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 692 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 695 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 696 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 698 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 699 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 703 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 704 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 706 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 708 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 712 for ( i = 0; i < n_iter2; ++i ) \ 715 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 716 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 719 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 720 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 722 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 723 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 727 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 728 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 730 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 732 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 738 float ga12 = *gamma12; \ 739 float si12 = *sigma12; \ 740 float ga23 = *gamma23; \ 741 float si23 = *sigma23; \ 749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \ 759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \ 761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \ 762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \ 766 #define MAC_Apply_G_mx3_asz( m_A, \ 775 int n_iter = m_A / 8; \ 776 int n_left = m_A % 8; \ 779 const int step_a1 = inc_a1 * 1; \ 780 const int step_a2 = inc_a1 * 1; \ 781 const int step_a3 = inc_a1 * 1; \ 783 dcomplex* restrict alpha1 = a1; \ 784 dcomplex* restrict alpha2 = a2; \ 785 dcomplex* restrict alpha3 = a3; \ 787 v2df_t a1v, a2v, a3v; \ 792 g12v.v = _mm_loaddup_pd( gamma12 ); \ 793 s12v.v = _mm_loaddup_pd( sigma12 ); \ 794 g23v.v = _mm_loaddup_pd( gamma23 ); \ 795 s23v.v = _mm_loaddup_pd( sigma23 ); \ 797 for ( i = 0; i < n_iter; ++i ) \ 800 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 801 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 804 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 805 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 807 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 808 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 812 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 813 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 815 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 816 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 820 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 821 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 824 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 825 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 827 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 828 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 832 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 833 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 835 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 836 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 840 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 841 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 844 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 845 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 847 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 848 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 852 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 853 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 855 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 856 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 860 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 861 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 864 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 865 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 867 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 868 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 872 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 873 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 875 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 876 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 880 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 881 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 884 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 885 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 887 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 888 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 892 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 893 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 895 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 896 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 900 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 901 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 904 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 905 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 907 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 908 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 912 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 913 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 915 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 916 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 920 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 921 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 924 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 925 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 927 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 928 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 932 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 933 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 935 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 936 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 940 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 941 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 944 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 945 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 947 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 948 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 952 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 953 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 955 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 956 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 961 for ( i = 0; i < n_left; ++i ) \ 963 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 967 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 968 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 970 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 972 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 975 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 976 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 978 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 980 _mm_store_pd( ( double* )alpha3, a3v.v ); \