Go to the documentation of this file. 12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 14 #define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops 15 #define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd 16 #define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc 17 #define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz 19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 21 #define MAC_Apply_G_mx2_ass( m_A, \ 27 int n_iter32 = m_A / ( 4 * 8 ); \ 28 int n_left32 = m_A % ( 4 * 8 ); \ 29 int n_iter4 = n_left32 / ( 4 * 1 ); \ 30 int n_left = n_left32 % ( 4 * 1 ); \ 33 const int step_a1 = inc_a1 * 4; \ 34 const int step_a2 = inc_a2 * 4; \ 36 float* restrict alpha1 = a1; \ 37 float* restrict alpha2 = a2; \ 43 g12v.v = _mm_load1_ps( gamma12 ); \ 44 s12v.v = _mm_load1_ps( sigma12 ); \ 46 for ( i = 0; i < n_iter32; ++i ) \ 49 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 50 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 53 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 54 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 56 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 57 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 62 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 63 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 66 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 67 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 69 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 70 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 75 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 76 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 79 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 80 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 82 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 83 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 88 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 89 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 92 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 93 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 95 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 96 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 101 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 102 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 105 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 106 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 108 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 109 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 114 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 115 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 118 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 119 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 121 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 122 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 127 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 128 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 131 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 132 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 134 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 135 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 140 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 141 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 144 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 145 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 147 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 148 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 154 for ( i = 0; i < n_iter4; ++i ) \ 157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 164 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 165 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 171 for ( i = 0; i < n_left; ++i ) \ 173 float ga12 = *gamma12; \ 174 float si12 = *sigma12; \ 181 *alpha1 = temp1 * ga12 + temp2 * si12; \ 182 *alpha2 = temp2 * ga12 - temp1 * si12; \ 189 #define MAC_Apply_G_mx2_asd( m_A, \ 195 int n_iter16 = m_A / ( 2 * 8 ); \ 196 int n_left16 = m_A % ( 2 * 8 ); \ 197 int n_iter2 = n_left16 / ( 2 * 1 ); \ 198 int n_left = n_left16 % ( 2 * 1 ); \ 201 const int step_a1 = inc_a1 * 2; \ 202 const int step_a2 = inc_a2 * 2; \ 204 double* restrict alpha1 = a1; \ 205 double* restrict alpha2 = a2; \ 211 g12v.v = _mm_loaddup_pd( gamma12 ); \ 212 s12v.v = _mm_loaddup_pd( sigma12 ); \ 214 for ( i = 0; i < n_iter16; ++i ) \ 217 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 218 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 221 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 222 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 224 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 225 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 230 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 231 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 234 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 235 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 237 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 238 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 243 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 244 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 247 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 248 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 250 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 251 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 256 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 257 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 260 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 261 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 263 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 264 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 269 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 270 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 273 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 274 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 276 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 277 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 282 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 283 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 286 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 287 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 289 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 290 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 295 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 296 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 299 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 300 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 302 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 303 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 308 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 309 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 312 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 313 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 315 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 316 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 322 for ( i = 0; i < n_iter2; ++i ) \ 325 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 329 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 330 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 332 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 333 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 341 double ga12 = *gamma12; \ 342 double si12 = *sigma12; \ 349 *alpha1 = temp1 * ga12 + temp2 * si12; \ 350 *alpha2 = temp2 * ga12 - temp1 * si12; \ 354 #define MAC_Apply_G_mx2_asc( m_A, \ 360 int n_iter16 = m_A / ( 2 * 8 ); \ 361 int n_left16 = m_A % ( 2 * 8 ); \ 362 int n_iter2 = n_left16 / ( 2 * 1 ); \ 363 int n_left = n_left16 % ( 2 * 1 ); \ 366 const int step_a1 = inc_a1 * 2; \ 367 const int step_a2 = inc_a2 * 2; \ 369 scomplex* restrict alpha1 = a1; \ 370 scomplex* restrict alpha2 = a2; \ 376 g12v.v = _mm_load1_ps( gamma12 ); \ 377 s12v.v = _mm_load1_ps( sigma12 ); \ 379 for ( i = 0; i < n_iter16; ++i ) \ 382 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 383 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 386 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 387 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 389 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 390 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 395 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 396 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 399 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 400 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 402 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 403 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 408 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 409 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 412 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 413 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 415 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 416 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 421 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 422 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 425 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 426 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 428 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 429 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 434 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 435 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 441 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 442 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 447 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 448 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 451 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 452 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 454 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 455 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 460 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 461 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 464 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 465 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 467 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 468 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 473 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 474 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 477 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 478 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 480 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 481 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 487 for ( i = 0; i < n_iter2; ++i ) \ 490 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 491 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 494 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 495 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 497 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 498 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 506 float ga12 = *gamma12; \ 507 float si12 = *sigma12; \ 514 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 515 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 517 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 518 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 522 #define MAC_Apply_G_mx2_asz( m_A, \ 528 int n_iter = m_A / 8; \ 529 int n_left = m_A % 8; \ 532 const int step_a1 = inc_a1 * 1; \ 533 const int step_a2 = inc_a2 * 1; \ 535 dcomplex* restrict alpha1 = a1; \ 536 dcomplex* restrict alpha2 = a2; \ 542 g12v.v = _mm_loaddup_pd( gamma12 ); \ 543 s12v.v = _mm_loaddup_pd( sigma12 ); \ 545 for ( i = 0; i < n_iter; ++i ) \ 548 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 549 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 552 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 553 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 555 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 556 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 561 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 562 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 565 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 566 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 568 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 569 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 574 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 575 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 578 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 579 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 581 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 582 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 587 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 588 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 591 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 592 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 594 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 595 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 600 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 601 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 604 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 605 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 607 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 608 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 613 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 614 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 617 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 618 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 620 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 621 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 626 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 627 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 630 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 631 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 633 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 634 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 639 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 640 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 646 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 647 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 653 for ( i = 0; i < n_left; ++i ) \ 655 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 656 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 659 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 660 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 662 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 663 _mm_store_pd( ( double* )alpha2, a2v.v ); \