libflame  revision_anchor
FLA_Apply_G_mx4s_opt.h
Go to the documentation of this file.
1 /*
2 
3  Copyright (C) 2014, The University of Texas at Austin
4 
5  This file is part of libflame and is available under the 3-Clause
6  BSD license, which can be found in the LICENSE file at the top-level
7  directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #define MAC_Apply_G_mx4s_ops( m_A, \
12  gamma23_k1, \
13  sigma23_k1, \
14  gamma34_k1, \
15  sigma34_k1, \
16  gamma12_k2, \
17  sigma12_k2, \
18  gamma23_k2, \
19  sigma23_k2, \
20  a1, inc_a1, \
21  a2, inc_a2, \
22  a3, inc_a3, \
23  a4, inc_a4 ) \
24 { \
25  float ga23_k1 = *gamma23_k1; \
26  float si23_k1 = *sigma23_k1; \
27  float ga34_k1 = *gamma34_k1; \
28  float si34_k1 = *sigma34_k1; \
29  float ga12_k2 = *gamma12_k2; \
30  float si12_k2 = *sigma12_k2; \
31  float ga23_k2 = *gamma23_k2; \
32  float si23_k2 = *sigma23_k2; \
33  float* restrict alpha1 = a1; \
34  float* restrict alpha2 = a2; \
35  float* restrict alpha3 = a3; \
36  float* restrict alpha4 = a4; \
37  float temp1; \
38  float temp2; \
39  float temp3; \
40  float temp4; \
41  int i; \
42 \
43  for ( i = 0; i < m_A; ++i ) \
44  { \
45  temp2 = *alpha2; \
46  temp3 = *alpha3; \
47 \
48  *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
49  *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
50 \
51  temp3 = *alpha3; \
52  temp4 = *alpha4; \
53 \
54  *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
55  *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
56 \
57  temp1 = *alpha1; \
58  temp2 = *alpha2; \
59 \
60  *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
61  *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
62 \
63  temp2 = *alpha2; \
64  temp3 = *alpha3; \
65 \
66  *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
67  *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
68 \
69  alpha1 += inc_a1; \
70  alpha2 += inc_a2; \
71  alpha3 += inc_a3; \
72  alpha4 += inc_a4; \
73  } \
74 }
75 
76 #define MAC_Apply_G_mx4s_opc( m_A, \
77  gamma23_k1, \
78  sigma23_k1, \
79  gamma34_k1, \
80  sigma34_k1, \
81  gamma12_k2, \
82  sigma12_k2, \
83  gamma23_k2, \
84  sigma23_k2, \
85  a1, inc_a1, \
86  a2, inc_a2, \
87  a3, inc_a3, \
88  a4, inc_a4 ) \
89 { \
90  float ga23_k1 = *gamma23_k1; \
91  float si23_k1 = *sigma23_k1; \
92  float ga34_k1 = *gamma34_k1; \
93  float si34_k1 = *sigma34_k1; \
94  float ga12_k2 = *gamma12_k2; \
95  float si12_k2 = *sigma12_k2; \
96  float ga23_k2 = *gamma23_k2; \
97  float si23_k2 = *sigma23_k2; \
98  scomplex* restrict alpha1 = a1; \
99  scomplex* restrict alpha2 = a2; \
100  scomplex* restrict alpha3 = a3; \
101  scomplex* restrict alpha4 = a4; \
102  scomplex temp1; \
103  scomplex temp2; \
104  scomplex temp3; \
105  scomplex temp4; \
106  int i; \
107 \
108  for ( i = 0; i < m_A; ++i ) \
109  { \
110 \
111  temp2 = *alpha2; \
112  temp3 = *alpha3; \
113 \
114  alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
115  alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
116 \
117  alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
118  alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
119 \
120  temp3 = *alpha3; \
121  temp4 = *alpha4; \
122 \
123  alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
124  alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
125 \
126  alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
127  alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
128 \
129  temp1 = *alpha1; \
130  temp2 = *alpha2; \
131 \
132  alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
133  alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
134 \
135  alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
136  alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
137 \
138  temp2 = *alpha2; \
139  temp3 = *alpha3; \
140 \
141  alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
142  alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
143 \
144  alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
145  alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
146 \
147  alpha1 += inc_a1; \
148  alpha2 += inc_a2; \
149  alpha3 += inc_a3; \
150  alpha4 += inc_a4; \
151  } \
152 }
153 
154 #define MAC_Apply_G_mx4s_opd( m_A, \
155  gamma23_k1, \
156  sigma23_k1, \
157  gamma34_k1, \
158  sigma34_k1, \
159  gamma12_k2, \
160  sigma12_k2, \
161  gamma23_k2, \
162  sigma23_k2, \
163  a1, inc_a1, \
164  a2, inc_a2, \
165  a3, inc_a3, \
166  a4, inc_a4 ) \
167 { \
168  double ga23_k1 = *gamma23_k1; \
169  double si23_k1 = *sigma23_k1; \
170  double ga34_k1 = *gamma34_k1; \
171  double si34_k1 = *sigma34_k1; \
172  double ga12_k2 = *gamma12_k2; \
173  double si12_k2 = *sigma12_k2; \
174  double ga23_k2 = *gamma23_k2; \
175  double si23_k2 = *sigma23_k2; \
176  double* restrict alpha1 = a1; \
177  double* restrict alpha2 = a2; \
178  double* restrict alpha3 = a3; \
179  double* restrict alpha4 = a4; \
180  double temp1; \
181  double temp2; \
182  double temp3; \
183  double temp4; \
184  int i; \
185 \
186  for ( i = 0; i < m_A; ++i ) \
187  { \
188  temp2 = *alpha2; \
189  temp3 = *alpha3; \
190 \
191  *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
192  *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
193 \
194  temp3 = *alpha3; \
195  temp4 = *alpha4; \
196 \
197  *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
198  *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
199 \
200  temp1 = *alpha1; \
201  temp2 = *alpha2; \
202 \
203  *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
204  *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
205 \
206  temp2 = *alpha2; \
207  temp3 = *alpha3; \
208 \
209  *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
210  *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
211 \
212  alpha1 += inc_a1; \
213  alpha2 += inc_a2; \
214  alpha3 += inc_a3; \
215  alpha4 += inc_a4; \
216  } \
217 }
218 
219 #define MAC_Apply_G_mx4s_opz( m_A, \
220  gamma23_k1, \
221  sigma23_k1, \
222  gamma34_k1, \
223  sigma34_k1, \
224  gamma12_k2, \
225  sigma12_k2, \
226  gamma23_k2, \
227  sigma23_k2, \
228  a1, inc_a1, \
229  a2, inc_a2, \
230  a3, inc_a3, \
231  a4, inc_a4 ) \
232 { \
233  double ga23_k1 = *gamma23_k1; \
234  double si23_k1 = *sigma23_k1; \
235  double ga34_k1 = *gamma34_k1; \
236  double si34_k1 = *sigma34_k1; \
237  double ga12_k2 = *gamma12_k2; \
238  double si12_k2 = *sigma12_k2; \
239  double ga23_k2 = *gamma23_k2; \
240  double si23_k2 = *sigma23_k2; \
241  dcomplex* restrict alpha1 = a1; \
242  dcomplex* restrict alpha2 = a2; \
243  dcomplex* restrict alpha3 = a3; \
244  dcomplex* restrict alpha4 = a4; \
245  dcomplex temp1; \
246  dcomplex temp2; \
247  dcomplex temp3; \
248  dcomplex temp4; \
249  int i; \
250 \
251  for ( i = 0; i < m_A; ++i ) \
252  { \
253 \
254  temp2 = *alpha2; \
255  temp3 = *alpha3; \
256 \
257  alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
258  alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
259 \
260  alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
261  alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
262 \
263  temp3 = *alpha3; \
264  temp4 = *alpha4; \
265 \
266  alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
267  alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
268 \
269  alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
270  alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
271 \
272  temp1 = *alpha1; \
273  temp2 = *alpha2; \
274 \
275  alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
276  alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
277 \
278  alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
279  alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
280 \
281  temp2 = *alpha2; \
282  temp3 = *alpha3; \
283 \
284  alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
285  alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
286 \
287  alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
288  alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
289 \
290  alpha1 += inc_a1; \
291  alpha2 += inc_a2; \
292  alpha3 += inc_a3; \
293  alpha4 += inc_a4; \
294  } \
295 }
296