LALPulsar 7.1.1.1-eeff03c
gc_hotloop_sse2.h
Go to the documentation of this file.
1static inline void gc_hotloop( REAL4 *fgrid2F, REAL4 *cgrid2F, UCHAR *fgridnc, REAL4 TwoFthreshold, UINT4 length ) __attribute__( ( hot ) );
2static inline void gc_hotloop_no_nc( REAL4 *fgrid2F, REAL4 *cgrid2F, UINT4 length ) __attribute__( ( hot ) );
3static inline void gc_hotloop_2Fmax_tracking( REAL4 *fgrid2F, REAL4 *fgrid2Fmax, UINT4 *fgrid2FmaxIdx, REAL4 *cgrid2F, UINT4 k, UINT4 length ) __attribute__( ( hot ) );
4
5
6
7
8#ifdef __APPLE__
9
10/* Apple's gcc aligns ok */
11#define ALRealloc LALRealloc
12#define ALFree LALFree
13
14#elif defined (__MINGW32__)
15
16extern void *__mingw_aligned_realloc( void *ptr, size_t size, size_t align );
17#define ALRealloc(p,s) __mingw_aligned_realloc(p,s,16)
18#define ALFree __mingw_aligned_free
19
20#else // neither APPLE nor MinGW
21
22#include <stdlib.h>
23
24#define ALFree free
25
26static void *ALRealloc( void *ptr, size_t size );
27
28/* in our case there is no need to keep the data,
29 so we can simply do a free() and malloc() */
30void *ALRealloc( void *ptr, size_t size )
31{
32 if ( ptr ) {
33 free( ptr );
34 }
35 if ( posix_memalign( &ptr, 16, size ) ) {
36 return ( NULL );
37 }
38 return ( ptr );
39}
40
41#endif
42
43void gc_hotloop_2Fmax_tracking( REAL4 *fgrid2F, REAL4 *fgrid2Fmax, UINT4 *fgrid2FmaxIdx, REAL4 *cgrid2F, UINT4 k, UINT4 length )
44{
45
46 UINT4 ifreq_fg;
47 int newMax;
48
49 UINT4 VIIII[4] __attribute__( ( aligned( 16 ) ) ) = { k, k, k, k };
50 UINT4 V1111[4] __attribute__( ( aligned( 16 ) ) ) = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
51
52
53
54 /* if this is the first segment (seg 0), then all we need to do is copy the cg vector into the fg
55 * vector and initialize the loudest segment data to also the first segment. This assumes
56 * we are calling this function in ascending order of segments */
57
58 if ( k == 0 ) {
59 memcpy( fgrid2F, cgrid2F, sizeof( REAL4 )*length );
60 memcpy( fgrid2Fmax, cgrid2F, sizeof( REAL4 )*length );
61 memset( fgrid2FmaxIdx, 0, sizeof( UINT4 )*length );
62
63 return;
64 }
65
66
67
68 for ( ifreq_fg = 0 ; ifreq_fg + 16 < length; ifreq_fg += 16 ) {
69#ifdef EXP_NO_ASM
70
71#pragma ivdep
72 for ( int j = 0 ; j < 16; j++ ) {
73 fgrid2F[0] += cgrid2F[0] ;
74
75 newMax = ( cgrid2F[0] >= fgrid2Fmax[0] );
76 fgrid2Fmax[0] = fmaxf( fgrid2Fmax[0], cgrid2F[0] );
77 fgrid2FmaxIdx[0] = fgrid2FmaxIdx[0] * ( 1 - newMax ) + k * newMax;
78 fgrid2F++;
79 cgrid2F++;
80 fgrid2Fmax++;
81 fgrid2FmaxIdx++;
82 }
83
84#else
85
86
87 __asm __volatile(
88 "MOVAPS %[Vk],%%xmm0 \n\t"
89 "MOVAPS %[V1],%%xmm1 \n\t"
90 /* iteration 0,1,2,3 */
91 "MOVUPS (%[cg2F]),%%xmm2 \n\t" /* load coarse grid values, possibly unaligned */
92 "MOVAPS (%[fg2F]),%%xmm3 \n\t"
93 "MOVAPS (%[fg2Fmax]),%%xmm4 \n\t"
94 "MOVAPS (%[fg2FmaxIdx]),%%xmm5 \n\t"
95 /* create mask with comparison result of former max 2F */
96 "MOVAPS %%xmm4,%%xmm7 \n\t"
97 "CMPLEPS %%xmm2,%%xmm7 \n\t" /* -1 if previous 2Fmax is <= coarse grid value */
98 /* summing */
99 "ADDPS %%xmm2,%%xmm3 \n\t" /* Add four coarse grid 2F values to fine grid sums */
100 "MOVAPS %%xmm3,(%[fg2F]) \n\t" /* store 4 values in fine grid 2F sum array */
101 "MOVAPS %%xmm0,%%xmm6 \n\t"
102 "ANDPS %%xmm7,%%xmm2 \n\t"
103 "ANDPS %%xmm7,%%xmm6 \n\t"
104
105 /* negate the bitmask */
106 "XORPS %%xmm1,%%xmm7 \n\t"
107 "ANDPS %%xmm7,%%xmm4 \n\t"
108 "ANDPS %%xmm7,%%xmm5 \n\t"
109 /*get the new entries for the max 2F values by ORing the masked values */
110 "ORPS %%xmm2,%%xmm4 \n\t"
111 "ORPS %%xmm6,%%xmm5 \n\t"
112 /* write back */
113 "MOVAPS %%xmm4,(%[fg2Fmax]) \n\t"
114 "MOVAPS %%xmm5,(%[fg2FmaxIdx]) \n\t"
115
116
117
118 /* iteration 4,5,6,7 */
119
120
121 "MOVUPS 0x10(%[cg2F]),%%xmm2 \n\t" /* load coarse grid values, possibly unaligned */
122 "MOVAPS 0x10(%[fg2F]),%%xmm3 \n\t"
123 "MOVAPS 0x10(%[fg2Fmax]),%%xmm4 \n\t"
124 "MOVAPS 0x10(%[fg2FmaxIdx]),%%xmm5 \n\t"
125 /* create mask with comparison result of former max 2F */
126 "MOVAPS %%xmm4,%%xmm7 \n\t"
127 "CMPLEPS %%xmm2,%%xmm7 \n\t" /* -1 if previous 2Fmax is <= coarse grid value */
128 /* summing */
129 "ADDPS %%xmm2,%%xmm3 \n\t" /* Add four coarse grid 2F values to fine grid sums */
130 "MOVAPS %%xmm3,0x10(%[fg2F]) \n\t" /* store 4 values in fine grid 2F sum array */
131 "MOVAPS %%xmm0,%%xmm6 \n\t"
132 "ANDPS %%xmm7,%%xmm2 \n\t"
133 "ANDPS %%xmm7,%%xmm6 \n\t"
134
135 /* negate the bitmask */
136 "XORPS %%xmm1,%%xmm7 \n\t"
137 "ANDPS %%xmm7,%%xmm4 \n\t"
138 "ANDPS %%xmm7,%%xmm5 \n\t"
139 /*get the new entries for the max 2F values by ORing the masked values */
140 "ORPS %%xmm2,%%xmm4 \n\t"
141 "ORPS %%xmm6,%%xmm5 \n\t"
142 /* write back */
143 "MOVAPS %%xmm4,0x10(%[fg2Fmax]) \n\t"
144 "MOVAPS %%xmm5,0x10(%[fg2FmaxIdx]) \n\t"
145
146 /* iteration 8,9,10,11 */
147
148
149 "MOVUPS 0x20(%[cg2F]),%%xmm2 \n\t" /* load coarse grid values, possibly unaligned */
150 "MOVAPS 0x20(%[fg2F]),%%xmm3 \n\t"
151 "MOVAPS 0x20(%[fg2Fmax]),%%xmm4 \n\t"
152 "MOVAPS 0x20(%[fg2FmaxIdx]),%%xmm5 \n\t"
153 /* create mask with comparison result of former max 2F */
154 "MOVAPS %%xmm4,%%xmm7 \n\t"
155 "CMPLEPS %%xmm2,%%xmm7 \n\t" /* -1 if previous 2Fmax is <= coarse grid value */
156 /* summing */
157 "ADDPS %%xmm2,%%xmm3 \n\t" /* Add four coarse grid 2F values to fine grid sums */
158 "MOVAPS %%xmm3,0x20(%[fg2F]) \n\t" /* store 4 values in fine grid 2F sum array */
159 "MOVAPS %%xmm0,%%xmm6 \n\t"
160 "ANDPS %%xmm7,%%xmm2 \n\t"
161 "ANDPS %%xmm7,%%xmm6 \n\t"
162
163 /* negate the bitmask */
164 "XORPS %%xmm1,%%xmm7 \n\t"
165 "ANDPS %%xmm7,%%xmm4 \n\t"
166 "ANDPS %%xmm7,%%xmm5 \n\t"
167 /*get the new entries for the max 2F values by ORing the masked values */
168 "ORPS %%xmm2,%%xmm4 \n\t"
169 "ORPS %%xmm6,%%xmm5 \n\t"
170 /* write back */
171 "MOVAPS %%xmm4,0x20(%[fg2Fmax]) \n\t"
172 "MOVAPS %%xmm5,0x20(%[fg2FmaxIdx]) \n\t"
173
174
175 /* iteration 12,13,14,15 */
176
177
178 "MOVUPS 0x30(%[cg2F]),%%xmm2 \n\t" /* load coarse grid values, possibly unaligned */
179 "MOVAPS 0x30(%[fg2F]),%%xmm3 \n\t"
180 "MOVAPS 0x30(%[fg2Fmax]),%%xmm4 \n\t"
181 "MOVAPS 0x30(%[fg2FmaxIdx]),%%xmm5 \n\t"
182 /* create mask with comparison result of former max 2F */
183 "MOVAPS %%xmm4,%%xmm7 \n\t"
184 "CMPLEPS %%xmm2,%%xmm7 \n\t" /* -1 if previous 2Fmax is <= coarse grid value */
185 /* summing */
186 "ADDPS %%xmm2,%%xmm3 \n\t" /* Add four coarse grid 2F values to fine grid sums */
187 "MOVAPS %%xmm3,0x30(%[fg2F]) \n\t" /* store 4 values in fine grid 2F sum array */
188 "MOVAPS %%xmm0,%%xmm6 \n\t"
189 "ANDPS %%xmm7,%%xmm2 \n\t"
190 "ANDPS %%xmm7,%%xmm6 \n\t"
191
192 /* negate the bitmask */
193 "XORPS %%xmm1,%%xmm7 \n\t"
194 "ANDPS %%xmm7,%%xmm4 \n\t"
195 "ANDPS %%xmm7,%%xmm5 \n\t"
196 /*get the new entries for the max 2F values by ORing the masked values */
197 "ORPS %%xmm2,%%xmm4 \n\t"
198 "ORPS %%xmm6,%%xmm5 \n\t"
199 /* write back */
200 "MOVAPS %%xmm4,0x30(%[fg2Fmax]) \n\t"
201 "MOVAPS %%xmm5,0x30(%[fg2FmaxIdx]) \n\t"
202
203
204
205 /* ---------------------------------------------------*/
206 :
207 /* output */
208
209 :
210 /* input */
211 [cg2F] "r"( cgrid2F ),
212 [fg2F] "r"( fgrid2F )
213
214 ,
215
216 [fg2Fmax] "r"( fgrid2Fmax ),
217 [fg2FmaxIdx] "r"( fgrid2FmaxIdx ),
218
219
220 [Vk] "m"( VIIII[0] ),
221 [V1] "m"( V1111[0] )
222
223
224 : /* clobbered */
225 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
226
227 ) ;
228
229
230 fgrid2F += 16;
231 cgrid2F += 16;
232 fgrid2Fmax += 16;
233 fgrid2FmaxIdx += 16;
234
235#endif // EXP_No_ASM
236
237
238 }
239
240 /* take care of remaining iterations, length modulo 16 */
241 for ( ; ifreq_fg < length; ifreq_fg++ ) {
242
243 fgrid2F[0] += cgrid2F[0] ;
244
245 newMax = ( cgrid2F[0] >= fgrid2Fmax[0] );
246 fgrid2Fmax[0] = fmaxf( fgrid2Fmax[0], cgrid2F[0] );
247 fgrid2FmaxIdx[0] = fgrid2FmaxIdx[0] * ( 1 - newMax ) + k * newMax;
248 fgrid2F++;
249 cgrid2F++;
250 fgrid2Fmax++;
251 fgrid2FmaxIdx++;
252
253 } /* for( ifreq_fg = 0; ifreq_fg < finegrid.freqlength; ifreq_fg++ ) { */
254
255
256}
257
258
259
260
261void gc_hotloop( REAL4 *fgrid2F, REAL4 *cgrid2F, UCHAR *fgridnc, REAL4 TwoFthreshold, UINT4 length )
262{
263 UINT4 ifreq_fg;
264
265 REAL4 VTTTT[4] __attribute__( ( aligned( 16 ) ) ) = { TwoFthreshold, TwoFthreshold, TwoFthreshold, TwoFthreshold };
266
267
268 for ( ifreq_fg = 0 ; ifreq_fg + 16 < length; ifreq_fg += 16 ) {
269 /* unrolled loop (16 iterations of original loop) */
270#ifdef EXP_NO_ASM
271
272#pragma ivdep
273 for ( int j = 0 ; j < 16; j++ ) {
274 fgrid2F[0] += cgrid2F[0] ;
275
276 fgridnc[0] += ( TwoFthreshold < cgrid2F[0] );
277 fgridnc++;
278
279 fgrid2F++;
280 cgrid2F++;
281 }
282
283#else
284 __asm __volatile(
285 "MOVUPS (%[cg2F]),%%xmm2 \n\t" /* load coarse grid values, possibly unaligned */
286 "MOVAPS (%[fg2F]),%%xmm3 \n\t"
287
288 "MOVAPS %[Vthresh2F],%%xmm7 \n\t"
289
290 "MOVUPS 0x10(%[cg2F]),%%xmm4 \n\t"
291 "MOVUPS 0x20(%[cg2F]),%%xmm5 \n\t"
292 "MOVUPS 0x30(%[cg2F]),%%xmm6 \n\t"
293
294 /* Loop iterations 1...4 */
295
296 "ADDPS %%xmm2,%%xmm3 \n\t" /* Add four coarse grid 2F values to fine grid sums */
297
298 "MOVAPS (%[fgnc]),%%xmm1 \n\t" /* vector of 16 (!) number count values (unsigned bytes) */
299
300 "MOVAPS %%xmm3,(%[fg2F]) \n\t" /* store 4 values in fine grid 2F sum array */
301
302 "MOVAPS %%xmm7,%%xmm3 \n\t"
303 "CMPLEPS %%xmm2,%%xmm3 \n\t" /* compare the four coarse grid 2F values to four */
304 /* copies of threshold value in parallel */
305 /* result is a vector of 4 integer values: */
306 /* -1 if TwoFthreshold < cgrid2F[i] */
307 /* 0 otherwise */
308 /* (saved in xmm3 for later processing) */
309
310 /* Loop iterations 5...8 (same as above) */
311
312 "MOVAPS 0x10(%[fg2F]),%%xmm2 \n\t"
313 "ADDPS %%xmm4,%%xmm2 \n\t"
314 "MOVAPS %%xmm2,0x10(%[fg2F]) \n\t"
315
316 "MOVAPS %%xmm7,%%xmm0 \n\t"
317 "CMPLEPS %%xmm4,%%xmm0 \n\t"
318
319 "PACKSSDW %%xmm0,%%xmm3 \n\t" /* combine two vectors of 4 double words (0/-1) */
320 /* to a vector of 8 words of 0/-1 in %%xmm3 */
321
322 /* Loop iterations 9...12 (same as above) */
323
324 "MOVAPS 0x20(%[fg2F]),%%xmm4 \n\t"
325 "ADDPS %%xmm5,%%xmm4 \n\t"
326 "MOVAPS %%xmm4,0x20(%[fg2F]) \n\t"
327
328 "MOVAPS %%xmm7,%%xmm4 \n\t"
329 "CMPLEPS %%xmm5,%%xmm4 \n\t"
330
331 /* Loop iterations 13...16 (same as above) */
332
333 "MOVAPS 0x30(%[fg2F]),%%xmm2 \n\t"
334 "ADDPS %%xmm6,%%xmm2 \n\t"
335 "MOVAPS %%xmm2,0x30(%[fg2F]) \n\t"
336
337 "MOVAPS %%xmm7,%%xmm0 \n\t"
338 "CMPLEPS %%xmm6,%%xmm0 \n\t"
339
340 "PACKSSDW %%xmm0,%%xmm4 \n\t" /* 8 words of 0/-1 in %%xmm4 */
341
342 "PACKSSWB %%xmm4,%%xmm3 \n\t" /* 16 unsigned bytes of 0/-1 in %%xmm3 */
343
344 "PSUBB %%xmm3, %%xmm1 \n\t" /* subtracting vector from number count vector */
345 "MOVAPS %%xmm1,(%[fgnc]) \n\t" /* to increment number count if threshold reached */
346
347 /* ---------------------------------------------------*/
348 :
349 /* output */
350
351 :
352 /* input */
353 [cg2F] "r"( cgrid2F ),
354 [fg2F] "r"( fgrid2F )
355
356 ,
357
358 [fgnc] "r"( fgridnc ),
359
360 [Vthresh2F] "m"( VTTTT[0] )
361
362 : /* clobbered */
363 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
364
365 ) ;
366
367#endif // EXP_No_ASM
368 fgrid2F += 16;
369 cgrid2F += 16;
370 fgridnc += 16;
371
372 }
373 /* take care of remaining iterations, length modulo 16 */
374 for ( ; ifreq_fg < length; ifreq_fg++ ) {
375 fgrid2F[0] += cgrid2F[0] ;
376 fgridnc[0] += ( TwoFthreshold < cgrid2F[0] );
377 fgridnc++;
378 fgrid2F++;
379 cgrid2F++;
380 } /* for( ifreq_fg = 0; ifreq_fg < finegrid.freqlength; ifreq_fg++ ) { */
381
382}
383
384void gc_hotloop_no_nc( REAL4 *fgrid2F, REAL4 *cgrid2F, UINT4 length )
385{
386 UINT4 ifreq_fg;
387
388
389
390 for ( ifreq_fg = 0 ; ifreq_fg + 16 < length; ifreq_fg += 16 ) {
391 /* unrolled loop (16 iterations of original loop) */
392#ifdef EXP_NO_ASM
393
394#pragma ivdep
395 for ( int j = 0 ; j < 16; j++ ) {
396 fgrid2F[0] += cgrid2F[0] ;
397 fgrid2F++;
398 cgrid2F++;
399 }
400
401#else
402 __asm __volatile(
403 "MOVUPS (%[cg2F]),%%xmm2 \n\t" /* load coarse grid values, possibly unaligned */
404 "MOVAPS (%[fg2F]),%%xmm3 \n\t"
405 "MOVUPS 0x10(%[cg2F]),%%xmm4 \n\t"
406 "MOVUPS 0x20(%[cg2F]),%%xmm5 \n\t"
407 "MOVUPS 0x30(%[cg2F]),%%xmm6 \n\t"
408
409 /* Loop iterations 1...4 */
410
411 "ADDPS %%xmm2,%%xmm3 \n\t" /* Add four coarse grid 2F values to fine grid sums */
412 "MOVAPS %%xmm3,(%[fg2F]) \n\t" /* store 4 values in fine grid 2F sum array */
413
414 /* Loop iterations 5...8 (same as above) */
415
416 "MOVAPS 0x10(%[fg2F]),%%xmm2 \n\t"
417 "ADDPS %%xmm4,%%xmm2 \n\t"
418 "MOVAPS %%xmm2,0x10(%[fg2F]) \n\t"
419
420 /* Loop iterations 9...12 (same as above) */
421
422 "MOVAPS 0x20(%[fg2F]),%%xmm4 \n\t"
423 "ADDPS %%xmm5,%%xmm4 \n\t"
424 "MOVAPS %%xmm4,0x20(%[fg2F]) \n\t"
425
426 /* Loop iterations 13...16 (same as above) */
427
428 "MOVAPS 0x30(%[fg2F]),%%xmm2 \n\t"
429 "ADDPS %%xmm6,%%xmm2 \n\t"
430 "MOVAPS %%xmm2,0x30(%[fg2F]) \n\t"
431
432 /* ---------------------------------------------------*/
433 :
434 /* output */
435
436 :
437 /* input */
438 [cg2F] "r"( cgrid2F ),
439 [fg2F] "r"( fgrid2F )
440
441 : /* clobbered */
442 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
443
444 ) ;
445
446#endif // EXP_No_ASM
447 fgrid2F += 16;
448 cgrid2F += 16;
449
450 }
451 /* take care of remaining iterations, length modulo 16 */
452 for ( ; ifreq_fg < length; ifreq_fg++ ) {
453 fgrid2F[0] += cgrid2F[0] ;
454 fgrid2F++;
455 cgrid2F++;
456 } /* for( ifreq_fg = 0; ifreq_fg < finegrid.freqlength; ifreq_fg++ ) { */
457
458}
int j
ProcessParamsTable * ptr
int k
static void * ALRealloc(void *ptr, size_t size)
static void gc_hotloop_2Fmax_tracking(REAL4 *fgrid2F, REAL4 *fgrid2Fmax, UINT4 *fgrid2FmaxIdx, REAL4 *cgrid2F, UINT4 k, UINT4 length) __attribute__((hot))
static void gc_hotloop(REAL4 *fgrid2F, REAL4 *cgrid2F, UCHAR *fgridnc, REAL4 TwoFthreshold, UINT4 length) __attribute__((hot))
static void gc_hotloop_no_nc(REAL4 *fgrid2F, REAL4 *cgrid2F, UINT4 length) __attribute__((hot))
#define __attribute__(x)
unsigned char UCHAR
uint32_t UINT4
float REAL4
size