11#define ALRealloc LALRealloc
14#elif defined (__MINGW32__)
16extern void *__mingw_aligned_realloc(
void *ptr,
size_t size,
size_t align );
17#define ALRealloc(p,s) __mingw_aligned_realloc(p,s,16)
18#define ALFree __mingw_aligned_free
26static void *
ALRealloc(
void *ptr,
size_t size );
35 if ( posix_memalign( &
ptr, 16,
size ) ) {
50 UINT4 V1111[4]
__attribute__( ( aligned( 16 ) ) ) = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
59 memcpy( fgrid2F, cgrid2F,
sizeof(
REAL4 )*length );
60 memcpy( fgrid2Fmax, cgrid2F,
sizeof(
REAL4 )*length );
61 memset( fgrid2FmaxIdx, 0,
sizeof(
UINT4 )*length );
68 for ( ifreq_fg = 0 ; ifreq_fg + 16 < length; ifreq_fg += 16 ) {
72 for (
int j = 0 ;
j < 16;
j++ ) {
73 fgrid2F[0] += cgrid2F[0] ;
75 newMax = ( cgrid2F[0] >= fgrid2Fmax[0] );
76 fgrid2Fmax[0] = fmaxf( fgrid2Fmax[0], cgrid2F[0] );
77 fgrid2FmaxIdx[0] = fgrid2FmaxIdx[0] * ( 1 - newMax ) +
k * newMax;
88 "MOVAPS %[Vk],%%xmm0 \n\t"
89 "MOVAPS %[V1],%%xmm1 \n\t"
91 "MOVUPS (%[cg2F]),%%xmm2 \n\t"
92 "MOVAPS (%[fg2F]),%%xmm3 \n\t"
93 "MOVAPS (%[fg2Fmax]),%%xmm4 \n\t"
94 "MOVAPS (%[fg2FmaxIdx]),%%xmm5 \n\t"
96 "MOVAPS %%xmm4,%%xmm7 \n\t"
97 "CMPLEPS %%xmm2,%%xmm7 \n\t"
99 "ADDPS %%xmm2,%%xmm3 \n\t"
100 "MOVAPS %%xmm3,(%[fg2F]) \n\t"
101 "MOVAPS %%xmm0,%%xmm6 \n\t"
102 "ANDPS %%xmm7,%%xmm2 \n\t"
103 "ANDPS %%xmm7,%%xmm6 \n\t"
106 "XORPS %%xmm1,%%xmm7 \n\t"
107 "ANDPS %%xmm7,%%xmm4 \n\t"
108 "ANDPS %%xmm7,%%xmm5 \n\t"
110 "ORPS %%xmm2,%%xmm4 \n\t"
111 "ORPS %%xmm6,%%xmm5 \n\t"
113 "MOVAPS %%xmm4,(%[fg2Fmax]) \n\t"
114 "MOVAPS %%xmm5,(%[fg2FmaxIdx]) \n\t"
121 "MOVUPS 0x10(%[cg2F]),%%xmm2 \n\t"
122 "MOVAPS 0x10(%[fg2F]),%%xmm3 \n\t"
123 "MOVAPS 0x10(%[fg2Fmax]),%%xmm4 \n\t"
124 "MOVAPS 0x10(%[fg2FmaxIdx]),%%xmm5 \n\t"
126 "MOVAPS %%xmm4,%%xmm7 \n\t"
127 "CMPLEPS %%xmm2,%%xmm7 \n\t"
129 "ADDPS %%xmm2,%%xmm3 \n\t"
130 "MOVAPS %%xmm3,0x10(%[fg2F]) \n\t"
131 "MOVAPS %%xmm0,%%xmm6 \n\t"
132 "ANDPS %%xmm7,%%xmm2 \n\t"
133 "ANDPS %%xmm7,%%xmm6 \n\t"
136 "XORPS %%xmm1,%%xmm7 \n\t"
137 "ANDPS %%xmm7,%%xmm4 \n\t"
138 "ANDPS %%xmm7,%%xmm5 \n\t"
140 "ORPS %%xmm2,%%xmm4 \n\t"
141 "ORPS %%xmm6,%%xmm5 \n\t"
143 "MOVAPS %%xmm4,0x10(%[fg2Fmax]) \n\t"
144 "MOVAPS %%xmm5,0x10(%[fg2FmaxIdx]) \n\t"
149 "MOVUPS 0x20(%[cg2F]),%%xmm2 \n\t"
150 "MOVAPS 0x20(%[fg2F]),%%xmm3 \n\t"
151 "MOVAPS 0x20(%[fg2Fmax]),%%xmm4 \n\t"
152 "MOVAPS 0x20(%[fg2FmaxIdx]),%%xmm5 \n\t"
154 "MOVAPS %%xmm4,%%xmm7 \n\t"
155 "CMPLEPS %%xmm2,%%xmm7 \n\t"
157 "ADDPS %%xmm2,%%xmm3 \n\t"
158 "MOVAPS %%xmm3,0x20(%[fg2F]) \n\t"
159 "MOVAPS %%xmm0,%%xmm6 \n\t"
160 "ANDPS %%xmm7,%%xmm2 \n\t"
161 "ANDPS %%xmm7,%%xmm6 \n\t"
164 "XORPS %%xmm1,%%xmm7 \n\t"
165 "ANDPS %%xmm7,%%xmm4 \n\t"
166 "ANDPS %%xmm7,%%xmm5 \n\t"
168 "ORPS %%xmm2,%%xmm4 \n\t"
169 "ORPS %%xmm6,%%xmm5 \n\t"
171 "MOVAPS %%xmm4,0x20(%[fg2Fmax]) \n\t"
172 "MOVAPS %%xmm5,0x20(%[fg2FmaxIdx]) \n\t"
178 "MOVUPS 0x30(%[cg2F]),%%xmm2 \n\t"
179 "MOVAPS 0x30(%[fg2F]),%%xmm3 \n\t"
180 "MOVAPS 0x30(%[fg2Fmax]),%%xmm4 \n\t"
181 "MOVAPS 0x30(%[fg2FmaxIdx]),%%xmm5 \n\t"
183 "MOVAPS %%xmm4,%%xmm7 \n\t"
184 "CMPLEPS %%xmm2,%%xmm7 \n\t"
186 "ADDPS %%xmm2,%%xmm3 \n\t"
187 "MOVAPS %%xmm3,0x30(%[fg2F]) \n\t"
188 "MOVAPS %%xmm0,%%xmm6 \n\t"
189 "ANDPS %%xmm7,%%xmm2 \n\t"
190 "ANDPS %%xmm7,%%xmm6 \n\t"
193 "XORPS %%xmm1,%%xmm7 \n\t"
194 "ANDPS %%xmm7,%%xmm4 \n\t"
195 "ANDPS %%xmm7,%%xmm5 \n\t"
197 "ORPS %%xmm2,%%xmm4 \n\t"
198 "ORPS %%xmm6,%%xmm5 \n\t"
200 "MOVAPS %%xmm4,0x30(%[fg2Fmax]) \n\t"
201 "MOVAPS %%xmm5,0x30(%[fg2FmaxIdx]) \n\t"
211 [cg2F]
"r"( cgrid2F ),
212 [fg2F]
"r"( fgrid2F )
216 [fg2Fmax]
"r"( fgrid2Fmax ),
217 [fg2FmaxIdx]
"r"( fgrid2FmaxIdx ),
220 [Vk]
"m"( VIIII[0] ),
225 "xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
241 for ( ; ifreq_fg < length; ifreq_fg++ ) {
243 fgrid2F[0] += cgrid2F[0] ;
245 newMax = ( cgrid2F[0] >= fgrid2Fmax[0] );
246 fgrid2Fmax[0] = fmaxf( fgrid2Fmax[0], cgrid2F[0] );
247 fgrid2FmaxIdx[0] = fgrid2FmaxIdx[0] * ( 1 - newMax ) +
k * newMax;
265 REAL4 VTTTT[4]
__attribute__( ( aligned( 16 ) ) ) = { TwoFthreshold, TwoFthreshold, TwoFthreshold, TwoFthreshold };
268 for ( ifreq_fg = 0 ; ifreq_fg + 16 < length; ifreq_fg += 16 ) {
273 for (
int j = 0 ;
j < 16;
j++ ) {
274 fgrid2F[0] += cgrid2F[0] ;
276 fgridnc[0] += ( TwoFthreshold < cgrid2F[0] );
285 "MOVUPS (%[cg2F]),%%xmm2 \n\t"
286 "MOVAPS (%[fg2F]),%%xmm3 \n\t"
288 "MOVAPS %[Vthresh2F],%%xmm7 \n\t"
290 "MOVUPS 0x10(%[cg2F]),%%xmm4 \n\t"
291 "MOVUPS 0x20(%[cg2F]),%%xmm5 \n\t"
292 "MOVUPS 0x30(%[cg2F]),%%xmm6 \n\t"
296 "ADDPS %%xmm2,%%xmm3 \n\t"
298 "MOVAPS (%[fgnc]),%%xmm1 \n\t"
300 "MOVAPS %%xmm3,(%[fg2F]) \n\t"
302 "MOVAPS %%xmm7,%%xmm3 \n\t"
303 "CMPLEPS %%xmm2,%%xmm3 \n\t"
312 "MOVAPS 0x10(%[fg2F]),%%xmm2 \n\t"
313 "ADDPS %%xmm4,%%xmm2 \n\t"
314 "MOVAPS %%xmm2,0x10(%[fg2F]) \n\t"
316 "MOVAPS %%xmm7,%%xmm0 \n\t"
317 "CMPLEPS %%xmm4,%%xmm0 \n\t"
319 "PACKSSDW %%xmm0,%%xmm3 \n\t"
324 "MOVAPS 0x20(%[fg2F]),%%xmm4 \n\t"
325 "ADDPS %%xmm5,%%xmm4 \n\t"
326 "MOVAPS %%xmm4,0x20(%[fg2F]) \n\t"
328 "MOVAPS %%xmm7,%%xmm4 \n\t"
329 "CMPLEPS %%xmm5,%%xmm4 \n\t"
333 "MOVAPS 0x30(%[fg2F]),%%xmm2 \n\t"
334 "ADDPS %%xmm6,%%xmm2 \n\t"
335 "MOVAPS %%xmm2,0x30(%[fg2F]) \n\t"
337 "MOVAPS %%xmm7,%%xmm0 \n\t"
338 "CMPLEPS %%xmm6,%%xmm0 \n\t"
340 "PACKSSDW %%xmm0,%%xmm4 \n\t"
342 "PACKSSWB %%xmm4,%%xmm3 \n\t"
344 "PSUBB %%xmm3, %%xmm1 \n\t"
345 "MOVAPS %%xmm1,(%[fgnc]) \n\t"
353 [cg2F]
"r"( cgrid2F ),
354 [fg2F]
"r"( fgrid2F )
358 [fgnc]
"r"( fgridnc ),
360 [Vthresh2F]
"m"( VTTTT[0] )
363 "xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
374 for ( ; ifreq_fg < length; ifreq_fg++ ) {
375 fgrid2F[0] += cgrid2F[0] ;
376 fgridnc[0] += ( TwoFthreshold < cgrid2F[0] );
390 for ( ifreq_fg = 0 ; ifreq_fg + 16 < length; ifreq_fg += 16 ) {
395 for (
int j = 0 ;
j < 16;
j++ ) {
396 fgrid2F[0] += cgrid2F[0] ;
403 "MOVUPS (%[cg2F]),%%xmm2 \n\t"
404 "MOVAPS (%[fg2F]),%%xmm3 \n\t"
405 "MOVUPS 0x10(%[cg2F]),%%xmm4 \n\t"
406 "MOVUPS 0x20(%[cg2F]),%%xmm5 \n\t"
407 "MOVUPS 0x30(%[cg2F]),%%xmm6 \n\t"
411 "ADDPS %%xmm2,%%xmm3 \n\t"
412 "MOVAPS %%xmm3,(%[fg2F]) \n\t"
416 "MOVAPS 0x10(%[fg2F]),%%xmm2 \n\t"
417 "ADDPS %%xmm4,%%xmm2 \n\t"
418 "MOVAPS %%xmm2,0x10(%[fg2F]) \n\t"
422 "MOVAPS 0x20(%[fg2F]),%%xmm4 \n\t"
423 "ADDPS %%xmm5,%%xmm4 \n\t"
424 "MOVAPS %%xmm4,0x20(%[fg2F]) \n\t"
428 "MOVAPS 0x30(%[fg2F]),%%xmm2 \n\t"
429 "ADDPS %%xmm6,%%xmm2 \n\t"
430 "MOVAPS %%xmm2,0x30(%[fg2F]) \n\t"
438 [cg2F]
"r"( cgrid2F ),
439 [fg2F]
"r"( fgrid2F )
442 "xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
452 for ( ; ifreq_fg < length; ifreq_fg++ ) {
453 fgrid2F[0] += cgrid2F[0] ;
static void * ALRealloc(void *ptr, size_t size)
static void gc_hotloop_2Fmax_tracking(REAL4 *fgrid2F, REAL4 *fgrid2Fmax, UINT4 *fgrid2FmaxIdx, REAL4 *cgrid2F, UINT4 k, UINT4 length) __attribute__((hot))
static void gc_hotloop(REAL4 *fgrid2F, REAL4 *cgrid2F, UCHAR *fgridnc, REAL4 TwoFthreshold, UINT4 length) __attribute__((hot))
static void gc_hotloop_no_nc(REAL4 *fgrid2F, REAL4 *cgrid2F, UINT4 length) __attribute__((hot))