00001 #ifndef vnl_sse_h_
00002 #define vnl_sse_h_
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include <vcl_compiler.h>
00014 #include <vxl_config.h>
00015 #include <vcl_cfloat.h>
00016
00017 #include <vnl/vnl_config.h>
00018 #include <vnl/vnl_alloc.h>
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #if VNL_CONFIG_ENABLE_SSE2
00029 # if defined(__APPLE__) && (defined(__ppc__) || defined(__ppc64__))
00030 # undef VNL_CONFIG_ENABLE_SSE2
00031 # define VNL_CONFIG_ENABLE_SSE2 0
00032 # elif !VXL_HAS_EMMINTRIN_H
00033 # error "Required file emmintrin.h for SSE2 not found"
00034 # else
00035 # include <emmintrin.h>
00036 # endif
00037 #endif
00038
00039
00040
00041
00042 #if defined(VCL_GCC)
00043
00044
00045
00046 # define VNL_SSE_FORCE_INLINE inline
00047 # define VNL_SSE_STACK_ALIGNED(x) __attribute__((aligned(x)))
00048 #elif defined VCL_VC || defined VCL_ICC
00049 # define VNL_SSE_FORCE_INLINE __forceinline
00050 # define VNL_SSE_STACK_ALIGNED(x) __declspec(align(x))
00051 #else
00052 # define VNL_SSE_FORCE_INLINE inline
00053 # define VNL_SSE_STACK_ALIGNED(x)
00054 # define VNL_SSE_STACK_STORE(pf) _mm_storeu_##pf // no stack alignment so use unaligned store (slower!)
00055 #endif
00056
00057
00058
00059
00060 #if VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MM_MALLOC
00061 # define VNL_SSE_ALLOC(n,s,a) _mm_malloc(n*s,a)
00062 # define VNL_SSE_FREE(v,n,s) _mm_free(v)
00063 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_ALIGNED_MALLOC
00064 # include <malloc.h>
00065 # define VNL_SSE_ALLOC(n,s,a) _aligned_malloc(n*s,a)
00066 # define VNL_SSE_FREE(v,n,s) _aligned_free(v)
00067 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MINGW_ALIGNED_MALLOC
00068 # include <malloc.h>
00069 # define VNL_SSE_ALLOC(n,s,a) __mingw_aligned_malloc(n*s,a)
00070 # define VNL_SSE_FREE(v,n,s) __mingw_aligned_free(v)
00071 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_POSIX_MEMALIGN
00072 # include <vcl_cstdlib.h>
00073 # define VNL_SSE_ALLOC(n,s,a) memalign(a,n*s)
00074 # define VNL_SSE_FREE(v,n,s) vcl_free(v)
00075 #else // sse2 disabled or could not get memory alignment support, use slower unaligned based intrinsics
00076 # define VNL_SSE_HEAP_STORE(pf) _mm_storeu_##pf
00077 # define VNL_SSE_HEAP_LOAD(pf) _mm_loadu_##pf
00078 # if VNL_CONFIG_THREAD_SAFE
00079 # define VNL_SSE_ALLOC(n,s,a) new char[n*s]
00080 # define VNL_SSE_FREE(v,n,s) delete [] static_cast<char*>(v)
00081 # else
00082 # define VNL_SSE_ALLOC(n,s,a) vnl_alloc::allocate((n == 0) ? 8 : (n * s));
00083 # define VNL_SSE_FREE(v,n,s) if (v) vnl_alloc::deallocate(v, (n == 0) ? 8 : (n * s));
00084 # endif
00085 #endif
00086
00087
00088
00089 #ifndef VNL_SSE_STACK_STORE
00090 # define VNL_SSE_STACK_STORE(pf) _mm_store_##pf
00091 #endif
00092
00093
00094 #ifndef VNL_SSE_HEAP_STORE
00095 # define VNL_SSE_HEAP_STORE(pf) _mm_store_##pf
00096 # define VNL_SSE_HEAP_LOAD(pf) _mm_load_##pf
00097 #endif
00098
00099
00100 VNL_SSE_FORCE_INLINE void* vnl_sse_alloc(vcl_size_t n, unsigned size)
00101 {
00102 return VNL_SSE_ALLOC(n,size,16);
00103 }
00104
00105
00106 VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void* mem, vcl_size_t n, unsigned size)
00107 {
00108 VNL_SSE_FREE(mem,n,size);
00109 }
00110
00111
00112 #ifndef NDEBUG
00113 #undef VNL_SSE_FORCE_INLINE
00114 #define VNL_SSE_FORCE_INLINE
00115 #endif
00116
00117 #if VNL_CONFIG_ENABLE_SSE2
00118 class vnl_sse_supplement
00119 {
00120 public:
00121
00122
00123
00124 static VNL_SSE_FORCE_INLINE __m128i vnl_mm_min_epi32(__m128i a, __m128i b)
00125 {
00126 __m128i mask = _mm_cmplt_epi32(a, b);
00127 a = _mm_and_si128(a, mask);
00128 b = _mm_andnot_si128(mask, b);
00129 a = _mm_or_si128(a, b);
00130 return a;
00131 }
00132
00133 static VNL_SSE_FORCE_INLINE __m128i vnl_mm_max_epi32(__m128i a, __m128i b)
00134 {
00135 __m128i mask = _mm_cmpgt_epi32(a, b);
00136 a = _mm_and_si128(a, mask);
00137 b = _mm_andnot_si128(mask, b);
00138 a = _mm_or_si128(a, b);
00139 return a;
00140 }
00141 };
00142
00143
00144 #define VNL_MM_MIN_EPI32 vnl_sse_supplement::vnl_mm_min_epi32
00145 #define VNL_MM_MAX_EPI32 vnl_sse_supplement::vnl_mm_max_epi32
00146 #endif // VNL_CONFIG_ENABLE_SSE2
00147
00148
00149 template <class T>
00150 class vnl_sse
00151 {
00152 public:
00153 static VNL_SSE_FORCE_INLINE void element_product(const T* x, const T* y, T* r, unsigned n)
00154 {
00155 for (unsigned i = 0; i < n; ++i)
00156 r[i] = x[i] * y[i];
00157 }
00158
00159 static VNL_SSE_FORCE_INLINE T dot_product(const T* x, const T* y, unsigned n)
00160 {
00161 T sum(0);
00162 for (unsigned i = 0; i < n; ++i)
00163 sum += x[i] * y[i];
00164 return sum;
00165 }
00166
00167 static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T* x, const T* y, unsigned n)
00168 {
00169
00170 T sum(0);
00171 #ifdef VCL_VC_6
00172 for (unsigned i=0; i<n; ++i)
00173 {
00174 const T diff = x[i] - y[i];
00175 sum += diff*diff;
00176 }
00177 #else
00178 --x;
00179 --y;
00180 while (n!=0)
00181 {
00182 const T diff = x[n] - y[n];
00183 sum += diff*diff;
00184 --n;
00185 }
00186 #endif
00187 return sum;
00188 }
00189
00190 static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T* v, const T* m, T* r, unsigned rows, unsigned cols)
00191 {
00192 for (unsigned int j=0; j<cols; ++j) {
00193 T som(0);
00194 for (unsigned int i=0; i<rows; ++i)
00195 som += (m+i*cols)[j] * v[i];
00196 r[j] = som;
00197 }
00198 }
00199
00200 static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T* m, const T* v, T* r, unsigned rows, unsigned cols)
00201 {
00202 for (unsigned int i=0; i<rows; ++i) {
00203 T som(0);
00204 for (unsigned int j=0; j<cols; ++j)
00205 som += (m+i*cols)[j] * v[j];
00206 r[i] = som;
00207 }
00208 }
00209
00210 static VNL_SSE_FORCE_INLINE T sum(const T* v, unsigned n)
00211 {
00212 T tot(0);
00213 for (unsigned i = 0; i < n; ++i)
00214 tot += *v++;
00215 return tot;
00216 }
00217
00218 static VNL_SSE_FORCE_INLINE T max(const T* v, unsigned n)
00219 {
00220 if (n==0) return T(0);
00221 T tmp = *v;
00222 while (--n > 0)
00223 if (*++v > tmp)
00224 tmp = *v;
00225 return tmp;
00226 }
00227
00228 static VNL_SSE_FORCE_INLINE T min(const T* v, unsigned n)
00229 {
00230 if (n==0) return T(0);
00231 T tmp = *v;
00232 while (--n > 0)
00233 if (*++v < tmp)
00234 tmp = *v;
00235 return tmp;
00236 }
00237
00238 static VNL_SSE_FORCE_INLINE unsigned arg_max(const T* v, unsigned n)
00239 {
00240 if (n==0) return unsigned(-1);
00241 T tmp = *v;
00242 unsigned idx = 0;
00243 for (unsigned i=1; i<n; ++i)
00244 if (*++v > tmp)
00245 tmp = *v, idx = i;
00246 return idx;
00247 }
00248
00249 static VNL_SSE_FORCE_INLINE unsigned arg_min(const T* v, unsigned n)
00250 {
00251 if (n==0) return unsigned(-1);
00252 T tmp = *v;
00253 unsigned idx = 0;
00254 for (unsigned i=1; i<n; ++i)
00255 if (*++v < tmp)
00256 tmp = *v, idx = i;
00257 return idx;
00258 }
00259 };
00260
00261 #if VNL_CONFIG_ENABLE_SSE2
00262
00263
00264 VCL_DEFINE_SPECIALIZATION
00265 class vnl_sse<double>
00266 {
00267 public:
00268 static VNL_SSE_FORCE_INLINE void element_product(const double* x, const double* y, double* r, unsigned n)
00269 {
00270 switch (n%4)
00271 {
00272
00273 case 3: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
00274 case 2: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
00275 case 1: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
00276 case 0: ;
00277 }
00278
00279
00280
00281 if (vcl_ptrdiff_t(x)%16 || vcl_ptrdiff_t(y)%16 || vcl_ptrdiff_t(r)%16)
00282
00283 for (int i = n-4; i >= 0; i-=4)
00284 {
00285 _mm_storeu_pd(r+i,_mm_mul_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i)));
00286 _mm_storeu_pd(r+i+2,_mm_mul_pd(_mm_loadu_pd(x+i+2),_mm_loadu_pd(y+i+2)));
00287 }
00288 else
00289 for (int i = n-4; i >= 0; i-=4)
00290 {
00291 VNL_SSE_HEAP_STORE(pd)(r+i,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i)));
00292 VNL_SSE_HEAP_STORE(pd)(r+i+2,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i+2),VNL_SSE_HEAP_LOAD(pd)(y+i+2)));
00293 }
00294 }
00295
00296 static VNL_SSE_FORCE_INLINE double dot_product(const double* x, const double* y, unsigned n)
00297 {
00298 double ret;
00299 __m128d sum;
00300 if (n%2)
00301 {
00302
00303 n--; sum = _mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
00304 }
00305 else
00306 sum = _mm_setzero_pd();
00307
00308 if (vcl_ptrdiff_t(x)%16 || vcl_ptrdiff_t(y)%16)
00309
00310 for (int i = n-2; i >= 0; i-=2)
00311 sum = _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(x+i), _mm_loadu_pd(y+i)),sum);
00312 else
00313 for (int i = n-2; i >= 0; i-=2)
00314 sum = _mm_add_pd(_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), VNL_SSE_HEAP_LOAD(pd)(y+i)),sum);
00315
00316
00317 sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
00318 _mm_store_sd(&ret,sum);
00319 return ret;
00320 }
00321
00322 static VNL_SSE_FORCE_INLINE double euclid_dist_sq(const double* x, const double* y, unsigned n)
00323 {
00324 double ret;
00325 __m128d sum,a;
00326
00327 if (n%2)
00328 {
00329
00330 n--; a = _mm_sub_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
00331 sum = _mm_mul_sd(a,a);
00332 }
00333 else
00334 sum = _mm_setzero_pd();
00335
00336 if (vcl_ptrdiff_t(x)%16 || vcl_ptrdiff_t(y)%16)
00337
00338 for ( int i = n-2; i >= 0; i-=2 )
00339 {
00340 a = _mm_sub_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i));
00341 sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
00342 }
00343 else
00344 for ( int i = n-2; i >= 0; i-=2 )
00345 {
00346 a = _mm_sub_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i));
00347 sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
00348 }
00349
00350
00351 sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
00352 _mm_store_sd(&ret,sum);
00353 return ret;
00354 }
00355
00356 static VNL_SSE_FORCE_INLINE void vector_x_matrix(const double* v, const double* m, double* r, unsigned rows, unsigned cols)
00357 {
00358 __m128d accum, x,y,z,w;
00359
00360
00361 unsigned r_left = rows%4;
00362 unsigned r_nice = rows - r_left;
00363 unsigned c_left = cols%2;
00364 unsigned c_nice = cols - c_left;
00365
00366
00367 for (unsigned j = 0; j < c_nice; j+=2)
00368 {
00369
00370 accum = _mm_setzero_pd();
00371 unsigned i = 0;
00372 while ( i < r_nice )
00373 {
00374
00375
00376 y = VNL_SSE_HEAP_LOAD(pd)(v+i);
00377 w = VNL_SSE_HEAP_LOAD(pd)(v+i+2);
00378
00379 _mm_prefetch((const char*)(v+i+4), _MM_HINT_NTA);
00380
00381
00382
00383
00384
00385
00386 x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
00387 y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
00388 z = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(0,0));
00389 w = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(1,1));
00390
00391
00392
00393
00394
00395
00396 x = _mm_mul_pd(x,_mm_loadu_pd(i++*cols+m+j));
00397 y = _mm_mul_pd(y,_mm_loadu_pd(i++*cols+m+j));
00398 z = _mm_mul_pd(z,_mm_loadu_pd(i++*cols+m+j));
00399 w = _mm_mul_pd(w,_mm_loadu_pd(i++*cols+m+j));
00400
00401
00402 accum = _mm_add_pd(x,accum);
00403 accum = _mm_add_pd(y,accum);
00404 accum = _mm_add_pd(z,accum);
00405 accum = _mm_add_pd(w,accum);
00406
00407
00408
00409 }
00410
00411
00412 switch (r_left)
00413 {
00414 case 3: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
00415 case 2: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
00416 case 1: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum);
00417 case 0: ;
00418 }
00419
00420
00421
00422 _mm_stream_pd(r+j,accum);
00423 }
00424
00425
00426 if ( c_left )
00427 {
00428 accum = _mm_setzero_pd();
00429 for (unsigned int i=0; i<rows; ++i)
00430 accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(m+i*cols+cols-1),_mm_load_sd(v+i)),accum);
00431 _mm_store_sd(r+cols-1, accum);
00432 }
00433 }
00434
00435 static VNL_SSE_FORCE_INLINE void matrix_x_vector(const double* m, const double* v, double* r, unsigned rows, unsigned cols)
00436 {
00437 __m128d accum, x,y,mxy1,mxy2;
00438
00439
00440 unsigned r_left = rows%2;
00441 unsigned r_nice = rows - r_left;
00442 unsigned c_left = cols%2;
00443 unsigned c_nice = cols - c_left;
00444
00445
00446 for (unsigned i = 0; i < r_nice; i+=2)
00447 {
00448
00449 accum = _mm_setzero_pd();
00450 const double *r1 = m+i*cols, *r2 = m+(i+1)*cols;
00451 unsigned j = 0;
00452 for (; j < c_nice; j+=2)
00453 {
00454
00455
00456 y = VNL_SSE_HEAP_LOAD(pd)(v+j);
00457
00458
00459
00460 x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
00461 y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
00462
00463
00464
00465 mxy1 = _mm_loadu_pd(r1+j);
00466 mxy2 = _mm_loadu_pd(r2+j);
00467
00468
00469
00470
00471 x = _mm_mul_pd(x,_mm_unpacklo_pd(mxy1,mxy2));
00472 y = _mm_mul_pd(y,_mm_unpackhi_pd(mxy1,mxy2));
00473
00474
00475 accum = _mm_add_pd(x,accum);
00476 accum = _mm_add_pd(y,accum);
00477
00478
00479
00480 }
00481
00482 if (c_left)
00483 accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+j),_mm_set_pd(*(r2+j),*(r1+j))), accum);
00484
00485
00486
00487 _mm_stream_pd(r+i,accum);
00488 }
00489
00490
00491 if ( r_left )
00492 {
00493 accum = _mm_setzero_pd();
00494 const double* p = m+(rows-1)*cols;
00495 for (unsigned int j=0; j<cols; ++j)
00496 accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(p+j),_mm_load_sd(v+j)),accum);
00497 _mm_store_sd(r+rows-1, accum);
00498 }
00499 }
00500
00501 static VNL_SSE_FORCE_INLINE double sum(const double* x, unsigned n)
00502 {
00503 double ret;
00504
00505 __m128d sum = n%2 ? _mm_load_sd(x+--n) : _mm_setzero_pd();
00506
00507
00508 for (int i = n-2; i >= 0; i-=2)
00509 sum = _mm_add_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),sum);
00510
00511
00512 sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
00513 _mm_store_sd(&ret,sum);
00514 return ret;
00515 }
00516
00517 static VNL_SSE_FORCE_INLINE unsigned arg_max(const double* x, unsigned n)
00518 {
00519 if (n == 1)
00520 return 0;
00521
00522 __m128d min_double = _mm_set1_pd(- DBL_MAX);
00523 __m128d max = min_double;
00524 __m128d input;
00525 __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
00526 __m128i input_idx_increment = _mm_set1_epi32(2);
00527 __m128i arg_max = _mm_set1_epi32(0);
00528 union IsMaxMask
00529 {
00530 __m128d m128d;
00531 __m128i m128i;
00532 };
00533 IsMaxMask is_max;
00534
00535 const int n16 = (n/2) * 2;
00536
00537 for (int i=0; i<n16; i+=2)
00538 {
00539 input = VNL_SSE_HEAP_LOAD(pd)(x+i);
00540 max = _mm_max_pd(input, max);
00541 is_max.m128d = _mm_cmpeq_pd(max, input);
00542 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
00543 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
00544 }
00545
00546
00547 if (n%2)
00548 {
00549 input_idx = _mm_set1_epi32(--n);
00550 input = _mm_load1_pd(x+n);
00551 max = _mm_max_sd(max, input);
00552 is_max.m128d = _mm_cmpeq_pd(max, input);
00553 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
00554 }
00555
00556
00557 is_max.m128d = max;
00558 max = _mm_max_sd(_mm_unpackhi_pd(max, min_double), max);
00559 max = _mm_unpacklo_pd(max, max);
00560 is_max.m128d = _mm_cmpeq_pd(is_max.m128d, max);
00561 arg_max = _mm_and_si128(is_max.m128i, arg_max);
00562 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
00563 unsigned ret = _mm_cvtsi128_si32(arg_max);
00564 return ret;
00565 }
00566
00567 static VNL_SSE_FORCE_INLINE unsigned arg_min(const double* x, unsigned n)
00568 {
00569 if (n == 1)
00570 return 0;
00571
00572 __m128d max_double = _mm_set1_pd(DBL_MAX);
00573 __m128d min = max_double;
00574 __m128d input;
00575 __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
00576 __m128i input_idx_increment = _mm_set1_epi32(2);
00577 __m128i arg_min = _mm_set1_epi32(0);
00578 union IsMinMask
00579 {
00580 __m128d m128d;
00581 __m128i m128i;
00582 };
00583 IsMinMask is_min;
00584
00585 const int n16 = (n/2) * 2;
00586
00587 for (int i=0; i<n16; i+=2)
00588 {
00589 input = VNL_SSE_HEAP_LOAD(pd)(x+i);
00590 min = _mm_min_pd(input, min);
00591 is_min.m128d = _mm_cmpeq_pd(min, input);
00592 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
00593 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
00594 }
00595
00596
00597 if (n%2)
00598 {
00599 input_idx = _mm_set1_epi32(--n);
00600 input = _mm_load1_pd(x+n);
00601 min = _mm_min_sd(min, input);
00602 is_min.m128d = _mm_cmpeq_pd(min, input);
00603 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
00604 }
00605
00606
00607 is_min.m128d = min;
00608 min = _mm_min_sd(_mm_unpackhi_pd(min, max_double), min);
00609 min = _mm_unpacklo_pd(min, min);
00610 is_min.m128d = _mm_cmpeq_pd(is_min.m128d, min);
00611 arg_min = _mm_and_si128(is_min.m128i, arg_min);
00612 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
00613 unsigned ret = _mm_cvtsi128_si32(arg_min);
00614 return ret;
00615 }
00616
00617 static VNL_SSE_FORCE_INLINE double max(const double* x, unsigned n)
00618 {
00619 double ret;
00620 __m128d min_double = _mm_set1_pd(- DBL_MAX);
00621 __m128d max = min_double;
00622
00623
00624 if (n%2)
00625 max = _mm_max_sd(max,_mm_load_sd(x+--n));
00626
00627
00628 for (int i=n-2; i>=0; i-=2)
00629 max = _mm_max_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), max);
00630
00631
00632 max = _mm_max_sd(max,_mm_unpackhi_pd(max,min_double));
00633 _mm_store_sd(&ret,max);
00634 return ret;
00635 }
00636
00637 static VNL_SSE_FORCE_INLINE double min(const double* x, unsigned n)
00638 {
00639 double ret;
00640 __m128d max_double = _mm_set1_pd(DBL_MAX);
00641 __m128d min = max_double;
00642
00643
00644 if (n%2)
00645 min = _mm_min_sd(min,_mm_load_sd(x+--n));
00646
00647
00648 for (int i=n-2; i>=0; i-=2)
00649 min = _mm_min_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), min);
00650
00651
00652 min = _mm_min_sd(min,_mm_unpackhi_pd(min,max_double));
00653 _mm_store_sd(&ret,min);
00654 return ret;
00655 }
00656 };
00657
00658
00659 VCL_DEFINE_SPECIALIZATION
00660 class vnl_sse<float>
00661 {
00662 public:
00663 static VNL_SSE_FORCE_INLINE void element_product(const float* x, const float* y, float* r, unsigned n)
00664 {
00665 switch (n%4)
00666 {
00667
00668 case 3: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00669 case 2: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00670 case 1: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00671 case 0: ;
00672 }
00673
00674
00675 for (int i = n-4; i >= 0; i-=4)
00676 VNL_SSE_HEAP_STORE(ps)(r+i,_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i)));
00677 }
00678
00679 static VNL_SSE_FORCE_INLINE float dot_product(const float* x, const float* y, unsigned n)
00680 {
00681 float ret;
00682 __m128 sum = _mm_setzero_ps();
00683 switch (n%4)
00684 {
00685
00686 case 3: n--; sum = _mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n));
00687 case 2: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
00688 case 1: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
00689 case 0: ;
00690 }
00691
00692 for (int i = n-4; i >= 0; i-=4)
00693 sum = _mm_add_ps(_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), VNL_SSE_HEAP_LOAD(ps)(y+i)),sum);
00694
00695
00696 sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
00697 sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
00698
00699 _mm_store_ss(&ret,sum);
00700 return ret;
00701 }
00702
00703 static VNL_SSE_FORCE_INLINE float euclid_dist_sq(const float* x, const float* y, unsigned n)
00704 {
00705 float ret;
00706 __m128 sum,a;
00707 sum = a = _mm_setzero_ps();
00708 switch (n%4)
00709 {
00710
00711 case 3: --n; a = _mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n));
00712 case 2: --n; a = _mm_shuffle_ps(_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)), a ,_MM_SHUFFLE(1,0,0,1));
00713 case 1: --n; a = _mm_move_ss(a,_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00714 sum = _mm_mul_ps(a,a);
00715 case 0: ;
00716 }
00717
00718 for ( int i = n-4; i >= 0; i-=4 )
00719 {
00720 a = _mm_sub_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i));
00721 sum = _mm_add_ps(_mm_mul_ps(a,a),sum);
00722 }
00723
00724
00725 sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
00726 sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
00727
00728 _mm_store_ss(&ret,sum);
00729 return ret;
00730 }
00731
00732 static VNL_SSE_FORCE_INLINE void vector_x_matrix(const float* v, const float* m, float* r, unsigned rows, unsigned cols)
00733 {
00734 __m128 accum, x,y,z,w;
00735
00736
00737 unsigned r_left = rows%4;
00738 unsigned r_nice = rows - r_left;
00739 unsigned c_left = cols%4;
00740 unsigned c_nice = cols - c_left;
00741
00742
00743 for (unsigned j = 0; j < c_nice; j+=4)
00744 {
00745
00746 accum = _mm_setzero_ps();
00747 unsigned i = 0;
00748 while ( i < r_nice )
00749 {
00750
00751
00752 w = VNL_SSE_HEAP_LOAD(ps)(v+i);
00753
00754
00755
00756
00757
00758
00759 x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
00760 y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
00761 z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
00762 w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
00763
00764
00765
00766
00767
00768
00769 x = _mm_mul_ps(x,_mm_loadu_ps(m+i++*cols+j));
00770 y = _mm_mul_ps(y,_mm_loadu_ps(m+i++*cols+j));
00771 z = _mm_mul_ps(z,_mm_loadu_ps(m+i++*cols+j));
00772 w = _mm_mul_ps(w,_mm_loadu_ps(m+i++*cols+j));
00773
00774
00775 accum = _mm_add_ps(x,accum);
00776 accum = _mm_add_ps(y,accum);
00777 accum = _mm_add_ps(z,accum);
00778 accum = _mm_add_ps(w,accum);
00779
00780
00781
00782
00783
00784 }
00785
00786
00787 switch (r_left)
00788 {
00789 case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
00790 case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
00791 case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum);
00792 case 0: ;
00793 }
00794
00795
00796
00797 _mm_stream_ps(r+j,accum);
00798 }
00799
00800
00801 for (; c_left > 0; --c_left) {
00802 accum = _mm_setzero_ps();
00803 for (unsigned int i=0; i<rows; ++i)
00804 accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(m+i*cols+cols-c_left), _mm_load_ss(v+i)),accum);
00805 _mm_store_ss(r+cols-c_left,accum);
00806 }
00807 }
00808
00809 static VNL_SSE_FORCE_INLINE void matrix_x_vector(const float* m, const float* v, float* r, unsigned rows, unsigned cols)
00810 {
00811 __m128 accum, x,y,z,w,mxy1,mxy2,mzw1,mzw2, mr1,mr2,mr3,mr4;
00812
00813
00814 unsigned r_left = rows%4;
00815 unsigned r_nice = rows - r_left;
00816 unsigned c_left = cols%4;
00817 unsigned c_nice = cols - c_left;
00818
00819
00820 for (unsigned i = 0; i < r_nice; i+=4)
00821 {
00822
00823 accum = _mm_setzero_ps();
00824 const float *r1 = m+i*cols, *r2 = m+(i+1)*cols,
00825 *r3 = m+(i+2)*cols, *r4 = m+(i+3)*cols;
00826 unsigned j = 0;
00827 for (; j < c_nice; j+=4)
00828 {
00829
00830
00831 w = VNL_SSE_HEAP_LOAD(ps)(v+j);
00832
00833
00834
00835
00836
00837
00838 x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
00839 y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
00840 z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
00841 w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
00842
00843
00844
00845
00846 mr1 = _mm_loadu_ps(r1+j);
00847 mr2 = _mm_loadu_ps(r2+j);
00848
00849
00850
00851
00852 mxy1 = _mm_unpacklo_ps(mr1,mr2);
00853 mzw1 = _mm_unpackhi_ps(mr1,mr2);
00854
00855
00856 mr3 = _mm_loadu_ps(r3+j);
00857 mr4 = _mm_loadu_ps(r4+j);
00858
00859
00860
00861
00862 mxy2 = _mm_unpacklo_ps(mr3,mr4);
00863 mzw2 = _mm_unpackhi_ps(mr3,mr4);
00864
00865
00866
00867
00868
00869
00870 #if 1
00871 __m128 mx = _mm_movelh_ps(mxy1,mxy2);
00872 x = _mm_mul_ps(x, mx);
00873 __m128 my = _mm_movehl_ps(mxy2,mxy1);
00874 y = _mm_mul_ps(y, my);
00875 __m128 mz = _mm_movelh_ps(mzw1,mzw2);
00876 z = _mm_mul_ps(z, mz);
00877 __m128 mw = _mm_movehl_ps(mzw2,mzw1);
00878 w = _mm_mul_ps(w,mw);
00879 #else
00880 x = _mm_mul_ps(x,_mm_movelh_ps(mxy1,mxy2));
00881 y = _mm_mul_ps(y,_mm_movehl_ps(mxy1,mxy2));
00882 z = _mm_mul_ps(z,_mm_movelh_ps(mzw1,mzw2));
00883 w = _mm_mul_ps(w,_mm_movehl_ps(mzw1,mzw2));
00884 #endif // 0
00885
00886
00887 accum = _mm_add_ps(x,accum);
00888 accum = _mm_add_ps(y,accum);
00889 accum = _mm_add_ps(z,accum);
00890 accum = _mm_add_ps(w,accum);
00891
00892
00893
00894
00895
00896 }
00897
00898
00899 switch (c_left)
00900 {
00901 case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
00902 case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
00903 case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum);
00904 case 0: ;
00905 }
00906
00907
00908 _mm_stream_ps(r+i,accum);
00909 }
00910
00911
00912 for (; r_left > 0; --r_left) {
00913 accum = _mm_setzero_ps();
00914 const float* p = m+(rows-r_left)*cols;
00915 for (unsigned int j=0; j<cols; ++j)
00916 accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(p+j), _mm_load_ss(v+j)),accum);
00917 _mm_store_ss(r+rows-r_left,accum);
00918 }
00919 }
00920
00921 static VNL_SSE_FORCE_INLINE float sum(const float* x, unsigned n)
00922 {
00923 float ret;
00924 __m128 sum = _mm_setzero_ps();
00925 switch (n%4)
00926 {
00927 case 3: sum = _mm_load_ss(x+--n);
00928 case 2: sum = _mm_shuffle_ps(_mm_load_ss(x+--n), sum ,_MM_SHUFFLE(1,0,0,1));
00929 case 1: sum = _mm_move_ss(sum,_mm_load_ss(x+--n));
00930 case 0: ;
00931 }
00932
00933
00934 for (int i = n-4; i >= 0; i-=4)
00935 sum = _mm_add_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),sum);
00936
00937
00938 sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
00939 sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
00940 _mm_store_ss(&ret,sum);
00941 return ret;
00942 }
00943
00944 static VNL_SSE_FORCE_INLINE float max(const float* x, unsigned n)
00945 {
00946 float ret;
00947 __m128 min_float = _mm_set1_ps(- FLT_MAX);
00948 __m128 max = min_float;
00949 switch (n%4)
00950 {
00951 case 3: max = _mm_load_ss(x+--n);
00952 case 2: max = _mm_shuffle_ps(_mm_load_ss(x+--n), max ,_MM_SHUFFLE(1,0,0,1));
00953 case 1: max = _mm_move_ss(max,_mm_load_ss(x+--n));
00954 case 0: ;
00955 }
00956
00957
00958 for (int i = n-4; i >= 0; i-=4)
00959 max = _mm_max_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), max);
00960
00961
00962 max = _mm_max_ps(max,_mm_movehl_ps(min_float,max));
00963 max = _mm_max_ss(max,_mm_shuffle_ps(max,max,_MM_SHUFFLE(3,2,1,1)));
00964 _mm_store_ss(&ret,max);
00965
00966 return ret;
00967 }
00968
00969 static VNL_SSE_FORCE_INLINE float min(const float* x, unsigned n)
00970 {
00971 float ret;
00972 __m128 max_float = _mm_set1_ps(FLT_MAX);
00973 __m128 min = max_float;
00974
00975 switch (n%4)
00976 {
00977 case 3: min = _mm_min_ss(min,_mm_load_ss(x+--n));
00978 case 2: min = _mm_min_ss(min,_mm_load_ss(x+--n));
00979 case 1: min = _mm_min_ss(min,_mm_load_ss(x+--n));
00980 case 0: ;
00981 }
00982
00983
00984 for (int i = n-4; i >= 0; i-=4)
00985 min = _mm_min_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), min);
00986
00987
00988
00989 min = _mm_min_ps(min,_mm_movehl_ps(max_float,min));
00990 min = _mm_min_ss(min,_mm_shuffle_ps(min,min,_MM_SHUFFLE(3,2,1,1)));
00991 _mm_store_ss(&ret,min);
00992
00993 return ret;
00994 }
00995
00996 static VNL_SSE_FORCE_INLINE unsigned arg_max(const float* x, unsigned n)
00997 {
00998 __m128 max = _mm_set1_ps(- FLT_MAX);
00999 __m128 input;
01000 __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
01001 __m128i input_idx_increment = _mm_set1_epi32(4);
01002 __m128i arg_max = _mm_set1_epi32(0);
01003 union IsMaxMask
01004 {
01005 __m128 m128;
01006 __m128i m128i;
01007 };
01008 IsMaxMask is_max;
01009
01010 const int n16 = (n/4) * 4;
01011
01012 for (int i=0; i<n16; i+=4)
01013 {
01014 input = VNL_SSE_HEAP_LOAD(ps)(x+i);
01015 max = _mm_max_ps(input, max);
01016 is_max.m128 = _mm_cmpeq_ps(max, input);
01017 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
01018 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01019 }
01020
01021
01022 int mod = n%4;
01023 n = n-mod;
01024 input_idx_increment = _mm_set1_epi32(1);
01025 while (mod != 0)
01026 {
01027 input_idx = _mm_set1_epi32(n);
01028 input = _mm_load1_ps(x+n);
01029 max = _mm_max_ps(max, input);
01030 is_max.m128 = _mm_cmpeq_ps(max, input);
01031 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
01032 --mod;
01033 ++n;
01034 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01035 }
01036
01037
01038 is_max.m128 = max;
01039 max = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(2,3,0,1)));
01040 max = _mm_max_ps(max, _mm_movehl_ps(max, max));
01041 max = _mm_shuffle_ps(max, max, _MM_SHUFFLE(0,0,0,0));
01042 is_max.m128 = _mm_cmpeq_ps(is_max.m128, max);
01043 arg_max = _mm_and_si128(is_max.m128i, arg_max);
01044 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
01045 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_srli_si128(arg_max, 4));
01046 unsigned ret = _mm_cvtsi128_si32(arg_max);
01047 return ret;
01048 }
01049
01050 static VNL_SSE_FORCE_INLINE unsigned arg_min(const float* x, unsigned n)
01051 {
01052 __m128 min = _mm_set1_ps(FLT_MAX);
01053 __m128 input;
01054 __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
01055 __m128i input_idx_increment = _mm_set1_epi32(4);
01056 __m128i arg_min = _mm_set1_epi32(0);
01057 union IsMinMask
01058 {
01059 __m128 m128;
01060 __m128i m128i;
01061 };
01062 IsMinMask is_min;
01063
01064 const int n16 = (n/4) * 4;
01065
01066 for (int i=0; i<n16; i+=4)
01067 {
01068 input = VNL_SSE_HEAP_LOAD(ps)(x+i);
01069 min = _mm_min_ps(input, min);
01070 is_min.m128 = _mm_cmpeq_ps(min, input);
01071 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
01072 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01073 }
01074
01075
01076 int mod = n%4;
01077 n = n-mod;
01078 input_idx_increment = _mm_set1_epi32(1);
01079 while (mod != 0)
01080 {
01081 input_idx = _mm_set1_epi32(n);
01082 input = _mm_load1_ps(x+n);
01083 min = _mm_min_ps(min, input);
01084 is_min.m128 = _mm_cmpeq_ps(min, input);
01085 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
01086 --mod;
01087 ++n;
01088 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01089 }
01090
01091
01092 is_min.m128 = min;
01093 min = _mm_min_ps(min, _mm_shuffle_ps(min, min, _MM_SHUFFLE(2,3,0,1)));
01094 min = _mm_min_ps(min, _mm_movehl_ps(min, min));
01095 min = _mm_shuffle_ps(min, min, _MM_SHUFFLE(0,0,0,0));
01096 is_min.m128 = _mm_cmpeq_ps(is_min.m128, min);
01097 arg_min = _mm_and_si128(is_min.m128i, arg_min);
01098 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
01099 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_srli_si128(arg_min, 4));
01100 unsigned ret = _mm_cvtsi128_si32(arg_min);
01101 return ret;
01102 }
01103 };
01104
01105 #endif // VNL_CONFIG_ENABLE_SSE2
01106
01107 #endif // vnl_sse_h_