core/vnl/vnl_sse.h
Go to the documentation of this file.
00001 #ifndef vnl_sse_h_
00002 #define vnl_sse_h_
00003 //:
00004 // \file
00005 // \author Kieran O'Mahony
00006 // \date Sep 2007
00007 // \brief Support for Streaming SIMD Extensions to speed up vector arithmetic
00008 // \verbatim
00009 //  Modifications
00010 //   2009-03-30 Peter Vanroose - Added arg_min() & arg_max() and reimplemented min() & max()
00011 // \endverbatim
00012 
00013 #include <vcl_compiler.h> // for macro decisions based on compiler type
00014 #include <vxl_config.h>   // for checking supported integer data types
00015 #include <vcl_cfloat.h>   // for DBL_MAX and FLT_MAX
00016 
00017 #include <vnl/vnl_config.h> // is SSE enabled
00018 #include <vnl/vnl_alloc.h>  // is SSE enabled
00019 
00020 // some caveats...
00021 // - Due to the way vnl_matrix is represented in memory cannot guarantee 16-byte alignment,
00022 //   therefore have to use slower unaligned loading intrinsics for matrices.
00023 // - The GCC 3.4 intrinsics seem to be horrendously slow...
00024 
00025 // - On Mac OS X, in order to support Universal Binaries, we do not consider it a hard
00026 //   error if VNL_CONFIG_ENABLE_SSE2 is true for PowerPC builds. PowerPC obviously does
00027 //   not support SSE2, so we simply redefine it to false.
00028 #if VNL_CONFIG_ENABLE_SSE2
00029 # if defined(__APPLE__) && (defined(__ppc__) || defined(__ppc64__))
00030 #   undef VNL_CONFIG_ENABLE_SSE2
00031 #   define VNL_CONFIG_ENABLE_SSE2 0
00032 # elif !VXL_HAS_EMMINTRIN_H
00033 #   error "Required file emmintrin.h for SSE2 not found"
00034 # else
00035 #   include <emmintrin.h> // sse 2 intrinsics
00036 # endif
00037 #endif
00038 
00039 
00040 // Try and use compiler instructions for forcing inlining if possible
00041 // Also instruction for aligning stack memory is compiler dependent
00042 #if defined(VCL_GCC)
00043 // With attribute always_inline, gcc can give an error if a function
00044 // cannot be inlined, so it is disabled.  Problem seen on 64 bit
00045 // platforms with vcl_vector<vnl_rational>.
00046 # define VNL_SSE_FORCE_INLINE /* __attribute__((always_inline)) */ inline
00047 # define VNL_SSE_STACK_ALIGNED(x)  __attribute__((aligned(x)))
00048 #elif defined VCL_VC || defined VCL_ICC
00049 # define VNL_SSE_FORCE_INLINE __forceinline
00050 # define VNL_SSE_STACK_ALIGNED(x)  __declspec(align(x))
00051 #else
00052 # define VNL_SSE_FORCE_INLINE inline
00053 # define VNL_SSE_STACK_ALIGNED(x)
00054 # define VNL_SSE_STACK_STORE(pf) _mm_storeu_##pf // no stack alignment so use unaligned store (slower!)
00055 #endif
00056 
00057 
00058 // SSE operates faster with 16 byte aligned memory addresses.
00059 // Check what memory alignment function is supported
00060 #if VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MM_MALLOC
00061 # define VNL_SSE_ALLOC(n,s,a) _mm_malloc(n*s,a)
00062 # define VNL_SSE_FREE(v,n,s) _mm_free(v)
00063 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_ALIGNED_MALLOC
00064 # include <malloc.h>
00065 # define VNL_SSE_ALLOC(n,s,a) _aligned_malloc(n*s,a)
00066 # define VNL_SSE_FREE(v,n,s) _aligned_free(v)
00067 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MINGW_ALIGNED_MALLOC
00068 # include <malloc.h>
00069 # define VNL_SSE_ALLOC(n,s,a) __mingw_aligned_malloc(n*s,a)
00070 # define VNL_SSE_FREE(v,n,s) __mingw_aligned_free(v)
00071 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_POSIX_MEMALIGN
00072 # include <vcl_cstdlib.h>
00073 # define VNL_SSE_ALLOC(n,s,a) memalign(a,n*s)
00074 # define VNL_SSE_FREE(v,n,s) vcl_free(v)
00075 #else // sse2 disabled or could not get memory alignment support, use slower unaligned based intrinsics
00076 # define VNL_SSE_HEAP_STORE(pf) _mm_storeu_##pf
00077 # define VNL_SSE_HEAP_LOAD(pf) _mm_loadu_##pf
00078 # if VNL_CONFIG_THREAD_SAFE
00079 #   define VNL_SSE_ALLOC(n,s,a) new char[n*s]
00080 #   define VNL_SSE_FREE(v,n,s) delete [] static_cast<char*>(v)
00081 # else
00082 #   define VNL_SSE_ALLOC(n,s,a) vnl_alloc::allocate((n == 0) ? 8 : (n * s));
00083 #   define VNL_SSE_FREE(v,n,s) if (v) vnl_alloc::deallocate(v, (n == 0) ? 8 : (n * s));
00084 # endif
00085 #endif
00086 
00087 
00088 // Stack memory can be aligned -> use SSE aligned store
00089 #ifndef VNL_SSE_STACK_STORE
00090 # define VNL_SSE_STACK_STORE(pf) _mm_store_##pf
00091 #endif
00092 
00093 // Heap memory can be aligned -> use SSE aligned load & store
00094 #ifndef VNL_SSE_HEAP_STORE
00095 # define VNL_SSE_HEAP_STORE(pf) _mm_store_##pf
00096 # define VNL_SSE_HEAP_LOAD(pf) _mm_load_##pf
00097 #endif
00098 
00099 //: Custom memory allocation function to force 16 byte alignment of data
00100 VNL_SSE_FORCE_INLINE void* vnl_sse_alloc(vcl_size_t n, unsigned size)
00101 {
00102   return VNL_SSE_ALLOC(n,size,16);
00103 }
00104 
00105 //: Custom memory deallocation function to free 16 byte aligned of data
00106 VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void* mem, vcl_size_t n, unsigned size)
00107 {
00108   VNL_SSE_FREE(mem,n,size);
00109 }
00110 
00111 // avoid inlining when debugging
00112 #ifndef NDEBUG
00113 #undef VNL_SSE_FORCE_INLINE
00114 #define VNL_SSE_FORCE_INLINE
00115 #endif
00116 
00117 #if VNL_CONFIG_ENABLE_SSE2
00118 class vnl_sse_supplement
00119 {
00120 public:
00121   // SSE2 does not have a native _mm_min_epi32 or _mm_max_epi32 (le sigh-- SSE4.1
00122   // provides these).  So, these are substitutes written in SSE2 based off the
00123   // SSEPlus library.
00124   static VNL_SSE_FORCE_INLINE __m128i vnl_mm_min_epi32(__m128i a, __m128i b)
00125   {
00126     __m128i mask  = _mm_cmplt_epi32(a, b);
00127     a = _mm_and_si128(a, mask);
00128     b = _mm_andnot_si128(mask, b);
00129     a = _mm_or_si128(a, b);
00130     return a;
00131   }
00132 
00133   static VNL_SSE_FORCE_INLINE __m128i vnl_mm_max_epi32(__m128i a, __m128i b)
00134   {
00135     __m128i mask  = _mm_cmpgt_epi32(a, b);
00136     a = _mm_and_si128(a, mask);
00137     b = _mm_andnot_si128(mask, b);
00138     a = _mm_or_si128(a, b);
00139     return a;
00140   }
00141 };
00142 // If SSE4.1 is available, these can be replaced by their native
00143 // implementations.
00144 #define VNL_MM_MIN_EPI32 vnl_sse_supplement::vnl_mm_min_epi32
00145 #define VNL_MM_MAX_EPI32 vnl_sse_supplement::vnl_mm_max_epi32
00146 #endif // VNL_CONFIG_ENABLE_SSE2
00147 
00148 //: Bog standard (no sse) implementation for non sse enabled hardware and any type which doesn't have a template specialisation.
00149 template <class T>
00150 class vnl_sse
00151 {
00152  public:
00153   static VNL_SSE_FORCE_INLINE void element_product(const T* x, const T* y, T* r, unsigned n)
00154   {
00155     for (unsigned i = 0; i < n; ++i)
00156       r[i] = x[i] * y[i];
00157   }
00158 
00159   static VNL_SSE_FORCE_INLINE T dot_product(const T* x, const T* y, unsigned n)
00160   {
00161     T sum(0);
00162     for (unsigned i = 0; i < n; ++i)
00163       sum += x[i] * y[i];
00164     return sum;
00165   }
00166 
00167   static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T* x, const T* y, unsigned n)
00168   {
00169     // IMS: Unable to optimise this any further for MSVC compiler
00170     T sum(0);
00171   #ifdef VCL_VC_6
00172     for (unsigned i=0; i<n; ++i)
00173     {
00174       const T diff = x[i] - y[i];
00175       sum += diff*diff;
00176     }
00177   #else
00178     --x;
00179     --y;
00180     while (n!=0)
00181     {
00182       const T diff = x[n] - y[n];
00183       sum += diff*diff;
00184       --n;
00185     }
00186   #endif
00187     return sum;
00188   }
00189 
00190   static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T* v, const T* m, T* r, unsigned rows, unsigned cols)
00191   {
00192     for (unsigned int j=0; j<cols; ++j) {
00193       T som(0);
00194       for (unsigned int i=0; i<rows; ++i)
00195         som += (m+i*cols)[j] * v[i];
00196       r[j] = som;
00197     }
00198   }
00199 
00200   static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T* m, const T* v, T* r, unsigned rows, unsigned cols)
00201   {
00202     for (unsigned int i=0; i<rows; ++i) {
00203       T som(0);
00204       for (unsigned int j=0; j<cols; ++j)
00205         som += (m+i*cols)[j] * v[j];
00206       r[i] = som;
00207     }
00208   }
00209 
00210   static VNL_SSE_FORCE_INLINE T sum(const T* v, unsigned n)
00211   {
00212     T tot(0);
00213     for (unsigned i = 0; i < n; ++i)
00214       tot += *v++;
00215     return tot;
00216   }
00217 
00218   static VNL_SSE_FORCE_INLINE T max(const T* v, unsigned n)
00219   {
00220     if (n==0) return T(0); // the maximum of an empty set is undefined
00221     T tmp = *v;
00222     while (--n > 0)
00223       if (*++v > tmp)
00224         tmp = *v;
00225     return tmp;
00226   }
00227 
00228   static VNL_SSE_FORCE_INLINE T min(const T* v, unsigned n)
00229   {
00230     if (n==0) return T(0); // the minimum of an empty set is undefined
00231     T tmp = *v;
00232     while (--n > 0)
00233       if (*++v < tmp)
00234         tmp = *v;
00235     return tmp;
00236   }
00237 
00238   static VNL_SSE_FORCE_INLINE unsigned arg_max(const T* v, unsigned n)
00239   {
00240     if (n==0) return unsigned(-1); // the maximum of an empty set is undefined
00241     T tmp = *v;
00242     unsigned idx = 0;
00243     for (unsigned i=1; i<n; ++i)
00244       if (*++v > tmp)
00245         tmp = *v, idx = i;
00246     return idx;
00247   }
00248 
00249   static VNL_SSE_FORCE_INLINE unsigned arg_min(const T* v, unsigned n)
00250   {
00251     if (n==0) return unsigned(-1); // the minimum of an empty set is undefined
00252     T tmp = *v;
00253     unsigned idx = 0;
00254     for (unsigned i=1; i<n; ++i)
00255       if (*++v < tmp)
00256         tmp = *v, idx = i;
00257     return idx;
00258   }
00259 };
00260 
00261 #if VNL_CONFIG_ENABLE_SSE2
00262 
00263 //: SSE2 implementation for double precision floating point (64 bit)
00264 VCL_DEFINE_SPECIALIZATION
00265 class vnl_sse<double>
00266 {
00267  public:
00268   static VNL_SSE_FORCE_INLINE void element_product(const double* x, const double* y, double* r, unsigned n)
00269   {
00270     switch (n%4)
00271     {
00272       // do scalar (single value) load, multiply and store for end elements
00273       case 3: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
00274       case 2: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
00275       case 1: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
00276       case 0: ;
00277     }
00278 
00279     // load, multiply and store two doubles at a time
00280     // loop unroll to handle 4
00281     if (vcl_ptrdiff_t(x)%16 || vcl_ptrdiff_t(y)%16  || vcl_ptrdiff_t(r)%16)
00282           // unaligned case
00283       for (int i = n-4; i >= 0; i-=4)
00284       {
00285         _mm_storeu_pd(r+i,_mm_mul_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i)));
00286         _mm_storeu_pd(r+i+2,_mm_mul_pd(_mm_loadu_pd(x+i+2),_mm_loadu_pd(y+i+2)));
00287       }
00288     else  // aligned case
00289       for (int i = n-4; i >= 0; i-=4)
00290       {
00291         VNL_SSE_HEAP_STORE(pd)(r+i,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i)));
00292         VNL_SSE_HEAP_STORE(pd)(r+i+2,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i+2),VNL_SSE_HEAP_LOAD(pd)(y+i+2)));
00293       }
00294   }
00295 
00296   static VNL_SSE_FORCE_INLINE double dot_product(const double* x, const double* y, unsigned n)
00297   {
00298     double ret;
00299     __m128d sum;
00300     if (n%2)
00301     {
00302       // handle single element at end of odd sized vectors
00303       n--; sum = _mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
00304     }
00305     else
00306       sum = _mm_setzero_pd();
00307 
00308     if (vcl_ptrdiff_t(x)%16 || vcl_ptrdiff_t(y)%16)
00309          // unaligned case
00310       for (int i = n-2; i >= 0; i-=2)
00311         sum = _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(x+i), _mm_loadu_pd(y+i)),sum);
00312     else // aligned case
00313       for (int i = n-2; i >= 0; i-=2)
00314         sum = _mm_add_pd(_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), VNL_SSE_HEAP_LOAD(pd)(y+i)),sum);
00315 
00316     // sum will contain 2 accumulated values, need to add them together
00317     sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
00318     _mm_store_sd(&ret,sum);
00319     return ret;
00320   }
00321 
00322   static VNL_SSE_FORCE_INLINE double euclid_dist_sq(const double* x, const double* y, unsigned n)
00323   {
00324     double ret;
00325     __m128d sum,a;
00326 
00327     if (n%2)
00328     {
00329       // handle single element at end of odd sized vectors
00330         n--; a = _mm_sub_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
00331         sum = _mm_mul_sd(a,a);
00332     }
00333     else
00334       sum = _mm_setzero_pd();
00335 
00336     if (vcl_ptrdiff_t(x)%16 || vcl_ptrdiff_t(y)%16)
00337          // unaligned case
00338       for ( int i = n-2; i >= 0; i-=2 )
00339       {
00340         a = _mm_sub_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i));
00341         sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
00342       }
00343     else // aligned case
00344       for ( int i = n-2; i >= 0; i-=2 )
00345       {
00346         a = _mm_sub_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i));
00347         sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
00348       }
00349 
00350     // sum will contain 2 accumulated values, need to add them together
00351     sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
00352     _mm_store_sd(&ret,sum);
00353     return ret;
00354   }
00355 
00356   static VNL_SSE_FORCE_INLINE void vector_x_matrix(const double* v, const double* m, double* r, unsigned rows, unsigned cols)
00357   {
00358     __m128d accum, x,y,z,w;
00359 
00360     // calculate if there are any left-over rows/columns
00361     unsigned r_left = rows%4;
00362     unsigned r_nice = rows - r_left;
00363     unsigned c_left = cols%2;
00364     unsigned c_nice = cols - c_left;
00365 
00366     // handle 2 matrix columns at a time
00367     for (unsigned j = 0; j < c_nice; j+=2)
00368     {
00369       // handle 4 matrix rows at a time
00370       accum = _mm_setzero_pd();
00371       unsigned i = 0;
00372       while ( i < r_nice )
00373       {
00374         // load vector data so that
00375         // y = (v0,v1) , w = (v2,v3)
00376         y = VNL_SSE_HEAP_LOAD(pd)(v+i);
00377         w = VNL_SSE_HEAP_LOAD(pd)(v+i+2);
00378 
00379         _mm_prefetch((const char*)(v+i+4), _MM_HINT_NTA);
00380 
00381         // after shuffling
00382         // x = (v0, v0)
00383         // y = (v1, v1)
00384         // z = (v2, v2)
00385         // w = (v3, v3)
00386         x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
00387         y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
00388         z = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(0,0));
00389         w = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(1,1));
00390 
00391         // multiply the two matrix columns
00392         // i.e.  x = ( v0 * m00, v0 * m01)
00393         //       y = ( v1 * m10, v1 * m11)
00394         //       z = ( v2 * m20, v2 * m21)
00395         //       w = ( v3 * m30, v3 * m31)
00396         x = _mm_mul_pd(x,_mm_loadu_pd(i++*cols+m+j));
00397         y = _mm_mul_pd(y,_mm_loadu_pd(i++*cols+m+j));
00398         z = _mm_mul_pd(z,_mm_loadu_pd(i++*cols+m+j));
00399         w = _mm_mul_pd(w,_mm_loadu_pd(i++*cols+m+j));
00400 
00401         // now sum both columns
00402         accum = _mm_add_pd(x,accum);
00403         accum = _mm_add_pd(y,accum);
00404         accum = _mm_add_pd(z,accum);
00405         accum = _mm_add_pd(w,accum);
00406 
00407         // accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30,
00408         //                v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31 )
00409       }
00410 
00411       // handle left-over rows
00412       switch (r_left)
00413       {
00414         case 3: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
00415         case 2: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
00416         case 1: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum);
00417         case 0: ;
00418       }
00419 
00420       // store the 2 values of the result vector
00421       // use stream to avoid polluting the cache
00422       _mm_stream_pd(r+j,accum);
00423     }
00424 
00425     // handle the left over columns
00426     if ( c_left )
00427     {
00428       accum = _mm_setzero_pd();
00429       for (unsigned int i=0; i<rows; ++i)
00430         accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(m+i*cols+cols-1),_mm_load_sd(v+i)),accum);
00431       _mm_store_sd(r+cols-1, accum);
00432     }
00433   }
00434 
00435   static VNL_SSE_FORCE_INLINE void matrix_x_vector(const double* m, const double* v, double* r, unsigned rows, unsigned cols)
00436   {
00437     __m128d accum, x,y,mxy1,mxy2;
00438 
00439     // calculate if there are any left-over rows/columns
00440     unsigned r_left = rows%2;
00441     unsigned r_nice = rows - r_left;
00442     unsigned c_left = cols%2;
00443     unsigned c_nice = cols - c_left;
00444 
00445     // handle 2 matrix rows at a time
00446     for (unsigned i = 0; i < r_nice; i+=2)
00447     {
00448       // handle 2 matrix columns at a time
00449       accum = _mm_setzero_pd();
00450       const double *r1 = m+i*cols, *r2 = m+(i+1)*cols;
00451       unsigned j = 0;
00452       for (; j < c_nice; j+=2)
00453       {
00454         // load  vector data so that
00455         //  y = (v0, v1)
00456         y = VNL_SSE_HEAP_LOAD(pd)(v+j);
00457 
00458         // shuffle so that
00459         //  x = (v0,v0)   y = (v1,v1)
00460         x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
00461         y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
00462 
00463         // load the matrix data so that
00464         // mxy1 = (m00,m01),  mxy2 = (m10,m11)
00465         mxy1 = _mm_loadu_pd(r1+j);
00466         mxy2 = _mm_loadu_pd(r2+j);
00467 
00468         // unpack matrix data to achieve
00469         //  (v0,v0) * (m00,m10)
00470         //  (v1,v1) * (m01,m11)
00471         x = _mm_mul_pd(x,_mm_unpacklo_pd(mxy1,mxy2));
00472         y = _mm_mul_pd(y,_mm_unpackhi_pd(mxy1,mxy2));
00473 
00474         // now sum the results
00475         accum = _mm_add_pd(x,accum);
00476         accum = _mm_add_pd(y,accum);
00477 
00478         // accum is now ( v0 * m00 + v1 * m01,
00479         //                v0 * m11 + v1 * m11 )
00480       }
00481       // handle the left over columns
00482       if (c_left)
00483         accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+j),_mm_set_pd(*(r2+j),*(r1+j))), accum);
00484 
00485       // store the 2 values of the result vector
00486       // use stream to avoid polluting the cache
00487       _mm_stream_pd(r+i,accum);
00488     }
00489 
00490     // handle the left over rows
00491     if ( r_left )
00492     {
00493       accum = _mm_setzero_pd();
00494       const double* p = m+(rows-1)*cols;
00495       for (unsigned int j=0; j<cols; ++j)
00496         accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(p+j),_mm_load_sd(v+j)),accum);
00497       _mm_store_sd(r+rows-1, accum);
00498     }
00499   }
00500 
00501   static VNL_SSE_FORCE_INLINE double sum(const double* x, unsigned n)
00502   {
00503     double ret;
00504     // decision logic for odd sized vectors
00505     __m128d sum = n%2 ? _mm_load_sd(x+--n) : _mm_setzero_pd();
00506 
00507     // sum two elements at a time, sum will contain two running totals
00508     for (int i = n-2; i >= 0; i-=2)
00509       sum = _mm_add_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),sum);
00510 
00511     // sum will contain 2 accumulated values, need to add them together
00512     sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
00513     _mm_store_sd(&ret,sum);
00514     return ret;
00515   }
00516 
00517   static VNL_SSE_FORCE_INLINE unsigned arg_max(const double* x, unsigned n)
00518   {
00519     if (n == 1)
00520       return 0;
00521 
00522     __m128d min_double = _mm_set1_pd(- DBL_MAX);
00523     __m128d max = min_double;
00524     __m128d input;
00525     __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
00526     __m128i input_idx_increment = _mm_set1_epi32(2);
00527     __m128i arg_max = _mm_set1_epi32(0);
00528     union IsMaxMask
00529       {
00530       __m128d m128d;
00531       __m128i m128i;
00532       };
00533     IsMaxMask is_max;
00534 
00535     const int n16 = (n/2) * 2;
00536     // handle two elements at a time, max will contain two max values
00537     for (int i=0; i<n16; i+=2)
00538       {
00539       input = VNL_SSE_HEAP_LOAD(pd)(x+i);
00540       max = _mm_max_pd(input, max);
00541       is_max.m128d = _mm_cmpeq_pd(max, input);
00542       arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
00543       input_idx = _mm_add_epi32(input_idx, input_idx_increment);
00544       }
00545 
00546     // decision logic for odd sized vectors
00547     if (n%2)
00548       {
00549       input_idx = _mm_set1_epi32(--n);
00550       input = _mm_load1_pd(x+n);
00551       max = _mm_max_sd(max, input);
00552       is_max.m128d = _mm_cmpeq_pd(max, input);
00553       arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
00554       }
00555 
00556     // need to store the index of the max value
00557     is_max.m128d = max;
00558     max = _mm_max_sd(_mm_unpackhi_pd(max, min_double), max);
00559     max = _mm_unpacklo_pd(max, max);
00560     is_max.m128d = _mm_cmpeq_pd(is_max.m128d, max);
00561     arg_max = _mm_and_si128(is_max.m128i, arg_max);
00562     arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
00563     unsigned ret = _mm_cvtsi128_si32(arg_max);
00564     return ret;
00565   }
00566 
00567   static VNL_SSE_FORCE_INLINE unsigned arg_min(const double* x, unsigned n)
00568   {
00569     if (n == 1)
00570       return 0;
00571 
00572     __m128d max_double = _mm_set1_pd(DBL_MAX);
00573     __m128d min = max_double;
00574     __m128d input;
00575     __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
00576     __m128i input_idx_increment = _mm_set1_epi32(2);
00577     __m128i arg_min = _mm_set1_epi32(0);
00578     union IsMinMask
00579       {
00580       __m128d m128d;
00581       __m128i m128i;
00582       };
00583     IsMinMask is_min;
00584 
00585     const int n16 = (n/2) * 2;
00586     // handle two elements at a time, max will contain two max values
00587     for (int i=0; i<n16; i+=2)
00588       {
00589       input = VNL_SSE_HEAP_LOAD(pd)(x+i);
00590       min = _mm_min_pd(input, min);
00591       is_min.m128d = _mm_cmpeq_pd(min, input);
00592       arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
00593       input_idx = _mm_add_epi32(input_idx, input_idx_increment);
00594       }
00595 
00596     // decision logic for odd sized vectors
00597     if (n%2)
00598       {
00599       input_idx = _mm_set1_epi32(--n);
00600       input = _mm_load1_pd(x+n);
00601       min = _mm_min_sd(min, input);
00602       is_min.m128d = _mm_cmpeq_pd(min, input);
00603       arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
00604       }
00605 
00606     // need to store the index of the min value
00607     is_min.m128d = min;
00608     min = _mm_min_sd(_mm_unpackhi_pd(min, max_double), min);
00609     min = _mm_unpacklo_pd(min, min);
00610     is_min.m128d = _mm_cmpeq_pd(is_min.m128d, min);
00611     arg_min = _mm_and_si128(is_min.m128i, arg_min);
00612     arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
00613     unsigned ret = _mm_cvtsi128_si32(arg_min);
00614     return ret;
00615   }
00616 
00617   static VNL_SSE_FORCE_INLINE double max(const double* x, unsigned n)
00618   {
00619     double ret;
00620     __m128d min_double = _mm_set1_pd(- DBL_MAX);
00621     __m128d max = min_double;
00622 
00623     // decision logic for odd sized vectors
00624     if (n%2)
00625       max = _mm_max_sd(max,_mm_load_sd(x+--n));
00626 
00627     // handle two elements at a time, max will contain two max values
00628     for (int i=n-2; i>=0; i-=2)
00629       max = _mm_max_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), max);
00630 
00631     // need to store max's two values
00632     max = _mm_max_sd(max,_mm_unpackhi_pd(max,min_double));
00633     _mm_store_sd(&ret,max);
00634     return ret;
00635   }
00636 
00637   static VNL_SSE_FORCE_INLINE double min(const double* x, unsigned n)
00638   {
00639     double ret;
00640     __m128d max_double = _mm_set1_pd(DBL_MAX);
00641     __m128d min = max_double;
00642 
00643     // hand last element if odd sized vector
00644     if (n%2)
00645       min = _mm_min_sd(min,_mm_load_sd(x+--n));
00646 
00647     // handle two elements at a time, min will contain two min values
00648     for (int i=n-2; i>=0; i-=2)
00649       min = _mm_min_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), min);
00650 
00651     // need to store min's two values
00652     min = _mm_min_sd(min,_mm_unpackhi_pd(min,max_double));
00653     _mm_store_sd(&ret,min);
00654     return ret;
00655   }
00656 };
00657 
00658 //: SSE2 implementation for single precision floating point (32 bit)
00659 VCL_DEFINE_SPECIALIZATION
00660 class vnl_sse<float>
00661 {
00662  public:
00663   static VNL_SSE_FORCE_INLINE void element_product(const float* x, const float* y, float* r, unsigned n)
00664   {
00665     switch (n%4)
00666     {
00667       // do scalar (single value) load, multiply and store for end elements
00668       case 3: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00669       case 2: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00670       case 1: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00671       case 0: ;
00672     }
00673 
00674     // load, multiply and store four floats at a time
00675     for (int i = n-4; i >= 0; i-=4)
00676       VNL_SSE_HEAP_STORE(ps)(r+i,_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i)));
00677   }
00678 
00679   static VNL_SSE_FORCE_INLINE float dot_product(const float* x, const float* y, unsigned n)
00680   {
00681     float ret;
00682     __m128 sum = _mm_setzero_ps();
00683     switch (n%4)
00684     {
00685       // handle elements at end of vectors with sizes not divisable by 4
00686       case 3: n--; sum = _mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n));
00687       case 2: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
00688       case 1: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
00689       case 0: ;
00690     }
00691 
00692     for (int i = n-4; i >= 0; i-=4)
00693       sum = _mm_add_ps(_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), VNL_SSE_HEAP_LOAD(ps)(y+i)),sum);
00694 
00695     // sum will contain 4 accumulated values, need to add them together
00696     sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
00697     sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
00698 
00699     _mm_store_ss(&ret,sum);
00700     return ret;
00701   }
00702 
00703   static VNL_SSE_FORCE_INLINE float euclid_dist_sq(const float* x, const float* y, unsigned n)
00704   {
00705     float ret;
00706     __m128 sum,a;
00707     sum = a = _mm_setzero_ps();
00708     switch (n%4)
00709     {
00710       // handle elements at end of vectors with sizes not divisable by 4
00711       case 3: --n; a = _mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n));
00712       case 2: --n; a = _mm_shuffle_ps(_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)), a ,_MM_SHUFFLE(1,0,0,1));
00713       case 1: --n; a = _mm_move_ss(a,_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
00714               sum = _mm_mul_ps(a,a);
00715       case 0: ;
00716     }
00717 
00718     for ( int i = n-4; i >= 0; i-=4 )
00719     {
00720       a = _mm_sub_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i));
00721       sum = _mm_add_ps(_mm_mul_ps(a,a),sum);
00722     }
00723 
00724     // sum will contain 4 accumulated values, need to add them together
00725     sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
00726     sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
00727 
00728     _mm_store_ss(&ret,sum);
00729     return ret;
00730   }
00731 
00732   static VNL_SSE_FORCE_INLINE void vector_x_matrix(const float* v, const float* m, float* r, unsigned rows, unsigned cols)
00733   {
00734     __m128 accum, x,y,z,w;
00735 
00736     // calculate if there are any left-over rows/columns
00737     unsigned r_left = rows%4;
00738     unsigned r_nice = rows - r_left;
00739     unsigned c_left = cols%4;
00740     unsigned c_nice = cols - c_left;
00741 
00742     // handle 4 matrix columns at a time
00743     for (unsigned j = 0; j < c_nice; j+=4)
00744     {
00745       // handle 4 matrix rows at a time
00746       accum = _mm_setzero_ps();
00747       unsigned i = 0;
00748       while ( i < r_nice )
00749       {
00750         // load vector data so that
00751         // w = (v0,v1,v2,v3)
00752         w = VNL_SSE_HEAP_LOAD(ps)(v+i);
00753 
00754         // after shuffling
00755         // x = (v0, v0, v0, v0)
00756         // y = (v1, v1, v1, v1)
00757         // z = (v2, v2, v2, v2)
00758         // w = (v3, v3, v3, v3)
00759         x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
00760         y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
00761         z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
00762         w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
00763 
00764         // multiply the four matrix columns
00765         // i.e.  x = ( v0 * m00, v0 * m01, v0 * m02, v0 * m03)
00766         //       y = ( v1 * m10, v1 * m11, v1 * m12, v1 * m13)
00767         //       z = ( v2 * m20, v2 * m21, v2 * m22, v2 * m23)
00768         //       w = ( v3 * m30, v3 * m31, v3 * m32, v3 * m33)
00769         x = _mm_mul_ps(x,_mm_loadu_ps(m+i++*cols+j));
00770         y = _mm_mul_ps(y,_mm_loadu_ps(m+i++*cols+j));
00771         z = _mm_mul_ps(z,_mm_loadu_ps(m+i++*cols+j));
00772         w = _mm_mul_ps(w,_mm_loadu_ps(m+i++*cols+j));
00773 
00774         // now sum the four columns
00775         accum = _mm_add_ps(x,accum);
00776         accum = _mm_add_ps(y,accum);
00777         accum = _mm_add_ps(z,accum);
00778         accum = _mm_add_ps(w,accum);
00779 
00780         // accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30,
00781         //                v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31,
00782         //                v0 * m02 + v1 * m12 + v2 * m22 + v3 * m32,
00783         //                v0 * m03 + v1 * m13 + v2 * m23 + v3 * m33 )
00784       }
00785 
00786       // handle left-over rows
00787       switch (r_left)
00788       {
00789         case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
00790         case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
00791         case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum);
00792         case 0: ;
00793       }
00794 
00795       // store the 4 values of the result vector
00796       // use stream to avoid polluting the cache
00797       _mm_stream_ps(r+j,accum);
00798     }
00799 
00800     // handle the left over columns
00801     for (; c_left > 0; --c_left) {
00802       accum = _mm_setzero_ps();
00803       for (unsigned int i=0; i<rows; ++i)
00804         accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(m+i*cols+cols-c_left), _mm_load_ss(v+i)),accum);
00805       _mm_store_ss(r+cols-c_left,accum);
00806     }
00807   }
00808 
00809   static VNL_SSE_FORCE_INLINE void matrix_x_vector(const float* m, const float* v, float* r, unsigned rows, unsigned cols)
00810   {
00811     __m128 accum, x,y,z,w,mxy1,mxy2,mzw1,mzw2, mr1,mr2,mr3,mr4;
00812 
00813     // calculate if there are any left-over rows/columns
00814     unsigned r_left = rows%4;
00815     unsigned r_nice = rows - r_left;
00816     unsigned c_left = cols%4;
00817     unsigned c_nice = cols - c_left;
00818 
00819     // handle 4 matrix rows at a time
00820     for (unsigned i = 0; i < r_nice; i+=4)
00821     {
00822       // handle 4 matrix columns at a time
00823       accum = _mm_setzero_ps();
00824       const float *r1 = m+i*cols, *r2 = m+(i+1)*cols,
00825                   *r3 = m+(i+2)*cols, *r4 = m+(i+3)*cols;
00826       unsigned j = 0;
00827       for (; j < c_nice; j+=4)
00828       {
00829         // load  vector data so that
00830         //  w = (v0, v1, v2, v3)
00831         w = VNL_SSE_HEAP_LOAD(ps)(v+j);
00832 
00833         // after shuffling
00834         // x = (v0, v0, v0, v0)
00835         // y = (v1, v1, v1, v1)
00836         // z = (v2, v2, v2, v2)
00837         // w = (v3, v3, v3, v3)
00838         x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
00839         y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
00840         z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
00841         w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
00842 
00843         // load form first two rows of the matrix
00844         // i.e. mr1 = (m00, m01, m02, m03)
00845         //      mr2 = (m10, m11, m12, m13)
00846         mr1 = _mm_loadu_ps(r1+j);
00847         mr2 = _mm_loadu_ps(r2+j);
00848 
00849         // unpack into xy and zw parts
00850         // i.e mxy1 = (m00, m10, m01, m11)
00851         //     mzw1 = (m02, m12, m03, m13)
00852         mxy1 = _mm_unpacklo_ps(mr1,mr2);
00853         mzw1 = _mm_unpackhi_ps(mr1,mr2);
00854 
00855         // similarly for the next two rows
00856         mr3 = _mm_loadu_ps(r3+j);
00857         mr4 = _mm_loadu_ps(r4+j);
00858 
00859         // unpack into xy and zw parts
00860         // i.e mxy2 = (m20, m30, m21, m31)
00861         //     mxy2 = (m22, m32, m23, m33)
00862         mxy2 = _mm_unpacklo_ps(mr3,mr4);
00863         mzw2 = _mm_unpackhi_ps(mr3,mr4);
00864 
00865         // move around matrix data and multiply so that
00866         // x = (v0,v0,v0,v0) * (m00,m10,m20,m30)
00867         // y = (v1,v1,v1,v1) * (m01,m11,m21,m31)
00868         // z = (v2,v2,v2,v2) * (m02,m12,m22,m32)
00869         // w = (v3,v3,v3,v3) * (m03,m13,m23,m33)
00870 #if 1
00871         __m128 mx = _mm_movelh_ps(mxy1,mxy2);
00872         x = _mm_mul_ps(x, mx);
00873         __m128 my = _mm_movehl_ps(mxy2,mxy1);
00874         y = _mm_mul_ps(y, my);
00875         __m128 mz = _mm_movelh_ps(mzw1,mzw2);
00876         z = _mm_mul_ps(z, mz);
00877         __m128 mw = _mm_movehl_ps(mzw2,mzw1);
00878         w = _mm_mul_ps(w,mw);
00879 #else
00880         x = _mm_mul_ps(x,_mm_movelh_ps(mxy1,mxy2));
00881         y = _mm_mul_ps(y,_mm_movehl_ps(mxy1,mxy2));
00882         z = _mm_mul_ps(z,_mm_movelh_ps(mzw1,mzw2));
00883         w = _mm_mul_ps(w,_mm_movehl_ps(mzw1,mzw2));
00884 #endif // 0
00885 
00886         // now sum the four results
00887         accum = _mm_add_ps(x,accum);
00888         accum = _mm_add_ps(y,accum);
00889         accum = _mm_add_ps(z,accum);
00890         accum = _mm_add_ps(w,accum);
00891 
00892         // accum is now ( v0 * m00 + v1 * m01 + v2 * m02 + v3 * m03,
00893         //               v0 * m10 + v1 * m11 + v2 * m12 + v3 * m13,
00894         //               v0 * m20 + v1 * m21 + v2 * m22 + v3 * m23,
00895         //               v0 * m30 + v1 * m31 + v2 * m32 + v3 * m33 )
00896       }
00897 
00898       // handle the left over columns
00899       switch (c_left)
00900       {
00901         case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
00902         case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
00903         case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum);
00904         case 0: ;
00905       }
00906       // store the 4 values of the result vector
00907       // use stream to avoid polluting the cache
00908       _mm_stream_ps(r+i,accum);
00909     }
00910 
00911     // handle the left over rows
00912     for (; r_left > 0; --r_left) {
00913       accum = _mm_setzero_ps();
00914       const float* p = m+(rows-r_left)*cols;
00915       for (unsigned int j=0; j<cols; ++j)
00916         accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(p+j), _mm_load_ss(v+j)),accum);
00917       _mm_store_ss(r+rows-r_left,accum);
00918     }
00919   }
00920 
00921   static VNL_SSE_FORCE_INLINE float sum(const float* x, unsigned n)
00922   {
00923     float ret;
00924     __m128 sum = _mm_setzero_ps();
00925     switch (n%4)
00926     { // handle vector sizes which aren't divisible by 4
00927       case 3: sum = _mm_load_ss(x+--n);
00928       case 2: sum = _mm_shuffle_ps(_mm_load_ss(x+--n), sum ,_MM_SHUFFLE(1,0,0,1));
00929       case 1: sum = _mm_move_ss(sum,_mm_load_ss(x+--n));
00930       case 0: ;
00931     }
00932 
00933     // sum four elements at a time, sum will contain four running totals
00934     for (int i = n-4; i >= 0; i-=4)
00935       sum = _mm_add_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),sum);
00936 
00937     // sum will contain 4 accumulated values, need to add them together
00938     sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
00939     sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
00940     _mm_store_ss(&ret,sum);
00941     return ret;
00942   }
00943 
00944   static VNL_SSE_FORCE_INLINE float max(const float* x, unsigned n)
00945   {
00946     float ret;
00947     __m128 min_float = _mm_set1_ps(- FLT_MAX);
00948     __m128 max = min_float;
00949     switch (n%4)
00950     { // handle vector sizes which aren't divisible by 4
00951       case 3: max = _mm_load_ss(x+--n);
00952       case 2: max = _mm_shuffle_ps(_mm_load_ss(x+--n), max ,_MM_SHUFFLE(1,0,0,1));
00953       case 1: max = _mm_move_ss(max,_mm_load_ss(x+--n));
00954       case 0: ;
00955     }
00956 
00957     // handle four elements at a time, max will contain four max values
00958     for (int i = n-4; i >= 0; i-=4)
00959       max = _mm_max_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), max);
00960 
00961     // need compare max's four values
00962     max = _mm_max_ps(max,_mm_movehl_ps(min_float,max));
00963     max = _mm_max_ss(max,_mm_shuffle_ps(max,max,_MM_SHUFFLE(3,2,1,1)));
00964     _mm_store_ss(&ret,max);
00965 
00966     return ret;
00967   }
00968 
00969   static VNL_SSE_FORCE_INLINE float min(const float* x, unsigned n)
00970   {
00971     float ret;
00972     __m128 max_float = _mm_set1_ps(FLT_MAX);
00973     __m128 min = max_float;
00974 
00975     switch (n%4)
00976     { // handle vector sizes which aren't divisible by 4
00977       case 3: min = _mm_min_ss(min,_mm_load_ss(x+--n));
00978       case 2: min = _mm_min_ss(min,_mm_load_ss(x+--n));
00979       case 1: min = _mm_min_ss(min,_mm_load_ss(x+--n));
00980       case 0: ;
00981     }
00982 
00983     // handle four elements at a time, min will contain four min values
00984     for (int i = n-4; i >= 0; i-=4)
00985       min = _mm_min_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), min);
00986 
00987 
00988     // need compare min's four values
00989     min = _mm_min_ps(min,_mm_movehl_ps(max_float,min));
00990     min = _mm_min_ss(min,_mm_shuffle_ps(min,min,_MM_SHUFFLE(3,2,1,1)));
00991     _mm_store_ss(&ret,min);
00992 
00993     return ret;
00994   }
00995 
00996   static VNL_SSE_FORCE_INLINE unsigned arg_max(const float* x, unsigned n)
00997   {
00998     __m128  max = _mm_set1_ps(- FLT_MAX);
00999     __m128  input;
01000     __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
01001     __m128i input_idx_increment = _mm_set1_epi32(4);
01002     __m128i arg_max = _mm_set1_epi32(0);
01003     union IsMaxMask
01004       {
01005       __m128  m128;
01006       __m128i m128i;
01007       };
01008     IsMaxMask is_max;
01009 
01010     const int n16 = (n/4) * 4;
01011     // handle two elements at a time, max will contain two max values
01012     for (int i=0; i<n16; i+=4)
01013       {
01014       input = VNL_SSE_HEAP_LOAD(ps)(x+i);
01015       max = _mm_max_ps(input, max);
01016       is_max.m128 = _mm_cmpeq_ps(max, input);
01017       arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
01018       input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01019       }
01020 
01021     // decision logic for vector sizes whach aren't divisible by 4
01022     int mod = n%4;
01023     n = n-mod;
01024     input_idx_increment = _mm_set1_epi32(1);
01025     while (mod != 0)
01026       {
01027       input_idx = _mm_set1_epi32(n);
01028       input = _mm_load1_ps(x+n);
01029       max = _mm_max_ps(max, input);
01030       is_max.m128 = _mm_cmpeq_ps(max, input);
01031       arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
01032       --mod;
01033       ++n;
01034       input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01035       }
01036 
01037     // need to store the index of the max value
01038     is_max.m128 = max;
01039     max = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(2,3,0,1)));
01040     max = _mm_max_ps(max, _mm_movehl_ps(max, max));
01041     max = _mm_shuffle_ps(max, max, _MM_SHUFFLE(0,0,0,0));
01042     is_max.m128 = _mm_cmpeq_ps(is_max.m128, max);
01043     arg_max = _mm_and_si128(is_max.m128i, arg_max);
01044     arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
01045     arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_srli_si128(arg_max, 4));
01046     unsigned ret = _mm_cvtsi128_si32(arg_max);
01047     return ret;
01048   }
01049 
01050   static VNL_SSE_FORCE_INLINE unsigned arg_min(const float* x, unsigned n)
01051   {
01052     __m128  min = _mm_set1_ps(FLT_MAX);
01053     __m128  input;
01054     __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
01055     __m128i input_idx_increment = _mm_set1_epi32(4);
01056     __m128i arg_min = _mm_set1_epi32(0);
01057     union IsMinMask
01058       {
01059       __m128  m128;
01060       __m128i m128i;
01061       };
01062     IsMinMask is_min;
01063 
01064     const int n16 = (n/4) * 4;
01065     // handle two elements at a time, max will contain two max values
01066     for (int i=0; i<n16; i+=4)
01067       {
01068       input = VNL_SSE_HEAP_LOAD(ps)(x+i);
01069       min = _mm_min_ps(input, min);
01070       is_min.m128 = _mm_cmpeq_ps(min, input);
01071       arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
01072       input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01073       }
01074 
01075     //// decision logic for vector sizes whach aren't divisible by 4
01076     int mod = n%4;
01077     n = n-mod;
01078     input_idx_increment = _mm_set1_epi32(1);
01079     while (mod != 0)
01080       {
01081       input_idx = _mm_set1_epi32(n);
01082       input = _mm_load1_ps(x+n);
01083       min = _mm_min_ps(min, input);
01084       is_min.m128 = _mm_cmpeq_ps(min, input);
01085       arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
01086       --mod;
01087       ++n;
01088       input_idx = _mm_add_epi32(input_idx, input_idx_increment);
01089       }
01090 
01091     // need to store the index of the max value
01092     is_min.m128 = min;
01093     min = _mm_min_ps(min, _mm_shuffle_ps(min, min, _MM_SHUFFLE(2,3,0,1)));
01094     min = _mm_min_ps(min, _mm_movehl_ps(min, min));
01095     min = _mm_shuffle_ps(min, min, _MM_SHUFFLE(0,0,0,0));
01096     is_min.m128 = _mm_cmpeq_ps(is_min.m128, min);
01097     arg_min = _mm_and_si128(is_min.m128i, arg_min);
01098     arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
01099     arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_srli_si128(arg_min, 4));
01100     unsigned ret = _mm_cvtsi128_si32(arg_min);
01101     return ret;
01102   }
01103 };
01104 
01105 #endif // VNL_CONFIG_ENABLE_SSE2
01106 
01107 #endif // vnl_sse_h_