contrib/brl/bseg/sdet/sdet_k_means.cxx
Go to the documentation of this file.
00001 // This is brl/bseg/sdet/sdet_k_means.cxx
00002 #include "sdet_k_means.h"
00003 //:
00004 //  \file
00005 
00006 #include <vcl_algorithm.h>
00007 #include <vcl_iostream.h>
00008 #include <vcl_cstdlib.h>
00009 #include <vcl_cassert.h>
00010 
00011 //: Find k cluster centres
00012 // Uses batch k-means clustering.
00013 // If you provide parameter partition, it will return the
00014 // cluster index for each data sample. The number of iterations
00015 // performed is returned.
00016 //
00017 // \par Initial Cluster Centres
00018 // If centres contain the correct number of centres, they will
00019 // be used as the initial centres, If not, and if partition is
00020 // given, and it is the correct size, then this will be used
00021 // to find the initial centres.
00022 //
00023 // \par Degenerate Cases
00024 // If at any point the one of the centres has no data points allocated to it
00025 // the number of centres will be reduced below k. This is most likely to
00026 // happen if you start the function with one or more centre identical, or
00027 // if some of the centres start off outside the convex hull of the data set.
00028 // In particular if you let the function initialise the centres, it will
00029 // occur if any of the first k data samples are identical.
00030 unsigned sdet_k_means(vcl_vector<vnl_vector<double> > &data, unsigned& k,
00031                      vcl_vector<vnl_vector<double> >* cluster_centres,
00032                      vcl_vector<unsigned> * partition //=0
00033                     )
00034 {
00035   unsigned n_data = data.size();
00036   if(n_data==0){
00037     vcl_cout << "no data to process in sdet_k_means\n";
00038     return 0;
00039   }
00040   vcl_vector<vnl_vector<double> > & centres = *cluster_centres;
00041 
00042   vcl_vector<unsigned> * p_partition;
00043 
00044   unsigned didx = 0;
00045 
00046   unsigned  dims = data[didx].size();
00047 
00048   vcl_vector<vnl_vector<double> > sums(k, vnl_vector<double>(dims, 0.0));
00049   vcl_vector<unsigned> nNearest(k,0);
00050   unsigned i;
00051   unsigned iterations =0;
00052 
00053   assert(data.size() >= k);
00054 
00055   bool initialise_from_clusters = false;
00056 
00057   // set up p_partition to point to something sensible
00058   if (partition)
00059   {
00060     p_partition = partition;
00061     if (p_partition->size() != data.size())
00062     {
00063       p_partition->resize(data.size());
00064       vcl_fill(p_partition->begin(), p_partition->end(), 0);
00065     }
00066     else initialise_from_clusters = true;
00067   }
00068   else
00069     p_partition = new vcl_vector<unsigned>(int(data.size()), 0u);
00070 
00071 
00072 // Calculate initial centres
00073 
00074   if (centres.size() != k) // use first k data items as centres
00075   {
00076     centres.resize(k);
00077     for (i=0; i<k; ++i)
00078     {
00079       centres[i] = data[didx];
00080       sums[i] += data[didx];
00081       nNearest[i]++;
00082       didx++;
00083     }
00084   }
00085   else if (initialise_from_clusters)
00086   {                         // calculate centres fro existing
00087     do
00088     {
00089       sums[(*p_partition)[didx] ] += data[didx];
00090       nNearest[(*p_partition)[didx] ]++;
00091     } while (didx++<n_data);
00092     // Calculate centres
00093     for (i=0; i<k; ++i)
00094       centres[i] = sums[i]/nNearest[i];
00095     didx = 0;
00096     vcl_fill(sums.begin(), sums.end(), vnl_vector<double>(dims, 0.0));
00097     vcl_fill(nNearest.begin(), nNearest.end(), 0);
00098   }
00099 
00100   bool changed = true;
00101   while (changed)
00102   {
00103     changed = false;
00104     do
00105     {
00106       unsigned bestCentre = 0;
00107       double bestDist = vnl_vector_ssd(centres[0], data[didx]);
00108       for (i=1; i<k; ++i)
00109       {
00110         double dist = vnl_vector_ssd(centres[i], data[didx]);
00111         if (dist < bestDist)
00112         {
00113             bestDist = dist;
00114             bestCentre = i;
00115         }
00116       }
00117       sums[bestCentre] += data[didx];
00118       nNearest[bestCentre] ++;
00119       if (bestCentre != (*p_partition)[didx])
00120       {
00121         changed = true;
00122         (*p_partition)[didx] = bestCentre;
00123       }
00124     } while (++didx<n_data);
00125 
00126 
00127     // reduce k if any centres have no data items assigned to its cluster.
00128     for (i=0; i<k; ++i)
00129     {
00130       if ( nNearest[i] == 0)
00131       {
00132         k--;
00133         centres.erase(centres.begin()+i);
00134         sums.erase(sums.begin()+i);
00135         nNearest.erase(nNearest.begin()+i);
00136 
00137         for (unsigned j=0; j<p_partition->size(); ++j)
00138         {
00139           assert ((*p_partition)[j] = i);
00140           if ((*p_partition)[j] > i) (*p_partition)[j]--;
00141         }
00142 
00143         changed= true;
00144       }
00145     }
00146 
00147     // Calculate new centres
00148     for (i=0; i<k; ++i)
00149       centres[i] = sums[i]/nNearest[i];
00150 
00151     // and repeat
00152 
00153     didx = 0;
00154     vcl_fill(sums.begin(), sums.end(), vnl_vector<double>(dims, 0.0));
00155     vcl_fill(nNearest.begin(), nNearest.end(), 0);
00156     iterations ++;
00157   }
00158 
00159   if (!partition)
00160     delete p_partition;
00161 
00162   return iterations;
00163 }
00164 
00165 static inline void incXbyYv(vnl_vector<double> *X, const vnl_vector<double> &Y, double v)
00166 {
00167   assert(X->size() == Y.size());
00168   assert(X->size() > 0);
00169   int i = ((int)X->size()) - 1;
00170   double * const pX=X->data_block();
00171   while (i >= 0)
00172   {
00173     pX[i] += Y[i] * v;
00174     i--;
00175   }
00176 }
00177 
00178 //: Find k cluster centres with weighted data
00179 // Uses batch k-means clustering.
00180 // If you provide parameter partition, it will return the
00181 // cluster index for each data sample. The number of iterations
00182 // performed is returned.
00183 //
00184 // \par Initial Cluster Centres
00185 // If centres contain the correct number of centres, they will
00186 // be used as the initial centres, If not, and if partition is
00187 // given, and it is the correct size, then this will be used
00188 // to find the initial centres.
00189 //
00190 // \par Degenerate Cases
00191 // If at any point the one of the centres has no data points allocated to it
00192 // the number of centres will be reduced below k. This is most likely to
00193 // happen if you start the function with one or more centre identical, or
00194 // if some of the centres start off outside the convex hull of the data set.
00195 // In particular if you let the function initialise the centres, it will
00196 // occur if any of the first k data samples are identical.
00197 //
00198 // \par
00199 // The algorithm has been optimised
00200 unsigned sdet_k_means_weighted(vcl_vector<vnl_vector<double> > &data,
00201                                unsigned& k,
00202                                const vcl_vector<double>& wts,
00203                                vcl_vector<vnl_vector<double> >* cluster_centres,
00204                                vcl_vector<unsigned> * partition //=0
00205                                )
00206 {
00207   unsigned n_data = data.size();
00208   if(n_data==0){
00209     vcl_cout << "no data to process in sdet_k_means\n";
00210     return 0;
00211   }
00212   vcl_vector<vnl_vector<double> > & centres = *cluster_centres;
00213 
00214   vcl_vector<unsigned> * p_partition;
00215   unsigned didx = 0;
00216   unsigned  dims = data[didx].size();
00217   vcl_vector<vnl_vector<double> > sums(k, vnl_vector<double>(dims, 0.0));
00218   vcl_vector<double> nNearest(k,0.0);
00219   unsigned i;
00220   unsigned iterations =0;
00221 
00222   bool initialise_from_clusters = false;
00223 
00224   assert(data.size() >= k);
00225   assert(data.size() == wts.size());
00226 
00227   // set up p_partition to point to something sensible
00228   if (partition)
00229   {
00230     p_partition = partition;
00231     if (p_partition->size() != data.size())
00232     {
00233       p_partition->resize(data.size());
00234       vcl_fill(p_partition->begin(), p_partition->end(), 0);
00235     }
00236     else initialise_from_clusters = true;
00237   }
00238   else
00239     p_partition = new vcl_vector<unsigned>(int(data.size()), 0u);
00240 
00241   const vnl_vector<double>  vcl_vector_double_dims_0(dims, 0.0);
00242 
00243 
00244 // Calculate initial centres
00245 
00246   if (centres.size() != k) // use first k non-zero weighted data items as centres
00247   {
00248     centres.resize(k);
00249     for (i=0; i<k; ++i)
00250     {
00251       while (wts[didx] == 0.0) // skip zero weighted data
00252       {
00253 #ifdef NDEBUG
00254         didx++;
00255 #else
00256         if (++didx<n_data)
00257         {
00258           vcl_cerr << "ERROR: sdet_k_means_weighted, while initialising centres from data\n"
00259                    << "Not enough non-zero-weighted data\n";
00260           vcl_abort();
00261         }
00262 #endif //NDEBUG
00263       }
00264       centres[i] = data[didx];
00265       incXbyYv(&sums[i], data[didx], wts[didx]);
00266 
00267       nNearest[i]+= wts[didx];
00268       didx++;
00269     }
00270   }
00271   else if (initialise_from_clusters)
00272   {                         // calculate centres fro existing
00273 
00274     do
00275     {
00276       incXbyYv(&sums[(*p_partition)[didx] ], data[didx], wts[didx]);
00277       nNearest[(*p_partition)[didx] ]+=wts[didx];
00278     } while (++didx<n_data);
00279     // Calculate centres
00280     for (i=0; i<k; ++i)
00281       centres[i] = sums[i]/nNearest[i];
00282     didx = 0;
00283     vcl_fill(sums.begin(), sums.end(), vcl_vector_double_dims_0);
00284     vcl_fill(nNearest.begin(), nNearest.end(), 0.0);
00285   }
00286 
00287   bool changed = true;
00288   while (changed)
00289   {
00290     changed = false;
00291     do
00292     {
00293       const double w = wts[didx];
00294       if (w != 0.0)
00295       {
00296         unsigned bestCentre = 0;
00297         double bestDist = vnl_vector_ssd(centres[0], data[didx]);
00298         for (i=1; i<k; ++i)
00299         {
00300           double dist = vnl_vector_ssd(centres[i], data[didx]);
00301           if (dist < bestDist)
00302           {
00303               bestDist = dist;
00304               bestCentre = i;
00305           }
00306         }
00307         incXbyYv(&sums[bestCentre], data[didx], w);
00308         nNearest[bestCentre] += w;
00309         if (bestCentre != (*p_partition)[didx])
00310         {
00311           changed = true;
00312           (*p_partition)[didx] = bestCentre;
00313         }
00314       }
00315     } while (++didx<n_data);
00316 
00317 
00318     // reduce k if any centres have no data items assigned to its cluster.
00319     for (i=0; i<k; ++i)
00320     {
00321       if ( nNearest[i] == 0.0)
00322       {
00323         k--;
00324         centres.erase(centres.begin()+i);
00325         sums.erase(sums.begin()+i);
00326         nNearest.erase(nNearest.begin()+i);
00327         for (unsigned j=0; j<p_partition->size(); ++j)
00328         {
00329           if (wts[j] != 0.0)
00330           {
00331             assert ((*p_partition)[j]  != i);
00332             if ((*p_partition)[j] > i) (*p_partition)[j]--;
00333           }
00334         }
00335       }
00336     }
00337 
00338     // Calculate new centres
00339     for (i=0; i<k; ++i)
00340       centres[i] = sums[i]/nNearest[i];
00341 
00342     // and repeat
00343     didx = 0;
00344     vcl_fill(sums.begin(), sums.end(), vcl_vector_double_dims_0);
00345     vcl_fill(nNearest.begin(), nNearest.end(), 0.0);
00346     iterations ++;
00347   }
00348 
00349   if (!partition)
00350     delete p_partition;
00351   else // assign all the zero weighted samples to their nearest centres.
00352   {
00353     didx = 0;
00354     do
00355     {
00356       if (wts[didx] == 0.0)
00357       {
00358         unsigned bestCentre = 0;
00359         double bestDist = vnl_vector_ssd(centres[0], data[didx]);
00360         for (i=1; i<k; ++i)
00361         {
00362           double dist = vnl_vector_ssd(centres[i], data[didx]);
00363           if (dist < bestDist)
00364           {
00365             bestDist = dist;
00366             bestCentre = i;
00367           }
00368         }
00369         (*p_partition)[didx] = bestCentre;
00370       }
00371     } while (++didx<n_data);
00372   }
00373 
00374   return iterations;
00375 }
00376