contrib/mul/mbl/mbl_k_means.cxx
Go to the documentation of this file.
00001 // This is mul/mbl/mbl_k_means.cxx
00002 #include "mbl_k_means.h"
00003 //:
00004 //  \file
00005 
00006 #include <vcl_algorithm.h>
00007 #include <vcl_iostream.h>
00008 #include <vcl_cstdlib.h>
00009 #include <vcl_cassert.h>
00010 
00011 //: Find k cluster centres
00012 // Uses batch k-means clustering.
00013 // If you provide parameter partition, it will return the
00014 // cluster index for each data sample. The number of iterations
00015 // performed is returned.
00016 //
00017 // \par Initial Cluster Centres
00018 // If centres contain the correct number of centres, they will
00019 // be used as the initial centres, If not, and if partition is
00020 // given, and it is the correct size, then this will be used
00021 // to find the initial centres.
00022 //
00023 // \par Degenerate Cases
00024 // If at any point the one of the centres has no data points allocated to it
00025 // the number of centres will be reduced below k. This is most likely to
00026 // happen if you start the function with one or more centre identical, or
00027 // if some of the centres start off outside the convex hull of the data set.
00028 // In particular if you let the function initialise the centres, it will
00029 // occur if any of the first k data samples are identical.
00030 unsigned mbl_k_means(mbl_data_wrapper<vnl_vector<double> > &data, unsigned k,
00031                      vcl_vector<vnl_vector<double> >* cluster_centres,
00032                      vcl_vector<unsigned> * partition //=0
00033                     )
00034 {
00035   vcl_vector<vnl_vector<double> > & centres = *cluster_centres;
00036 
00037   vcl_vector<unsigned> * p_partition;
00038   data.reset();
00039   unsigned  dims = data.current().size();
00040   vcl_vector<vnl_vector<double> > sums(k, vnl_vector<double>(dims, 0.0));
00041   vcl_vector<unsigned> nNearest(k,0);
00042   unsigned i;
00043   unsigned iterations =0;
00044 
00045   assert(data.size() >= k);
00046 
00047   bool initialise_from_clusters = false;
00048 
00049   // set up p_partition to point to something sensible
00050   if (partition)
00051   {
00052     p_partition = partition;
00053     if (p_partition->size() != data.size())
00054     {
00055       p_partition->resize(data.size());
00056       vcl_fill(p_partition->begin(), p_partition->end(), 0);
00057     }
00058     else initialise_from_clusters = true;
00059   }
00060   else
00061     p_partition = new vcl_vector<unsigned>(int(data.size()), 0u);
00062 
00063 
00064 // Calculate initial centres
00065 
00066   if (centres.size() != k) // use first k data items as centres
00067   {
00068     centres.resize(k);
00069     for (i=0; i<k; ++i)
00070     {
00071       centres[i] = data.current();
00072       sums[i] += data.current();
00073       nNearest[i]++;
00074       data.next();
00075     }
00076   }
00077   else if (initialise_from_clusters)
00078   {                         // calculate centres fro existing
00079     do
00080     {
00081       sums[(*p_partition)[data.index()] ] += data.current();
00082       nNearest[(*p_partition)[data.index()] ]++;
00083     } while (data.next());
00084     // Calculate centres
00085     for (i=0; i<k; ++i)
00086       centres[i] = sums[i]/nNearest[i];
00087     data.reset();
00088     vcl_fill(sums.begin(), sums.end(), vnl_vector<double>(dims, 0.0));
00089     vcl_fill(nNearest.begin(), nNearest.end(), 0);
00090   }
00091 
00092   bool changed = true;
00093   while (changed)
00094   {
00095     changed = false;
00096     do
00097     {
00098       unsigned bestCentre = 0;
00099       double bestDist = vnl_vector_ssd(centres[0], data.current());
00100       for (i=1; i<k; ++i)
00101       {
00102         double dist = vnl_vector_ssd(centres[i], data.current());
00103         if (dist < bestDist)
00104         {
00105             bestDist = dist;
00106             bestCentre = i;
00107         }
00108       }
00109       sums[bestCentre] += data.current();
00110       nNearest[bestCentre] ++;
00111       if (bestCentre != (*p_partition)[data.index()])
00112       {
00113         changed = true;
00114         (*p_partition)[data.index()] = bestCentre;
00115       }
00116     } while (data.next());
00117 
00118 
00119     // reduce k if any centres have no data items assigned to its cluster.
00120     for (i=0; i<k; ++i)
00121     {
00122       if ( nNearest[i] == 0)
00123       {
00124         k--;
00125         centres.erase(centres.begin()+i);
00126         sums.erase(sums.begin()+i);
00127         nNearest.erase(nNearest.begin()+i);
00128 
00129         for (unsigned j=0; j<p_partition->size(); ++j)
00130         {
00131           assert ((*p_partition)[j] = i);
00132           if ((*p_partition)[j] > i) (*p_partition)[j]--;
00133         }
00134 
00135         changed= true;
00136       }
00137     }
00138 
00139     // Calculate new centres
00140     for (i=0; i<k; ++i)
00141       centres[i] = sums[i]/nNearest[i];
00142 
00143     // and repeat
00144     data.reset();
00145     vcl_fill(sums.begin(), sums.end(), vnl_vector<double>(dims, 0.0));
00146     vcl_fill(nNearest.begin(), nNearest.end(), 0);
00147     iterations ++;
00148   }
00149 
00150   if (!partition)
00151     delete p_partition;
00152 
00153   return iterations;
00154 }
00155 
00156 static inline void incXbyYv(vnl_vector<double> *X, const vnl_vector<double> &Y, double v)
00157 {
00158   assert(X->size() == Y.size());
00159   assert(X->size() > 0);
00160   int i = ((int)X->size()) - 1;
00161   double * const pX=X->data_block();
00162   while (i >= 0)
00163   {
00164     pX[i] += Y[i] * v;
00165     i--;
00166   }
00167 }
00168 
00169 //: Find k cluster centres with weighted data
00170 // Uses batch k-means clustering.
00171 // If you provide parameter partition, it will return the
00172 // cluster index for each data sample. The number of iterations
00173 // performed is returned.
00174 //
00175 // \par Initial Cluster Centres
00176 // If centres contain the correct number of centres, they will
00177 // be used as the initial centres, If not, and if partition is
00178 // given, and it is the correct size, then this will be used
00179 // to find the initial centres.
00180 //
00181 // \par Degenerate Cases
00182 // If at any point the one of the centres has no data points allocated to it
00183 // the number of centres will be reduced below k. This is most likely to
00184 // happen if you start the function with one or more centre identical, or
00185 // if some of the centres start off outside the convex hull of the data set.
00186 // In particular if you let the function initialise the centres, it will
00187 // occur if any of the first k data samples are identical.
00188 //
00189 // \par
00190 // The algorithm has been optimised
00191 unsigned mbl_k_means_weighted(mbl_data_wrapper<vnl_vector<double> > &data, unsigned k,
00192                               const vcl_vector<double>& wts,
00193                               vcl_vector<vnl_vector<double> >* cluster_centres,
00194                               vcl_vector<unsigned> * partition //=0
00195                              )
00196 {
00197   vcl_vector<vnl_vector<double> > & centres = *cluster_centres;
00198 
00199   vcl_vector<unsigned> * p_partition;
00200   data.reset();
00201   unsigned  dims = data.current().size();
00202   vcl_vector<vnl_vector<double> > sums(k, vnl_vector<double>(dims, 0.0));
00203   vcl_vector<double> nNearest(k,0.0);
00204   unsigned i;
00205   unsigned iterations =0;
00206 
00207   bool initialise_from_clusters = false;
00208 
00209   assert(data.size() >= k);
00210   assert(data.size() == wts.size());
00211 
00212   // set up p_partition to point to something sensible
00213   if (partition)
00214   {
00215     p_partition = partition;
00216     if (p_partition->size() != data.size())
00217     {
00218       p_partition->resize(data.size());
00219       vcl_fill(p_partition->begin(), p_partition->end(), 0);
00220     }
00221     else initialise_from_clusters = true;
00222   }
00223   else
00224     p_partition = new vcl_vector<unsigned>(int(data.size()), 0u);
00225 
00226   const vnl_vector<double>  vcl_vector_double_dims_0(dims, 0.0);
00227 
00228 
00229 // Calculate initial centres
00230 
00231   if (centres.size() != k) // use first k non-zero weighted data items as centres
00232   {
00233     centres.resize(k);
00234     for (i=0; i<k; ++i)
00235     {
00236       while (wts[data.index()] == 0.0) // skip zero weighted data
00237       {
00238 #ifdef NDEBUG
00239         data.next();
00240 #else
00241         if (!data.next())
00242         {
00243           vcl_cerr << "ERROR: mbl_k_means_weighted, while initialising centres from data\n"
00244                    << "Not enough non-zero-weighted data\n";
00245           vcl_abort();
00246         }
00247 #endif //NDEBUG
00248       }
00249       centres[i] = data.current();
00250       incXbyYv(&sums[i], data.current(), wts[data.index()]);
00251 
00252       nNearest[i]+= wts[data.index()];
00253       data.next();
00254     }
00255   }
00256   else if (initialise_from_clusters)
00257   {                         // calculate centres fro existing
00258 
00259     do
00260     {
00261       incXbyYv(&sums[(*p_partition)[data.index()] ], data.current(), wts[data.index()]);
00262       nNearest[(*p_partition)[data.index()] ]+=wts[data.index()];
00263     } while (data.next());
00264     // Calculate centres
00265     for (i=0; i<k; ++i)
00266       centres[i] = sums[i]/nNearest[i];
00267     data.reset();
00268     vcl_fill(sums.begin(), sums.end(), vcl_vector_double_dims_0);
00269     vcl_fill(nNearest.begin(), nNearest.end(), 0.0);
00270   }
00271 
00272   bool changed = true;
00273   while (changed)
00274   {
00275     changed = false;
00276     do
00277     {
00278       const double w = wts[data.index()];
00279       if (w != 0.0)
00280       {
00281         unsigned bestCentre = 0;
00282         double bestDist = vnl_vector_ssd(centres[0], data.current());
00283         for (i=1; i<k; ++i)
00284         {
00285           double dist = vnl_vector_ssd(centres[i], data.current());
00286           if (dist < bestDist)
00287           {
00288               bestDist = dist;
00289               bestCentre = i;
00290           }
00291         }
00292         incXbyYv(&sums[bestCentre], data.current(), w);
00293         nNearest[bestCentre] += w;
00294         if (bestCentre != (*p_partition)[data.index()])
00295         {
00296           changed = true;
00297           (*p_partition)[data.index()] = bestCentre;
00298         }
00299       }
00300     } while (data.next());
00301 
00302 
00303     // reduce k if any centres have no data items assigned to its cluster.
00304     for (i=0; i<k; ++i)
00305     {
00306       if ( nNearest[i] == 0.0)
00307       {
00308         k--;
00309         centres.erase(centres.begin()+i);
00310         sums.erase(sums.begin()+i);
00311         nNearest.erase(nNearest.begin()+i);
00312         for (unsigned j=0; j<p_partition->size(); ++j)
00313         {
00314           if (wts[j] != 0.0)
00315           {
00316             assert ((*p_partition)[j]  != i);
00317             if ((*p_partition)[j] > i) (*p_partition)[j]--;
00318           }
00319         }
00320       }
00321     }
00322 
00323     // Calculate new centres
00324     for (i=0; i<k; ++i)
00325       centres[i] = sums[i]/nNearest[i];
00326 
00327     // and repeat
00328     data.reset();
00329     vcl_fill(sums.begin(), sums.end(), vcl_vector_double_dims_0);
00330     vcl_fill(nNearest.begin(), nNearest.end(), 0.0);
00331     iterations ++;
00332   }
00333 
00334   if (!partition)
00335     delete p_partition;
00336   else // assign all the zero weighted samples to their nearest centres.
00337   {
00338     data.reset();
00339     do
00340     {
00341       if (wts[data.index()] == 0.0)
00342       {
00343         unsigned bestCentre = 0;
00344         double bestDist = vnl_vector_ssd(centres[0], data.current());
00345         for (i=1; i<k; ++i)
00346         {
00347           double dist = vnl_vector_ssd(centres[i], data.current());
00348           if (dist < bestDist)
00349           {
00350             bestDist = dist;
00351             bestCentre = i;
00352           }
00353         }
00354         (*p_partition)[data.index()] = bestCentre;
00355       }
00356     } while (data.next());
00357   }
00358 
00359   return iterations;
00360 }