00001
00002 #include "mbl_k_means.h"
00003
00004
00005
00006 #include <vcl_algorithm.h>
00007 #include <vcl_iostream.h>
00008 #include <vcl_cstdlib.h>
00009 #include <vcl_cassert.h>
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 unsigned mbl_k_means(mbl_data_wrapper<vnl_vector<double> > &data, unsigned k,
00031 vcl_vector<vnl_vector<double> >* cluster_centres,
00032 vcl_vector<unsigned> * partition
00033 )
00034 {
00035 vcl_vector<vnl_vector<double> > & centres = *cluster_centres;
00036
00037 vcl_vector<unsigned> * p_partition;
00038 data.reset();
00039 unsigned dims = data.current().size();
00040 vcl_vector<vnl_vector<double> > sums(k, vnl_vector<double>(dims, 0.0));
00041 vcl_vector<unsigned> nNearest(k,0);
00042 unsigned i;
00043 unsigned iterations =0;
00044
00045 assert(data.size() >= k);
00046
00047 bool initialise_from_clusters = false;
00048
00049
00050 if (partition)
00051 {
00052 p_partition = partition;
00053 if (p_partition->size() != data.size())
00054 {
00055 p_partition->resize(data.size());
00056 vcl_fill(p_partition->begin(), p_partition->end(), 0);
00057 }
00058 else initialise_from_clusters = true;
00059 }
00060 else
00061 p_partition = new vcl_vector<unsigned>(int(data.size()), 0u);
00062
00063
00064
00065
00066 if (centres.size() != k)
00067 {
00068 centres.resize(k);
00069 for (i=0; i<k; ++i)
00070 {
00071 centres[i] = data.current();
00072 sums[i] += data.current();
00073 nNearest[i]++;
00074 data.next();
00075 }
00076 }
00077 else if (initialise_from_clusters)
00078 {
00079 do
00080 {
00081 sums[(*p_partition)[data.index()] ] += data.current();
00082 nNearest[(*p_partition)[data.index()] ]++;
00083 } while (data.next());
00084
00085 for (i=0; i<k; ++i)
00086 centres[i] = sums[i]/nNearest[i];
00087 data.reset();
00088 vcl_fill(sums.begin(), sums.end(), vnl_vector<double>(dims, 0.0));
00089 vcl_fill(nNearest.begin(), nNearest.end(), 0);
00090 }
00091
00092 bool changed = true;
00093 while (changed)
00094 {
00095 changed = false;
00096 do
00097 {
00098 unsigned bestCentre = 0;
00099 double bestDist = vnl_vector_ssd(centres[0], data.current());
00100 for (i=1; i<k; ++i)
00101 {
00102 double dist = vnl_vector_ssd(centres[i], data.current());
00103 if (dist < bestDist)
00104 {
00105 bestDist = dist;
00106 bestCentre = i;
00107 }
00108 }
00109 sums[bestCentre] += data.current();
00110 nNearest[bestCentre] ++;
00111 if (bestCentre != (*p_partition)[data.index()])
00112 {
00113 changed = true;
00114 (*p_partition)[data.index()] = bestCentre;
00115 }
00116 } while (data.next());
00117
00118
00119
00120 for (i=0; i<k; ++i)
00121 {
00122 if ( nNearest[i] == 0)
00123 {
00124 k--;
00125 centres.erase(centres.begin()+i);
00126 sums.erase(sums.begin()+i);
00127 nNearest.erase(nNearest.begin()+i);
00128
00129 for (unsigned j=0; j<p_partition->size(); ++j)
00130 {
00131 assert ((*p_partition)[j] = i);
00132 if ((*p_partition)[j] > i) (*p_partition)[j]--;
00133 }
00134
00135 changed= true;
00136 }
00137 }
00138
00139
00140 for (i=0; i<k; ++i)
00141 centres[i] = sums[i]/nNearest[i];
00142
00143
00144 data.reset();
00145 vcl_fill(sums.begin(), sums.end(), vnl_vector<double>(dims, 0.0));
00146 vcl_fill(nNearest.begin(), nNearest.end(), 0);
00147 iterations ++;
00148 }
00149
00150 if (!partition)
00151 delete p_partition;
00152
00153 return iterations;
00154 }
00155
00156 static inline void incXbyYv(vnl_vector<double> *X, const vnl_vector<double> &Y, double v)
00157 {
00158 assert(X->size() == Y.size());
00159 assert(X->size() > 0);
00160 int i = ((int)X->size()) - 1;
00161 double * const pX=X->data_block();
00162 while (i >= 0)
00163 {
00164 pX[i] += Y[i] * v;
00165 i--;
00166 }
00167 }
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191 unsigned mbl_k_means_weighted(mbl_data_wrapper<vnl_vector<double> > &data, unsigned k,
00192 const vcl_vector<double>& wts,
00193 vcl_vector<vnl_vector<double> >* cluster_centres,
00194 vcl_vector<unsigned> * partition
00195 )
00196 {
00197 vcl_vector<vnl_vector<double> > & centres = *cluster_centres;
00198
00199 vcl_vector<unsigned> * p_partition;
00200 data.reset();
00201 unsigned dims = data.current().size();
00202 vcl_vector<vnl_vector<double> > sums(k, vnl_vector<double>(dims, 0.0));
00203 vcl_vector<double> nNearest(k,0.0);
00204 unsigned i;
00205 unsigned iterations =0;
00206
00207 bool initialise_from_clusters = false;
00208
00209 assert(data.size() >= k);
00210 assert(data.size() == wts.size());
00211
00212
00213 if (partition)
00214 {
00215 p_partition = partition;
00216 if (p_partition->size() != data.size())
00217 {
00218 p_partition->resize(data.size());
00219 vcl_fill(p_partition->begin(), p_partition->end(), 0);
00220 }
00221 else initialise_from_clusters = true;
00222 }
00223 else
00224 p_partition = new vcl_vector<unsigned>(int(data.size()), 0u);
00225
00226 const vnl_vector<double> vcl_vector_double_dims_0(dims, 0.0);
00227
00228
00229
00230
00231 if (centres.size() != k)
00232 {
00233 centres.resize(k);
00234 for (i=0; i<k; ++i)
00235 {
00236 while (wts[data.index()] == 0.0)
00237 {
00238 #ifdef NDEBUG
00239 data.next();
00240 #else
00241 if (!data.next())
00242 {
00243 vcl_cerr << "ERROR: mbl_k_means_weighted, while initialising centres from data\n"
00244 << "Not enough non-zero-weighted data\n";
00245 vcl_abort();
00246 }
00247 #endif //NDEBUG
00248 }
00249 centres[i] = data.current();
00250 incXbyYv(&sums[i], data.current(), wts[data.index()]);
00251
00252 nNearest[i]+= wts[data.index()];
00253 data.next();
00254 }
00255 }
00256 else if (initialise_from_clusters)
00257 {
00258
00259 do
00260 {
00261 incXbyYv(&sums[(*p_partition)[data.index()] ], data.current(), wts[data.index()]);
00262 nNearest[(*p_partition)[data.index()] ]+=wts[data.index()];
00263 } while (data.next());
00264
00265 for (i=0; i<k; ++i)
00266 centres[i] = sums[i]/nNearest[i];
00267 data.reset();
00268 vcl_fill(sums.begin(), sums.end(), vcl_vector_double_dims_0);
00269 vcl_fill(nNearest.begin(), nNearest.end(), 0.0);
00270 }
00271
00272 bool changed = true;
00273 while (changed)
00274 {
00275 changed = false;
00276 do
00277 {
00278 const double w = wts[data.index()];
00279 if (w != 0.0)
00280 {
00281 unsigned bestCentre = 0;
00282 double bestDist = vnl_vector_ssd(centres[0], data.current());
00283 for (i=1; i<k; ++i)
00284 {
00285 double dist = vnl_vector_ssd(centres[i], data.current());
00286 if (dist < bestDist)
00287 {
00288 bestDist = dist;
00289 bestCentre = i;
00290 }
00291 }
00292 incXbyYv(&sums[bestCentre], data.current(), w);
00293 nNearest[bestCentre] += w;
00294 if (bestCentre != (*p_partition)[data.index()])
00295 {
00296 changed = true;
00297 (*p_partition)[data.index()] = bestCentre;
00298 }
00299 }
00300 } while (data.next());
00301
00302
00303
00304 for (i=0; i<k; ++i)
00305 {
00306 if ( nNearest[i] == 0.0)
00307 {
00308 k--;
00309 centres.erase(centres.begin()+i);
00310 sums.erase(sums.begin()+i);
00311 nNearest.erase(nNearest.begin()+i);
00312 for (unsigned j=0; j<p_partition->size(); ++j)
00313 {
00314 if (wts[j] != 0.0)
00315 {
00316 assert ((*p_partition)[j] != i);
00317 if ((*p_partition)[j] > i) (*p_partition)[j]--;
00318 }
00319 }
00320 }
00321 }
00322
00323
00324 for (i=0; i<k; ++i)
00325 centres[i] = sums[i]/nNearest[i];
00326
00327
00328 data.reset();
00329 vcl_fill(sums.begin(), sums.end(), vcl_vector_double_dims_0);
00330 vcl_fill(nNearest.begin(), nNearest.end(), 0.0);
00331 iterations ++;
00332 }
00333
00334 if (!partition)
00335 delete p_partition;
00336 else
00337 {
00338 data.reset();
00339 do
00340 {
00341 if (wts[data.index()] == 0.0)
00342 {
00343 unsigned bestCentre = 0;
00344 double bestDist = vnl_vector_ssd(centres[0], data.current());
00345 for (i=1; i<k; ++i)
00346 {
00347 double dist = vnl_vector_ssd(centres[i], data.current());
00348 if (dist < bestDist)
00349 {
00350 bestDist = dist;
00351 bestCentre = i;
00352 }
00353 }
00354 (*p_partition)[data.index()] = bestCentre;
00355 }
00356 } while (data.next());
00357 }
00358
00359 return iterations;
00360 }