contrib/brl/bbas/bsta/algo/bsta_sample_set.h
Go to the documentation of this file.
00001 // This is brl/bbas/bsta/algo/bsta_sample_set.h
00002 #ifndef bsta_sample_set_h_
00003 #define bsta_sample_set_h_
00004 //:
00005 // \file
00006 // \brief Classes to collect samples
00007 //
00008 // \author Ozge C. Ozcanli
00009 // \date March 04, 2009
00010 //
00011 // \verbatim
00012 //  Modifications
00013 //   (none yet)
00014 // \endverbatim
00015 //
00016 #include <bsta/bsta_parzen_sphere.h>
00017 #include <bsta/bsta_mixture.h>
00018 #include <bsta/bsta_attributes.h>
00019 #include <bsta/bsta_gaussian_sphere.h>
00020 #include <bsta/bsta_gaussian_full.h>
00021 #include <vnl/vnl_vector_fixed.h>
00022 #include <vnl/vnl_matrix_fixed.h>
00023 #include <vcl_fstream.h>
00024 #include <vcl_utility.h>
00025 #include <vcl_iostream.h>
00026 
00027 #define MIN_VAR_  0.0001
00028 
00029 //: A class to hold samples, the window width parameter, weights for each sample, assignments of each sample to modes/cluster centers/classes
00030 //  This class is used by mean-shift and EM algorithms
00031 template <class T, unsigned n>
00032 class bsta_sample_set : public bsta_parzen_sphere<T,n>
00033 {
00034  public:
00035 
00036   typedef typename bsta_parzen_sphere<T,n>::vector_type vector_;
00037 
00038   // Constructor
00039   bsta_sample_set(T bandwidth = T(1)) : bsta_parzen_sphere<T,n>() { this->set_bandwidth(bandwidth); }
00040 
00041   //: Compute the mean in a window around the given pt, the window size is the bandwidth
00042   //  If there are no points within bandwidth of the input pt, \return false
00043   bool mean(vector_ const& pt, vector_& out);
00044 
00045   //: Insert a weighted sample into the distribution
00046   void insert_sample(vector_ const& sample, T weight = T(1.0));
00047 
00048   T weight(unsigned i) const { return weights_[i]; }
00049   vcl_vector<T>& weights() { return weights_; }
00050 
00051   //: one may need to normalize the weights after the insertion is over
00052   void normalize_weights();
00053 
00054   //: must call this method before using the assignment vector
00055   void initialize_assignments();  // initializes each sample's assignment to -1 (null assignment)
00056   vcl_vector<int>& assignments() { return assignments_; }
00057   int assignment(unsigned i) const { return assignments_[i]; }
00058   void set_assignment(unsigned i, int mode) { assignments_[i] = mode; }
00059 
00060   //: compute the mean of a particular assignment/mode/cluster
00061   bool mode_mean(int mode, vector_& out) const;
00062 
00063   typename vcl_vector<typename bsta_parzen_sphere<T,n>::vector_type >::const_iterator samples_begin() const { return bsta_parzen<T,n>::samples_.begin(); }
00064   typename vcl_vector<typename bsta_parzen_sphere<T,n>::vector_type >::const_iterator samples_end() const { return bsta_parzen<T,n>::samples_.end(); }
00065 
00066   typename vcl_vector<T >::const_iterator weights_begin() const { return weights_.begin(); }
00067   typename vcl_vector<T >::const_iterator weights_end() const { return weights_.end(); }
00068 
00069   vcl_vector<int >::const_iterator assignments_begin() const { return assignments_.begin(); }
00070   vcl_vector<int >::const_iterator assignments_end() const { return assignments_.end(); }
00071 
00072   bool check_initializations() const { return bsta_parzen<T,n>::samples_.size() == weights_.size() &&
00073                                         bsta_parzen<T,n>::samples_.size() == assignments_.size(); }
00074 
00075   //: return number of assignments to this mode
00076   int mode_size(int mode) const;
00077 
00078   //: return total weight of assignments to this mode
00079   T mode_weight(int mode) const;
00080 
00081   //: return number of modes in the current assignment vector
00082   unsigned mode_cnt() const;
00083 
00084   //: return total weight of all assignments
00085   T total_weight() const;
00086 
00087  private:
00088   //: hold a vector of weights for each data sample
00089   //  Needs to be set separately with each insert into the data set,
00090   //  otherwise it's set to 1.0 by default at the first call to mean()
00091   vcl_vector<T> weights_;
00092 
00093   vcl_vector<int> assignments_;  // a negative value indicates "null assignment"
00094 };
00095 
00096 //: compute the variance of a particular assignment in a bsta_sample_set
00097 template <class T>
00098 bool bsta_sample_set_variance(const bsta_sample_set<T,1>& set, int mode, T min_var, T& out)
00099 {
00100   typedef typename vcl_vector<T >::const_iterator sit_t;
00101   typedef typename vcl_vector<T >::const_iterator wit_t;
00102   typedef typename vcl_vector<int >::const_iterator ait_t;
00103 
00104   if (!set.check_initializations()) {
00105     vcl_cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
00106     return false;
00107   }
00108 
00109   T mv;
00110   set.mode_mean(mode, mv);
00111 
00112   T sum(T(0));
00113   sit_t sit = set.samples_begin();
00114   wit_t wit = set.weights_begin();
00115   ait_t ait = set.assignments_begin();
00116   T nsamp = 0;
00117   for (; sit != set.samples_end(); ++sit, ++wit, ++ait) {
00118     if (*ait != mode)
00119       continue;
00120 
00121     T s = (*sit-mv)*(*sit-mv);
00122     sum += (*wit)*s;
00123     nsamp += (*wit);
00124   }
00125   if (nsamp > 0) {
00126     out = sum / nsamp;
00127     if (out < min_var)
00128       out = min_var;
00129     return true;
00130   }
00131 
00132   return false;
00133 }
00134 
00135 //: compute the variance of a particular assignment in a bsta_sample_set
00136 template <class T, unsigned n>
00137 bool bsta_sample_set_variance(const bsta_sample_set<T,n>& set, int mode, vnl_matrix_fixed<T,n,n>& out)
00138 {
00139   typedef typename vcl_vector<vnl_vector_fixed<T,n> >::const_iterator sit_t;
00140   typedef typename vcl_vector<T >::const_iterator wit_t;
00141   typedef typename vcl_vector<int >::const_iterator ait_t;
00142 
00143   if (!set.check_initializations()) {
00144     vcl_cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
00145     return false;
00146   }
00147 
00148   vnl_vector_fixed<T,n> mv;
00149   set.mode_mean(mode, mv);
00150 
00151   vnl_matrix_fixed<T,n,n> sum(T(0));
00152   sit_t sit = set.samples_begin();
00153   wit_t wit = set.weights_begin();
00154   ait_t ait = set.assignments_begin();
00155   T nsamp = 0;
00156   for (; sit != set.samples_end(); ++sit, ++wit, ++ait) {
00157     if (*ait != mode)
00158       continue;
00159 
00160     vnl_vector_fixed<T,n> diff = (*sit)-mv;
00161     sum += (*wit)*outer_product(diff,diff);
00162     nsamp += (*wit);
00163   }
00164   if (nsamp > 0) {
00165     out = sum / nsamp;
00166 
00167     return true;
00168   }
00169 
00170   return false;
00171 }
00172 
00173 //: compute the marginalized 1D sample set distribution from nD set
00174 template <class T, unsigned n>
00175 bool bsta_sample_set_marginalize(const bsta_sample_set<T,n>& set, unsigned component, bsta_sample_set<T,1>& out_set)
00176 {
00177   typedef typename vcl_vector<vnl_vector_fixed<T,n> >::const_iterator sit_t;
00178   typedef typename vcl_vector<T >::const_iterator wit_t;
00179 
00180   if (n <= component)  // if the vector is not as large to have component return false
00181     return false;
00182 
00183   sit_t sit = set.samples_begin();
00184   wit_t wit = set.weights_begin();
00185 
00186   for (; sit != set.samples_end(); ++sit, ++wit) {
00187     out_set.insert_sample((*sit)[component], (*wit));
00188   }
00189 
00190   return true;
00191 }
00192 
00193 template <class T>
00194 bool bsta_sample_set_fit_distribution(const bsta_sample_set<T,1>& set, bsta_mixture<bsta_num_obs<bsta_gaussian_sphere<T,1> > >& out)
00195 {
00196   if (!set.check_initializations()) {
00197     vcl_cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
00198     return false;
00199   }
00200 
00201   // compute mean and variance for each mode
00202   unsigned mode_cnt = set.mode_cnt();
00203 
00204   while (out.num_components() != 0) {
00205     out.remove_last();
00206   }
00207 
00208   T total_weight = set.total_weight();
00209   for (unsigned mi = 0; mi < mode_cnt; mi++) {
00210     T meanv;
00211     set.mode_mean(mi, meanv);
00212     T var;
00213     if (!bsta_sample_set_variance(set, mi, T(MIN_VAR_), var))
00214       return false;
00215     T w = set.mode_weight(mi);
00216     bsta_gaussian_sphere<T,1> gauss_d(meanv,var);
00217     bsta_num_obs<bsta_gaussian_sphere<T,1> > gauss(gauss_d, w);
00218     if (!out.insert(gauss, w/total_weight))
00219       return false;
00220   }
00221 
00222   return true;
00223 }
00224 
00225 template <class T, unsigned n>
00226 bool bsta_sample_set_fit_distribution(const bsta_sample_set<T,n>& set, bsta_mixture<bsta_num_obs<bsta_gaussian_full<T,n> > >& out)
00227 {
00228   if (!set.check_initializations()) {
00229     vcl_cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
00230     return false;
00231   }
00232 
00233   // compute mean and variance for each mode
00234   unsigned mode_cnt = set.mode_cnt();
00235 
00236   while (out.num_components() != 0) {
00237     out.remove_last();
00238   }
00239 
00240   T total_weight = set.total_weight();
00241   for (unsigned mi = 0; mi < mode_cnt; mi++) {
00242     vnl_vector_fixed<T,n> meanv;
00243     set.mode_mean(mi, meanv);
00244     vnl_matrix_fixed<T,n,n> covar;
00245     if (!bsta_sample_set_variance(set, mi, covar))
00246       return false;
00247     T w = set.mode_weight(mi);
00248     bsta_gaussian_full<T,n> gauss_d(meanv,covar);
00249     bsta_num_obs<bsta_gaussian_full<T,n> > gauss(gauss_d, w);
00250     if (!out.insert(gauss, w/total_weight))
00251       return false;
00252   }
00253 
00254   return true;
00255 }
00256 
00257 //:
00258 //  Total weight is used to normalize the weight of the distribution
00259 //  (bsta_num_obs class contains total weight of samples assigned to this distribution)
00260 template <class T>
00261 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,1>& set, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist, T total_weight)
00262 {
00263   if (!set.size()) {
00264     vcl_cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : assignments not initialized!\n";
00265     return T(0);
00266   }
00267 
00268   T w = dist.num_observations;
00269   T p_dist = w/total_weight;
00270   T sum = T(0);
00271   for (unsigned i = 0; i < set.size(); i++) {
00272     if (vcl_sqrt(dist.sqr_mahalanobis_dist(set.sample(i))) < 3) {// we don't want zero to be logged
00273       T p = T(vcl_log(dist.prob_density(set.sample(i))));
00274       T pw = T(vcl_log(p_dist));
00275       sum += p + pw;  // log is natural logarithm
00276     }
00277   }
00278 
00279   return sum;
00280 }
00281 
00282 //:
00283 //  Total weight is used to normalize the weight of the distribution
00284 //  (bsta_num_obs class contains total weight of samples assigned to this distribution)
00285 template <class T, unsigned n>
00286 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,n>& set, bsta_num_obs<bsta_gaussian_full<T,n> >& dist, T total_weight)
00287 {
00288   if (!set.size()) {
00289     vcl_cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : assignments not initialized!\n";
00290     return T(0);
00291   }
00292 
00293   T w = dist.num_observations;
00294   T p_dist = w/total_weight;
00295   T sum = T(0);
00296   for (unsigned i = 0; i < set.size(); i++) {
00297     if (vcl_sqrt(dist.sqr_mahalanobis_dist(set.sample(i))) < 3) {// we don't want zero to be logged
00298       T p = T(vcl_log(dist.prob_density(set.sample(i))));
00299       T pw = T(vcl_log(p_dist));
00300       sum += p + pw;  // log is natural logarithm
00301     }
00302   }
00303 
00304   return sum;
00305 }
00306 
00307 //:
00308 //  Total weight is used to normalize the weight of the distribution
00309 //  (bsta_num_obs class contains total weight of samples assigned to this distribution)
00310 template <class T>
00311 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,2>& set, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist0, T w0, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist1, T w1, T& w_sum)
00312 {
00313   if (!set.size()) {
00314     vcl_cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : set is empty!\n";
00315     return T(0);
00316   }
00317 
00318   T total_weight = T(0);
00319   w_sum = T(0);
00320   T sum = T(0);
00321   unsigned cnt = 0;
00322   for (unsigned i = 0; i < set.size(); i++) {
00323 
00324     T d0 = dist0.sqr_mahalanobis_dist(set.sample(i)[0]);
00325     T d1 = dist1.sqr_mahalanobis_dist(set.sample(i)[1]);
00326     T d0_sqrt = vcl_sqrt(d0);
00327     T d1_sqrt = vcl_sqrt(d1);
00328 
00329     if (d0_sqrt < 3) {
00330       if (d1_sqrt < 3) {  // if this sample belongs to both of these modes
00331         w_sum += set.weight(i);
00332         cnt++;
00333       } else {
00334         d1 = 9; // make max distance to be 9
00335       }
00336     } else {
00337       d0 = 9;
00338     }
00339 
00340     T p = dist0.dist_prob_density(d0);
00341     p *= w0;
00342     T p0 = T(vcl_log(p));
00343     p = dist1.dist_prob_density(d1);
00344     p *= w1;
00345     T p1 = T(vcl_log(p));
00346 
00347     sum += p0 + p1;  // log is natural logarithm
00348 
00349     total_weight += set.weight(i);
00350   }
00351 
00352   // w_sum is the total weight of all the samples assigned to these two modes w_sum/total_weight becomes the probability of this joint mode
00353   T prior = w_sum/total_weight;
00354   T log_prior = T(vcl_log(prior));
00355   T tot_log_prior = set.size()*log_prior;
00356   sum += tot_log_prior;
00357 
00358   return sum;
00359 }
00360 
00361 
00362 //: a specialized matlab file printer for 2D data
00363 template<class T>
00364 bool bsta_sample_set_print_to_m(const bsta_sample_set<T,2>& set, vcl_ofstream& of)
00365 {
00366   // print samples in different colors according to the assignment
00367   unsigned mode_cnt = set.mode_cnt();
00368 
00369   of << "cmap = colormap(lines(" << mode_cnt << "));\n";
00370 
00371   for (unsigned m = 0; m < mode_cnt; m++) {
00372     vcl_vector<vcl_pair<T,T> > points;
00373     for (unsigned i = 0; i < set.size(); i++) {
00374       if (set.assignment(i) == m)
00375         points.push_back(vcl_pair<T,T>(T(set.sample(i)[0]), T(set.sample(i)[1])));
00376     }
00377     if (points.size() > 0) {
00378       of << "x = [" << points[0].first;
00379       for (unsigned i = 1; i < points.size(); i++) {
00380         of << ", " << points[i].first;
00381       }
00382       of << "];\n";
00383       of << "y = [" << points[0].second;
00384       for (unsigned i = 1; i < points.size(); i++) {
00385         of << ", " << points[i].second;
00386       }
00387       of << "];\n";
00388       of << "h = plot(x,y,'or');\nset(h, 'Color', cmap(" << m+1 << ",:));\n";
00389       of << "hold on\n";
00390     }
00391 
00392     vnl_vector_fixed<T,2> mode;
00393     set.mode_mean(m, mode);
00394     of << "xx = [" << mode[0] << "];\n";
00395     of << "yy = [" << mode[1] << "];\n";
00396     of << "h = plot(xx,yy,'+r');\nset(h, 'Color', cmap(" << m+1 << ",:));\n";
00397     of << "hold on\n";
00398   }
00399 
00400   return true;
00401 }
00402 
00403 //: a specialized matlab file printer to visualize printed distribution
00404 template<class T>
00405 bool bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,2>& set, vcl_ofstream& of)
00406 {
00407   // print samples in different colors according to the assignment
00408   unsigned mode_cnt = set.mode_cnt();
00409 
00410   of << "cmap = colormap(lines(" << mode_cnt << "));\n";
00411 
00412   // find range of data to construct surface properly
00413   T min = T(10000), max = T(0);
00414   for (unsigned i = 0; i < set.size(); i++) {
00415     if (set.sample(i)[0] < min)
00416       min = set.sample(i)[0];
00417     if (set.sample(i)[0] > max)
00418       max = set.sample(i)[0];
00419     if (set.sample(i)[1] < min)
00420       min = set.sample(i)[1];
00421     if (set.sample(i)[1] > max)
00422       max = set.sample(i)[1];
00423   }
00424 
00425   of << "[x,y] = meshgrid(" << min << ":.2:" << max << ", " << min << ":.2:" << max << ");\n";
00426 
00427   T total_weight = set.total_weight();
00428   for (unsigned m = 0; m < mode_cnt; m++) {
00429     vnl_vector_fixed<T,2> mode;
00430     set.mode_mean(m, mode);
00431     of << "mu = [" << mode[0] << ' ' << mode[1] << "];\n";
00432     vnl_matrix_fixed<T,2,2> covar;
00433     if (!bsta_sample_set_variance(set, m, covar))
00434       return false;
00435     of << "sigma = [";
00436     for (unsigned r = 0; r < 2; r++) {
00437       for (unsigned c = 0; c < 2; c++)
00438         of << covar[r][c] << ' ';
00439       if (r == 0)
00440         of << ';';
00441     }
00442     of << "];\n";
00443     of << "X = [x(:) y(:)];\n";
00444     of << "p = " << set.mode_weight(m)/total_weight << "*mvnpdf(X, mu, sigma);\n";
00445     of << "c = " << (m+1) << "*ones(size(x));\n";
00446     of << "surf(x,y,reshape(p,size(x,1),size(x,2)),c);\n";
00447     of << "hold on\n";
00448   }
00449 
00450   return true;
00451 }
00452 
00453 //: a specialized matlab file printer to visualize printed distribution
00454 template<class T>
00455 bool bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,1>& set, vcl_ofstream& of)
00456 {
00457   // print samples in different colors according to the assignment
00458   unsigned mode_cnt = set.mode_cnt();
00459 
00460   of << "cmap = colormap(lines(" << mode_cnt << "));\n";
00461 
00462   // find range of data to construct surface properly
00463   T min = T(10000), max = T(0);
00464   for (unsigned i = 0; i < set.size(); i++) {
00465     if (set.sample(i) < min)
00466       min = set.sample(i);
00467     if (set.sample(i) > max)
00468       max = set.sample(i);
00469   }
00470 
00471   of << "X = [" << min << ":.2:" << max << "]';\n";
00472 
00473   T total_weight = set.total_weight();
00474   for (unsigned m = 0; m < mode_cnt; m++) {
00475     T mode;
00476     set.mode_mean(m, mode);
00477     of << "mu = [" << mode << "];\n";
00478     T var;
00479     if (!bsta_sample_set_variance(set, m, MIN_VAR_, var))
00480       return false;
00481     of << "sigma = [" << var << "];\n";
00482     of << "p = " << set.mode_weight(m)/total_weight << "*mvnpdf(X, mu, sigma);\n";
00483     of << "hh = plot(X,p,'+r');\nset(hh, 'Color', cmap(" << m+1 << ",:));\n";
00484     of << "hold on\n";
00485   }
00486 
00487   return true;
00488 }
00489 
00490 #endif // bsta_sample_set_h_