contrib/mul/mcal/mcal_pca.cxx
Go to the documentation of this file.
00001 #include "mcal_pca.h"
00002 //:
00003 // \file
00004 #include <vcl_cstdlib.h>
00005 #include <vcl_string.h>
00006 #include <vcl_vector.h>
00007 #include <vcl_cmath.h>
00008 #include <vcl_sstream.h>
00009 
00010 #include <vsl/vsl_indent.h>
00011 #include <vsl/vsl_binary_io.h>
00012 #include <vnl/algo/vnl_symmetric_eigensystem.h>
00013 #include <vnl/vnl_math.h>
00014 #include <vul/vul_string.h>
00015 #include <mbl/mbl_parse_block.h>
00016 #include <mbl/mbl_read_props.h>
00017 #include <mbl/mbl_matrix_products.h>
00018 #include <mbl/mbl_exception.h>
00019 
00020 //=======================================================================
00021 // Constructors
00022 //=======================================================================
00023 
00024 mcal_pca::mcal_pca()
00025 {
00026   min_modes_ = 0;
00027   max_modes_ = 9999;
00028   var_prop_ = 0.98;
00029 
00030   max_d_in_memory_ = 1.0e8;
00031   use_chunks_=false;
00032 }
00033 
00034 //=======================================================================
00035 // Destructor
00036 //=======================================================================
00037 
00038 mcal_pca::~mcal_pca()
00039 {
00040 }
00041 
00042 
00043 //: Return the number of modes to retain
00044 unsigned mcal_pca::choose_n_modes(const vnl_vector<double>& evals)
00045 {
00046   if (evals.size()==0) return 0;
00047   const double* v_data = evals.begin();
00048   unsigned n=evals.size();
00049 
00050   if (var_prop_==0.0) return 0;
00051 
00052   // Find last positive eigenvalue
00053   unsigned i=0;
00054   while (i<n && v_data[i]>0) i++;
00055   n = i;
00056 
00057   if (n==0) return 0;
00058 
00059   if ((var_prop_>=1.0) || (var_prop_<=0.0))
00060   {
00061     if (n>max_modes_ /* && max_modes_>=0 */ ) // max_modes_ is unsigned
00062       return max_modes_;
00063     else
00064       return n;
00065   }
00066 
00067   // Compute total variance
00068   double total_var=0.0;
00069   for (i=0;i<n;++i) total_var += v_data[i];
00070   if (total_var==0.0) return 0;
00071 
00072   double target_var = total_var * var_prop_;
00073 
00074   // Find number of modes required to achieve target_var;
00075   double var=0.0;
00076   unsigned n_modes=0;
00077   while (var<target_var && n_modes<n)
00078   { var+=v_data[n_modes]; n_modes++; }
00079 
00080   if (n_modes>max_modes_) return max_modes_;
00081   if (n_modes<min_modes_ && min_modes_<=n) return min_modes_;
00082   return n_modes;
00083 }
00084 
00085 //: Compute eigenvectors assuming fewer dimensions than samples
00086 void mcal_pca::build_evecs_nd_smaller(mbl_data_wrapper<vnl_vector<double> >& data,
00087                                       const vnl_vector<double>& mean,
00088                                       vnl_matrix<double>& evecs,
00089                                       vnl_vector<double>& evals)
00090 {
00091   data.reset();
00092   unsigned int n_samples = data.size();
00093   unsigned int n_dims = data.current().size();
00094 
00095   vnl_matrix<double> S(n_dims,n_dims);
00096   double **s_data = S.data_array();
00097   S.fill(0);
00098 
00099   vnl_vector<double> dv;
00100   for (unsigned int k=0;k<n_samples;++k)
00101   {
00102     dv = data.current();
00103     dv-=mean;
00104     const double* v_data = dv.begin();
00105     for (unsigned int i=0;i<n_dims;++i)
00106     {
00107       double *s_row = s_data[i];
00108       double v_i = v_data[i];
00109       for (unsigned int j=0;j<n_dims;++j)
00110         s_row[j] += v_i*v_data[j];
00111     }
00112 
00113     data.next();
00114   }
00115 
00116   S/=n_samples;
00117 
00118   // Compute eigenvectors and values of S
00119   vnl_matrix<double> EVecs(n_dims,n_dims);
00120   vnl_vector<double> EVals(n_dims);
00121   vnl_symmetric_eigensystem_compute(S,EVecs,EVals);
00122 
00123   // Note: EVecs and EVals are ordered with the smallest first
00124   evals = EVals;
00125   evals.flip();
00126   unsigned n_modes = choose_n_modes(evals);
00127 
00128   // Extract the relevant n_modes from EVecs and EVals
00129   evecs.set_size(n_dims,n_modes);
00130   evals.set_size(n_modes);
00131   double **EV_data = EVecs.data_array();
00132   double **ev_data = evecs.data_array();
00133 
00134   for (unsigned i=0;i<n_dims;++i)
00135   {
00136     // Flip each row
00137     double *EV = EV_data[i]-1;
00138     double *ev = ev_data[i];
00139     for (unsigned j=0;j<n_modes;++j) ev[j]=EV[n_dims-j];
00140   }
00141   for (unsigned j=0;j<n_modes;++j)
00142     evals[j]=EVals[n_dims-1-j];
00143 }
00144 
00145 //: Compute eigenvectors assuming fewer samples than dimensions
00146 void mcal_pca::build_evecs_ns_smaller(mbl_data_wrapper<vnl_vector<double> >& data,
00147                                       const vnl_vector<double>& mean,
00148                                       vnl_matrix<double>& evecs,
00149                                       vnl_vector<double>& evals)
00150 {
00151   data.reset();
00152   // Principle components are generated by computing eigenvectors
00153   // of a n_dims square matrix, DDt
00154   unsigned int n_samples = data.size();
00155   unsigned int n_dims = data.current().size();
00156 
00157   double n_total = n_samples * n_dims;
00158   unsigned int n_chunks = 1 + int(2*n_total/max_d_in_memory_);
00159   double chunk_size = double(n_samples)/n_chunks;
00160 
00161   vnl_matrix<double> DDt(n_samples,n_samples);
00162   vnl_matrix<double> D;
00163   vnl_vector<double> dv;
00164 
00165   static bool message_given_before = false;
00166   if (!use_chunks_ && n_chunks>1 && !message_given_before)
00167   {
00168     message_given_before=true;
00169     vcl_cerr<<"\n"
00170             <<"WARNING - mcal_pca:  The size of the matrix is\n"
00171             <<"  possibly too large for the memory. If the build fails, try\n"
00172             <<"  setting set_use_chunks to true in the mcal_pca.\n\n";
00173   }
00174 
00175 
00176   // If all chunks can be managed in one pass
00177   if (!use_chunks_ || n_chunks<=2 )
00178   {
00179     // Read data into a single matrix
00180     D.set_size(n_samples,n_dims);
00181     for (unsigned int i=0;i<n_samples;i++)
00182     {
00183       dv = data.current();
00184       dv -= mean;
00185       D.set_row(i,dv);
00186       data.next();
00187     }
00188 
00189     mbl_matrix_product_a_bt(DDt,D,D);
00190   }
00191   else
00192   {
00193     vcl_cout<<vsl_indent()<<"mcal_pca:  Constructing DD_t in "
00194             <<n_chunks<<" passes."<<vcl_endl;
00195 
00196     // Set up indices for chunks
00197     vcl_vector<int> start(n_chunks),end(n_chunks);
00198     start[0] = 0;
00199     end[0] = vnl_math_rnd(chunk_size)-1;
00200     for (unsigned int i=1;i<n_chunks;i++)
00201     {
00202       start[i] = end[i-1]+1;
00203       end[i] = vnl_math_rnd((i+1)*chunk_size) -1;
00204     }
00205     end[n_chunks-1] = n_samples-1;
00206 
00207     vnl_matrix<double> D1,D2,DDt1;
00208     for (unsigned int i=0;i<n_chunks;i++)
00209     {
00210       int n = 1+end[i]-start[i];
00211       D1.set_size(n,n_dims);
00212       data.set_index(start[i]);
00213       for (int j=0;j<n;++j)
00214       {
00215         dv=data.current();
00216         dv-=mean;
00217         D1.set_row(j,dv);
00218         data.next();
00219       }
00220 
00221       mbl_matrix_product_a_bt(DDt1,D1,D1);
00222       fillDDt(DDt,DDt1,start[i],end[i],start[i],end[i]);
00223 
00224       for (unsigned int j=i+1;j<n_chunks;j++)
00225       {
00226         //data.chunk(D2,start[j],end[j]);
00227         int n = 1+end[j]-start[j];
00228         D2.set_size(n,n_dims);
00229         data.set_index(start[j]);
00230         for (int k=0;k<n;++k)
00231         {
00232           dv=data.current();
00233           dv-=mean;
00234           D2.set_row(k,dv);
00235           data.next();
00236         }
00237         //subtractMean(D2);
00238 
00239         mbl_matrix_product_a_bt(DDt1,D1,D2);
00240         fillDDt(DDt,DDt1,start[i],end[i],start[j],end[j]);
00241       }
00242     }
00243   }
00244 
00245   DDt/=n_samples;
00246 
00247   // Compute eigenvectors
00248   vnl_matrix<double> EVecs(n_samples,n_samples);
00249   vnl_vector<double> EVals;
00250   evals.set_size(n_samples);
00251   vnl_symmetric_eigensystem_compute(DDt,EVecs,EVals);
00252 
00253   // Arrange that values/vectors ordered with largest first
00254   EVals.flip();
00255   EVecs.fliplr();
00256 
00257   unsigned n_modes = 0;
00258   if (n_samples>=2)
00259     n_modes = choose_n_modes(EVals);
00260 
00261   if (n_modes==0)
00262   {
00263     evals.set_size(0);
00264     evecs.set_size(0,0);
00265     return;
00266   }
00267 
00268   // Retain larges n_modes eigenvalues
00269   evals.set_size(n_modes);
00270   for (unsigned i=0;i<n_modes;++i) evals[i]=EVals[i];
00271 
00272   // Now compute the principal components from PC = D.transpose() * EVecs
00273   evecs.set_size(n_dims,n_modes);
00274   double **EV_data = EVecs.data_array();
00275   double **ev_data = evecs.data_array();
00276   evecs.fill(0);
00277 
00278   double **D_data = D.data_array();
00279   if (!use_chunks_ || n_chunks<=2)
00280   {
00281     for (unsigned int i=0;i<n_samples;++i)
00282     {
00283       double* dv_data = D_data[i];
00284       double* EV_row = EV_data[i];
00285       for (unsigned int j=0;j<n_modes;++j)
00286       {
00287         double EV = EV_row[j];
00288         for (unsigned int k=0;k<n_dims;++k)
00289           ev_data[k][j] += dv_data[k]*EV;
00290       }
00291     }
00292   }
00293   else
00294   {
00295     vnl_vector<double> dv;
00296     data.reset();
00297     for (unsigned int i=0;i<n_samples;++i)
00298     {
00299       dv = data.current();
00300       dv -= mean;
00301       double* dv_data = dv.begin();
00302       double* EV_row = EV_data[i];
00303       for (unsigned int j=0;j<n_modes;++j)
00304       {
00305         double EV = EV_row[j];
00306         for (unsigned int k=0;k<n_dims;++k)
00307           ev_data[k][j] += dv_data[k]*EV;
00308       }
00309 
00310       data.next();
00311     }
00312   }
00313 
00314   // Normalise each column
00315   vnl_vector<double> col_sum(n_modes);
00316   col_sum.fill(0);
00317   double* cs = col_sum.begin();
00318   for (unsigned int k=0;k<n_dims;k++)
00319   {
00320     const double* row = ev_data[k];
00321     for (unsigned int j=0;j<n_modes;++j)
00322       cs[j]+=row[j]*row[j];
00323   }
00324   for (unsigned int j=0;j<n_modes;++j) cs[j]=1.0/vcl_sqrt(cs[j]);
00325   for (unsigned int k=0;k<n_dims;k++)
00326   {
00327     double* row = ev_data[k];
00328     for (unsigned int j=0;j<n_modes;++j)
00329       row[j]*=cs[j];
00330   }
00331 }
00332 
00333 //: Support function for chunks
00334 void mcal_pca::fillDDt(vnl_matrix<double>& DDt, const vnl_matrix<double>& A,
00335                        int rlo, int rhi, int clo, int chi)
00336 {
00337   double **DDt_data = DDt.data_array();
00338   const double *const*A_data = A.data_array();
00339 
00340   for (int r=rlo;r<=rhi;++r)
00341   {
00342     double *DDt_row = DDt_data[r];
00343     const double* A_row = A_data[r-rlo]-clo;
00344     for (int c=clo;c<=chi;c++)
00345     {
00346       DDt_row[c] = A_row[c];
00347       DDt_data[c][r] = A_row[c];    // Transpose
00348     }
00349   }
00350 }
00351 
00352 void mcal_pca::set_mode_choice(unsigned min, unsigned max,
00353                                double var_proportion)
00354 {
00355   min_modes_ = min;
00356   max_modes_ = max;
00357   var_prop_ = var_proportion;
00358 }
00359 
00360 //: Set the choice for the minimum number of model
00361 void mcal_pca::set_min_modes( const unsigned min )
00362 {
00363   min_modes_ = min;
00364 }
00365 
00366 
00367 //: Current lower limit on number of parameters
00368 unsigned mcal_pca::min_modes() const
00369 {
00370   return min_modes_;
00371 }
00372 
00373 void mcal_pca::set_max_modes(unsigned max)
00374 {
00375   max_modes_ = max;
00376 }
00377 
00378 
00379 //: Current upper limit on number of parameters
00380 unsigned mcal_pca::max_modes() const
00381 {
00382   return max_modes_;
00383 }
00384 
00385 //: Define proportion of data variance to explain
00386 void mcal_pca::set_var_prop(double v)
00387 {
00388   var_prop_ = v;
00389 }
00390 
00391 //: Current proportion of data variance to explain
00392 double mcal_pca::var_prop() const
00393 {
00394   return var_prop_;
00395 }
00396 
00397 void mcal_pca::set_max_d_in_memory(double max_n)
00398 {
00399   max_d_in_memory_ = max_n;
00400 }
00401 
00402 //: Set whether we may build in chunks if required
00403 void mcal_pca::set_use_chunks(bool chunks)
00404 {
00405   use_chunks_=chunks;
00406 }
00407 
00408 //: Compute modes of the supplied data relative to the supplied mean
00409 //  Model is x = mean + modes*b,  where b is a vector of weights on each mode.
00410 //  mode_var[i] gives the variance of the data projected onto that mode.
00411 void mcal_pca::build_about_mean(mbl_data_wrapper<vnl_vector<double> >& data,
00412                                 const vnl_vector<double>& mean,
00413                                 vnl_matrix<double>& modes,
00414                                 vnl_vector<double>& mode_var)
00415 {
00416   if (data.size()==0)
00417   {
00418     vcl_cerr<<"mcal_pca::build_about_mean() No samples supplied.\n";
00419     vcl_abort();
00420   }
00421 
00422   data.reset();
00423 
00424   if (data.current().size()==0)
00425   {
00426     vcl_cerr<<"mcal_pca::build_about_mean()\n"
00427             <<"Warning: Samples claim to have zero dimensions.\n"
00428             <<"Constructing empty model.\n";
00429 
00430     modes.set_size(0,0);
00431     mode_var.set_size(0);
00432     return;
00433   }
00434 
00435   // Choose suitable strategy for computing principal components
00436   if (data.current().size()<data.size())
00437     build_evecs_nd_smaller(data,mean,modes,mode_var);
00438   else
00439     build_evecs_ns_smaller(data,mean,modes,mode_var);
00440 }
00441 
00442 
00443 //=======================================================================
00444 // Method: is_a
00445 //=======================================================================
00446 
00447 vcl_string  mcal_pca::is_a() const
00448 {
00449   return vcl_string("mcal_pca");
00450 }
00451 
00452 //=======================================================================
00453 // Method: version_no
00454 //=======================================================================
00455 
00456 short mcal_pca::version_no() const
00457 {
00458   return 1;
00459 }
00460 
00461 //=======================================================================
00462 // Method: clone
00463 //=======================================================================
00464 
00465 mcal_component_analyzer* mcal_pca::clone() const
00466 {
00467   return new mcal_pca(*this);
00468 }
00469 
00470 //=======================================================================
00471 // Method: print
00472 //=======================================================================
00473 
00474 void mcal_pca::print_summary(vcl_ostream& os) const
00475 {
00476   os<<'\n'
00477     <<vsl_indent()<<"min_modes: "<<min_modes_<<'\n'
00478     <<vsl_indent()<<"max_modes: "<<max_modes_<<'\n'
00479     <<vsl_indent()<<"var_prop: "<<var_prop_<<'\n'
00480     <<vsl_indent()<<"max_d_in_memory: "<<max_d_in_memory_<<'\n'
00481     <<vsl_indent()<<"use_chunks: "<<use_chunks_<<'\n';
00482 }
00483 
00484 //=======================================================================
00485 // Method: save
00486 //=======================================================================
00487 
00488 void mcal_pca::b_write(vsl_b_ostream& bfs) const
00489 {
00490   vsl_b_write(bfs,version_no());
00491   vsl_b_write(bfs,min_modes_);
00492   vsl_b_write(bfs,max_modes_);
00493   vsl_b_write(bfs,var_prop_);
00494   vsl_b_write(bfs,max_d_in_memory_);
00495   vsl_b_write(bfs,use_chunks_);
00496 }
00497 
00498 //=======================================================================
00499 // Method: load
00500 //=======================================================================
00501 
00502 void mcal_pca::b_read(vsl_b_istream& bfs)
00503 {
00504   short version;
00505   vsl_b_read(bfs,version);
00506   switch (version)
00507   {
00508     case 1:
00509       vsl_b_read(bfs,min_modes_);
00510       vsl_b_read(bfs,max_modes_);
00511       vsl_b_read(bfs,var_prop_);
00512       vsl_b_read(bfs,max_d_in_memory_);
00513       vsl_b_read(bfs,use_chunks_);
00514       break;
00515     default:
00516       vcl_cerr << "mcal_pca::b_read()\n"
00517                << "Unexpected version number " << version << vcl_endl;
00518       vcl_abort();
00519   }
00520 }
00521 
00522 //=======================================================================
00523 //: Read initialisation settings from a stream.
00524 // Parameters:
00525 // \verbatim
00526 // {
00527 //   min_modes: 0 max_modes: 99 prop_modes: 0.99
00528 //   // Maximum number of doubles to store in memory at once
00529 //   max_d_in_memory: 1e8
00530 //   // Indicate how to build from large amounts of data
00531 //   use_chunks: false
00532 // }
00533 // \endverbatim
00534 // \throw mbl_exception_parse_error if the parse fails.
00535 void mcal_pca::config_from_stream(vcl_istream & is)
00536 {
00537   vcl_string s = mbl_parse_block(is);
00538 
00539   vcl_istringstream ss(s);
00540   mbl_read_props_type props = mbl_read_props_ws(ss);
00541 
00542   {
00543     int m_min =0,m_max=999;
00544     double m_props=0.05;
00545     if (props.find("min_modes")!=props.end())
00546     {
00547       m_min=vul_string_atoi(props["min_modes"]);
00548       props.erase("min_modes");
00549     }
00550 
00551     if (props.find("max_modes")!=props.end())
00552     {
00553       m_max=vul_string_atoi(props["max_modes"]);
00554       props.erase("max_modes");
00555     }
00556 
00557     if (props.find("var_prop")!=props.end())
00558     {
00559       m_props=vul_string_atof(props["var_prop"]);
00560       props.erase("var_prop");
00561     }
00562 
00563     set_mode_choice(m_min,m_max,m_props);
00564   }
00565 
00566   if (props.find("max_d_in_memory")!=props.end())
00567   {
00568     max_d_in_memory_=vul_string_atof(props["max_d_in_memory"]);
00569     props.erase("max_d_in_memory");
00570   }
00571 
00572 
00573   {
00574     if (!props["use_chunks"].empty())
00575     {
00576       use_chunks_ = vul_string_to_bool(props["use_chunks"]);
00577       props.erase("use_chunks");
00578     }
00579     else
00580       use_chunks_=false;
00581 
00582     props.erase("use_chunks");
00583   }
00584 
00585   try
00586   {
00587     mbl_read_props_look_for_unused_props(
00588           "mcal_pca::config_from_stream", props);
00589   }
00590   catch(mbl_exception_unused_props &e)
00591   {
00592     throw mbl_exception_parse_error(e.what());
00593   }
00594 }