00001 #include "mcal_pca.h"
00002
00003
00004 #include <vcl_cstdlib.h>
00005 #include <vcl_string.h>
00006 #include <vcl_vector.h>
00007 #include <vcl_cmath.h>
00008 #include <vcl_sstream.h>
00009
00010 #include <vsl/vsl_indent.h>
00011 #include <vsl/vsl_binary_io.h>
00012 #include <vnl/algo/vnl_symmetric_eigensystem.h>
00013 #include <vnl/vnl_math.h>
00014 #include <vul/vul_string.h>
00015 #include <mbl/mbl_parse_block.h>
00016 #include <mbl/mbl_read_props.h>
00017 #include <mbl/mbl_matrix_products.h>
00018 #include <mbl/mbl_exception.h>
00019
00020
00021
00022
00023
00024 mcal_pca::mcal_pca()
00025 {
00026 min_modes_ = 0;
00027 max_modes_ = 9999;
00028 var_prop_ = 0.98;
00029
00030 max_d_in_memory_ = 1.0e8;
00031 use_chunks_=false;
00032 }
00033
00034
00035
00036
00037
00038 mcal_pca::~mcal_pca()
00039 {
00040 }
00041
00042
00043
00044 unsigned mcal_pca::choose_n_modes(const vnl_vector<double>& evals)
00045 {
00046 if (evals.size()==0) return 0;
00047 const double* v_data = evals.begin();
00048 unsigned n=evals.size();
00049
00050 if (var_prop_==0.0) return 0;
00051
00052
00053 unsigned i=0;
00054 while (i<n && v_data[i]>0) i++;
00055 n = i;
00056
00057 if (n==0) return 0;
00058
00059 if ((var_prop_>=1.0) || (var_prop_<=0.0))
00060 {
00061 if (n>max_modes_ )
00062 return max_modes_;
00063 else
00064 return n;
00065 }
00066
00067
00068 double total_var=0.0;
00069 for (i=0;i<n;++i) total_var += v_data[i];
00070 if (total_var==0.0) return 0;
00071
00072 double target_var = total_var * var_prop_;
00073
00074
00075 double var=0.0;
00076 unsigned n_modes=0;
00077 while (var<target_var && n_modes<n)
00078 { var+=v_data[n_modes]; n_modes++; }
00079
00080 if (n_modes>max_modes_) return max_modes_;
00081 if (n_modes<min_modes_ && min_modes_<=n) return min_modes_;
00082 return n_modes;
00083 }
00084
00085
00086 void mcal_pca::build_evecs_nd_smaller(mbl_data_wrapper<vnl_vector<double> >& data,
00087 const vnl_vector<double>& mean,
00088 vnl_matrix<double>& evecs,
00089 vnl_vector<double>& evals)
00090 {
00091 data.reset();
00092 unsigned int n_samples = data.size();
00093 unsigned int n_dims = data.current().size();
00094
00095 vnl_matrix<double> S(n_dims,n_dims);
00096 double **s_data = S.data_array();
00097 S.fill(0);
00098
00099 vnl_vector<double> dv;
00100 for (unsigned int k=0;k<n_samples;++k)
00101 {
00102 dv = data.current();
00103 dv-=mean;
00104 const double* v_data = dv.begin();
00105 for (unsigned int i=0;i<n_dims;++i)
00106 {
00107 double *s_row = s_data[i];
00108 double v_i = v_data[i];
00109 for (unsigned int j=0;j<n_dims;++j)
00110 s_row[j] += v_i*v_data[j];
00111 }
00112
00113 data.next();
00114 }
00115
00116 S/=n_samples;
00117
00118
00119 vnl_matrix<double> EVecs(n_dims,n_dims);
00120 vnl_vector<double> EVals(n_dims);
00121 vnl_symmetric_eigensystem_compute(S,EVecs,EVals);
00122
00123
00124 evals = EVals;
00125 evals.flip();
00126 unsigned n_modes = choose_n_modes(evals);
00127
00128
00129 evecs.set_size(n_dims,n_modes);
00130 evals.set_size(n_modes);
00131 double **EV_data = EVecs.data_array();
00132 double **ev_data = evecs.data_array();
00133
00134 for (unsigned i=0;i<n_dims;++i)
00135 {
00136
00137 double *EV = EV_data[i]-1;
00138 double *ev = ev_data[i];
00139 for (unsigned j=0;j<n_modes;++j) ev[j]=EV[n_dims-j];
00140 }
00141 for (unsigned j=0;j<n_modes;++j)
00142 evals[j]=EVals[n_dims-1-j];
00143 }
00144
00145
00146 void mcal_pca::build_evecs_ns_smaller(mbl_data_wrapper<vnl_vector<double> >& data,
00147 const vnl_vector<double>& mean,
00148 vnl_matrix<double>& evecs,
00149 vnl_vector<double>& evals)
00150 {
00151 data.reset();
00152
00153
00154 unsigned int n_samples = data.size();
00155 unsigned int n_dims = data.current().size();
00156
00157 double n_total = n_samples * n_dims;
00158 unsigned int n_chunks = 1 + int(2*n_total/max_d_in_memory_);
00159 double chunk_size = double(n_samples)/n_chunks;
00160
00161 vnl_matrix<double> DDt(n_samples,n_samples);
00162 vnl_matrix<double> D;
00163 vnl_vector<double> dv;
00164
00165 static bool message_given_before = false;
00166 if (!use_chunks_ && n_chunks>1 && !message_given_before)
00167 {
00168 message_given_before=true;
00169 vcl_cerr<<"\n"
00170 <<"WARNING - mcal_pca: The size of the matrix is\n"
00171 <<" possibly too large for the memory. If the build fails, try\n"
00172 <<" setting set_use_chunks to true in the mcal_pca.\n\n";
00173 }
00174
00175
00176
00177 if (!use_chunks_ || n_chunks<=2 )
00178 {
00179
00180 D.set_size(n_samples,n_dims);
00181 for (unsigned int i=0;i<n_samples;i++)
00182 {
00183 dv = data.current();
00184 dv -= mean;
00185 D.set_row(i,dv);
00186 data.next();
00187 }
00188
00189 mbl_matrix_product_a_bt(DDt,D,D);
00190 }
00191 else
00192 {
00193 vcl_cout<<vsl_indent()<<"mcal_pca: Constructing DD_t in "
00194 <<n_chunks<<" passes."<<vcl_endl;
00195
00196
00197 vcl_vector<int> start(n_chunks),end(n_chunks);
00198 start[0] = 0;
00199 end[0] = vnl_math_rnd(chunk_size)-1;
00200 for (unsigned int i=1;i<n_chunks;i++)
00201 {
00202 start[i] = end[i-1]+1;
00203 end[i] = vnl_math_rnd((i+1)*chunk_size) -1;
00204 }
00205 end[n_chunks-1] = n_samples-1;
00206
00207 vnl_matrix<double> D1,D2,DDt1;
00208 for (unsigned int i=0;i<n_chunks;i++)
00209 {
00210 int n = 1+end[i]-start[i];
00211 D1.set_size(n,n_dims);
00212 data.set_index(start[i]);
00213 for (int j=0;j<n;++j)
00214 {
00215 dv=data.current();
00216 dv-=mean;
00217 D1.set_row(j,dv);
00218 data.next();
00219 }
00220
00221 mbl_matrix_product_a_bt(DDt1,D1,D1);
00222 fillDDt(DDt,DDt1,start[i],end[i],start[i],end[i]);
00223
00224 for (unsigned int j=i+1;j<n_chunks;j++)
00225 {
00226
00227 int n = 1+end[j]-start[j];
00228 D2.set_size(n,n_dims);
00229 data.set_index(start[j]);
00230 for (int k=0;k<n;++k)
00231 {
00232 dv=data.current();
00233 dv-=mean;
00234 D2.set_row(k,dv);
00235 data.next();
00236 }
00237
00238
00239 mbl_matrix_product_a_bt(DDt1,D1,D2);
00240 fillDDt(DDt,DDt1,start[i],end[i],start[j],end[j]);
00241 }
00242 }
00243 }
00244
00245 DDt/=n_samples;
00246
00247
00248 vnl_matrix<double> EVecs(n_samples,n_samples);
00249 vnl_vector<double> EVals;
00250 evals.set_size(n_samples);
00251 vnl_symmetric_eigensystem_compute(DDt,EVecs,EVals);
00252
00253
00254 EVals.flip();
00255 EVecs.fliplr();
00256
00257 unsigned n_modes = 0;
00258 if (n_samples>=2)
00259 n_modes = choose_n_modes(EVals);
00260
00261 if (n_modes==0)
00262 {
00263 evals.set_size(0);
00264 evecs.set_size(0,0);
00265 return;
00266 }
00267
00268
00269 evals.set_size(n_modes);
00270 for (unsigned i=0;i<n_modes;++i) evals[i]=EVals[i];
00271
00272
00273 evecs.set_size(n_dims,n_modes);
00274 double **EV_data = EVecs.data_array();
00275 double **ev_data = evecs.data_array();
00276 evecs.fill(0);
00277
00278 double **D_data = D.data_array();
00279 if (!use_chunks_ || n_chunks<=2)
00280 {
00281 for (unsigned int i=0;i<n_samples;++i)
00282 {
00283 double* dv_data = D_data[i];
00284 double* EV_row = EV_data[i];
00285 for (unsigned int j=0;j<n_modes;++j)
00286 {
00287 double EV = EV_row[j];
00288 for (unsigned int k=0;k<n_dims;++k)
00289 ev_data[k][j] += dv_data[k]*EV;
00290 }
00291 }
00292 }
00293 else
00294 {
00295 vnl_vector<double> dv;
00296 data.reset();
00297 for (unsigned int i=0;i<n_samples;++i)
00298 {
00299 dv = data.current();
00300 dv -= mean;
00301 double* dv_data = dv.begin();
00302 double* EV_row = EV_data[i];
00303 for (unsigned int j=0;j<n_modes;++j)
00304 {
00305 double EV = EV_row[j];
00306 for (unsigned int k=0;k<n_dims;++k)
00307 ev_data[k][j] += dv_data[k]*EV;
00308 }
00309
00310 data.next();
00311 }
00312 }
00313
00314
00315 vnl_vector<double> col_sum(n_modes);
00316 col_sum.fill(0);
00317 double* cs = col_sum.begin();
00318 for (unsigned int k=0;k<n_dims;k++)
00319 {
00320 const double* row = ev_data[k];
00321 for (unsigned int j=0;j<n_modes;++j)
00322 cs[j]+=row[j]*row[j];
00323 }
00324 for (unsigned int j=0;j<n_modes;++j) cs[j]=1.0/vcl_sqrt(cs[j]);
00325 for (unsigned int k=0;k<n_dims;k++)
00326 {
00327 double* row = ev_data[k];
00328 for (unsigned int j=0;j<n_modes;++j)
00329 row[j]*=cs[j];
00330 }
00331 }
00332
00333
00334 void mcal_pca::fillDDt(vnl_matrix<double>& DDt, const vnl_matrix<double>& A,
00335 int rlo, int rhi, int clo, int chi)
00336 {
00337 double **DDt_data = DDt.data_array();
00338 const double *const*A_data = A.data_array();
00339
00340 for (int r=rlo;r<=rhi;++r)
00341 {
00342 double *DDt_row = DDt_data[r];
00343 const double* A_row = A_data[r-rlo]-clo;
00344 for (int c=clo;c<=chi;c++)
00345 {
00346 DDt_row[c] = A_row[c];
00347 DDt_data[c][r] = A_row[c];
00348 }
00349 }
00350 }
00351
00352 void mcal_pca::set_mode_choice(unsigned min, unsigned max,
00353 double var_proportion)
00354 {
00355 min_modes_ = min;
00356 max_modes_ = max;
00357 var_prop_ = var_proportion;
00358 }
00359
00360
00361 void mcal_pca::set_min_modes( const unsigned min )
00362 {
00363 min_modes_ = min;
00364 }
00365
00366
00367
00368 unsigned mcal_pca::min_modes() const
00369 {
00370 return min_modes_;
00371 }
00372
00373 void mcal_pca::set_max_modes(unsigned max)
00374 {
00375 max_modes_ = max;
00376 }
00377
00378
00379
00380 unsigned mcal_pca::max_modes() const
00381 {
00382 return max_modes_;
00383 }
00384
00385
00386 void mcal_pca::set_var_prop(double v)
00387 {
00388 var_prop_ = v;
00389 }
00390
00391
00392 double mcal_pca::var_prop() const
00393 {
00394 return var_prop_;
00395 }
00396
00397 void mcal_pca::set_max_d_in_memory(double max_n)
00398 {
00399 max_d_in_memory_ = max_n;
00400 }
00401
00402
00403 void mcal_pca::set_use_chunks(bool chunks)
00404 {
00405 use_chunks_=chunks;
00406 }
00407
00408
00409
00410
00411 void mcal_pca::build_about_mean(mbl_data_wrapper<vnl_vector<double> >& data,
00412 const vnl_vector<double>& mean,
00413 vnl_matrix<double>& modes,
00414 vnl_vector<double>& mode_var)
00415 {
00416 if (data.size()==0)
00417 {
00418 vcl_cerr<<"mcal_pca::build_about_mean() No samples supplied.\n";
00419 vcl_abort();
00420 }
00421
00422 data.reset();
00423
00424 if (data.current().size()==0)
00425 {
00426 vcl_cerr<<"mcal_pca::build_about_mean()\n"
00427 <<"Warning: Samples claim to have zero dimensions.\n"
00428 <<"Constructing empty model.\n";
00429
00430 modes.set_size(0,0);
00431 mode_var.set_size(0);
00432 return;
00433 }
00434
00435
00436 if (data.current().size()<data.size())
00437 build_evecs_nd_smaller(data,mean,modes,mode_var);
00438 else
00439 build_evecs_ns_smaller(data,mean,modes,mode_var);
00440 }
00441
00442
00443
00444
00445
00446
00447 vcl_string mcal_pca::is_a() const
00448 {
00449 return vcl_string("mcal_pca");
00450 }
00451
00452
00453
00454
00455
00456 short mcal_pca::version_no() const
00457 {
00458 return 1;
00459 }
00460
00461
00462
00463
00464
00465 mcal_component_analyzer* mcal_pca::clone() const
00466 {
00467 return new mcal_pca(*this);
00468 }
00469
00470
00471
00472
00473
00474 void mcal_pca::print_summary(vcl_ostream& os) const
00475 {
00476 os<<'\n'
00477 <<vsl_indent()<<"min_modes: "<<min_modes_<<'\n'
00478 <<vsl_indent()<<"max_modes: "<<max_modes_<<'\n'
00479 <<vsl_indent()<<"var_prop: "<<var_prop_<<'\n'
00480 <<vsl_indent()<<"max_d_in_memory: "<<max_d_in_memory_<<'\n'
00481 <<vsl_indent()<<"use_chunks: "<<use_chunks_<<'\n';
00482 }
00483
00484
00485
00486
00487
00488 void mcal_pca::b_write(vsl_b_ostream& bfs) const
00489 {
00490 vsl_b_write(bfs,version_no());
00491 vsl_b_write(bfs,min_modes_);
00492 vsl_b_write(bfs,max_modes_);
00493 vsl_b_write(bfs,var_prop_);
00494 vsl_b_write(bfs,max_d_in_memory_);
00495 vsl_b_write(bfs,use_chunks_);
00496 }
00497
00498
00499
00500
00501
00502 void mcal_pca::b_read(vsl_b_istream& bfs)
00503 {
00504 short version;
00505 vsl_b_read(bfs,version);
00506 switch (version)
00507 {
00508 case 1:
00509 vsl_b_read(bfs,min_modes_);
00510 vsl_b_read(bfs,max_modes_);
00511 vsl_b_read(bfs,var_prop_);
00512 vsl_b_read(bfs,max_d_in_memory_);
00513 vsl_b_read(bfs,use_chunks_);
00514 break;
00515 default:
00516 vcl_cerr << "mcal_pca::b_read()\n"
00517 << "Unexpected version number " << version << vcl_endl;
00518 vcl_abort();
00519 }
00520 }
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535 void mcal_pca::config_from_stream(vcl_istream & is)
00536 {
00537 vcl_string s = mbl_parse_block(is);
00538
00539 vcl_istringstream ss(s);
00540 mbl_read_props_type props = mbl_read_props_ws(ss);
00541
00542 {
00543 int m_min =0,m_max=999;
00544 double m_props=0.05;
00545 if (props.find("min_modes")!=props.end())
00546 {
00547 m_min=vul_string_atoi(props["min_modes"]);
00548 props.erase("min_modes");
00549 }
00550
00551 if (props.find("max_modes")!=props.end())
00552 {
00553 m_max=vul_string_atoi(props["max_modes"]);
00554 props.erase("max_modes");
00555 }
00556
00557 if (props.find("var_prop")!=props.end())
00558 {
00559 m_props=vul_string_atof(props["var_prop"]);
00560 props.erase("var_prop");
00561 }
00562
00563 set_mode_choice(m_min,m_max,m_props);
00564 }
00565
00566 if (props.find("max_d_in_memory")!=props.end())
00567 {
00568 max_d_in_memory_=vul_string_atof(props["max_d_in_memory"]);
00569 props.erase("max_d_in_memory");
00570 }
00571
00572
00573 {
00574 if (!props["use_chunks"].empty())
00575 {
00576 use_chunks_ = vul_string_to_bool(props["use_chunks"]);
00577 props.erase("use_chunks");
00578 }
00579 else
00580 use_chunks_=false;
00581
00582 props.erase("use_chunks");
00583 }
00584
00585 try
00586 {
00587 mbl_read_props_look_for_unused_props(
00588 "mcal_pca::config_from_stream", props);
00589 }
00590 catch(mbl_exception_unused_props &e)
00591 {
00592 throw mbl_exception_parse_error(e.what());
00593 }
00594 }