contrib/mul/clsfy/clsfy_adaboost_sorted_builder.cxx
Go to the documentation of this file.
00001 // This is mul/clsfy/clsfy_adaboost_sorted_builder.cxx
00002 #ifdef VCL_NEEDS_PRAGMA_INTERFACE
00003 #pragma implementation
00004 #endif
00005 //:
00006 // \file
00007 // \brief Functions to train classifiers using AdaBoost algorithm
00008 // \author dac
00009 // \date   Fri Mar  1 23:49:39 2002
00010 //
00011 //  Functions to train classifiers using AdaBoost algorithm
00012 //  AdaBoost combines a set of (usually simple, weak) classifiers into
00013 //  a more powerful single classifier.  Essentially it selects the
00014 //  classifiers one at a time, choosing the best at each step.
00015 //  The classifiers are trained to distinguish the examples mis-classified
00016 //  by the currently selected classifiers.
00017 
00018 #include "clsfy_adaboost_sorted_builder.h"
00019 #include "clsfy_simple_adaboost.h"
00020 #include "clsfy_builder_1d.h"
00021 
00022 #include <vcl_iostream.h>
00023 #include <vcl_cstdlib.h> // for vcl_abort()
00024 #include <vcl_cmath.h>
00025 #include <vcl_ctime.h> // for clock()
00026 #include <vcl_algorithm.h>
00027 #include <vcl_cassert.h>
00028 #include <vbl/vbl_triple.h>
00029 #include <mbl/mbl_file_data_collector.h>
00030 #include <mbl/mbl_data_collector_list.h>
00031 
00032 //=======================================================================
00033 
00034 clsfy_adaboost_sorted_builder::clsfy_adaboost_sorted_builder()
00035 : save_data_to_disk_(false), bs_(-1), max_n_clfrs_(-1), weak_builder_(0)
00036 {
00037 }
00038 
00039 //=======================================================================
00040 
00041 clsfy_adaboost_sorted_builder::~clsfy_adaboost_sorted_builder()
00042 {
00043 }
00044 
00045 
00046 //=======================================================================
00047 
00048 bool clsfy_adaboost_sorted_builder::is_class(vcl_string const& s) const
00049 {
00050   return s == clsfy_adaboost_sorted_builder::is_a() || clsfy_builder_base::is_class(s);
00051 }
00052 
00053 //=======================================================================
00054 
00055 vcl_string clsfy_adaboost_sorted_builder::is_a() const
00056 {
00057   return vcl_string("clsfy_adaboost_sorted_builder");
00058 }
00059 
00060 
00061 //: Build classifier composed of 1d classifiers working on individual vector elements
00062 // Builds an n-component classifier, each component of which is a 1D classifier
00063 // working on a single element of the input vector.
00064 double clsfy_adaboost_sorted_builder::build(clsfy_classifier_base& model,
00065                                             mbl_data_wrapper<vnl_vector<double> >& inputs,
00066                                             unsigned /* nClasses */,
00067                                             const vcl_vector<unsigned> &outputs) const
00068 {
00069   // N.B. ignore nClasses=1, i.e. always binary classifier
00070 
00071   assert( model.is_class("clsfy_simple_adaboost") );
00072   clsfy_simple_adaboost &strong_classifier = (clsfy_simple_adaboost&) model;
00073 
00074 
00075   // check parameters are OK
00076   if ( max_n_clfrs_ < 0 )
00077   {
00078     vcl_cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
00079             <<"max_n_clfrs_ = "<<max_n_clfrs_<<" ie < 0\n"
00080             <<"set using set_max_n_clfrs()\n";
00081     vcl_abort();
00082   }
00083   else
00084   {
00085     vcl_cout<<"Maximum number of classifiers to be found by Adaboost = "
00086             <<max_n_clfrs_<<'\n';
00087   }
00088 
00089   if ( weak_builder_ == 0 )
00090   {
00091     vcl_cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
00092             <<"weak_builder_ pointer has not been set\n"
00093             <<"need to provide a builder to build each weak classifier\n"
00094             <<"set using set_weak_builder()\n";
00095     vcl_abort();
00096   }
00097   else
00098   {
00099     vcl_cout<<"Weak learner used by AdaBoost = "
00100             <<weak_builder_->is_a()<<'\n';
00101   }
00102 
00103   if ( bs_ < 0 )
00104   {
00105     vcl_cout<<"Error: clsfy_adaboost_sorted_builder::build\n"
00106             <<"bs_ = "<<bs_<<" ie < 0\n"
00107             <<"set using set_batch_size()\n";
00108     vcl_abort();
00109   }
00110   else
00111   {
00112     vcl_cout<<"Batch size when sorting data =" <<bs_<<'\n';
00113   }
00114 
00115 
00116   assert(bs_>0);
00117   assert(bs_!=1);
00118   assert (max_n_clfrs_ >= 0);
00119 
00120   // first arrange the data in the form
00121   // vcl_vector< < vcl_vector< vtl_triple<double,int,int> > > > data
00122   // + vnl_vector wts
00123   // then sort all data once, then build the classifier
00124 
00125   // number of examples
00126   unsigned n= inputs.size();
00127   //vcl_cout<<"n= "<<n<<'\n';
00128 
00129   // Dimensionality of data
00130   inputs.reset();
00131   int d = inputs.current().size();
00132 
00133   //need file data wrapper instead of old vector
00134   //data stored on disk NOT ram
00135   //vcl_vector< vcl_vector<vbl_triple<double,int,int> > > data(d);
00136 
00137   vcl_string temp_path= "temp.dat";
00138   mbl_file_data_collector<
00139       vcl_vector< vbl_triple<double,int,int> >
00140                          >
00141               file_collector( temp_path );
00142 
00143   mbl_data_collector_list<
00144       vcl_vector< vbl_triple<double,int,int> >
00145                          >
00146               ram_collector;
00147 
00148   mbl_data_collector<vcl_vector< vbl_triple<double,int,int> >
00149                          >*  collector;
00150 
00151   if (save_data_to_disk_)
00152   {
00153     vcl_cout<<"saving data to disk!\n";
00154     collector= &file_collector;
00155   }
00156   else
00157   {
00158     //bs_ = n ;
00159     vcl_cout<<"saving data to ram!\n";
00160     collector= &ram_collector;
00161   }
00162 
00163   // perhaps change this so load and sort several vectors at once!!
00164   // far too slow at present
00165   // say load in and sort 100 at once?????
00166   // i.e. 100 features at once!
00167 
00168   //int bs= 100; //batch size
00169   vcl_vector< vcl_vector< vbl_triple<double,int,int> > >vec(bs_);
00170   vbl_triple<double,int,int> t;
00171 
00172   vcl_cout<<"d= "<<d<<'\n';
00173   int b=0;
00174   while ( b+1<d )
00175   {
00176     int r= vcl_min ( bs_, (d-b) );
00177     assert(r>0);
00178 
00179     vcl_cout<<"sorting weak classifiers = "<<b<<" to "
00180             <<(b+r)-1<<" of "<<d<<'\n';
00181 
00182 
00183     // have to resize all vectors
00184     for (int i=0; i< bs_; ++i)
00185       vec[i].resize(0);
00186 
00187     // add data for both classes
00188     inputs.reset();
00189     for (unsigned int j=0;j<n;++j)
00190     {
00191       for (int i=0; i< r; ++i)
00192       {
00193         t.first=inputs.current()[b+i];
00194         t.second=outputs[j];
00195         t.third = j;
00196         vec[i].push_back(t);
00197       }
00198       inputs.next();
00199     }
00200 
00201 
00202     for (int i=0; i< r; ++i)
00203     {
00204       // sort training data for each individual weak classifier
00205       assert (vec[i].size() == n);
00206       assert (n != 0);
00207 
00208       vcl_sort(vec[i].begin(), vec[i].end() );
00209 
00210       // store sorted vector of responses for each individual weak classifier
00211       collector->record(vec[i]);
00212     }
00213 
00214     b+=bs_;
00215   }
00216 
00217 
00218   mbl_data_wrapper< vcl_vector< vbl_triple<double,int,int> > >&
00219               wrapper=collector->data_wrapper();
00220 
00221 
00222   // now actually apply AdaBoost algorithm
00223   wrapper.reset();
00224   assert ( wrapper.current().size() == n );
00225   assert ( d == (int)wrapper.size() );
00226 
00227 
00228   // initialize weights
00229   unsigned n0=0;
00230   unsigned n1=0;
00231   for (unsigned int i=0;i<n;++i)
00232   {
00233     if ( outputs[i] == 0 )
00234       n0++;
00235     else if ( outputs[i] == 1 )
00236       n1++;
00237     else
00238     {
00239       vcl_cout<<"Error : clsfy_adaboost_sorted_builder\n"
00240               <<"unrecognised output value : outputs["<<i<<"]= "
00241               <<outputs[i]<<'\n';
00242       vcl_abort();
00243     }
00244   }
00245 
00246   assert (n0+n1==n );
00247 
00248   vnl_vector<double> wts(n);
00249   for (unsigned int i=0; i<n; ++i)
00250   {
00251     if ( outputs[i] == 0 )
00252       wts(i)=0.5/n0;
00253     else if ( outputs[i] == 1 )
00254       wts(i)=0.5/n1;
00255     else
00256     {
00257       vcl_cout<<"Error : clsfy_adaboost_sorted_builder\n"
00258               <<"unrecognised output value : outputs["<<i<<"]= "
00259               <<outputs[i]<<'\n';
00260       vcl_abort();
00261     }
00262   }
00263 
00264 
00265   // clear classifier
00266   // N.B. maybe shouldn't do this if going to build incrementally
00267   // i.e. by rebuilding the training set from false positives of the
00268   // current classifier
00269   strong_classifier.clear();
00270   strong_classifier.set_n_dims(d);
00271   // N.B. have to set builder as a member variable elsewhere
00272   clsfy_classifier_1d* c1d = weak_builder_->new_classifier();
00273   clsfy_classifier_1d* best_c1d= weak_builder_->new_classifier();
00274 
00275   double beta, alpha;
00276   long old_time = vcl_clock();
00277   double tot_time=0;
00278 
00279   for (unsigned int r=0;r<(unsigned)max_n_clfrs_;++r)
00280   {
00281     vcl_cout<<"adaboost training round = "<<r<<'\n';
00282 
00283     //vcl_cout<<"wts0= "<<wts0<<"\nwts1= "<<wts1<<'\n';
00284 
00285     int best_i=-1;
00286     double min_error= 100000;
00287     wrapper.reset();  // make sure pointing to first data vector
00288     for (int i=0;i<d;++i)
00289     {
00290       const vcl_vector< vbl_triple<double,int,int> >& vec = wrapper.current();
00291 
00292       double error = weak_builder_->build_from_sorted_data(*c1d,&vec[0],wts);
00293       if (i==0 || error<min_error)
00294       {
00295         min_error = error;
00296         delete best_c1d;
00297         best_c1d= c1d->clone();
00298         best_i = i;
00299       }
00300 
00301       wrapper.next();   // move to next data vector
00302     }
00303 
00304     assert(best_i != -1);
00305 
00306     vcl_cout<<"best_i= "<<best_i<<'\n'
00307             <<"min_error= "<<min_error<<'\n';
00308 
00309     if (min_error<1e-10)  // Hooray!
00310     {
00311       vcl_cout<<"min_error<1e-10 !!!\n";
00312       alpha  = vcl_log(2.0*n);   //is this appropriate???
00313       strong_classifier.add_classifier( best_c1d, alpha, best_i);
00314 
00315       // delete classifiers on heap, because clones taken by strong_classifier
00316       delete c1d;
00317       delete best_c1d;
00318       return clsfy_test_error(strong_classifier, inputs, outputs);
00319     }
00320 
00321 
00322     if (0.5-min_error<1e-10) // Oh dear, no further improvement possible
00323     {
00324       vcl_cout<<"min_error => 0.5 !!!\n";
00325       beta=1.0;
00326 
00327       // delete classifiers on heap, because clones taken by strong_classifier
00328       delete c1d;
00329       delete best_c1d;
00330       return clsfy_test_error(strong_classifier, inputs, outputs);
00331     }
00332 
00333     // update the classifier
00334     beta = min_error/(1.0-min_error);
00335     alpha  = -1.0*vcl_log(beta);
00336     strong_classifier.add_classifier( best_c1d, alpha, best_i);
00337 
00338 
00339     // extract the best weak classifier results
00340     wrapper.set_index(best_i);
00341     const vcl_vector< vbl_triple<double,int,int> >& vec = wrapper.current();
00342 
00343     // update the wts using the best weak classifier
00344     for (unsigned int j=0;j<n;++j)
00345       if ( best_c1d-> classify( vec[j].first ) == (unsigned) vec[j].second)
00346         wts[vec[j].third]*=beta;
00347 
00348     double w_sum= wts.mean()*n;
00349     wts/=w_sum;
00350 
00351     long new_time = vcl_clock();
00352 
00353     double dt = (1.0*(new_time-old_time))/CLOCKS_PER_SEC;
00354     vcl_cout<<"Time for AdaBoost round:      "<<dt<<" secs\n";
00355     tot_time+=dt;
00356     vcl_cout<<"Total time for rounds so far: "<<tot_time<<" secs\n";
00357 
00358     old_time = new_time;
00359   }
00360 
00361   delete c1d;
00362   delete best_c1d;
00363 
00364   // does clsfy_test_error balk if have too much data?
00365   // should be OK because just passes mbl_data_wrapper and evaluates
00366   // one at a time, so if using mbl_file_data_wrapper should be OK!
00367   vcl_cout<<"calculating training error\n";
00368   return clsfy_test_error(strong_classifier, inputs, outputs);
00369 }
00370 
00371 
00372 //: Create empty classifier
00373 // Caller is responsible for deletion
00374 clsfy_classifier_base* clsfy_adaboost_sorted_builder::new_classifier() const
00375 {
00376   return new clsfy_simple_adaboost();
00377 }
00378 
00379 
00380 //=======================================================================
00381 
00382 #if 0
00383     // required if data stored on the heap is present in this derived class
00384 clsfy_adaboost_sorted_builder::clsfy_adaboost_sorted_builder(const clsfy_adaboost_sorted_builder& new_b):
00385   data_ptr_(0)
00386 {
00387   *this = new_b;
00388 }
00389 
00390 
00391 //=======================================================================
00392 
00393     // required if data stored on the heap is present in this derived class
00394 clsfy_adaboost_sorted_builder& clsfy_adaboost_sorted_builder::operator=(const clsfy_adaboost_sorted_builder& new_b)
00395 {
00396   if (&new_b==this) return *this;
00397 
00398   // Copy heap member variables.
00399   delete data_ptr_; data_ptr_=0;
00400 
00401   if (new_b.data_ptr_)
00402     data_ptr_ = new_b.data_ptr_->clone();
00403 
00404   // Copy normal member variables
00405   data_ = new_b.data_;
00406 
00407   return *this;
00408 }
00409 #endif // 0
00410 
00411 //=======================================================================
00412 
00413 clsfy_builder_base* clsfy_adaboost_sorted_builder::clone() const
00414 {
00415   return new clsfy_adaboost_sorted_builder(*this);
00416 }
00417 
00418 //=======================================================================
00419 
00420     // required if data is present in this base class
00421 void clsfy_adaboost_sorted_builder::print_summary(vcl_ostream& /*os*/) const
00422 {
00423 #if 0
00424   clsfy_builder_base::print_summary(os); // Uncomment this line if it has one.
00425   vsl_print_summary(os, data_); // Example of data output
00426 #endif
00427 
00428   vcl_cerr << "clsfy_adaboost_sorted_builder::print_summary() NYI\n";
00429 }
00430 
00431 //=======================================================================
00432 
00433   // required if data is present in this base class
00434 void clsfy_adaboost_sorted_builder::b_write(vsl_b_ostream& /*bfs*/) const
00435 {
00436 #if 0
00437   vsl_b_write(bfs, version_no());
00438   clsfy_builder_base::b_write(bfs);  // Needed if base has any data
00439   vsl_b_write(bfs, data_);
00440 #endif
00441   vcl_cerr << "clsfy_adaboost_sorted_builder::b_write() NYI\n";
00442 }
00443 
00444 //=======================================================================
00445 
00446   // required if data is present in this base class
00447 void clsfy_adaboost_sorted_builder::b_read(vsl_b_istream& /*bfs*/)
00448 {
00449   vcl_cerr << "clsfy_adaboost_sorted_builder::b_read() NYI\n";
00450 #if 0
00451   if (!bfs) return;
00452 
00453   short version;
00454   vsl_b_read(bfs,version);
00455   switch (version)
00456   {
00457    case 1:
00458     //clsfy_builder_base::b_read(bfs);  // Needed if base has any data
00459     vsl_b_read(bfs,data_);
00460     break;
00461    default:
00462     vcl_cerr << "I/O ERROR: vsl_b_read(vsl_b_istream&, clsfy_adaboost_sorted_builder&)\n"
00463              << "           Unknown version number "<< version << '\n';
00464     bfs.is().clear(vcl_ios::badbit); // Set an unrecoverable IO error on stream
00465     return;
00466   }
00467 #endif // 0
00468 }