1 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
3 <link rel="STYLESHEET" href="opencvref.css" charset="ISO-8859-1" type="text/css">
4 <title>Machine Learning Reference</title>
7 <h1>Machine Learning Reference</h1>
9 <p font-size="120%" color="#0000ff"> </p>
12 <!-- <li><a href=#ch_introduction>Introduction</a> -->
13 <li><a href=#ch_commonfunctions>Introduction. Common classes and functions</a>
14 <li><a href=#ch_nbayes>Normal Bayes Classifier</a>
15 <li><a href=#ch_knn>K Nearest Neighbors</a>
16 <li><a href=#ch_svm>Support Vector Machine</a>
17 <li><a href=#ch_dtree>Decision Trees</a>
18 <li><a href=#ch_boosting>Boosting</a>
19 <li><a href=#ch_randomforest>Random Trees</a>
20 <li><a href=#ch_em>Expectation-Maximization</a>
21 <li><a href=#ch_ann>Neural Networks</a>
22 <!-- <li><a href=#ch_cnn>Convolutional Neural Networks</a>
23 <li><a href=#ch_crossvalid>Cross-validation method of accuracy estimation</a> -->
26 <hr><h2><a name="ch_introduction">Introduction. Common classes and functions</a></h2>
28 <!-- *****************************************************************************************
29 *****************************************************************************************
30 ***************************************************************************************** -->
32 <p>The Machine Learning Library (MLL) is a set of classes and functions for statistical classification,
33 regression and clustering of data.</p>
35 <p>Most of the classification and regression algorithms are implemented as C++ classes.
36 As the algorithms have different set of features (like ability to handle missing measurements,
37 or categorical input variables etc.), there is a little common ground between the classes.
38 This common ground is defined by the class <code>CvStatModel</code> that all
39 the other ML classes are derived from.
41 <hr><h3><a name="decl_CvStatModel">CvStatModel</a></h3>
42 <p class="Blurb">Base class for statistical models in ML</p>
47 <i>/* <a href="#decl_CvStatModel_c">CvStatModel</a>(); */</i>
48 <i>/* <a href="#decl_CvStatModel_ct">CvStatModel</a>( const CvMat* train_data ... ); */</i>
50 virtual <a href="#decl_CvStatModel_ct">~CvStatModel</a>();
52 virtual void <a href="#decl_CvStatModel_clear">clear()</a>=0;
54 <i>/* virtual bool <a href="#decl_CvStatModel_train">train</a>( const CvMat* train_data, [int tflag,] ..., const CvMat* responses, ...,
55 [const CvMat* var_idx,] ..., [const CvMat* sample_idx,] ...
56 [const CvMat* var_type,] ..., [const CvMat* missing_mask,] <misc_training_alg_params> ... )=0;
59 <i>/* virtual float <a href="#decl_CvStatModel_predict">predict</a>( const CvMat* sample ... ) const=0; */</i>
61 virtual void <a href="#decl_CvStatModel_save">save</a>( const char* filename, const char* name=0 )=0;
62 virtual void <a href="#decl_CvStatModel_load">load</a>( const char* filename, const char* name=0 )=0;
64 virtual void <a href="#decl_CvStatModel_write">write</a>( CvFileStorage* storage, const char* name )=0;
65 virtual void <a href="#decl_CvStatModel_read">read</a>( CvFileStorage* storage, CvFileNode* node )=0;
70 In this declaration some methods are commented off.
71 Actually, these are methods for which there is no unified API
72 (with the exception of the default constructor), however, there are many similarities
73 in the syntax and semantics that are briefly described below in this section,
74 as if they are a part of the base class.
78 <hr><h3><a name="decl_CvStatModel_c">CvStatModel::CvStatModel</a></h3>
79 <p class="Blurb">Default constructor</p>
81 CvStatModel::CvStatModel();
83 <p>Each statistical model class in ML has default constructor without parameters.
84 This constructor is useful for 2-stage model construction, when the default constructor
85 is followed by <a href="#decl_CvStatModel_train">train()</a> or
86 <a href="#decl_CvStatModel_load">load()</a>.
89 <hr><h3><a name="decl_CvStatModel_ct">CvStatModel::CvStatModel(...)</a></h3>
90 <p class="Blurb">Training constructor</p>
92 CvStatModel::CvStatModel( const CvMat* train_data ... ); */
94 <p>Most ML classes provide single-step construct+train constructor.
95 This constructor is equivalent to the default constructor, followed by the
96 <a href="#decl_CvStatModel_train">train()</a> method with the parameters that
97 passed to the constructor.</p>
100 <hr><h3><a name="decl_CvStatModel_d">CvStatModel::~CvStatModel</a></h3>
101 <p class="Blurb">Virtual destructor</p>
103 CvStatModel::~CvStatModel();
106 The destructor of the base class is declared as virtual,
107 so it is safe to write the following code:
112 model = new CvSVM(... /* SVM params */);
114 model = new CvDTree(... /* Decision tree params */);
119 Normally, the destructor of each derived class does nothing, but calls
120 the overridden method <a href="decl_CvStatModel_clear">clear()</a> that deallocates all the memory.
124 <hr><h3><a name="decl_CvStatModel_clear">CvStatModel::clear</a></h3>
125 <p class="Blurb">Deallocates memory and resets the model state</p>
127 void CvStatModel::clear();
130 The method <code>clear</code> does the same job as the destructor, i.e. it deallocates all the memory
131 occupied by the class members. But the object itself is not destructed, and it can be reused further.
132 This method is called from the destructor, from the <code>train</code> methods of the derived classes,
133 from the methods <a href="decl_CvStatModel_load">load()</a>,
134 <a href="decl_CvStatModel_read">read()</a> etc., or even explicitly by user.
138 <hr><h3><a name="decl_CvStatModel_save">CvStatModel::save</a></h3>
139 <p class="Blurb">Saves the model to file</p>
141 void CvStatModel::save( const char* filename, const char* name=0 );
144 The method <code>save</code> stores the complete model state to the specified XML or YAML file
145 with the specified name or default name (that depends on the particular class).
146 <a href="opencvref_cxcore.htm#cxcore_persistence">Data persistence</a>
147 functionality from cxcore is used.
151 <hr><h3><a name="decl_CvStatModel_load">CvStatModel::load</a></h3>
152 <p class="Blurb">Loads the model from file</p>
154 void CvStatModel::load( const char* filename, const char* name=0 );
157 The method <code>load</code> loads the complete model state with the specified
159 (or default model-dependent name) from the specified XML or YAML file. The
160 previous model state is cleared by <a href="#decl_CvStatModel_clear">clear()</a>.</p>
162 Note that the method is virtual, therefore any model can be loaded using this virtual method.
163 However, unlike the C types of OpenCV that can be loaded using generic
164 <a href="opencvref_cxcore.htm#decl_cvLoad">cvLoad()</a>, in this case the model type
165 must be known anyway, because an empty model, an instance of the appropriate class,
166 must be constructed beforehand.
167 This limitation will be removed in the later ML versions.
171 <hr><h3><a name="decl_CvStatModel_write">CvStatModel::write</a></h3>
172 <p class="Blurb">Writes the model to file storage</p>
174 void CvStatModel::write( CvFileStorage* storage, const char* name );
177 The method <code>write</code> stores the complete model state
179 with the specified name or default name (that depends on the particular class).
180 The method is called by <a href="#decl_CvStatModel_save">save()</a>.
184 <hr><h3><a name="decl_CvStatModel_read">CvStatModel::read</a></h3>
185 <p class="Blurb">Reads the model from file storage</p>
187 void CvStatMode::read( CvFileStorage* storage, CvFileNode* node );
190 The method <code>read</code> restores the complete model state from the specified node
191 of the file storage. The node must be located by user, for example, using
192 the function <a href="opencvref_cxcore.htm#decl_cvGetFileNodeByName">cvGetFileNodeByName()</a>.
193 The method is called by <a href="#decl_CvStatModel_load">load()</a>.
195 <p>The previous model state is cleared by <a href="#decl_CvStatModel_clear">clear()</a>.
199 <hr><h3><a name="decl_CvStatModel_train">CvStatModel::train</a></h3>
200 <p class="Blurb">Trains the model</p>
202 bool CvStatMode::train( const CvMat* train_data, [int tflag,] ..., const CvMat* responses, ...,
203 [const CvMat* var_idx,] ..., [const CvMat* sample_idx,] ...
204 [const CvMat* var_type,] ..., [const CvMat* missing_mask,] <misc_training_alg_params> ... );
207 The method trains the statistical model using a set of input feature vectors and the corresponding
208 output values (responses). Both input and output vectors/values are passed as matrices.
209 By default the input feature vectors are stored as <code>train_data</code> rows,
210 i.e. all the components (features) of a training vector are stored
211 continuously. However, some algorithms can handle the transposed representation,
212 when all values of each particular feature (component/input variable)
213 over the whole input set are stored continuously. If both layouts are supported, the method
214 includes <code>tflag</code> parameter that specifies the orientation:<br>
215 <code>tflag=CV_ROW_SAMPLE</code> means that the feature vectors are stored as rows,<br>
216 <code>tflag=CV_COL_SAMPLE</code> means that the feature vectors are stored as columns.<br>
217 The <code>train_data</code> must have
218 <code>32fC1</code> (32-bit floating-point, single-channel) format.
219 Responses are usually stored in the 1d
220 vector (a row or a column) of <code>32sC1</code> (only in the classification problem) or
221 <code>32fC1</code> format, one value per an input vector (although some algorithms, like various flavors of neural nets,
222 take vector responses).<p>For classification problems the responses are discrete class labels,
223 for regression problems - the responses are values of the function to be approximated.
224 Some algorithms can deal only with classification problems, some - only with regression problems,
225 and some can deal with both problems. In the latter case the type of output variable is either passed as separate parameter,
226 or as a last element of <code>var_type</code> vector:<br>
227 <code>CV_VAR_CATEGORICAL</code> means that the output values are discrete class labels,<br>
228 <code>CV_VAR_ORDERED(=CV_VAR_NUMERICAL)</code> means that the output values are ordered, i.e.
229 2 different values can be compared as numbers, and this is a regression problem<br>
230 The types of input variables can be also specified using <code>var_type</code>.
231 Most algorithms can handle only ordered input variables.
234 Many models in the ML may be trained on a selected feature subset,
235 and/or on a selected sample subset of the training set. To make it easier for user,
236 the method <code>train</code> usually includes <code>var_idx</code>
237 and <code>sample_idx</code> parameters. The former identifies variables (features) of interest,
238 and the latter identifies samples of interest. Both vectors are either integer (<code>32sC1</code>)
239 vectors, i.e. lists of 0-based indices, or 8-bit (<code>8uC1</code>) masks of active variables/samples.
240 User may pass <code>NULL</code> pointers instead of either of the argument, meaning that
241 all the variables/samples are used for training.
243 <p>Additionally some algorithms can handle missing measurements,
244 that is when certain features of
245 certain training samples have unknown values (for example, they forgot to
246 measure a temperature
247 of patient A on Monday). The parameter <code>missing_mask</code>, 8-bit matrix of the same size as
248 <code>train_data</code>, is used to mark the missed values (non-zero elements of the mask).</p>
249 <p>Usually, the previous model state is cleared by
250 <a href="#decl_CvStatModel_clear">clear()</a> before running the training procedure.
251 However, some algorithms may optionally update the model state with the new
252 training data, instead of resetting it.</a>
255 <hr><h3><a name="decl_CvStatModel_predict">CvStatModel::predict</a></h3>
256 <p class="Blurb">Predicts the response for sample</p>
258 float CvStatMode::predict( const CvMat* sample[, <prediction_params>] ) const;
261 The method is used to predict the response for a new sample. In case of
262 classification the method returns the class label, in case of regression - the
263 output function value. The input sample must have as many components as the
264 <code>train_data</code> passed to <a href="#decl_CvStatModel_train">train</a>
265 contains. If the <code>var_idx</code> parameter is passed to <a href="#decl_CvStatModel_train">train</a>,
266 it is remembered and then is used to extract only the necessary components from the input sample
267 in the method <code>predict</code>.</p>
268 <p>The suffix "const" means that prediction does not affect the internal model state, so
269 the method can be safely called from within different threads.</p>
271 <!-- *****************************************************************************************
272 *****************************************************************************************
273 ***************************************************************************************** -->
275 <hr><h2><a name="ch_nbayes">Normal Bayes Classifier</a></h2>
276 This is a simple classification model assuming that feature vectors from each class
277 are normally distributed (though, not necessarily independently distributed),
278 so the whole data distribution function is assumed to be a Gaussian mixture, one component per a class.
279 Using the training data the algorithm estimates mean vectors and covariation matrices for every class,
280 and then it uses them for prediction.
282 <p><a name="NBC_paper"></a><b>
283 [Fukunaga90] K. Fukunaga.
284 Introduction to Statistical Pattern Recognition. second ed., New York: Academic Press, 1990.
288 <hr><h3><a name="decl_CvNormalBayesClassifier">CvNormalBayesClassifier</a></h3>
289 <p class="Blurb">Bayes classifier for normally distributed data</p>
291 class CvNormalBayesClassifier : public CvStatModel
294 CvNormalBayesClassifier();
295 virtual ~CvNormalBayesClassifier();
297 CvNormalBayesClassifier( const CvMat* _train_data, const CvMat* _responses,
298 const CvMat* _var_idx=0, const CvMat* _sample_idx=0 );
300 virtual bool train( const CvMat* _train_data, const CvMat* _responses,
301 const CvMat* _var_idx = 0, const CvMat* _sample_idx=0, bool update=false );
303 virtual float predict( const CvMat* _samples, CvMat* results=0 ) const;
304 virtual void clear();
306 virtual void save( const char* filename, const char* name=0 );
307 virtual void load( const char* filename, const char* name=0 );
309 virtual void write( CvFileStorage* storage, const char* name );
310 virtual void read( CvFileStorage* storage, CvFileNode* node );
316 <hr><h3><a name="decl_CvNormalBayesClassifier_train">CvNormalBayesClassifier::train</a></h3>
317 <p class="Blurb">Trains the model</p>
319 bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _responses,
320 const CvMat* _var_idx = 0, const CvMat* _sample_idx=0, bool update=false );
323 The method trains the Normal Bayes classifier. It follows the conventions of
324 generic <a href="#decl_CvStatModel_train">train</a> "method" with the following limitations:
325 only CV_ROW_SAMPLE data layout is supported; the input variables are all ordered;
326 the output variable is categorical (i.e. elements of <code>_responses</code>
327 must be integer numbers, though the vector may have <code>32fC1</code> type),
328 missing measurements are not supported.</p>
329 <p>In addition, there is <code>update</code> flag that identifies, whether the model should
330 be trained from scratch (<code>update=false</code>) or should be updated using new training data
331 (<code>update=true</code>).
335 <hr><h3><a name="decl_CvNormalBayesClassifier_predict">CvNormalBayesClassifier::predict</a></h3>
336 <p class="Blurb">Predicts the response for sample(s)</p>
338 float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results=0 ) const;
341 The method <code>predict</code> estimates the most probable classes for the input vectors.
342 The input vectors (one or more) are stored as rows of the matrix <code>samples</code>.
343 In case of multiple input vectors, there should be output vector <code>results</code>.
344 The predicted class for a single input vector is returned by the method.</p>
346 <!-- *****************************************************************************************
347 *****************************************************************************************
348 ***************************************************************************************** -->
351 <h2><a name="ch_knn">K Nearest Neighbors</a></h2>
354 The algorithm caches all the training samples, and it predicts the response for
355 a new sample by analyzing a certain number (<em>K</em>) of the nearest neighbors
356 of the sample (using voting, calculating weighted sum etc.) The method is
357 sometimes referred to as "learning by example", i.e. for prediction it looks for
359 with a known response that is closest to the given vector.
362 <hr><h3><a name="decl_CvKNearest">CvKNearest</a></h3>
363 <p class="Blurb">K Nearest Neighbors model</p>
365 class CvKNearest : public CvStatModel
370 virtual ~CvKNearest();
372 CvKNearest( const CvMat* _train_data, const CvMat* _responses,
373 const CvMat* _sample_idx=0, bool _is_regression=false, int max_k=32 );
375 virtual bool train( const CvMat* _train_data, const CvMat* _responses,
376 const CvMat* _sample_idx=0, bool is_regression=false,
377 int _max_k=32, bool _update_base=false );
379 virtual float find_nearest( const CvMat* _samples, int k, CvMat* results,
380 const float** neighbors=0, CvMat* neighbor_responses=0, CvMat* dist=0 ) const;
382 virtual void clear();
383 int get_max_k() const;
384 int get_var_count() const;
385 int get_sample_count() const;
386 bool is_regression() const;
394 <hr><h3><a name="decl_CvKNearest_train">CvKNearest::train</a></h3>
395 <p class="Blurb">Trains the model</p>
397 bool CvKNearest::train( const CvMat* _train_data, const CvMat* _responses,
398 const CvMat* _sample_idx=0, bool is_regression=false,
399 int _max_k=32, bool _update_base=false );
402 The method trains the K-Nearest model. It follows the conventions of
403 generic <a href="#decl_CvStatModel_train">train</a> "method" with the following limitations:
404 only CV_ROW_SAMPLE data layout is supported, the input variables are all ordered,
405 the output variables can be either categorical (<code>is_regression=false</code>)
406 or ordered (<code>is_regression=true</code>), variable subsets (<code>var_idx</code>) and
407 missing measurements are not supported.</p>
408 <p>The parameter <code>_max_k</code> specifies the number of maximum neighbors that may be
409 passed to the method <a href="decl_CvKNearest_find_nearest">find_nearest</a>.</p>
410 <p>The parameter <code>_update_base</code> specifies, whether the model is trained from scratch (<code>_update_base=false</code>), or
411 it is updated using the new training data
412 (<code>_update_base=true</code>). In the latter case the parameter <code>_max_k</code>
413 must not be larger than the original value.</p>
416 <hr><h3><a name="decl_CvKNearest_find_nearest">CvKNearest::find_nearest</a></h3>
417 <p class="Blurb">Finds the neighbors for the input vectors</p>
419 float CvKNearest::find_nearest( const CvMat* _samples, int k, CvMat* results=0,
420 const float** neighbors=0, CvMat* neighbor_responses=0, CvMat* dist=0 ) const;
423 For each input vector (which are rows of the matrix <code>_samples</code>)
424 the method finds <code>k≤get_max_k()</code> nearest neighbor. In case of regression,
425 the predicted result will be a mean value of the particular vector's neighbor responses.
426 In case of classification the class is determined by voting.</p>
428 For custom classification/regression prediction, the method can optionally return
429 pointers to the neighbor vectors themselves (<code>neighbors</code>, array of
430 <code>k*_samples->rows</code> pointers), their corresponding output values
431 (<code>neighbor_responses</code>, a vector of <code>k*_samples->rows</code> elements)
432 and the distances from the input vectors to the neighbors
433 (<code>dist</code>, also a vector of <code>k*_samples->rows</code> elements).</p>
434 <p>For each input vector the neighbors are sorted by their distances to the vector.</p>
435 <p>If only a single input vector is passed, all output matrices are optional and the
436 predicted value is returned by the method.</p>
440 <h3>Example. Classification of 2D samples from a Gaussian mixture with k-nearest classifier</h3>
445 int main( int argc, char** argv )
448 int i, j, k, accuracy;
450 int train_sample_count = 100;
451 CvRNG rng_state = cvRNG(-1);
452 CvMat* trainData = cvCreateMat( train_sample_count, 2, CV_32FC1 );
453 CvMat* trainClasses = cvCreateMat( train_sample_count, 1, CV_32FC1 );
454 IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 );
456 CvMat sample = cvMat( 1, 2, CV_32FC1, _sample );
459 CvMat trainData1, trainData2, trainClasses1, trainClasses2;
461 // form the training samples
462 cvGetRows( trainData, &trainData1, 0, train_sample_count/2 );
463 cvRandArr( &rng_state, &trainData1, CV_RAND_NORMAL, cvScalar(200,200), cvScalar(50,50) );
465 cvGetRows( trainData, &trainData2, train_sample_count/2, train_sample_count );
466 cvRandArr( &rng_state, &trainData2, CV_RAND_NORMAL, cvScalar(300,300), cvScalar(50,50) );
468 cvGetRows( trainClasses, &trainClasses1, 0, train_sample_count/2 );
469 cvSet( &trainClasses1, cvScalar(1) );
471 cvGetRows( trainClasses, &trainClasses2, train_sample_count/2, train_sample_count );
472 cvSet( &trainClasses2, cvScalar(2) );
475 CvKNearest knn( trainData, trainClasses, 0, false, K );
476 CvMat* nearests = cvCreateMat( 1, K, CV_32FC1);
478 for( i = 0; i < img->height; i++ )
480 for( j = 0; j < img->width; j++ )
482 sample.data.fl[0] = (float)j;
483 sample.data.fl[1] = (float)i;
485 // estimates the response and get the neighbors' labels
486 response = knn.find_nearest(&sample,K,0,0,nearests,0);
488 // compute the number of neighbors representing the majority
489 for( k = 0, accuracy = 0; k < K; k++ )
491 if( nearests->data.fl[k] == response)
494 // highlight the pixel depending on the accuracy (or confidence)
495 cvSet2D( img, i, j, response == 1 ?
496 (accuracy > 5 ? CV_RGB(180,0,0) : CV_RGB(180,120,0)) :
497 (accuracy > 5 ? CV_RGB(0,180,0) : CV_RGB(120,120,0)) );
501 // display the original training samples
502 for( i = 0; i < train_sample_count/2; i++ )
505 pt.x = cvRound(trainData1.data.fl[i*2]);
506 pt.y = cvRound(trainData1.data.fl[i*2+1]);
507 cvCircle( img, pt, 2, CV_RGB(255,0,0), CV_FILLED );
508 pt.x = cvRound(trainData2.data.fl[i*2]);
509 pt.y = cvRound(trainData2.data.fl[i*2+1]);
510 cvCircle( img, pt, 2, CV_RGB(0,255,0), CV_FILLED );
513 cvNamedWindow( "classifier result", 1 );
514 cvShowImage( "classifier result", img );
517 cvReleaseMat( &trainClasses );
518 cvReleaseMat( &trainData );
524 <!-- *****************************************************************************************
525 *****************************************************************************************
526 ***************************************************************************************** -->
527 <hr><h2><a name="ch_svm">Support Vector Machines</a></h2>
529 <p>Originally, support vector machines (SVM) was a technique for building an optimal (in some sense)
530 binary (2-class) classifier. Then the technique has been extended to regression and clustering problems.
531 SVM is a partial case of kernel-based methods,
532 it maps feature vectors into higher-dimensional space using some kernel function, and then it builds
533 an optimal linear discriminating function in this space (or an optimal hyper-plane that fits into
534 the training data, ...). In case of SVM the kernel is not defined explicitly. Instead, a distance
535 between any 2 points in the hyper-space needs to be defined.</p>
536 <p>The solution is optimal in a sense that the margin between the separating hyper-plane and the
537 nearest feature vectors from the both classes (in case of 2-class classifier) is maximal. The
538 feature vectors that are the closest to the hyper-plane are called "support vectors", meaning that
539 the position of other vectors does not affect the hyper-plane (the decision function).</p>
541 <!-- TODO: insert formulae -->
543 <p>There are a lot of good references on SVM. Here are only a few ones to start with.</p>
544 <b>[Burges98] C. Burges. "A tutorial on support vector machines for pattern recognition", Knowledge Discovery and Data
545 Mining 2(2), 1998.</b><br>
546 (available online at <a href="http://citeseer.ist.psu.edu/burges98tutorial.html">http://citeseer.ist.psu.edu/burges98tutorial.html</a>).<br>
547 <b>LIBSVM - A Library for Support Vector Machines. By Chih-Chung Chang and Chih-Jen Lin</b><br>
548 (<a href="http://www.csie.ntu.edu.tw/~cjlin/libsvm/">http://www.csie.ntu.edu.tw/~cjlin/libsvm/</a>)
551 <hr><h3><a name="decl_CvSVM">CvSVM</a></h3>
552 <p class="Blurb">Support Vector Machines</p>
554 class CvSVM : public CvStatModel
558 enum { C_SVC=100, NU_SVC=101, ONE_CLASS=102, EPS_SVR=103, NU_SVR=104 };
561 enum { LINEAR=0, POLY=1, RBF=2, SIGMOID=3 };
564 enum { C=0, GAMMA=1, P=2, NU=3, COEF=4, DEGREE=5 };
569 CvSVM( const CvMat* _train_data, const CvMat* _responses,
570 const CvMat* _var_idx=0, const CvMat* _sample_idx=0,
571 CvSVMParams _params=CvSVMParams() );
573 virtual bool train( const CvMat* _train_data, const CvMat* _responses,
574 const CvMat* _var_idx=0, const CvMat* _sample_idx=0,
575 CvSVMParams _params=CvSVMParams() );
577 virtual bool train_auto( const CvMat* _train_data, const CvMat* _responses,
578 const CvMat* _var_idx, const CvMat* _sample_idx, CvSVMParams _params,
580 CvParamGrid C_grid = get_default_grid(CvSVM::C),
581 CvParamGrid gamma_grid = get_default_grid(CvSVM::GAMMA),
582 CvParamGrid p_grid = get_default_grid(CvSVM::P),
583 CvParamGrid nu_grid = get_default_grid(CvSVM::NU),
584 CvParamGrid coef_grid = get_default_grid(CvSVM::COEF),
585 CvParamGrid degree_grid = get_default_grid(CvSVM::DEGREE) );
587 virtual float predict( const CvMat* _sample ) const;
588 virtual int get_support_vector_count() const;
589 virtual const float* get_support_vector(int i) const;
590 virtual CvSVMParams get_params() const { return params; };
591 virtual void clear();
593 static CvParamGrid get_default_grid( int param_id );
595 virtual void save( const char* filename, const char* name=0 );
596 virtual void load( const char* filename, const char* name=0 );
598 virtual void write( CvFileStorage* storage, const char* name );
599 virtual void read( CvFileStorage* storage, CvFileNode* node );
600 int get_var_count() const { return var_idx ? var_idx->cols : var_all; }
607 <hr><h3><a name="decl_CvSVMParams">CvSVMParams</a></h3>
608 <p class="Blurb">SVM training parameters</p>
613 CvSVMParams( int _svm_type, int _kernel_type,
614 double _degree, double _gamma, double _coef0,
615 double _C, double _nu, double _p,
616 CvMat* _class_weights, CvTermCriteria _term_crit );
620 double degree; // for poly
621 double gamma; // for poly/rbf/sigmoid
622 double coef0; // for poly/sigmoid
624 double C; // for CV_SVM_C_SVC, CV_SVM_EPS_SVR and CV_SVM_NU_SVR
625 double nu; // for CV_SVM_NU_SVC, CV_SVM_ONE_CLASS, and CV_SVM_NU_SVR
626 double p; // for CV_SVM_EPS_SVR
627 CvMat* class_weights; // for CV_SVM_C_SVC
628 CvTermCriteria term_crit; // termination criteria
632 <dt>svm_type<dd>Type of SVM, one of the following types:<br>
633 CvSVM::C_SVC - n-class classification (n>=2), allows imperfect separation of classes with
634 penalty multiplier <code>C</code> for outliers.<br>
635 CvSVM::NU_SVC - n-class classification with possible imperfect separation. Parameter <code>nu</code>
636 (in the range 0..1, the larger the value, the smoother the decision boundary) is used instead of <code>C</code>.<br>
637 CvSVM::ONE_CLASS - one-class SVM. All the training data are from the same class, SVM builds
638 a boundary that separates the class from the rest of the feature space.<br>
639 CvSVM::EPS_SVR - regression. The distance between feature vectors from the training set and
640 the fitting hyper-plane must be less than <code>p</code>. For outliers
641 the penalty multiplier <code>C</code> is used.<br>
642 CvSVM::NU_SVR - regression; <code>nu</code> is used instead of <code>p</code>.
643 <dt>kernel_type<dd>The kernel type, one of the following types:<br>
644 CvSVM::LINEAR - no mapping is done, linear discrimination (or regression) is done in the original feature space.
645 It is the fastest option. <em>d(x,y) = x•y == (x,y)</em><br>
646 CvSVM::POLY - polynomial kernel: <em>d(x,y) = (gamma*(x•y)+coef0)<sup>degree</sup></em><br>
647 CvSVM::RBF - radial-basis-function kernel; a good choice in most cases:
648 <em>d(x,y) = exp(-gamma*|x-y|<sup>2</sup>)</em><br>
649 CvSVM::SIGMOID - sigmoid function is used as a kernel:
650 <em>d(x,y) = tanh(gamma*(x•y)+coef0)</em><br>
651 <dt>degree, gamma, coef0<dd>Parameters of the kernel, see the formulas above.
652 <dt>C, nu, p<dd>Parameters in the generalized SVM optimization problem.
653 <dt>class_weights<dd>Optional weights, assigned to particular classes.
654 They are multiplied by <code>C</code> and thus affect the misclassification penalty for different classes.
655 The larger weight, the larger penalty on misclassification of data from the corresponding class.
656 <dt>term_crit<dd>Termination procedure for iterative SVM training procedure
657 (which solves a partial case of constrained quadratic optimization problem)
659 The structure must be initialized and passed to the training method of <a href="#decl_CvSVM">CvSVM</a>
662 <hr><h3><a name="decl_CvSVM_train">CvSVM::train</a></h3>
663 <p class="Blurb">Trains SVM</p>
665 bool CvSVM::train( const CvMat* _train_data, const CvMat* _responses,
666 const CvMat* _var_idx=0, const CvMat* _sample_idx=0,
667 CvSVMParams _params=CvSVMParams() );
669 The method trains the SVM model. It follows the conventions of
670 generic <a href="#decl_CvStatModel_train">train</a> "method" with the following limitations:
671 only CV_ROW_SAMPLE data layout is supported, the input variables are all ordered,
672 the output variables can be either categorical (<code>_params.svm_type=CvSVM::C_SVC</code> or
673 <code>_params.svm_type=CvSVM::NU_SVC</code>), or ordered
674 (<code>_params.svm_type=CvSVM::EPS_SVR</code> or
675 <code>_params.svm_type=CvSVM::NU_SVR</code>), or not required at all
676 (<code>_params.svm_type=CvSVM::ONE_CLASS</code>),
677 missing measurements are not supported.</p>
678 <p>All the other parameters are gathered in <a href="#decl_CvSVMParams">CvSVMParams</a>
682 <hr><h3><a name="decl_CvSVM_train_auto">CvSVM::train_auto</a></h3>
683 <p class="Blurb">Trains SVM with optimal parameters</p>
685 train_auto( const CvMat* _train_data, const CvMat* _responses,
686 const CvMat* _var_idx, const CvMat* _sample_idx,
687 CvSVMParams params, int k_fold = 10,
688 CvParamGrid C_grid = get_default_grid(CvSVM::C),
689 CvParamGrid gamma_grid = get_default_grid(CvSVM::GAMMA),
690 CvParamGrid p_grid = get_default_grid(CvSVM::P),
691 CvParamGrid nu_grid = get_default_grid(CvSVM::NU),
692 CvParamGrid coef_grid = get_default_grid(CvSVM::COEF),
693 CvParamGrid degree_grid = get_default_grid(CvSVM::DEGREE) );
696 <dt>k_fold<dd>Cross-validation parameter. The training set is divided into
697 <em>k_fold</em> subsets, one subset being used to train the model,
698 the others forming the test set. So, the SVM algorithm is executed <em>k_fold</em>
701 The method trains the SVM model automatically by choosing
702 the optimal parameters <em>C</em>, <em>gamma</em>,
703 <em>p</em>, <em>nu</em>, <em>coef0</em>, <em>degree</em> from
704 <a href="#decl_CvSVMParams">CvSVMParams</a>. By the optimality one mean that
705 the cross-validation estimate of the test set error is minimal.
706 The parameters are iterated by logarithmic grid, for example, the parameter
707 <em>gamma</em> takes the values in the set
708 {<em>gamma_grid</em>.min_val, <em>gamma_grid</em>.min_val*<em>gamma_grid</em>.step,
709 <em>gamma_grid</em>.min_val*<em>gamma_grid</em>.step<sup>2</sup>,...,
710 <em>gamma_grid</em>.min_val*<em>gamma_grid</em>.step<sup><em>n</em></sup>},
711 where <em>n</em> is the maximal index such, that
712 <em>gamma_grid</em>.min_val*<em>gamma_grid</em>.step<sup><em>n</em></sup> <
713 <em>gamma_grid</em>.max_val. So, <em>step</em> must always be greater than 1.
715 <p>If there is no need in optimization in some parameter, the according grid step should be
716 set to any value less or equal to 1. For example, to avoid optimization in
717 <em>gamma</em> one should set <em>gamma_grid</em>.step = 0,
718 <em>gamma_grid</em>.min_val, <em>gamma_grid</em>.max_val being arbitrary numbers.
719 In this case, the value <em>params</em>.gamma will be taken for <em>gamma</em>.
722 And, finally, if the optimization in some parameter is required,
723 but there is no idea of the corresponding grid, one may call the function
724 <a href="#decl_CvSVM_get_default_grid">CvSVM::get_default_grid</a>.
725 In order to generate a grid, say,
726 for <em>gamma</em>, call <code>CvSVM::get_default_grid(CvSVM::GAMMA)</code>.
729 This function works for the case of classification
730 (<code>params.svm_type=CvSVM::C_SVC</code> or
731 <code>params.svm_type=CvSVM::NU_SVC</code>) as well as for the regression
732 (<code>params.svm_type=CvSVM::EPS_SVR </code> or
733 <code>params.svm_type=CvSVM::NU_SVR</code>). If
734 <code>params.svm_type=CvSVM::ONE_CLASS</code>, no
735 optimization is made and usual SVM with
736 specified in <code>params</code> parameters is executed.
739 <hr><h3><a name="decl_CvSVM_get_default_grid">CvSVM::get_default_grid</a></h3>
740 <p class="Blurb">Generates a grid for SVM parameters</p>
742 CvParamGrid CvSVM::get_default_grid( int param_id );
745 <dt>param_id<dd>Must be one of the following:
746 <code>CvSVM::C, CvSVM::GAMMA, CvSVM::P, CvSVM::NU, CvSVM::COEF, CvSVM::DEGREE</code>.
747 The grid will be generated for the parameter with this ID.
749 <p>The function generates a grid for the specified parameter of the SVM algorithm.
750 The grid may be passed to the function
751 <a href="#decl_CvSVM_train_auto">CvSVM::train_auto</a></p>
753 <hr><h3><a name="decl_CvSVM_get_params">CvSVM::get_params</a></h3>
754 <p class="Blurb">Returns current SVM parameters</p>
756 CvSVMParams CvSVM::get_params() const;
758 <p>This function may be used to get the optimal parameters that were obtained
759 while automatic training <a href="#decl_CvSVM_train_auto">CvSVM::train_auto</a>.</p>
761 <hr><h3><a name="decl_CvSVM_get_support_vector">CvSVM::get_support_vector*</a></h3>
762 <p class="Blurb">Retrieves the number of support vectors and the particular vector</p>
764 int CvSVM::get_support_vector_count() const;
765 const float* CvSVM::get_support_vector(int i) const;
767 <p>The methods can be used to retrieve the set of support vectors.</p>
770 <!-- *****************************************************************************************
771 *****************************************************************************************
772 ***************************************************************************************** -->
774 <hr><h2><a name="ch_dtree">Decision Trees</a></h2>
776 <p>The ML classes discussed in this section implement
777 Classification And Regression Tree algorithms, which is described in
778 <a href="#paper_Breiman84">[Breiman84]</a>.</p>
779 <p>The class <a href="#decl_CvDTree">CvDTree</a> represents a single decision tree that
780 may be used alone, or as a base class in tree ensembles
781 (see <a href=#ch_boosting>Boosting</a> and <a href=#ch_randomforest>Random Trees</a>).</p>
782 <p>Decision tree is a binary tree (i.e. tree where each non-leaf node has exactly 2 child nodes).
783 It can be used either for classification, when
784 each tree leaf is marked with some class label (multiple leafs may have the same label),
785 or for regression, when each tree leaf is also assigned a constant
786 (so the approximation function is piecewise constant).
787 <h3>Predicting with Decision Trees</h3>
788 <p>To reach a leaf node, and thus
789 to obtain a response for the input feature vector, the prediction procedure starts
790 with the root node. From each non-leaf node the procedure goes to the left (i.e. selects the
791 left child node as the next observed node), or to the right based on the value of
792 a certain variable, which index is stored in the observed node. The variable can be either
793 ordered or categorical. In the first case, the variable value is compared with the certain threshold
794 (which is also stored in the node); if the value is less than the threshold, the
795 procedure goes to the left,
796 otherwise, to the right (for example, if the weight is less than 1 kilogram, the
797 procedure goes to the left,
798 else to the right). And in the second case the discrete variable value is tested, whether it
799 belongs to a certain subset of values (also stored in the node)
800 from a limited set of values the variable could take;
801 if yes, the procedure goes to the left, else - to the right (for example,
802 if the color is green or red, go to the left, else to the right).
803 That is, in each node, a pair of entities (<variable_index>, <decision_rule (threshold/subset)>)
805 This pair is called split (split on the variable #<variable_index>).
806 Once a leaf node is reached, the value assigned to this node is used as the output of prediction procedure.</p>
807 <p>Sometimes, certain features of the input vector are missed (for example, in the darkness
808 it is difficult to determine the object color), and the prediction procedure
809 may get stuck in the certain node (in the mentioned example if the node is split by color).
810 To avoid such situations, decision trees use so-called
811 surrogate splits. That is, in addition to the best "primary" split, every tree node may also
812 be split on one or more other variables with nearly the same results.</p>
814 <h3>Training Decision Trees</h3>
815 <p>The tree is built recursively, starting from the root node. The whole training data (feature
816 vectors and the responses) are used to split the root node. In each node the optimum
817 decision rule (i.e. the best "primary" split) is found based on some criteria (in ML <em>gini</em> "purity" criteria is used
818 for classification, and sum of squared errors is used for regression). Then, if necessary,
820 splits are found that resemble at the most the results of the primary split on
821 the training data; all data are divided using the primary and the surrogate splits
822 (just like it is done in the prediction procedure)
823 between the left and the right child node. Then the procedure recursively splits both left and right
824 nodes etc. At each node the recursive procedure may stop (i.e. stop splitting the node further)
825 in one of the following cases:<br>
827 <li>depth of the tree branch being constructed has reached the specified maximum value.
828 <li>number of training samples in the node is less than the specified threshold, i.e.
829 it is not statistically representative set to split the node further.
830 <li>all the samples in the node belong to the same class (or, in case of regression,
831 the variation is too small).
832 <li>the best split found does not give any noticeable improvement comparing to just a random
836 <p>When the tree is built, it may be pruned using cross-validation procedure, if need.
837 That is, some branches of the tree that may lead to the model overfitting are cut off.
838 Normally, this procedure is only applied to standalone decision trees, while tree ensembles
839 usually build small enough trees and use their own protection schemes against overfitting.
842 <h3>Variable importance</h3>
844 Besides the obvious use of decision trees - prediction, the tree can be also
846 for various data analysis.
847 One of the key properties of the constructed decision tree algorithms is that it is possible
848 to compute importance (relative decisive power) of each variable. For example, in a spam
849 filter that uses a set of words occurred in the message as a feature vector, the variable importance
850 rating can be used to determine the most "spam-indicating" words and thus help to keep the dictionary
852 <p>Importance of each variable is computed over all the splits on this variable in the tree, primary
853 and surrogate ones. Thus, to compute variable importance correctly, the surrogate splits must be enabled
854 in the training parameters, even if there is no missing data.</p>
856 <p><a name="paper_Breiman84"><b>[Breiman84]
857 Breiman, L., Friedman, J. Olshen, R. and Stone, C. (1984), "Classification and Regression Trees", Wadsworth.
861 <hr><h3><a name="decl_CvDTreeSplit">CvDTreeSplit</a></h3>
862 <p class="Blurb">Decision tree node split</p>
883 <dt>var_idx<dd>Index of the variable used in the split
884 <dt>inversed<dd>When it equals to 1, the inverse split rule is used
885 (i.e. left and right branches are exchanged in the expressions below)
886 <dt>quality<dd>The split quality, a positive number. It is used to choose the
887 best primary split, then to choose and sort the surrogate splits.
888 After the tree is constructed, it is also used to compute variable importance.
889 <dt>next<dd>Pointer to the next split in the node split list.
890 <dt>subset<dd>Bit array indicating the value subset in case of split on a categorical variable.<br>
891 The rule is: <code>if var_value in subset then next_node<-left else next_node<-right</code>
892 <dt>c<dd>The threshold value in case of split on an ordered variable.<br>
893 The rule is: <code>if var_value < c then next_node<-left else next_node<-right</code>
894 <dt>split_point<dd>Used internally by the training algorithm.
898 <hr><h3><a name="decl_CvDTreeNode">CvDTreeNode</a></h3>
899 <p class="Blurb">Decision tree node</p>
919 <dt>value<dd>The value assigned to the tree node. It is either a class label,
920 or the estimated function value.
921 <dt>class_idx<dd>The assigned to the node
922 normalized class index (to 0..class_count-1 range), it is used internally in classification trees
924 <dt>Tn<dd>The tree index in a ordered sequence of trees. The indices are used during and
925 after the pruning procedure. The root node has the maximum value <code>Tn</code>
926 of the whole tree, child nodes have <code>Tn</code> less than or equal to
927 the parent's <code>Tn</code>,
928 and the nodes with <code>Tn≤<a href="#decl_CvDTree">CvDTree</a>::pruned_tree_idx</code> are not taken
929 into consideration at the prediction stage (the corresponding branches are
930 considered as cut-off), even
931 if they have not been physically deleted from the tree at the pruning stage.
932 <dt>parent, left, right<dd>Pointers to the parent node, left and right child nodes.
933 <dt>split<dd>Pointer to the first (primary) split.
934 <dt>sample_count<dd>The number of samples that fall into the node at the training stage.
935 It is used to resolve the difficult cases - when the variable for the primary split
936 is missing, and all the variables for other surrogate splits are
937 missing too,<br>the sample is
938 directed to the left if <code>left->sample_count>right->sample_count</code> and
939 to the right otherwise.
940 <dt>depth<dd>The node depth, the root node depth is 0, the child nodes depth is the parent's depth + 1.
942 <p>Other numerous fields of <code>CvDTreeNode</code> are used internally at the training stage.</p>
945 <hr><h3><a name="decl_CvDTreeParams">CvDTreeParams</a></h3>
946 <p class="Blurb">Decision tree training parameters</p>
952 int min_sample_count;
956 bool truncate_pruned_tree;
957 float regression_accuracy;
960 CvDTreeParams() : max_categories(10), max_depth(INT_MAX), min_sample_count(10),
961 cv_folds(10), use_surrogates(true), use_1se_rule(true),
962 truncate_pruned_tree(true), regression_accuracy(0.01f), priors(0)
965 CvDTreeParams( int _max_depth, int _min_sample_count,
966 float _regression_accuracy, bool _use_surrogates,
967 int _max_categories, int _cv_folds,
968 bool _use_1se_rule, bool _truncate_pruned_tree,
969 const float* _priors );
973 <dt>max_depth<dd>This parameter specifies the maximum possible depth of the
974 tree. That is the training algorithms attempts to split a node while its depth
975 is less than <code>max_depth</code>. The actual depth
977 be smaller if the other termination criteria are met
978 (see the outline of the training procedure in the beginning of the section),
979 and/or if the tree is pruned.
980 <dt>min_sample_count<dd>A node is not split if the number of samples directed to the node
981 is less than the parameter value.
982 <dt>regression_accuracy<dd>Another stop criteria - only for regression trees. As soon as
983 the estimated node value differs from the node training samples responses
984 by less than the parameter value, the node is not split further.
985 <dt>use_surrogates<dd>If <code>true</code>, surrogate splits are built. Surrogate splits are
986 needed to handle missing measurements and for variable importance estimation.
987 <dt>max_categories<dd>If a discrete variable, on which the training procedure tries to make a split,
988 takes more than <code>max_categories</code> values, the precise best subset
989 estimation may take a very long time (as the algorithm is exponential).
990 Instead, many decision trees engines (including ML) try to find sub-optimal split
991 in this case by clustering all the samples into <code>max_categories</code> clusters
992 (i.e. some categories are merged together).<br>
993 Note that this technique is used only in <code>N(>2)</code>-class classification problems.
994 In case of regression and 2-class classification the optimal split can be found efficiently
995 without employing clustering, thus the parameter is not used in these cases.
996 <dt>cv_folds<dd>If this parameter is >1, the tree is pruned using <code>cv_folds</code>-fold
998 <dt>use_1se_rule<dd>If <code>true</code>, the tree is truncated a bit more by the pruning procedure.
999 That leads to compact, and more resistant to the training data noise, but a bit less
1000 accurate decision tree.
1001 <dt>truncate_pruned_tree<dd>If <code>true</code>, the cut off nodes
1002 (with <code>Tn</code>≤<code>CvDTree::pruned_tree_idx</code>) are physically
1003 removed from the tree. Otherwise they are kept, and by decreasing
1004 <code>CvDTree::pruned_tree_idx</code> (e.g. setting it to -1)
1005 it is still possible to get the results from the original un-pruned
1006 (or pruned less aggressively) tree.
1007 <dt>priors<dd>The array of a priori class probabilities, sorted by the class label value.
1008 The parameter can be used to tune the decision tree preferences toward a certain class.
1009 For example, if users want to detect some rare anomaly occurrence, the training
1010 base will likely contain much more normal cases than anomalies, so
1011 a very good classification
1012 performance will be achieved just by considering every case as normal. To avoid this, the priors
1013 can be specified, where the anomaly probability is artificially increased
1014 (up to 0.5 or even greater), so the weight of the misclassified anomalies
1015 becomes much bigger,
1016 and the tree is adjusted properly.
1017 <p>A note about memory management: the field <code>priors</code>
1018 is a pointer to the array of floats. The array should be allocated by user, and
1019 released just after the <code>CvDTreeParams</code> structure is passed to
1020 <a href="#decl_CvDTreeTrainData">CvDTreeTrainData</a> or
1021 <a href="#decl_CvDTree">CvDTree</a> constructors/methods (as the methods
1022 make a copy of the array).
1025 The structure contains all the decision tree training parameters.
1026 There is a default constructor that initializes all the parameters with the default values
1027 tuned for standalone classification tree. Any of the parameters can be
1029 or the structure may be fully initialized using the advanced variant of the constructor.</p>
1032 <hr><h3><a name="decl_CvDTreeTrainData">CvDTreeTrainData</a></h3>
1033 <p class="Blurb">Decision tree training data and shared data for tree ensembles</p>
1035 struct CvDTreeTrainData
1038 CvDTreeTrainData( const CvMat* _train_data, int _tflag,
1039 const CvMat* _responses, const CvMat* _var_idx=0,
1040 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1041 const CvMat* _missing_mask=0,
1042 const CvDTreeParams& _params=CvDTreeParams(),
1043 bool _shared=false, bool _add_labels=false );
1044 virtual ~CvDTreeTrainData();
1046 virtual void set_data( const CvMat* _train_data, int _tflag,
1047 const CvMat* _responses, const CvMat* _var_idx=0,
1048 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1049 const CvMat* _missing_mask=0,
1050 const CvDTreeParams& _params=CvDTreeParams(),
1051 bool _shared=false, bool _add_labels=false,
1052 bool _update_data=false );
1054 virtual void get_vectors( const CvMat* _subsample_idx,
1055 float* values, uchar* missing, float* responses, bool get_class_idx=false );
1057 virtual CvDTreeNode* subsample_data( const CvMat* _subsample_idx );
1059 virtual void write_params( CvFileStorage* fs );
1060 virtual void read_params( CvFileStorage* fs, CvFileNode* node );
1062 // release all the data
1063 virtual void clear();
1065 int get_num_classes() const;
1066 int get_var_type(int vi) const;
1067 int get_work_var_count() const;
1069 virtual int* get_class_labels( CvDTreeNode* n );
1070 virtual float* get_ord_responses( CvDTreeNode* n );
1071 virtual int* get_labels( CvDTreeNode* n );
1072 virtual int* get_cat_var_data( CvDTreeNode* n, int vi );
1073 virtual CvPair32s32f* get_ord_var_data( CvDTreeNode* n, int vi );
1074 virtual int get_child_buf_idx( CvDTreeNode* n );
1076 ////////////////////////////////////
1078 virtual bool set_params( const CvDTreeParams& params );
1079 virtual CvDTreeNode* new_node( CvDTreeNode* parent, int count,
1080 int storage_idx, int offset );
1082 virtual CvDTreeSplit* new_split_ord( int vi, float cmp_val,
1083 int split_point, int inversed, float quality );
1084 virtual CvDTreeSplit* new_split_cat( int vi, float quality );
1085 virtual void free_node_data( CvDTreeNode* node );
1086 virtual void free_train_data();
1087 virtual void free_node( CvDTreeNode* node );
1089 int sample_count, var_all, var_count, max_c_count;
1090 int ord_var_count, cat_var_count;
1091 bool have_labels, have_priors;
1094 int buf_count, buf_size;
1107 CvMat* var_type; // i-th element =
1109 // k>=0 - categorical, see k-th element of cat_* arrays
1112 CvDTreeParams params;
1114 CvMemStorage* tree_storage;
1115 CvMemStorage* temp_storage;
1117 CvDTreeNode* data_root;
1128 This structure is mostly used internally for storing both standalone trees and tree ensembles
1129 efficiently. Basically, it contains 3 types of information:
1131 <li>The training parameters, <a href="#decl_CvDTreeParams">CvDTreeParams</a> instance.
1132 <li>The training data, preprocessed in order to find the best splits more efficiently.
1133 For tree ensembles this preprocessed data is reused by all the trees.
1134 Additionally, the training data characteristics that are shared by
1135 all trees in the ensemble are stored here: variable types,
1136 the number of classes, class label compression map etc.
1137 <li>Buffers, memory storages for tree nodes, splits and other elements of the trees constructed.
1140 There are 2 ways of using this structure.
1141 In simple cases (e.g. standalone tree,
1142 or ready-to-use "black box" tree ensemble from ML, like <a href=#ch_randomforest>Random Trees</a>
1143 or <a href=#ch_boosting>Boosting</a>) there is no need to care or even to know about the structure -
1144 just construct the needed statistical model, train it and use it. The <code>CvDTreeTrainData</code>
1145 structure will be constructed and used internally. However, for custom tree algorithms,
1146 or another sophisticated cases, the structure may be constructed and used explicitly.
1147 The scheme is the following:
1149 <li>The structure is initialized using the default constructor, followed by
1150 <code>set_data</code> (or it is built using the full form of constructor).
1151 The parameter <code>_shared</code> must be set to <code>true</code>.
1152 <li>One or more trees are trained using this data, see the special form of the method
1153 <a href="#decl_CvDTree_train">CvDTree::train</a>.
1154 <li>Finally, the structure can be released only after all the trees using it are released.
1159 <hr><h3><a name="decl_CvDTree">CvDTree</a></h3>
1160 <p class="Blurb">Decision tree</p>
1162 class CvDTree : public CvStatModel
1168 virtual bool train( const CvMat* _train_data, int _tflag,
1169 const CvMat* _responses, const CvMat* _var_idx=0,
1170 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1171 const CvMat* _missing_mask=0,
1172 CvDTreeParams params=CvDTreeParams() );
1174 virtual bool train( CvDTreeTrainData* _train_data, const CvMat* _subsample_idx );
1176 virtual CvDTreeNode* predict( const CvMat* _sample, const CvMat* _missing_data_mask=0,
1177 bool raw_mode=false ) const;
1178 virtual const CvMat* get_var_importance();
1179 virtual void clear();
1181 virtual void read( CvFileStorage* fs, CvFileNode* node );
1182 virtual void write( CvFileStorage* fs, const char* name );
1184 // special read & write methods for trees in the tree ensembles
1185 virtual void read( CvFileStorage* fs, CvFileNode* node,
1186 CvDTreeTrainData* data );
1187 virtual void write( CvFileStorage* fs );
1189 const CvDTreeNode* get_root() const;
1190 int get_pruned_tree_idx() const;
1191 CvDTreeTrainData* get_data();
1195 virtual bool do_train( const CvMat* _subsample_idx );
1197 virtual void try_split_node( CvDTreeNode* n );
1198 virtual void split_node_data( CvDTreeNode* n );
1199 virtual CvDTreeSplit* find_best_split( CvDTreeNode* n );
1200 virtual CvDTreeSplit* find_split_ord_class( CvDTreeNode* n, int vi );
1201 virtual CvDTreeSplit* find_split_cat_class( CvDTreeNode* n, int vi );
1202 virtual CvDTreeSplit* find_split_ord_reg( CvDTreeNode* n, int vi );
1203 virtual CvDTreeSplit* find_split_cat_reg( CvDTreeNode* n, int vi );
1204 virtual CvDTreeSplit* find_surrogate_split_ord( CvDTreeNode* n, int vi );
1205 virtual CvDTreeSplit* find_surrogate_split_cat( CvDTreeNode* n, int vi );
1206 virtual double calc_node_dir( CvDTreeNode* node );
1207 virtual void complete_node_dir( CvDTreeNode* node );
1208 virtual void cluster_categories( const int* vectors, int vector_count,
1209 int var_count, int* sums, int k, int* cluster_labels );
1211 virtual void calc_node_value( CvDTreeNode* node );
1213 virtual void prune_cv();
1214 virtual double update_tree_rnc( int T, int fold );
1215 virtual int cut_tree( int T, int fold, double min_alpha );
1216 virtual void free_prune_data(bool cut_tree);
1217 virtual void free_tree();
1219 virtual void write_node( CvFileStorage* fs, CvDTreeNode* node );
1220 virtual void write_split( CvFileStorage* fs, CvDTreeSplit* split );
1221 virtual CvDTreeNode* read_node( CvFileStorage* fs, CvFileNode* node, CvDTreeNode* parent );
1222 virtual CvDTreeSplit* read_split( CvFileStorage* fs, CvFileNode* node );
1223 virtual void write_tree_nodes( CvFileStorage* fs );
1224 virtual void read_tree_nodes( CvFileStorage* fs, CvFileNode* node );
1228 int pruned_tree_idx;
1229 CvMat* var_importance;
1231 CvDTreeTrainData* data;
1235 <hr><h3><a name="decl_CvDTree_train">CvDTree::train</a></h3>
1236 <p class="Blurb">Trains decision tree</p>
1238 bool CvDTree::train( const CvMat* _train_data, int _tflag,
1239 const CvMat* _responses, const CvMat* _var_idx=0,
1240 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1241 const CvMat* _missing_mask=0,
1242 CvDTreeParams params=CvDTreeParams() );
1244 bool CvDTree::train( CvDTreeTrainData* _train_data, const CvMat* _subsample_idx );
1246 There are 2 <code>train</code> methods in <code>CvDTree</code>.<p>
1247 The first method follows the generic <a href="#decl_CvStatModel_train">CvStatModel::train</a>
1248 conventions, it is the most complete form of it. Both data layouts
1249 (<code>_tflag=CV_ROW_SAMPLE</code> and <code>_tflag=CV_COL_SAMPLE</code>) are supported,
1250 as well as sample and variable subsets, missing measurements, arbitrary combinations
1251 of input and output variable types etc. The last parameter contains all
1252 the necessary training parameters, see <a href="#decl_CvDTreeParams">CvDTreeParams</a> description.
1253 <p>The second method <code>train</code> is mostly used for building tree ensembles.
1254 It takes the pre-constructed <a href="#decl_CvDTreeTrainData">CvDTreeTrainData</a> instance and
1255 the optional subset of training set. The indices in <code>_subsample_idx</code> are counted
1256 relatively to the <code>_sample_idx</code>, passed to <code>CvDTreeTrainData</code> constructor.
1257 For example, if <code>_sample_idx=[1, 5, 7, 100]</code>, then
1258 <code>_subsample_idx=[0,3]</code> means that the samples <code>[1, 100]</code> of the original
1259 training set are used.
1263 <hr><h3><a name="decl_CvDTree_predict">CvDTree::predict</a></h3>
1264 <p class="Blurb">Returns the leaf node of decision tree corresponding to the input vector</p>
1266 CvDTreeNode* CvDTree::predict( const CvMat* _sample, const CvMat* _missing_data_mask=0,
1267 bool raw_mode=false ) const;
1269 <p>The method takes the feature vector and the optional missing measurement mask on input,
1270 traverses the decision tree and returns the reached leaf node on output.
1271 The prediction result, either the class label or the estimated function value, may
1272 be retrieved as <code>value</code> field of the <a href="#decl_CvDTreeNode">CvDTreeNode</a>
1273 structure, for example: dtree->predict(sample,mask)->value</p>
1274 <p>The last parameter is normally set to <code>false</code> that implies a regular input.
1275 If it is <code>true</code>, the method assumes that all the values of the discrete input variables
1276 have been already normalized to <code>0..<num_of_categories<sub>i</sub>>-1</code> ranges.
1277 (as the decision tree uses such normalized representation internally). It is useful for faster
1278 prediction with tree ensembles. For ordered input variables the flag is not used.
1282 <h3>Example. Building Tree for Classifying Mushrooms</h3>
1283 <p>See <a href="../../samples/c/mushroom.cpp">mushroom.cpp</a> sample that
1284 demonstrates how to build and use the decision tree.</p>
1286 <!-- *****************************************************************************************
1287 *****************************************************************************************
1288 ***************************************************************************************** -->
1290 <hr><h2><a name="ch_boosting">Boosting</a></h2>
1291 <!--Boosting works by sequentially applying a classification algorthm to reweighted versions
1292 of the training data, and then taking a weighted majority vote of the sequence of
1293 classifiers thus produced.-->
1295 A common machine learning task is supervised learning of the following kind:
1296 Predict the output <var>y</var> for an unseen input sample <var>x</var> given a training set
1297 consisting of input and its desired output. In other words, the goal is to learn
1298 the functional relationship <var>F</var>: <var>y</var> = <var>F</var>(<var>x</var>)
1299 between input <var>x</var> and output <var>y</var>.
1300 Predicting qualitative output is called classification, while predicting
1301 quantitative output is called regression.</p>
1303 Boosting is a powerful learning concept, which provide a solution
1304 to supervised classification learning task.
1305 It combines the performance of many "weak" classifiers
1306 to produce a powerful 'committee' [<a href="#HTF01">HTF01</a>].
1307 A weak classifier is only required to be better
1308 than chance, and thus can be very simple and computationally inexpensive.
1309 Many of them smartly combined, however, result in a strong classifier,
1310 which often outperforms most 'monolithic' strong classifiers such as SVMs and Neural Networks.
1313 Decision trees are the most popular weak classifiers used in boosting schemes.
1314 Often the simplest decision trees with only a single split node per tree (called stumps)
1317 Learning of boosted model is based on <var>N</var> training examples
1318 {(<var>x<sub>i</sub></var>,<var>y<sub>i</sub></var>)}<span class=subs>1</span><span class=sups><var>N</var></span>
1319 with <var>x<sub>i</sub></var> ∈ <var>R<sup>K</sup></var> and
1320 <var>y<sub>i</sub></var> ∈ {−1, +1}.
1321 <var>x<sub>i</sub></var> is a <var>K</var>-component vector.
1322 Each component encodes a feature relevant for the learning task at hand.
1323 The desired two-class output is encoded as −1 and +1.
1326 Different variants of boosting are known such as Discrete Adaboost, Real AdaBoost, LogitBoost,
1327 and Gentle AdaBoost [<a href="#FHT98">FHT98</a>].
1328 All of them are very similar in their overall structure.
1329 Therefore, we will look only at the standard two-class Discrete AdaBoost algorithm
1330 as shown in the box below.
1331 Each sample is initially assigned the same weight (step 2).
1332 Next a weak classifier <var>f<sub>m</sub></var>(<var>x</var>)
1333 is trained on the weighted training data (step 3a).
1334 Its weighted training error and scaling factor <var>c<sub>m</sub></var> is computed (step 3b).
1335 The weights are increased for training samples, which have been misclassified (step 3c).
1336 All weights are then normalized, and the process of finding the next week classifier continues
1337 for another <var>M</var>-1 times. The final classifier <var>F</var>(<var>x</var>) is the sign
1338 of the weighted sum over the individual weak classifiers (step 4).</p>
1342 <li>Given <var>N</var> examples
1343 {(<var>x<sub>i</sub></var>,<var>y<sub>i</sub></var>)}<span class=subs>1</span><span class=sups><var>N</var></span>
1344 with <var>x<sub>i</sub></var> ∈ <var>R<sup>K</sup></var>,
1345 <var>y<sub>i</sub></var> ∈ {−1, +1}.
1347 <li>Start with weights <var>w<sub>i</sub></var> = 1/<var>N</var>,
1348 <var>i</var> = 1,…,<var>N</var>.
1350 <li>Repeat for <var>m</var> = 1,2,…,<var>M</var>:
1352 <li>Fit the classifier <var>f<sub>m</sub></var>(<var>x</var>) ∈ {−1,1},
1353 using weights <var>w<sub>i</sub></var> on the training data.
1355 <li>Compute err<var><sub>m</sub></var> = <var>E<sub>w</sub></var>
1356 [1<sub>(<var>y</var> ≠ <var>f<sub>m</sub></var>(<var>x</var>))</sub>],
1357 <var>c<sub>m</sub></var> = log((1 − err<var><sub>m</sub></var>)/err<var><sub>m</sub></var>).
1359 <li>Set <var>w<sub>i</sub></var> ← <var>w<sub>i</sub></var>
1360 exp[<var>c<sub>m</sub></var>
1361 1<sub>(<var>y<sub>i</sub></var> ≠ <var>f<sub>m</sub></var>(<var>x<sub>i</sub></var>))</sub>],
1362 <var>i</var> = 1,2,…,<var>N</var>,
1363 and renormalize so that ∑ <var><span class=subs>i</span> w<sub>i</sub></var> = 1.
1367 <li>Output the classifier sign[∑ <span class=subs><var>m</var> = 1</span><span class=sups><var>M</var></span>
1368 <var>c<sub>m</sub> f<sub>m</sub></var>(<var>x</var>)].
1373 Two-class Discrete AdaBoost Algorithm: Training (steps 1 to 3)
1374 and Evaluation (step 4)
1378 <p><b>NOTE. </b>As well as the classical boosting methods, the current implementation
1379 supports 2-class classifiers only. For M>2 classes there is <em>AdaBoost.MH</em> algorithm,
1380 described in [<a href="#FHT98">FHT98</a>], that reduces the problem to the 2-class problem,
1381 yet with much larger training set.</p>
1383 <p>In order to reduce computation time for boosted models without substantial loosing
1384 of the accuracy, the influence trimming technique may be employed.
1385 As the training algorithm proceeds and the number of trees in the ensemble is increased,
1386 a larger number of the training samples are classified correctly
1387 and with increasing confidence, thereby those samples receive
1388 smaller weights on the subsequent iterations. Examples with very low relative weight have small impact on training of
1389 the weak classifier. Thus such examples may be excluded during the weak classifier training
1390 without having much effect on the induced classifier. This process is controlled
1391 via the <span class=par>weight_trim_rate</span> parameter.
1392 Only examples with the summary fraction <span class=par>weight_trim_rate</span>
1393 of the total weight mass are used in the weak classifier training.
1394 Note that the weights for <em>all</em> training examples are recomputed at each
1395 training iteration. Examples deleted at a particular iteration may
1396 be used again for learning some of the weak classifiers further
1397 [<a href="#FHT98">FHT98</a>].</p>
1400 <p><a name="HTF01">[HTF01]</a> Hastie, T., Tibshirani, R., Friedman, J. H.
1401 The Elements of Statistical Learning:
1402 Data Mining, Inference, and Prediction. Springer Series in Statistics. 2001.</p>
1403 <p><a name="FHT98">[FHT98]</a> Friedman, J. H., Hastie, T. and Tibshirani, R. Additive Logistic
1404 Regression: a Statistical View of Boosting. Technical Report, Dept. of Statistics, Stanford
1405 University, 1998.</p></b>
1408 <hr><h3><a name="decl_CvBoostParams">CvBoostParams</a></h3>
1409 <p class="Blurb">Boosting training parameters</p>
1411 struct CvBoostParams : public CvDTreeParams
1416 double weight_trim_rate;
1419 CvBoostParams( int boost_type, int weak_count, double weight_trim_rate,
1420 int max_depth, bool use_surrogates, const float* priors );
1424 <dt>boost_type<dd>Boosting type, one of the following:<br>
1425 <code>CvBoost::DISCRETE</code> - Discrete AdaBoost<br>
1426 <code>CvBoost::REAL</code> - Real AdaBoost<br>
1427 <code>CvBoost::LOGIT</code> - LogitBoost<br>
1428 <code>CvBoost::GENTLE</code> - Gentle AdaBoost<br>
1429 Gentle AdaBoost and Real AdaBoost are often the preferable choices.
1430 <dt>weak_count<dd>The number of weak classifiers to build.
1431 <dt>split_criteria<dd>Splitting criteria, used to choose optimal splits during a weak tree construction:<br>
1432 <code>CvBoost::DEFAULT</code> - Use the default criteria for the particular boosting method, see below.<br>
1433 <code>CvBoost::GINI</code> - Use Gini index. This is default option for Real AdaBoost;
1434 may be also used for Discrete AdaBoost.<br>
1435 <code>CvBoost::MISCLASS</code> - Use misclassification rate.
1436 This is default option for Discrete AdaBoost;
1437 may be also used for Real AdaBoost.<br>
1438 <code>CvBoost::SQERR</code> - Use least squares criteria. This is default and
1439 the only option for LogitBoost and Gentle AdaBoost.<br>
1440 <dt>weight_trim_rate<dd>The weight trimming ratio, within 0..1. See the discussion of it above.
1441 If the parameter is ≤0 or >1, the trimming is not used, all the samples
1442 are used at each iteration. The default value is 0.95.
1445 The structure is derived from <code><a href="#decl_CvDTreeParams">CvDTreeParams</a></code>,
1446 but not all of the decision tree parameters are supported. In particular, cross-validation
1447 is not supported.</p>
1450 <hr><h3><a name="decl_CvBoostTree">CvBoostTree</a></h3>
1451 <p class="Blurb">Weak tree classifier</p>
1453 class CvBoostTree: public CvDTree
1457 virtual ~CvBoostTree();
1459 virtual bool train( CvDTreeTrainData* _train_data,
1460 const CvMat* subsample_idx, CvBoost* ensemble );
1461 virtual void scale( double s );
1462 virtual void read( CvFileStorage* fs, CvFileNode* node,
1463 CvBoost* ensemble, CvDTreeTrainData* _data );
1464 virtual void clear();
1471 <p>The weak classifier, a component of boosted tree classifier
1472 <a href="#decl_CvBoost">CvBoost</a>, is a derivative of <a href="#decl_CvDTree">CvDTree</a>.
1473 Normally, there is no need to use the weak classifiers directly,
1474 however they can be accessed as elements of sequence <code>CvBoost::weak</code>,
1475 retrieved by <a href="#decl_CvBoost_get_weak_predictors">CvBoost::get_weak_predictors</a>.
1477 Note, that in case of LogitBoost and Gentle AdaBoost each weak predictor is a regression
1478 tree, rather than a classification tree. Even in case of Discrete AdaBoost and Real AdaBoost
1479 the <code>CvBoostTree::predict</code> return value (<code>CvDTreeNode::value</code>) is not
1480 the output class label; a negative value "votes" for class #0, a positive - for class #1.
1481 And the votes are weighted. The weight of each individual tree may be increased or decreased using
1482 method <code>CvBoostTree::scale</code>.
1486 <hr><h3><a name="decl_CvBoost">CvBoost</a></h3>
1487 <p class="Blurb">Boosted tree classifier</p>
1489 class CvBoost : public CvStatModel
1493 enum { DISCRETE=0, REAL=1, LOGIT=2, GENTLE=3 };
1495 // Splitting criteria
1496 enum { DEFAULT=0, GINI=1, MISCLASS=3, SQERR=4 };
1501 CvBoost( const CvMat* _train_data, int _tflag,
1502 const CvMat* _responses, const CvMat* _var_idx=0,
1503 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1504 const CvMat* _missing_mask=0,
1505 CvBoostParams params=CvBoostParams() );
1507 virtual bool train( const CvMat* _train_data, int _tflag,
1508 const CvMat* _responses, const CvMat* _var_idx=0,
1509 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1510 const CvMat* _missing_mask=0,
1511 CvBoostParams params=CvBoostParams(),
1512 bool update=false );
1514 virtual float predict( const CvMat* _sample, const CvMat* _missing=0,
1515 CvMat* weak_responses=0, CvSlice slice=CV_WHOLE_SEQ,
1516 bool raw_mode=false ) const;
1518 virtual void prune( CvSlice slice );
1520 virtual void clear();
1522 virtual void write( CvFileStorage* storage, const char* name );
1523 virtual void read( CvFileStorage* storage, CvFileNode* node );
1525 CvSeq* get_weak_predictors();
1526 const CvBoostParams& get_params() const;
1530 virtual bool set_params( const CvBoostParams& _params );
1531 virtual void update_weights( CvBoostTree* tree );
1532 virtual void trim_weights();
1533 virtual void write_params( CvFileStorage* fs );
1534 virtual void read_params( CvFileStorage* fs, CvFileNode* node );
1536 CvDTreeTrainData* data;
1537 CvBoostParams params;
1544 <hr><h3><a name="decl_CvBoost_train">CvBoost::train</a></h3>
1545 <p class="Blurb">Trains boosted tree classifier</p>
1547 bool CvBoost::train( const CvMat* _train_data, int _tflag,
1548 const CvMat* _responses, const CvMat* _var_idx=0,
1549 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1550 const CvMat* _missing_mask=0,
1551 CvBoostParams params=CvBoostParams(),
1552 bool update=false );
1554 The train method follows the common template, the last parameter <code>update</code>
1555 specifies whether the classifier needs to be updated
1556 (i.e. the new weak tree classifiers added to the existing ensemble), or the classifier
1557 needs to be rebuilt from scratch.
1558 The responses must be categorical, i.e. boosted trees can not be built for regression,
1559 and there should be 2 classes.
1563 <hr><h3><a name="decl_CvBoost_predict">CvBoost::predict</a></h3>
1564 <p class="Blurb">Predicts response for the input sample</p>
1566 float CvBoost::predict( const CvMat* sample, const CvMat* missing=0,
1567 CvMat* weak_responses=0, CvSlice slice=CV_WHOLE_SEQ,
1568 bool raw_mode=false ) const;
1571 <dt>sample<dd>The input sample.
1572 <dt>missing<dd>The optional mask of missing measurements. To handle
1573 missing measurements, the weak classifiers must include
1574 surrogate splits (see <code>CvDTreeParams::use_surrogates</code>).
1575 <dt>weak_responses<dd>The optional output parameter, a floating-point vector,
1576 of responses from each individual weak classifier.
1577 The number of elements in the vector must be equal to
1578 the <code>slice</code> length.
1579 <dt>slice<dd>The continuous subset of the sequence of
1580 weak classifiers to be used for prediction. By default,
1581 all the weak classifiers are used.
1582 <dt>raw_mode<dd>It has the same meaning as in <code>CvDTree::predict</code>.
1583 Normally, it should be set to false.
1586 The method <code>CvBoost::predict</code> runs the sample through the trees
1587 in the ensemble and returns the output class label based on the weighted voting.
1591 <hr><h3><a name="decl_CvBoost_prune">CvBoost::prune</a></h3>
1592 <p class="Blurb">Removes specified weak classifiers</p>
1594 void CvBoost::prune( CvSlice slice );
1596 The method removes the specified weak classifiers from the sequence. Note that
1597 this method should not be confused with the pruning of individual decision trees, which is currently
1602 <hr><h3><a name="decl_CvBoost_get_weak_predictors">CvBoost::get_weak_predictors</a></h3>
1603 <p class="Blurb">Returns the sequence of weak tree classifiers</p>
1605 CvSeq* CvBoost::get_weak_predictors();
1607 The method returns the sequence of weak classifiers.
1608 Each element of the sequence is a pointer to <code>CvBoostTree</code> class
1609 (or, probably, to some of its derivatives).
1613 <!-- *****************************************************************************************
1614 *****************************************************************************************
1615 ***************************************************************************************** -->
1616 <hr><h2><a name="ch_randomforest">Random Trees</a></h2>
1617 <p>Random trees have been introduced by Leo Breiman and
1618 Adele Cutler: <a href="http://www.stat.berkeley.edu/users/breiman/RandomForests/">
1619 http://www.stat.berkeley.edu/users/breiman/RandomForests/</a>.
1620 The algorithm can deal with both classification and regression problems.
1621 Random trees is a collection (ensemble) of <a href="#ch_dtree">tree
1622 predictors</a> that is called <b>forest</b> further in this section
1623 (the term has been also introduced by L. Breiman).
1624 The classification works as following:
1625 the random trees classifier takes the input feature vector, classifies
1626 it with every tree in the forest, and outputs the class label that
1627 has got the majority of "votes". In case of regression the classifier response
1628 is the average of responses over all the trees in the forest.
1630 All the trees are trained with the same parameters, but on the different training sets, which
1631 are generated from the original training set using bootstrap procedure:
1632 for each training set we randomly select the same number of vectors
1633 as in the original set (<code>=N</code>). The vectors are chosen with replacement.
1634 That is, some vectors will occur more than once and some will be absent.
1635 At each node of each tree trained not all the variables are used to find the best split,
1636 rather than a random subset of them. The each node a new subset is generated, however its
1637 size is fixed for all the nodes and all the trees. It is a training parameter,
1638 set to <code>sqrt(<number_of_variables>)</code> by default.
1639 None of the tree built is pruned.
1643 <a name="decl_RTreesOOBerror"></a>
1644 In random trees there is no need in any accuracy estimation procedures, such
1645 as cross-validation or bootstrap, or a separate test set to get an estimate of the
1646 training error. The error is estimated internally during the training.
1647 When the training set for the current tree is drawn by sampling with replacement,
1648 some vectors are left out (so-called <em>oob (out-of-bag) data</em>).
1649 The size of oob data is about <code>N/3</code>. The classification error
1650 is estimated by using this oob-data as following:
1652 <li>Get a prediction for each vector, which is oob relatively to the i-th tree,
1653 using the very i-th tree.
1654 <li>After all the trees have been trained, for each vector that has ever been oob,
1655 find the class-"winner" for it (i.e. the class that has got the majority of votes
1656 in the trees, where the vector was oob) and compare it to the ground-truth response.
1657 <li>Then the classification error estimate is computed as ratio of
1658 number of misclassified oob vectors to all the vectors in the original data.
1659 In the case of regression the oob-error is computed as
1660 the squared error for oob vectors difference divided by the total number of vectors.
1666 <li><a href="http://stat-www.berkeley.edu/users/breiman/wald2002-1.pdf">
1667 Machine Learning, Wald I, July 2002</a></li>
1668 <li><a href="http://stat-www.berkeley.edu/users/breiman/wald2002-2.pdf">
1669 Looking Inside the Black Box, Wald II, July 2002
1671 <li><a href="http://stat-www.berkeley.edu/users/breiman/wald2002-3.pdf">
1672 Software for the Masses, Wald III, July 2002
1674 <li>And other articles from the web-site
1675 <a href="http://www.stat.berkeley.edu/users/breiman/RandomForests/cc_home.htm">
1676 http://www.stat.berkeley.edu/users/breiman/RandomForests/cc_home.htm</a>.</li>
1681 <hr><h3><a name="decl_CvRTParams">CvRTParams</a></h3>
1682 <p class="Blurb">Training Parameters of Random Trees</p>
1684 struct CvRTParams : public CvDTreeParams
1686 bool calc_var_importance;
1688 CvTermCriteria term_crit;
1690 CvRTParams() : CvDTreeParams( 5, 10, 0, false, 10, 0, false, false, 0 ),
1691 calc_var_importance(false), nactive_vars(0)
1693 term_crit = cvTermCriteria( CV_TERMCRIT_ITER+CV_TERMCRIT_EPS, 50, 0.1 );
1696 CvRTParams( int _max_depth, int _min_sample_count,
1697 float _regression_accuracy, bool _use_surrogates,
1698 int _max_categories, const float* _priors,
1699 bool _calc_var_importance,
1700 int _nactive_vars, int max_tree_count,
1701 float forest_accuracy, int termcrit_type );
1705 <dt>calc_var_importance<dd>If it is set, then variable importance
1706 is computed by the training procedure. To retrieve the computed variable importance array,
1707 call the method <code>CvRTrees::get_var_importance()</code>.
1708 <dt>nactive_vars<dd>The number of variables that are randomly selected at each tree
1709 node and that are used to find the best split(s).
1710 <dt>term_crit<dd>Termination criteria for growing the forest:
1711 <code>term_crit.max_iter</code> is the maximum number of trees in the forest
1712 (see also <code>max_tree_count</code> parameter of the constructor</code>,
1713 by default it is set to 50)<br>
1714 <code>term_crit.epsilon</code> is the sufficient accuracy
1715 (<a href="#decl_RTreesOOBerror">OOB error</a>).
1718 The set of training parameters for the forest is the superset of the training parameters for
1719 a single tree. However, Random trees do not need all the functionality/features of decision trees,
1720 most noticeably, the trees are not pruned, so the cross-validation parameters are not used.
1723 <hr><h3><a name="decl_CvRTrees">CvRTrees</a></h3>
1724 <p class="Blurb">Random Trees</p>
1726 class CvRTrees : public CvStatModel
1730 virtual ~CvRTrees();
1731 virtual bool train( const CvMat* _train_data, int _tflag,
1732 const CvMat* _responses, const CvMat* _var_idx=0,
1733 const CvMat* _sample_idx=0, const CvMat* _var_type=0,
1734 const CvMat* _missing_mask=0,
1735 CvRTParams params=CvRTParams() );
1736 virtual float predict( const CvMat* sample, const CvMat* missing = 0 ) const;
1737 virtual void clear();
1739 virtual const CvMat* get_var_importance();
1740 virtual float get_proximity( const CvMat* sample_1, const CvMat* sample_2 ) const;
1742 virtual void read( CvFileStorage* fs, CvFileNode* node );
1743 virtual void write( CvFileStorage* fs, const char* name );
1745 CvMat* get_active_var_mask();
1748 int get_tree_count() const;
1749 CvForestTree* get_tree(int i) const;
1753 bool grow_forest( const CvTermCriteria term_crit );
1755 // array of the trees of the forest
1756 CvForestTree** trees;
1757 CvDTreeTrainData* data;
1765 <hr><h3><a name="decl_CvRTrees_train">CvRTrees::train</a></h3>
1766 <p class="Blurb">Trains Random Trees model</p>
1768 bool CvRTrees::train( const CvMat* train_data, int tflag,
1769 const CvMat* responses, const CvMat* comp_idx=0,
1770 const CvMat* sample_idx=0, const CvMat* var_type=0,
1771 const CvMat* missing_mask=0,
1772 CvRTParams params=CvRTParams() );
1774 <p>The method <code>CvRTrees::train</code> is very similar to the first form
1775 of <a href="#decl_CvDTree_train">CvDTree::train</a>() and follows
1776 the generic method <a href="#decl_CvStatModel_train">CvStatModel::train</a> conventions.
1777 All the specific to the algorithm training parameters are passed as
1778 <a href="#decl_CvRTParams">CvRTParams</a> instance.
1779 The estimate of the training error (<a href="#decl_RTreesOOBerror">oob-error</a>)
1780 is stored in the protected class member <code>oob_error</code>.</p>
1783 <hr><h3><a name="decl_CvRTrees_predict">CvRTrees::predict</a></h3>
1784 <p class="Blurb">Predicts the output for the input sample</p>
1786 double CvRTrees::predict( const CvMat* sample, const CvMat* missing=0 ) const;
1789 The input parameters of the prediction method are the same as in
1790 <a href="#decl_CvDTree_predict">CvDTree::predict</a>, but the return value type is different.
1791 This method returns the cumulative result from all the trees in the forest
1792 (the class that receives the majority of voices, or the mean of the regression function estimates).
1796 <hr><h3><a name="decl_CvRTrees_get_var_importance">CvRTrees::get_var_importance</a></h3>
1797 <p class="Blurb">Retrieves the variable importance array</p>
1799 const CvMat* CvRTrees::get_var_importance() const;
1801 <p>The method returns the variable importance vector, computed at the training stage
1802 when <code><a href="#decl_CvRTParams">CvRTParams</a>::calc_var_importance</code>
1803 is set. If the training flag is not set, then the <code>NULL</code> pointer is returned.
1804 This is unlike decision trees, where variable importance can be computed anytime
1809 <hr><h3><a name="decl_CvRTrees_get_proximity">CvRTrees::get_proximity</a></h3>
1810 <p class="Blurb">Retrieves proximity measure between two training samples</p>
1812 float CvRTrees::get_proximity( const CvMat* sample_1, const CvMat* sample_2 ) const;
1814 <p>The method returns proximity measure between any two samples
1815 (the ratio of the those trees in the ensemble, in which the samples fall into
1816 the same leaf node, to the total number of the trees).</p>
1818 <hr><h4>Example. Prediction of mushroom goodness using random trees classifier</h4>
1820 #include <float.h>
1821 #include <stdio.h>
1822 #include <ctype.h>
1827 CvStatModel* cls = NULL;
1828 CvFileStorage* storage = cvOpenFileStorage( "Mushroom.xml", NULL,CV_STORAGE_READ );
1829 CvMat* data = (CvMat*)cvReadByName(storage, NULL, "sample", 0 );
1830 CvMat train_data, test_data;
1832 CvMat* missed = NULL;
1833 CvMat* comp_idx = NULL;
1834 CvMat* sample_idx = NULL;
1835 CvMat* type_mask = NULL;
1838 CvRTreesParams params;
1839 CvTreeClassifierTrainParams cart_params;
1840 const int ntrain_samples = 1000;
1841 const int ntest_samples = 1000;
1842 const int nvars = 23;
1844 if(data == NULL || data->cols != nvars)
1846 puts("Error in source data");
1850 cvGetSubRect( data, &train_data, cvRect(0, 0, nvars, ntrain_samples) );
1851 cvGetSubRect( data, &test_data, cvRect(0, ntrain_samples, nvars,
1852 ntrain_samples + ntest_samples) );
1855 cvGetCol( &train_data, &response, resp_col);
1857 /* create missed variable matrix */
1858 missed = cvCreateMat(train_data.rows, train_data.cols, CV_8UC1);
1859 for( i = 0; i < train_data.rows; i++ )
1860 for( j = 0; j < train_data.cols; j++ )
1861 CV_MAT_ELEM(*missed,uchar,i,j) = (uchar)(CV_MAT_ELEM(train_data,float,i,j) < 0);
1863 /* create comp_idx vector */
1864 comp_idx = cvCreateMat(1, train_data.cols-1, CV_32SC1);
1865 for( i = 0; i < train_data.cols; i++ )
1867 if(i<resp_col)CV_MAT_ELEM(*comp_idx,int,0,i) = i;
1868 if(i>resp_col)CV_MAT_ELEM(*comp_idx,int,0,i-1) = i;
1871 /* create sample_idx vector */
1872 sample_idx = cvCreateMat(1, train_data.rows, CV_32SC1);
1873 for( j = i = 0; i < train_data.rows; i++ )
1875 if(CV_MAT_ELEM(response,float,i,0) < 0) continue;
1876 CV_MAT_ELEM(*sample_idx,int,0,j) = i;
1879 sample_idx->cols = j;
1881 /* create type mask */
1882 type_mask = cvCreateMat(1, train_data.cols+1, CV_8UC1);
1883 cvSet( type_mask, cvRealScalar(CV_VAR_CATEGORICAL), 0);
1885 // initialize training parameters
1886 cvSetDefaultParamTreeClassifier((CvStatModelParams*)&cart_params);
1887 cart_params.wrong_feature_as_unknown = 1;
1888 params.tree_params = &cart_params;
1889 params.term_crit.max_iter = 50;
1890 params.term_crit.epsilon = 0.1;
1891 params.term_crit.type = CV_TERMCRIT_ITER|CV_TERMCRIT_EPS;
1893 puts("Random forest results");
1894 cls = cvCreateRTreesClassifier( &train_data, CV_ROW_SAMPLE, &response,
1895 (CvStatModelParams*)& params, comp_idx, sample_idx, type_mask, missed );
1898 CvMat sample = cvMat( 1, nvars, CV_32FC1, test_data.data.fl );
1900 int wrong = 0, total = 0;
1901 cvGetCol( &test_data, &test_resp, resp_col);
1902 for( i = 0; i < ntest_samples; i++, sample.data.fl += nvars )
1904 if( CV_MAT_ELEM(test_resp,float,i,0) >= 0 )
1906 float resp = cls->predict( cls, &sample, NULL );
1907 wrong += (fabs(resp-response.data.fl[i]) > 1e-3 ) ? 1 : 0;
1911 printf( "Test set error = %.2f\n", wrong*100.f/(float)total );
1914 puts("Error forest creation");
1917 cvReleaseMat(&missed);
1918 cvReleaseMat(&sample_idx);
1919 cvReleaseMat(&comp_idx);
1920 cvReleaseMat(&type_mask);
1921 cvReleaseMat(&data);
1922 cvReleaseStatModel(&cls);
1923 cvReleaseFileStorage(&storage);
1929 <!-- *****************************************************************************************
1930 *****************************************************************************************
1931 ***************************************************************************************** -->
1932 <hr><h2><a name="ch_em">Expectation-Maximization</a></h2>
1933 The EM (Expectation-Maximization) algorithm estimates the parameters of the
1934 multivariate probability density function in a form of the Gaussian mixture
1935 distribution with a specified number of mixtures.</p>
1937 Consider the set of the feature vectors {x<sub>1</sub>, x<sub>2</sub>,..., x<sub>N</sub>}:
1938 <code>N</code> vectors from <code>d</code>-dimensional Euclidean space drawn from a Gaussian mixture:
1939 <p align="center"><img src="pics/em1.png"></p>
1940 where <code>m</code> is the number of mixtures, p<sub>k</sub>
1941 is the normal distribution density with the mean <code>a<sub>k</sub></code>
1942 and covariance matrix <code>S<sub>k</sub></code>, <code>π<sub>k</sub></code> is the weight of k-th mixture.
1943 Given the number of mixtures <code>m</code> and the samples <code>{x<sub>i</sub>, i=1..N}</code>
1944 the algorithm finds the <a name="def_EM_MLE">maximum-likelihood estimates (MLE)</a>
1945 of the all the mixture parameters, i.e. <code>a<sub>k</sub></code>, <code>S<sub>k</sub></code> and
1946 <code>π<sub>k</sub></code>:
1947 <p align="center"><img src="pics/em3.png"></p>
1948 EM algorithm is an iterative procedure. Each iteration of it includes two steps.
1949 At the first step (Expectation-step, or E-step), we find a probability <code>p<sub>i,k</sub></code>
1950 (denoted <code>α<sub>i,k</sub></code> in the formula below)
1951 of sample <code>#i</code> to belong to mixture <code>#k</code>
1952 using the currently available mixture parameter estimates:
1953 <p align="center"><img src="pics/em4.png"></p>
1954 At the second step (Maximization-step, or M-step) the mixture parameter estimates
1955 are refined using the computed probabilities:
1956 <p align="center"><img src="pics/em5.png"></p>
1957 Alternatively, the algorithm may start with M-step when initial values for <code>p<sub>i,k</sub></code>
1958 can be provided. Another alternative, when <code>p<sub>i,k</sub></code> are unknown,
1959 is to use a simpler clustering algorithm to pre-cluster the input samples and thus obtain
1960 initial <code>p<sub>i,k</sub></code>. Often (and in ML)
1961 <a href="opencvref_cxcore.htm#decl_cvKMeans2">k-means</a> algorithm is used for that purpose.
1962 <p>One of the main that EM algorithm should deal with is the large number of parameters to estimate.
1963 The majority of the parameters sits in covariation matrices, which are <code>d×d</code> elements each
1964 (where <code>d</code> is the feature space dimensionality). However, in many practical problems
1965 the covariation matrices are close to diagonal, or even to <code>μ<sub>k</sub>*I</code>,
1966 where <code>I</code> is identity matrix and <code>μ<sub>k</sub></code> is mixture-dependent "scale" parameter.
1967 So a robust computation scheme could be to start with the harder constraints on
1968 the covariation matrices and then use the estimated parameters as an input for
1969 a less constrained optimization problem (often a diagonal covariation matrix is already a good enough
1972 <p><b>References:</b><br>
1974 <b><a name="EM_paper">
1975 [Bilmes98] J. A. Bilmes. A Gentle Tutorial of the EM Algorithm and its Application to Parameter
1976 Estimation for Gaussian Mixture and Hidden Markov Models.
1977 Technical Report TR-97-021,
1978 International Computer Science Institute and Computer Science Division,
1979 University of California at Berkeley, April 1998.</a></b><br>
1982 <hr><h3><a name="decl_CvEMParams">CvEMParams</a></h3>
1983 <p class="Blurb">Parameters of EM algorithm</p>
1987 CvEMParams() : nclusters(10), cov_mat_type(CvEM::COV_MAT_DIAGONAL),
1988 start_step(CvEM::START_AUTO_STEP), probs(0), weights(0), means(0), covs(0)
1990 term_crit=cvTermCriteria( CV_TERMCRIT_ITER+CV_TERMCRIT_EPS, 100, FLT_EPSILON );
1993 CvEMParams( int _nclusters, int _cov_mat_type=1/*CvEM::COV_MAT_DIAGONAL*/,
1994 int _start_step=0/*CvEM::START_AUTO_STEP*/,
1995 CvTermCriteria _term_crit=cvTermCriteria(CV_TERMCRIT_ITER+CV_TERMCRIT_EPS, 100, FLT_EPSILON),
1996 CvMat* _probs=0, CvMat* _weights=0, CvMat* _means=0, CvMat** _covs=0 ) :
1997 nclusters(_nclusters), cov_mat_type(_cov_mat_type), start_step(_start_step),
1998 probs(_probs), weights(_weights), means(_means), covs(_covs), term_crit(_term_crit)
2005 const CvMat* weights;
2008 CvTermCriteria term_crit;
2011 <dt>nclusters<dd>The number of mixtures. Some of EM implementation could determine the optimal
2012 number of mixtures within a specified value range, but that is not the case in ML yet.
2013 <dt>cov_mat_type<dd>The type of the mixture covariation matrices; should be one of the following:<br>
2014 <code>CvEM::COV_MAT_GENERIC</code> - a covariation matrix of each mixture may
2015 be arbitrary symmetrical positively defined matrix,
2016 so the number of free parameters in each matrix
2017 is about <code>d</code><sup>2</sup>/2. It is not recommended to use this option,
2018 unless there is pretty accurate initial estimation of the parameters and/or
2019 a huge number of training samples.<br>
2020 <code>CvEM::COV_MAT_DIAGONAL</code> - a covariation matrix of each mixture may
2021 be arbitrary diagonal matrix with positive diagonal elements, that is, non-diagonal
2022 elements are forced to be 0's, so the number of free parameters is <code>d</code>
2023 for each matrix. This is most commonly used option yielding good estimation results.<br>
2024 <code>CvEM::COV_MAT_SPHERICAL</code> - a covariation matrix of each mixture is a scaled
2025 identity matrix, <code>μ<sub>k</sub>*I</code>, so the only parameter to be
2026 estimated is <code>μ<sub>k</sub></code>. The option may be used in special cases,
2027 when the constraint is relevant, or as a first step in the optimization (e.g.
2028 in case when the data is preprocessed with
2029 <a href="opencvref_cxcore.htm#decl_cvCalcPCA">PCA</a>).
2030 The results of such preliminary estimation may be passed again to the optimization
2031 procedure, this time with <code>cov_mat_type=CvEM::COV_MAT_DIAGONAL</code>.
2032 <dt>start_step<dd>The initial step the algorithm starts from; should be one of the following:<br>
2033 <code>CvEM::START_E_STEP</code> - the algorithm starts with E-step. At least,
2034 the initial values of mean vectors, <code>CvEMParams::means</code> must be
2035 passed. Optionally, the user may also provide initial values for weights
2036 (<code>CvEMParams::weights</code>) and/or covariation matrices
2037 (<code>CvEMParams::covs</code>).<br>
2038 <code>CvEM::START_M_STEP</code> - the algorithm starts with M-step.
2039 The initial probabilities <code>p<sub>i,k</sub></code> must be provided.<br>
2040 <code>CvEM::START_AUTO_STEP</code> - No values are required from
2042 k-means algorithm is used to estimate initial mixtures parameters.
2043 <dt>term_crit<dd>Termination criteria of the procedure. EM algorithm stops either after a certain
2044 number of iterations (<code>term_crit.num_iter</code>),
2045 or when the parameters change too little (no more than <code>term_crit.epsilon</code>)
2046 from iteration to iteration.
2047 <dt>probs<dd>Initial probabilities <code>p<sub>i,k</sub></code>; are used (and must be not NULL) only when <code>start_step=CvEM::START_M_STEP</code>.
2048 <dt>weights<dd>Initial mixture weights <code>π<sub>k</sub></code>; are used (if not NULL) only when <code>start_step=CvEM::START_E_STEP</code>.
2049 <dt>covs<dd>Initial mixture covariation matrices <code>S<sub>k</sub></code>; are used (if not NULL) only when <code>start_step=CvEM::START_E_STEP</code>.
2050 <dt>means<dd>Initial mixture means <code>a<sub>k</sub></code>; are used (and must be not NULL) only when <code>start_step=CvEM::START_E_STEP</code>.
2052 <p>The structure has 2 constructors, the default one represents a rough rule-of-thumb,
2053 with another one it is possible to override a variety of parameters,
2054 from a single number of mixtures (the only essential problem-dependent parameter),
2055 to the initial values for the mixture parameters.</p>
2058 <hr><h3><a name="decl_CvEM">CvEM</a></h3>
2059 <p class="Blurb">EM model</p>
2061 class CV_EXPORTS CvEM : public CvStatModel
2064 // Type of covariation matrices
2065 enum { COV_MAT_SPHERICAL=0, COV_MAT_DIAGONAL=1, COV_MAT_GENERIC=2 };
2068 enum { START_E_STEP=1, START_M_STEP=2, START_AUTO_STEP=0 };
2071 CvEM( const CvMat* samples, const CvMat* sample_idx=0,
2072 CvEMParams params=CvEMParams(), CvMat* labels=0 );
2075 virtual bool train( const CvMat* samples, const CvMat* sample_idx=0,
2076 CvEMParams params=CvEMParams(), CvMat* labels=0 );
2078 virtual float predict( const CvMat* sample, CvMat* probs ) const;
2079 virtual void clear();
2081 int get_nclusters() const { return params.nclusters; }
2082 const CvMat* get_means() const { return means; }
2083 const CvMat** get_covs() const { return covs; }
2084 const CvMat* get_weights() const { return weights; }
2085 const CvMat* get_probs() const { return probs; }
2089 virtual void set_params( const CvEMParams& params,
2090 const CvVectors& train_data );
2091 virtual void init_em( const CvVectors& train_data );
2092 virtual double run_em( const CvVectors& train_data );
2093 virtual void init_auto( const CvVectors& samples );
2094 virtual void kmeans( const CvVectors& train_data, int nclusters,
2095 CvMat* labels, CvTermCriteria criteria,
2096 const CvMat* means );
2098 double log_likelihood;
2105 CvMat* log_weight_div_det;
2106 CvMat* inv_eigen_values;
2107 CvMat** cov_rotate_mats;
2111 <hr><h3><a name="decl_CvEM_train">CvEM::train</a></h3>
2112 <p class="Blurb">Estimates Gaussian mixture parameters from the sample set</p>
2114 void CvEM::train( const CvMat* samples, const CvMat* sample_idx=0,
2115 CvEMParams params=CvEMParams(), CvMat* labels=0 );
2117 <p>Unlike many of ML models, EM is an unsupervised learning algorithm and it does not
2118 take responses (class labels or the function values) on input. Instead, it computes
2119 <a href="#def_EM_MLE">MLE</a> of Gaussian mixture parameters from the input sample set,
2120 stores all the parameters inside the structure: <code>p<sub>i,k</sub></code> in <code>probs</code>,
2121 <code>a<sub>k</sub></code> in <code>means</code> <code>S<sub>k</sub></code> in <code>covs[k]</code>,
2122 <code>π<sub>k</sub></code> in <code>weights</code> and
2123 optionally computes the output "class label" for each sample:
2124 <code>labels<sub>i</sub>=arg max<sub>k</sub>(p<sub>i,k</sub>), i=1..N</code>
2125 (i.e. indices of the most-probable mixture for each sample).</p>
2126 <p>The trained model can be used further for prediction, just like any other classifier.
2127 The model trained is similar to the <a href="#ch_nbayes">normal Bayes classifier</a>.</p>
2129 <h4>Example. Clustering random samples of multi-Gaussian distribution using EM</h4>
2132 #include "highgui.h"
2134 int main( int argc, char** argv )
2137 const int N1 = (int)sqrt((double)N);
2138 const CvScalar colors[] = {{{0,0,255}},{{0,255,0}},{{0,255,255}},{{255,255,0}}};
2141 CvRNG rng_state = cvRNG(-1);
2142 CvMat* samples = cvCreateMat( nsamples, 2, CV_32FC1 );
2143 CvMat* labels = cvCreateMat( nsamples, 1, CV_32SC1 );
2144 IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 );
2146 CvMat sample = cvMat( 1, 2, CV_32FC1, _sample );
2151 cvReshape( samples, samples, 2, 0 );
2152 for( i = 0; i < N; i++ )
2154 CvScalar mean, sigma;
2156 // form the training samples
2157 cvGetRows( samples, &samples_part, i*nsamples/N, (i+1)*nsamples/N );
2158 mean = cvScalar(((i%N1)+1.)*img->width/(N1+1), ((i/N1)+1.)*img->height/(N1+1));
2159 sigma = cvScalar(30,30);
2160 cvRandArr( &rng_state, &samples_part, CV_RAND_NORMAL, mean, sigma );
2162 cvReshape( samples, samples, 1, 0 );
2164 // initialize model's parameters
2166 params.means = NULL;
2167 params.weights = NULL;
2168 params.probs = NULL;
2169 params.nclusters = N;
2170 params.cov_mat_type = CvEM::COV_MAT_SPHERICAL;
2171 params.start_step = CvEM::START_AUTO_STEP;
2172 params.term_crit.max_iter = 10;
2173 params.term_crit.epsilon = 0.1;
2174 params.term_crit.type = CV_TERMCRIT_ITER|CV_TERMCRIT_EPS;
2177 em_model.train( samples, 0, params, labels );
2180 // the piece of code shows how to repeatedly optimize the model
2181 // with less-constrained parameters (COV_MAT_DIAGONAL instead of COV_MAT_SPHERICAL)
2182 // when the output of the first stage is used as input for the second.
2184 params.cov_mat_type = CvEM::COV_MAT_DIAGONAL;
2185 params.start_step = CvEM::START_E_STEP;
2186 params.means = em_model.get_means();
2187 params.covs = (const CvMat**)em_model.get_covs();
2188 params.weights = em_model.get_weights();
2190 em_model2.train( samples, 0, params, labels );
2191 // to use em_model2, replace em_model.predict() with em_model2.predict() below
2193 // classify every image pixel
2195 for( i = 0; i < img->height; i++ )
2197 for( j = 0; j < img->width; j++ )
2199 CvPoint pt = cvPoint(j, i);
2200 sample.data.fl[0] = (float)j;
2201 sample.data.fl[1] = (float)i;
2202 int response = cvRound(em_model.predict( &sample, NULL ));
2203 CvScalar c = colors[response];
2205 cvCircle( img, pt, 1, cvScalar(c.val[0]*0.75,c.val[1]*0.75,c.val[2]*0.75), CV_FILLED );
2209 //draw the clustered samples
2210 for( i = 0; i < nsamples; i++ )
2213 pt.x = cvRound(samples->data.fl[i*2]);
2214 pt.y = cvRound(samples->data.fl[i*2+1]);
2215 cvCircle( img, pt, 1, colors[labels->data.i[i]], CV_FILLED );
2218 cvNamedWindow( "EM-clustering result", 1 );
2219 cvShowImage( "EM-clustering result", img );
2222 cvReleaseMat( &samples );
2223 cvReleaseMat( &labels );
2229 <!-- *****************************************************************************************
2230 *****************************************************************************************
2231 ***************************************************************************************** -->
2232 <hr><h2><a name="ch_ann">Neural Networks</a></h2>
2234 <p>ML implements feed-forward artificial neural networks, more particularly, multi-layer
2235 perceptrons (MLP), the most commonly used type of neural networks.
2237 MLP consists of the input layer, output layer and one or more
2238 hidden layers. Each layer of MLP includes one or more neurons that are
2239 directionally linked with the neurons from the previous and the next layer.
2240 Here is an example of 3-layer perceptron with 3 inputs, 2 outputs and the hidden layer
2241 including 5 neurons:</p>
2242 <p align="center"><img src="pics/mlp_.png"></p>
2244 All the neurons in MLP are similar. Each of them has several input links (i.e.
2245 it takes the output values from several neurons in the previous layer on input) and
2246 several output links (i.e. it passes the response to several neurons in the next layer).
2247 The values retrieved from the previous layer are summed with certain weights, individual for each
2248 neuron, plus the bias term, and the sum is transformed using the activation function <code>f</code> that may be also
2249 different for different neurons. Here is the picture:
2250 <p align="center"><img src="pics/neuron_model.png"></p>
2251 In other words, given the outputs <code>{x<sub>j</sub>}</code> of the layer <code>n</code>,
2252 the outputs <code>{y<sub>i</sub>}</code> of the layer <code>n+1</code> are computed as:
2254 u<sub>i</sub>=sum<sub>j</sub>(w<sup>(n+1)</sup><sub>i,j</sub>*x<sub>j</sub>) + w<sup>(n+1)</sup><sub>i,bias</sub>
2255 y<sub>i</sub>=f(u<sub>i</sub>)
2257 <p>Different activation functions may be used, the ML implements 3 standard ones:
2259 <li>Identity function (<code>CvANN_MLP::IDENTITY</code>): <code>f(x)=x</code>
2260 <li>Symmetrical sigmoid (<code>CvANN_MLP::SIGMOID_SYM</code>):
2261 <code>f(x)=β*(1-e<sup>-αx</sup>)/(1+e<sup>-αx</sup>)</code>,
2262 the default choice for MLP; the standard sigmoid with β=1, α=1 is shown below:
2263 <p align="center"><img src="pics/sigmoid_bipolar.png"></p>
2264 <li>Gaussian function (<code>CvANN_MLP::GAUSSIAN</code>):
2265 <code>f(x)=βe<sup>-αx*x</sup></code>,
2266 not completely supported by the moment.
2268 <p>In ML all the neurons have the same activation functions, with the same free parameters
2269 (α, β) that are specified by user and are not altered by the training algorithms.
2271 So the whole trained network works as following. It takes the feature vector on input, the vector size
2272 is equal to the size of the input layer, when the values are passed as input to the first hidden layer,
2273 the outputs of the hidden layer are computed using the weights and the activation functions and passed
2274 further downstream, until we compute the output layer.</p>
2275 <p>So, in order to compute the network one need to know all the weights <code>w<sup>(n+1)</sup><sub>i,j</sub></code>.
2276 The weights are computed by the training algorithm. The algorithm takes a training set: multiple
2277 input vectors with the corresponding output vectors, and iteratively adjusts the weights to try to make
2278 the network give the desired response on the provided input vectors.</p>
2280 The larger the network size (the number of hidden layers and their sizes), the more is the potential
2281 network flexibility, and the error on the training set could be made arbitrarily small. But at the same
2282 time the learned network will also "learn" the noise present in the training set, so the error
2283 on the test set usually starts increasing after the network size reaches some limit.
2284 Besides, the larger networks are train much longer than the smaller ones, so it is reasonable to preprocess
2285 the data (using <a href="opencvref_cxcore.htm#decl_cvCalcPCA">PCA</a> or similar technique) and
2286 train a smaller network on only the essential features.</p>
2287 <p>Another feature of the MLP's is their inability to handle categorical data as is, however
2288 there is a workaround. If a certain
2289 feature in the input or output (i.e. in case of <code>n</code>-class classifier for <code>n>2</code>)
2290 layer is categorical and can take <code>M</code> (>2) different values,
2291 it makes sense to represent it as binary tuple of <code>M</code> elements, where <code>i</code>-th
2292 element is <code>1</code> if and only if the feature is equal to the <code>i</code>-th value out of
2293 <code>M</code> possible. It will increase the size of the input/output layer, but will speedup the
2294 training algorithm convergence and at the same time enable "fuzzy" values of such variables, i.e.
2295 a tuple of probabilities instead of a fixed value.</p>
2296 <p>ML implements 2 algorithms for training MLP's. The first is the classical random sequential
2297 <a href="#paper_backprop">back-propagation algorithm</a> and the second (default one) is
2298 batch <a href="#paper_rprop">RPROP algorithm</a></p>
2300 <p><b>References:<br>
2302 <li><a href="http://en.wikipedia.org/wiki/Backpropagation">http://en.wikipedia.org/wiki/Backpropagation</a>.
2303 Wikipedia article about the back-propagation algorithm.
2304 <li>Y. LeCun, L. Bottou, G.B. Orr and K.-R. Muller, "Efficient backprop", in Neural Networks---Tricks of the Trade, Springer Lecture Notes in Computer Sciences 1524, pp.5-50, 1998.
2305 <li><a name="paper_prop">M. Riedmiller and H. Braun, "A Direct Adaptive Method for Faster Backpropagation Learning: The RPROP Algorithm", Proc. ICNN, San Francisco (1993).</a></b>
2309 <hr><h3><a name="decl_CvANN_MLP_TrainParams">CvANN_MLP_TrainParams</a></h3>
2310 <p class="Blurb">Parameters of MLP training algorithm</p>
2312 struct CvANN_MLP_TrainParams
2314 CvANN_MLP_TrainParams();
2315 CvANN_MLP_TrainParams( CvTermCriteria term_crit, int train_method,
2316 double param1, double param2=0 );
2317 ~CvANN_MLP_TrainParams();
2319 enum { BACKPROP=0, RPROP=1 };
2321 CvTermCriteria term_crit;
2324 // backpropagation parameters
2325 double bp_dw_scale, bp_moment_scale;
2328 double rp_dw0, rp_dw_plus, rp_dw_minus, rp_dw_min, rp_dw_max;
2331 <dt>term_crit<dd>The termination criteria for the training algorithm.
2332 It identifies how many iterations is done by the algorithm
2333 (for sequential backpropagation algorithm the number is multiplied
2334 by the size of the training set) and how much the weights could
2335 change between the iterations to make the algorithm continue.
2336 <dt>train_method<dd>The training algorithm to use;
2337 can be one of <code>CvANN_MLP_TrainParams::BACKPROP</code> (sequential
2338 backpropagation algorithm) or <code>CvANN_MLP_TrainParams::RPROP</code>
2339 (RPROP algorithm, default value).
2340 <dt>bp_dw_scale<dd>(Backpropagation only): The coefficient to multiply the computed weight gradient by.
2341 The recommended value is about <code>0.1</code>. The parameter
2342 can be set via <code>param1</code> of the constructor.
2343 <dt>bp_moment_scale<dd>(Backpropagation only): The coefficient to multiply the difference
2344 between weights on the 2 previous iterations. This parameter
2345 provides some inertia to smooth
2346 the random fluctuations of the weights. It can vary from <code>0</code> (the feature is disabled)
2347 to <code>1</code> and beyond. The value <code>0.1</code> or so is good enough.
2348 The parameter can be set via <code>param2</code> of the constructor.
2349 <dt>rp_dw0<dd>(RPROP only): Initial magnitude of the weight delta. The default value is <code>0.1</code>.
2350 This parameter can be set via <code>param1</code> of the constructor.
2351 <dt>rp_dw_plus<dd>(RPROP only): The increase factor for the weight delta. It must be >1,
2352 default value is <code>1.2</code> that should work well in most cases, according to
2353 the algorithm's author. The parameter can only be changed explicitly by modifying the structure member.
2354 <dt>rp_dw_minus<dd>(RPROP only): The decrease factor for the weight delta. It must be <1,
2355 default value is <code>0.5</code> that should work well in most cases, according to
2356 the algorithm's author. The parameter can only be changed explicitly by modifying the structure member.
2357 <dt>rp_dw_min<dd>(RPROP only): The minimum value of the weight delta.
2358 It must be >0, the default value is <code>FLT_EPSILON</code>.
2359 The parameter can be set via <code>param2</code> of the constructor.
2360 <dt>rp_dw_max<dd>(RPROP only): The maximum value of the weight delta.
2361 It must be >1, the default value is <code>50</code>.
2362 The parameter can only be changed explicitly by modifying the structure member.
2364 <p>The structure has default constructor that initializes parameters for <code>RPROP</code> algorithm.
2365 There is also more advanced constructor to customize the parameters and/or choose backpropagation algorithm.
2366 Finally, the individual parameters can be adjusted after the structure is created.</p>
2369 <hr><h3><a name="decl_CvANN_MLP">CvANN_MLP</a></h3>
2370 <p class="Blurb">MLP model</p>
2372 class CvANN_MLP : public CvStatModel
2376 CvANN_MLP( const CvMat* _layer_sizes,
2377 int _activ_func=SIGMOID_SYM,
2378 double _f_param1=0, double _f_param2=0 );
2380 virtual ~CvANN_MLP();
2382 virtual void create( const CvMat* _layer_sizes,
2383 int _activ_func=SIGMOID_SYM,
2384 double _f_param1=0, double _f_param2=0 );
2386 virtual int train( const CvMat* _inputs, const CvMat* _outputs,
2387 const CvMat* _sample_weights, const CvMat* _sample_idx=0,
2388 CvANN_MLP_TrainParams _params = CvANN_MLP_TrainParams(),
2390 virtual float predict( const CvMat* _inputs,
2391 CvMat* _outputs ) const;
2393 virtual void clear();
2395 // possible activation functions
2396 enum { IDENTITY = 0, SIGMOID_SYM = 1, GAUSSIAN = 2 };
2398 // available training flags
2399 enum { UPDATE_WEIGHTS = 1, NO_INPUT_SCALE = 2, NO_OUTPUT_SCALE = 4 };
2401 virtual void read( CvFileStorage* fs, CvFileNode* node );
2402 virtual void write( CvFileStorage* storage, const char* name );
2404 int get_layer_count() { return layer_sizes ? layer_sizes->cols : 0; }
2405 const CvMat* get_layer_sizes() { return layer_sizes; }
2409 virtual bool prepare_to_train( const CvMat* _inputs, const CvMat* _outputs,
2410 const CvMat* _sample_weights, const CvMat* _sample_idx,
2411 CvANN_MLP_TrainParams _params,
2412 CvVectors* _ivecs, CvVectors* _ovecs, double** _sw, int _flags );
2414 // sequential random backpropagation
2415 virtual int train_backprop( CvVectors _ivecs, CvVectors _ovecs, const double* _sw );
2418 virtual int train_rprop( CvVectors _ivecs, CvVectors _ovecs, const double* _sw );
2420 virtual void calc_activ_func( CvMat* xf, const double* bias ) const;
2421 virtual void calc_activ_func_deriv( CvMat* xf, CvMat* deriv, const double* bias ) const;
2422 virtual void set_activ_func( int _activ_func=SIGMOID_SYM,
2423 double _f_param1=0, double _f_param2=0 );
2424 virtual void init_weights();
2425 virtual void scale_input( const CvMat* _src, CvMat* _dst ) const;
2426 virtual void scale_output( const CvMat* _src, CvMat* _dst ) const;
2427 virtual void calc_input_scale( const CvVectors* vecs, int flags );
2428 virtual void calc_output_scale( const CvVectors* vecs, int flags );
2430 virtual void write_params( CvFileStorage* fs );
2431 virtual void read_params( CvFileStorage* fs, CvFileNode* node );
2435 CvMat* sample_weights;
2437 double f_param1, f_param2;
2438 double min_val, max_val, min_val1, max_val1;
2440 int max_count, max_buf_sz;
2441 CvANN_MLP_TrainParams params;
2445 <p>Unlike many other models in ML that are constructed and trained at once,
2446 in the MLP model these steps are separated. First, a network with the specified topology
2447 is created using the non-default constructor or the method <a href="#decl_CVANN_MLP_create">create</a>.
2448 All the weights are set to zeros.
2449 Then the network is trained using the set of input and output vectors.
2450 The training procedure can be repeated more than once, i.e. the weights can be adjusted
2451 based on the new training data.</p>
2453 <hr><h3><a name="decl_CvANN_MLP_create">CvANN_MLP::create</a></h3>
2454 <p class="Blurb">Constructs the MLP with the specified topology</p>
2456 void CvANN_MLP::create( const CvMat* _layer_sizes,
2457 int _activ_func=SIGMOID_SYM,
2458 double _f_param1=0, double _f_param2=0 );
2460 <dt>_layer_sizes<dd>The integer vector specifies the number of neurons in each
2461 layer including the input and output layers.
2462 <dt>_activ_func<dd>Specifies the activation function for each neuron; one of
2463 <code>CvANN_MLP::IDENTITY</code>, <code>CvANN_MLP::SIGMOID_SYM</code>
2464 and <code>CvANN_MLP::GAUSSIAN</code>.
2465 <dt>_f_param1, _f_param2<dd>Free parameters of the activation function, α and β,
2466 respectively. See the formulas in the introduction section.
2468 <p>The method creates MLP network with the specified topology and assigns the same activation
2469 function to all the neurons.</p>
2472 <hr><h3><a name="decl_CvANN_MLP_train">CvANN_MLP::train</a></h3>
2473 <p class="Blurb">Trains/updates MLP</p>
2475 int CvANN_MLP::train( const CvMat* _inputs, const CvMat* _outputs,
2476 const CvMat* _sample_weights, const CvMat* _sample_idx=0,
2477 CvANN_MLP_TrainParams _params = CvANN_MLP_TrainParams(),
2481 <dt>_inputs<dd>A floating-point matrix of input vectors, one vector per row.
2482 <dt>_outputs<dd>A floating-point matrix of the corresponding output vectors, one vector per row.
2483 <dt>_sample_weights<dd>(RPROP only) The optional floating-point vector of weights for each sample.
2484 Some samples may be more important than others for training, e.g. user
2485 may want to gain the weight of certain classes to find the right balance between
2486 hit-rate and false-alarm rate etc.
2487 <dt>_sample_idx<dd>The optional integer vector indicating the samples
2488 (i.e. rows of <code>_inputs</code> and <code>_outputs</code>) that are taken
2490 <dt>_params<dd>The training params.
2491 See <a href="#decl_CvANN_MLP_TrainParams">CvANN_MLP_TrainParams</a> description.
2492 <dt>_flags<dd>The various parameters to control the training algorithm. May be a combination of the following:<br>
2493 <code>UPDATE_WEIGHTS = 1</code> - algorithm updates the network weights, rather than
2494 computes them from scratch (in the latter case the weights are
2496 using <em>Nguyen-Widrow</em> algorithm).<br>
2497 <code>NO_INPUT_SCALE</code> - algorithm does not normalize the input vectors. If this flag is not set,
2498 the training algorithm normalizes each input feature independently, shifting its
2499 mean value to 0 and making the standard deviation <code>=1</code>. If the network
2500 is assumed to be updated frequently, the new training data could be much different from
2501 original one. In this case user should take care of proper normalization.<br>
2502 <code>NO_OUTPUT_SCALE</code> - algorithm does not normalize the output vectors. If the flag is not set,
2503 the training algorithm normalizes each output features independently, by
2505 it to the certain range depending on the activation function used.<br>
2507 This method applies the specified training algorithm to compute/adjust the network weights.
2508 It returns the number of done iterations.</p>
2511 <!-- Example of creating of perceptron -->
2512 <!-- <h4>Example. Creating perceptron to compute XOR function</h4>
2514 <img src="pics/xor_net.png"><br>
2529 int main(int argc, char* argv[])
2531 printf("MLL-ANN sample\n");
2533 CvANNNeuralNet *network;
2536 CvANNPerceptronParams perParams;
2538 double actParams[1];
2539 actParams[0] = 1.41;
2541 perParams.activationFunctionID = CV_ANN_ACTIV_SIGMOID;/* Sigmoid activation function */
2542 perParams.activationParams = actParams;/* Activation function parameters */
2543 perParams.biases = 1;/* Use biases */
2545 /* Set train parameters */
2546 CvANNPerceptronTrainParams perTrainParams;
2548 perTrainParams.learnRate = 0.8;
2549 perTrainParams.momentum = 0.3;
2550 perTrainParams.maxEpoch = 5000;
2551 perTrainParams.meanError = 0.0;
2552 perTrainParams.maxError = 0;
2553 perTrainParams.recognizeError = 0.1;
2554 perTrainParams.recognizeRate = 1;//0.99;
2555 perTrainParams.reportStep = 20;/* Each 20-th step report errors */
2557 CvANNClassifierTrainParams trainParams;
2558 trainParams.networkTrainParams = &perTrainParams;
2560 /* -------------- Fill training data --------------- */
2572 double inputData[8] = { 0, 0,
2577 double outputData[4] = { 0,
2582 trainInput = cvMat(4,2,CV_64F,inputData);
2583 trainOutput = cvMat(4,1,CV_64F,outputData);
2585 int layerSizes[3] = {2,8,1};/* Sizes of layers */
2587 perParams.numLayers = 3;/* Number of layers */
2588 perParams.layerSizes = layerSizes;/* array with layer sizes */
2590 network = cvCreateANNPerceptron(&perParams);
2593 cvANNConvertNetworkLinks(network ,CV_ANN_CVT_LINK2MATR);
2595 trainParams.network = network;
2598 CvANNClassifier *ann;
2600 /* Create and train classifier */
2602 ann = (CvANNClassifier *)cvCreateANNClassifier( &trainInput,//CvMat* trainInput,
2604 &trainOutput,//CvMat* trainOutput,
2605 (CvStatModelParams*)&trainParams,//CvStatModelParams* trainParams CV_DEFAULT(0),
2606 0,//CvMat* compIdx CV_DEFAULT(0),
2607 0,//CvMat* sampleIdx CV_DEFAULT(0),
2608 0,//CvMat* typeMask CV_DEFAULT(0),
2609 0/*CvMat* missedMeasurementsMask CV_DEFAULT(0)*/);
2610 printf("Classifier was created\n");
2612 printf("Test trained network\n");
2616 testOutput = cvMat(4,1,CV_64F,testOut);
2618 cvANNPredict( (CvStatModel *)ann, &trainInput, &testOutput);
2621 for( i = 0; i < 4; i++ )
2623 printf("%.0lf XOR %.0lf => %.2lf (Must be %.0lf)\n",inputData[i*2],inputData[i*2+1],testOut[i],outputData[i]);