1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
10 // Intel License Agreement
12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
13 // Third party copyrights are property of their respective owners.
15 // Redistribution and use in source and binary forms, with or without modification,
16 // are permitted provided that the following conditions are met:
18 // * Redistribution's of source code must retain the above copyright notice,
19 // this list of conditions and the following disclaimer.
21 // * Redistribution's in binary form must reproduce the above copyright notice,
22 // this list of conditions and the following disclaimer in the documentation
23 // and/or other materials provided with the distribution.
25 // * The name of Intel Corporation may not be used to endorse or promote products
26 // derived from this software without specific prior written permission.
28 // This software is provided by the copyright holders and contributors "as is" and
29 // any express or implied warranties, including, but not limited to, the implied
30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
31 // In no event shall the Intel Corporation or contributors be liable for any direct,
32 // indirect, incidental, special, exemplary, or consequential damages
33 // (including, but not limited to, procurement of substitute goods or services;
34 // loss of use, data, or profits; or business interruption) however caused
35 // and on any theory of liability, whether in contract, strict liability,
36 // or tort (including negligence or otherwise) arising in any way out of
37 // the use of this software, even if advised of the possibility of such damage.
43 CvForestTree::CvForestTree()
49 CvForestTree::~CvForestTree()
55 bool CvForestTree::train( CvDTreeTrainData* _data,
56 const CvMat* _subsample_idx,
61 CV_FUNCNAME( "CvForestTree::train" );
71 CV_CALL(result = do_train(_subsample_idx));
80 CvForestTree::train( const CvMat*, int, const CvMat*, const CvMat*,
81 const CvMat*, const CvMat*, const CvMat*, CvDTreeParams )
89 CvForestTree::train( CvDTreeTrainData*, const CvMat* )
96 CvDTreeSplit* CvForestTree::find_best_split( CvDTreeNode* node )
99 CvDTreeSplit *best_split = 0, *split = 0, *t;
101 CV_FUNCNAME("CvForestTree::find_best_split");
104 CvMat* active_var_mask = 0;
108 CvRNG* rng = forest->get_rng();
110 active_var_mask = forest->get_active_var_mask();
111 var_count = active_var_mask->cols;
113 CV_ASSERT( var_count == data->var_count );
115 for( vi = 0; vi < var_count; vi++ )
118 int i1 = cvRandInt(rng) % var_count;
119 int i2 = cvRandInt(rng) % var_count;
120 CV_SWAP( active_var_mask->data.ptr[i1],
121 active_var_mask->data.ptr[i2], temp );
124 for( vi = 0; vi < data->var_count; vi++ )
126 int ci = data->var_type->data.i[vi];
127 if( node->num_valid[vi] <= 1
128 || (active_var_mask && !active_var_mask->data.ptr[vi]) )
131 if( data->is_classifier )
134 split = find_split_cat_class( node, vi );
136 split = find_split_ord_class( node, vi );
141 split = find_split_cat_reg( node, vi );
143 split = find_split_ord_reg( node, vi );
148 if( !best_split || best_split->quality < split->quality )
149 CV_SWAP( best_split, split, t );
151 cvSetRemoveByPtr( data->split_heap, split );
161 void CvForestTree::read( CvFileStorage* fs, CvFileNode* fnode, CvRTrees* _forest, CvDTreeTrainData* _data )
163 CvDTree::read( fs, fnode, _data );
168 void CvForestTree::read( CvFileStorage*, CvFileNode* )
173 void CvForestTree::read( CvFileStorage* _fs, CvFileNode* _node,
174 CvDTreeTrainData* _data )
176 CvDTree::read( _fs, _node, _data );
180 //////////////////////////////////////////////////////////////////////////////////////////
182 //////////////////////////////////////////////////////////////////////////////////////////
191 active_var_mask = NULL;
192 var_importance = NULL;
193 rng = cvRNG(0xffffffff);
194 default_model_name = "my_random_trees";
198 void CvRTrees::clear()
201 for( k = 0; k < ntrees; k++ )
208 cvReleaseMat( &active_var_mask );
209 cvReleaseMat( &var_importance );
214 CvRTrees::~CvRTrees()
220 CvMat* CvRTrees::get_active_var_mask()
222 return active_var_mask;
226 CvRNG* CvRTrees::get_rng()
231 bool CvRTrees::train( const CvMat* _train_data, int _tflag,
232 const CvMat* _responses, const CvMat* _var_idx,
233 const CvMat* _sample_idx, const CvMat* _var_type,
234 const CvMat* _missing_mask, CvRTParams params )
238 CV_FUNCNAME("CvRTrees::train");
245 CvDTreeParams tree_params( params.max_depth, params.min_sample_count,
246 params.regression_accuracy, params.use_surrogates, params.max_categories,
247 params.cv_folds, params.use_1se_rule, false, params.priors );
249 data = new CvDTreeTrainData();
250 CV_CALL(data->set_data( _train_data, _tflag, _responses, _var_idx,
251 _sample_idx, _var_type, _missing_mask, tree_params, true));
253 var_count = data->var_count;
254 if( params.nactive_vars > var_count )
255 params.nactive_vars = var_count;
256 else if( params.nactive_vars == 0 )
257 params.nactive_vars = (int)sqrt((double)var_count);
258 else if( params.nactive_vars < 0 )
259 CV_ERROR( CV_StsBadArg, "<nactive_vars> must be non-negative" );
260 params.term_crit = cvCheckTermCriteria( params.term_crit, 0.1, 1000 );
262 // Create mask of active variables at the tree nodes
263 CV_CALL(active_var_mask = cvCreateMat( 1, var_count, CV_8UC1 ));
264 if( params.calc_var_importance )
266 CV_CALL(var_importance = cvCreateMat( 1, var_count, CV_32FC1 ));
267 cvZero(var_importance);
269 { // initialize active variables mask
270 CvMat submask1, submask2;
271 cvGetCols( active_var_mask, &submask1, 0, params.nactive_vars );
272 cvGetCols( active_var_mask, &submask2, params.nactive_vars, var_count );
273 cvSet( &submask1, cvScalar(1) );
277 CV_CALL(result = grow_forest( params.term_crit ));
287 bool CvRTrees::grow_forest( const CvTermCriteria term_crit )
291 CvMat* sample_idx_mask_for_tree = 0;
292 CvMat* sample_idx_for_tree = 0;
294 CvMat* oob_sample_votes = 0;
295 CvMat* oob_responses = 0;
297 float* oob_samples_perm_ptr= 0;
299 float* samples_ptr = 0;
300 uchar* missing_ptr = 0;
301 float* true_resp_ptr = 0;
303 CV_FUNCNAME("CvRTrees::grow_forest");
306 const int max_ntrees = term_crit.max_iter;
307 const double max_oob_err = term_crit.epsilon;
309 const int dims = data->var_count;
310 float maximal_response = 0;
312 // oob_predictions_sum[i] = sum of predicted values for the i-th sample
313 // oob_num_of_predictions[i] = number of summands
314 // (number of predictions for the i-th sample)
315 // initialize these variable to avoid warning C4701
316 CvMat oob_predictions_sum = cvMat( 1, 1, CV_32FC1 );
317 CvMat oob_num_of_predictions = cvMat( 1, 1, CV_32FC1 );
319 nsamples = data->sample_count;
320 nclasses = data->get_num_classes();
322 trees = (CvForestTree**)cvAlloc( sizeof(trees[0])*max_ntrees );
323 memset( trees, 0, sizeof(trees[0])*max_ntrees );
325 if( data->is_classifier )
327 CV_CALL(oob_sample_votes = cvCreateMat( nsamples, nclasses, CV_32SC1 ));
328 cvZero(oob_sample_votes);
332 // oob_responses[0,i] = oob_predictions_sum[i]
333 // = sum of predicted values for the i-th sample
334 // oob_responses[1,i] = oob_num_of_predictions[i]
335 // = number of summands (number of predictions for the i-th sample)
336 CV_CALL(oob_responses = cvCreateMat( 2, nsamples, CV_32FC1 ));
337 cvZero(oob_responses);
338 cvGetRow( oob_responses, &oob_predictions_sum, 0 );
339 cvGetRow( oob_responses, &oob_num_of_predictions, 1 );
341 CV_CALL(sample_idx_mask_for_tree = cvCreateMat( 1, nsamples, CV_8UC1 ));
342 CV_CALL(sample_idx_for_tree = cvCreateMat( 1, nsamples, CV_32SC1 ));
343 CV_CALL(oob_samples_perm_ptr = (float*)cvAlloc( sizeof(float)*nsamples*dims ));
344 CV_CALL(samples_ptr = (float*)cvAlloc( sizeof(float)*nsamples*dims ));
345 CV_CALL(missing_ptr = (uchar*)cvAlloc( sizeof(uchar)*nsamples*dims ));
346 CV_CALL(true_resp_ptr = (float*)cvAlloc( sizeof(float)*nsamples ));
348 CV_CALL(data->get_vectors( 0, samples_ptr, missing_ptr, true_resp_ptr ));
350 double minval, maxval;
351 CvMat responses = cvMat(1, nsamples, CV_32FC1, true_resp_ptr);
352 cvMinMaxLoc( &responses, &minval, &maxval );
353 maximal_response = (float)MAX( MAX( fabs(minval), fabs(maxval) ), 0 );
357 while( ntrees < max_ntrees )
359 int i, oob_samples_count = 0;
360 double ncorrect_responses = 0; // used for estimation of variable importance
361 CvMat sample, missing;
362 CvForestTree* tree = 0;
364 cvZero( sample_idx_mask_for_tree );
365 for( i = 0; i < nsamples; i++ ) //form sample for creation one tree
367 int idx = cvRandInt( &rng ) % nsamples;
368 sample_idx_for_tree->data.i[i] = idx;
369 sample_idx_mask_for_tree->data.ptr[idx] = 0xFF;
372 trees[ntrees] = new CvForestTree();
373 tree = trees[ntrees];
374 CV_CALL(tree->train( data, sample_idx_for_tree, this ));
376 // form array of OOB samples indices and get these samples
377 sample = cvMat( 1, dims, CV_32FC1, samples_ptr );
378 missing = cvMat( 1, dims, CV_8UC1, missing_ptr );
381 for( i = 0; i < nsamples; i++,
382 sample.data.fl += dims, missing.data.ptr += dims )
384 CvDTreeNode* predicted_node = 0;
385 // check if the sample is OOB
386 if( sample_idx_mask_for_tree->data.ptr[i] )
389 // predict oob samples
390 if( !predicted_node )
391 CV_CALL(predicted_node = tree->predict(&sample, &missing, true));
393 if( !data->is_classifier ) //regression
395 double avg_resp, resp = predicted_node->value;
396 oob_predictions_sum.data.fl[i] += (float)resp;
397 oob_num_of_predictions.data.fl[i] += 1;
400 avg_resp = oob_predictions_sum.data.fl[i]/oob_num_of_predictions.data.fl[i];
401 avg_resp -= true_resp_ptr[i];
402 oob_error += avg_resp*avg_resp;
403 resp = (resp - true_resp_ptr[i])/maximal_response;
404 ncorrect_responses += exp( -resp*resp );
406 else //classification
412 cvGetRow(oob_sample_votes, &votes, i);
413 votes.data.i[predicted_node->class_idx]++;
416 cvMinMaxLoc( &votes, 0, 0, 0, &max_loc );
418 prdct_resp = data->cat_map->data.i[max_loc.x];
419 oob_error += (fabs(prdct_resp - true_resp_ptr[i]) < FLT_EPSILON) ? 0 : 1;
421 ncorrect_responses += cvRound(predicted_node->value - true_resp_ptr[i]) == 0;
425 if( oob_samples_count > 0 )
426 oob_error /= (double)oob_samples_count;
428 // estimate variable importance
429 if( var_importance && oob_samples_count > 0 )
433 memcpy( oob_samples_perm_ptr, samples_ptr, dims*nsamples*sizeof(float));
434 for( m = 0; m < dims; m++ )
436 double ncorrect_responses_permuted = 0;
437 // randomly permute values of the m-th variable in the oob samples
438 float* mth_var_ptr = oob_samples_perm_ptr + m;
440 for( i = 0; i < nsamples; i++ )
445 if( sample_idx_mask_for_tree->data.ptr[i] ) //the sample is not OOB
447 i1 = cvRandInt( &rng ) % nsamples;
448 i2 = cvRandInt( &rng ) % nsamples;
449 CV_SWAP( mth_var_ptr[i1*dims], mth_var_ptr[i2*dims], temp );
451 // turn values of (m-1)-th variable, that were permuted
452 // at the previous iteration, untouched
454 oob_samples_perm_ptr[i*dims+m-1] = samples_ptr[i*dims+m-1];
457 // predict "permuted" cases and calculate the number of votes for the
458 // correct class in the variable-m-permuted oob data
459 sample = cvMat( 1, dims, CV_32FC1, oob_samples_perm_ptr );
460 missing = cvMat( 1, dims, CV_8UC1, missing_ptr );
461 for( i = 0; i < nsamples; i++,
462 sample.data.fl += dims, missing.data.ptr += dims )
464 double predct_resp, true_resp;
466 if( sample_idx_mask_for_tree->data.ptr[i] ) //the sample is not OOB
469 predct_resp = tree->predict(&sample, &missing, true)->value;
470 true_resp = true_resp_ptr[i];
471 if( data->is_classifier )
472 ncorrect_responses_permuted += cvRound(true_resp - predct_resp) == 0;
475 true_resp = (true_resp - predct_resp)/maximal_response;
476 ncorrect_responses_permuted += exp( -true_resp*true_resp );
479 var_importance->data.fl[m] += (float)(ncorrect_responses
480 - ncorrect_responses_permuted);
484 if( term_crit.type != CV_TERMCRIT_ITER && oob_error < max_oob_err )
488 CV_CALL(cvConvertScale( var_importance, var_importance, 1./ntrees/nsamples ));
494 cvReleaseMat( &sample_idx_mask_for_tree );
495 cvReleaseMat( &sample_idx_for_tree );
496 cvReleaseMat( &oob_sample_votes );
497 cvReleaseMat( &oob_responses );
499 cvFree( &oob_samples_perm_ptr );
500 cvFree( &samples_ptr );
501 cvFree( &missing_ptr );
502 cvFree( &true_resp_ptr );
508 const CvMat* CvRTrees::get_var_importance()
510 return var_importance;
514 float CvRTrees::get_proximity( const CvMat* sample1, const CvMat* sample2,
515 const CvMat* missing1, const CvMat* missing2 ) const
519 CV_FUNCNAME( "CvRTrees::get_proximity" );
524 for( i = 0; i < ntrees; i++ )
525 result += trees[i]->predict( sample1, missing1 ) ==
526 trees[i]->predict( sample2, missing2 ) ? 1 : 0;
527 result = result/(float)ntrees;
535 float CvRTrees::predict( const CvMat* sample, const CvMat* missing ) const
539 CV_FUNCNAME("CvRTrees::predict");
544 if( nclasses > 0 ) //classification
547 int* votes = (int*)alloca( sizeof(int)*nclasses );
548 memset( votes, 0, sizeof(*votes)*nclasses );
549 for( k = 0; k < ntrees; k++ )
551 CvDTreeNode* predicted_node = trees[k]->predict( sample, missing );
553 int class_idx = predicted_node->class_idx;
554 CV_ASSERT( 0 <= class_idx && class_idx < nclasses );
556 nvotes = ++votes[class_idx];
557 if( nvotes > max_nvotes )
560 result = predicted_node->value;
567 for( k = 0; k < ntrees; k++ )
568 result += trees[k]->predict( sample, missing )->value;
569 result /= (double)ntrees;
574 return (float)result;
578 void CvRTrees::write( CvFileStorage* fs, const char* name )
580 CV_FUNCNAME( "CvRTrees::write" );
586 if( ntrees < 1 || !trees || nsamples < 1 )
587 CV_ERROR( CV_StsBadArg, "Invalid CvRTrees object" );
589 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_RTREES );
591 cvWriteInt( fs, "nclasses", nclasses );
592 cvWriteInt( fs, "nsamples", nsamples );
593 cvWriteInt( fs, "nactive_vars", (int)cvSum(active_var_mask).val[0] );
594 cvWriteReal( fs, "oob_error", oob_error );
597 cvWrite( fs, "var_importance", var_importance );
599 cvWriteInt( fs, "ntrees", ntrees );
601 CV_CALL(data->write_params( fs ));
603 cvStartWriteStruct( fs, "trees", CV_NODE_SEQ );
605 for( k = 0; k < ntrees; k++ )
607 cvStartWriteStruct( fs, 0, CV_NODE_MAP );
608 CV_CALL( trees[k]->write( fs ));
609 cvEndWriteStruct( fs );
612 cvEndWriteStruct( fs ); //trees
613 cvEndWriteStruct( fs ); //CV_TYPE_NAME_ML_RTREES
619 void CvRTrees::read( CvFileStorage* fs, CvFileNode* fnode )
621 CV_FUNCNAME( "CvRTrees::read" );
625 int nactive_vars, var_count, k;
627 CvFileNode* trees_fnode = 0;
631 nclasses = cvReadIntByName( fs, fnode, "nclasses", -1 );
632 nsamples = cvReadIntByName( fs, fnode, "nsamples" );
633 nactive_vars = cvReadIntByName( fs, fnode, "nactive_vars", -1 );
634 oob_error = cvReadRealByName(fs, fnode, "oob_error", -1 );
635 ntrees = cvReadIntByName( fs, fnode, "ntrees", -1 );
637 var_importance = (CvMat*)cvReadByName( fs, fnode, "var_importance" );
639 if( nclasses < 0 || nsamples <= 0 || nactive_vars < 0 || oob_error < 0 || ntrees <= 0)
640 CV_ERROR( CV_StsParseError, "Some <nclasses>, <nsamples>, <var_count>, "
641 "<nactive_vars>, <oob_error>, <ntrees> of tags are missing" );
645 trees = (CvForestTree**)cvAlloc( sizeof(trees[0])*ntrees );
646 memset( trees, 0, sizeof(trees[0])*ntrees );
648 data = new CvDTreeTrainData();
649 data->read_params( fs, fnode );
652 trees_fnode = cvGetFileNodeByName( fs, fnode, "trees" );
653 if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
654 CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
656 cvStartReadSeq( trees_fnode->data.seq, &reader );
657 if( reader.seq->total != ntrees )
658 CV_ERROR( CV_StsParseError,
659 "<ntrees> is not equal to the number of trees saved in file" );
661 for( k = 0; k < ntrees; k++ )
663 trees[k] = new CvForestTree();
664 CV_CALL(trees[k]->read( fs, (CvFileNode*)reader.ptr, this, data ));
665 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
668 var_count = data->var_count;
669 CV_CALL(active_var_mask = cvCreateMat( 1, var_count, CV_8UC1 ));
671 // initialize active variables mask
672 CvMat submask1, submask2;
673 cvGetCols( active_var_mask, &submask1, 0, nactive_vars );
674 cvGetCols( active_var_mask, &submask2, nactive_vars, var_count );
675 cvSet( &submask1, cvScalar(1) );
683 int CvRTrees::get_tree_count() const
688 CvForestTree* CvRTrees::get_tree(int i) const
690 return (unsigned)i < (unsigned)ntrees ? trees[i] : 0;