1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
10 // Intel License Agreement
12 // Copyright (C) 2000, Intel Corporation, all rights reserved.
13 // Third party copyrights are property of their respective owners.
15 // Redistribution and use in source and binary forms, with or without modification,
16 // are permitted provided that the following conditions are met:
18 // * Redistribution's of source code must retain the above copyright notice,
19 // this list of conditions and the following disclaimer.
21 // * Redistribution's in binary form must reproduce the above copyright notice,
22 // this list of conditions and the following disclaimer in the documentation
23 // and/or other materials provided with the distribution.
25 // * The name of Intel Corporation may not be used to endorse or promote products
26 // derived from this software without specific prior written permission.
28 // This software is provided by the copyright holders and contributors "as is" and
29 // any express or implied warranties, including, but not limited to, the implied
30 // warranties of merchantability and fitness for a particular purpose are disclaimed.
31 // In no event shall the Intel Corporation or contributors be liable for any direct,
32 // indirect, incidental, special, exemplary, or consequential damages
33 // (including, but not limited to, procurement of substitute goods or services;
34 // loss of use, data, or profits; or business interruption) however caused
35 // and on any theory of liability, whether in contract, strict liability,
36 // or tort (including negligence or otherwise) arising in any way out of
37 // the use of this software, even if advised of the possibility of such damage.
43 static const float ord_nan = FLT_MAX*0.5f;
44 static const int min_block_size = 1 << 16;
45 static const int block_size_delta = 1 << 10;
47 CvDTreeTrainData::CvDTreeTrainData()
49 var_idx = var_type = cat_count = cat_ofs = cat_map =
50 priors = priors_mult = counts = buf = direction = split_buf = 0;
51 tree_storage = temp_storage = 0;
57 CvDTreeTrainData::CvDTreeTrainData( const CvMat* _train_data, int _tflag,
58 const CvMat* _responses, const CvMat* _var_idx,
59 const CvMat* _sample_idx, const CvMat* _var_type,
60 const CvMat* _missing_mask, const CvDTreeParams& _params,
61 bool _shared, bool _add_labels )
63 var_idx = var_type = cat_count = cat_ofs = cat_map =
64 priors = priors_mult = counts = buf = direction = split_buf = 0;
65 tree_storage = temp_storage = 0;
67 set_data( _train_data, _tflag, _responses, _var_idx, _sample_idx,
68 _var_type, _missing_mask, _params, _shared, _add_labels );
72 CvDTreeTrainData::~CvDTreeTrainData()
78 bool CvDTreeTrainData::set_params( const CvDTreeParams& _params )
82 CV_FUNCNAME( "CvDTreeTrainData::set_params" );
89 if( params.max_categories < 2 )
90 CV_ERROR( CV_StsOutOfRange, "params.max_categories should be >= 2" );
91 params.max_categories = MIN( params.max_categories, 15 );
93 if( params.max_depth < 0 )
94 CV_ERROR( CV_StsOutOfRange, "params.max_depth should be >= 0" );
95 params.max_depth = MIN( params.max_depth, 25 );
97 params.min_sample_count = MAX(params.min_sample_count,1);
99 if( params.cv_folds < 0 )
100 CV_ERROR( CV_StsOutOfRange,
101 "params.cv_folds should be =0 (the tree is not pruned) "
102 "or n>0 (tree is pruned using n-fold cross-validation)" );
104 if( params.cv_folds == 1 )
107 if( params.regression_accuracy < 0 )
108 CV_ERROR( CV_StsOutOfRange, "params.regression_accuracy should be >= 0" );
118 #define CV_CMP_NUM_PTR(a,b) (*(a) < *(b))
119 static CV_IMPLEMENT_QSORT_EX( icvSortIntPtr, int*, CV_CMP_NUM_PTR, int )
120 static CV_IMPLEMENT_QSORT_EX( icvSortDblPtr, double*, CV_CMP_NUM_PTR, int )
122 #define CV_CMP_PAIRS(a,b) ((a).val < (b).val)
123 static CV_IMPLEMENT_QSORT_EX( icvSortPairs, CvPair32s32f, CV_CMP_PAIRS, int )
125 void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
126 const CvMat* _responses, const CvMat* _var_idx, const CvMat* _sample_idx,
127 const CvMat* _var_type, const CvMat* _missing_mask, const CvDTreeParams& _params,
128 bool _shared, bool _add_labels, bool _update_data )
130 CvMat* sample_idx = 0;
131 CvMat* var_type0 = 0;
134 CvDTreeTrainData* data = 0;
136 CV_FUNCNAME( "CvDTreeTrainData::set_data" );
140 int sample_all = 0, r_type = 0, cv_n;
141 int total_c_count = 0;
142 int tree_block_size, temp_block_size, max_split_size, nv_size, cv_size = 0;
143 int ds_step, dv_step, ms_step = 0, mv_step = 0; // {data|mask}{sample|var}_step
146 const int *sidx = 0, *vidx = 0;
148 if( _update_data && data_root )
150 data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
151 _sample_idx, _var_type, _missing_mask, _params, _shared, _add_labels );
153 // compare new and old train data
154 if( !(data->var_count == var_count &&
155 cvNorm( data->var_type, var_type, CV_C ) < FLT_EPSILON &&
156 cvNorm( data->cat_count, cat_count, CV_C ) < FLT_EPSILON &&
157 cvNorm( data->cat_map, cat_map, CV_C ) < FLT_EPSILON) )
158 CV_ERROR( CV_StsBadArg,
159 "The new training data must have the same types and the input and output variables "
160 "and the same categories for categorical variables" );
162 cvReleaseMat( &priors );
163 cvReleaseMat( &priors_mult );
164 cvReleaseMat( &buf );
165 cvReleaseMat( &direction );
166 cvReleaseMat( &split_buf );
167 cvReleaseMemStorage( &temp_storage );
169 priors = data->priors; data->priors = 0;
170 priors_mult = data->priors_mult; data->priors_mult = 0;
171 buf = data->buf; data->buf = 0;
172 buf_count = data->buf_count; buf_size = data->buf_size;
173 sample_count = data->sample_count;
175 direction = data->direction; data->direction = 0;
176 split_buf = data->split_buf; data->split_buf = 0;
177 temp_storage = data->temp_storage; data->temp_storage = 0;
178 nv_heap = data->nv_heap; cv_heap = data->cv_heap;
180 data_root = new_node( 0, sample_count, 0, 0 );
189 CV_CALL( set_params( _params ));
191 // check parameter types and sizes
192 CV_CALL( cvCheckTrainData( _train_data, _tflag, _missing_mask, &var_all, &sample_all ));
193 if( _tflag == CV_ROW_SAMPLE )
195 ds_step = _train_data->step/CV_ELEM_SIZE(_train_data->type);
198 ms_step = _missing_mask->step, mv_step = 1;
202 dv_step = _train_data->step/CV_ELEM_SIZE(_train_data->type);
205 mv_step = _missing_mask->step, ms_step = 1;
208 sample_count = sample_all;
213 CV_CALL( sample_idx = cvPreprocessIndexArray( _sample_idx, sample_all ));
214 sidx = sample_idx->data.i;
215 sample_count = sample_idx->rows + sample_idx->cols - 1;
220 CV_CALL( var_idx = cvPreprocessIndexArray( _var_idx, var_all ));
221 vidx = var_idx->data.i;
222 var_count = var_idx->rows + var_idx->cols - 1;
225 if( !CV_IS_MAT(_responses) ||
226 (CV_MAT_TYPE(_responses->type) != CV_32SC1 &&
227 CV_MAT_TYPE(_responses->type) != CV_32FC1) ||
228 _responses->rows != 1 && _responses->cols != 1 ||
229 _responses->rows + _responses->cols - 1 != sample_all )
230 CV_ERROR( CV_StsBadArg, "The array of _responses must be an integer or "
231 "floating-point vector containing as many elements as "
232 "the total number of samples in the training data matrix" );
234 CV_CALL( var_type0 = cvPreprocessVarType( _var_type, var_idx, var_all, &r_type ));
235 CV_CALL( var_type = cvCreateMat( 1, var_count+2, CV_32SC1 ));
240 is_classifier = r_type == CV_VAR_CATEGORICAL;
242 // step 0. calc the number of categorical vars
243 for( vi = 0; vi < var_count; vi++ )
245 var_type->data.i[vi] = var_type0->data.ptr[vi] == CV_VAR_CATEGORICAL ?
246 cat_var_count++ : ord_var_count--;
249 ord_var_count = ~ord_var_count;
250 cv_n = params.cv_folds;
251 // set the two last elements of var_type array to be able
252 // to locate responses and cross-validation labels using
253 // the corresponding get_* functions.
254 var_type->data.i[var_count] = cat_var_count;
255 var_type->data.i[var_count+1] = cat_var_count+1;
257 // in case of single ordered predictor we need dummy cv_labels
258 // for safe split_node_data() operation
259 have_labels = cv_n > 0 || ord_var_count == 1 && cat_var_count == 0 || _add_labels;
261 buf_size = (ord_var_count + get_work_var_count())*sample_count + 2;
263 buf_count = shared ? 3 : 2;
264 CV_CALL( buf = cvCreateMat( buf_count, buf_size, CV_32SC1 ));
265 CV_CALL( cat_count = cvCreateMat( 1, cat_var_count+1, CV_32SC1 ));
266 CV_CALL( cat_ofs = cvCreateMat( 1, cat_count->cols+1, CV_32SC1 ));
267 CV_CALL( cat_map = cvCreateMat( 1, cat_count->cols*10 + 128, CV_32SC1 ));
269 // now calculate the maximum size of split,
270 // create memory storage that will keep nodes and splits of the decision tree
271 // allocate root node and the buffer for the whole training data
272 max_split_size = cvAlign(sizeof(CvDTreeSplit) +
273 (MAX(0,sample_count - 33)/32)*sizeof(int),sizeof(void*));
274 tree_block_size = MAX((int)sizeof(CvDTreeNode)*8, max_split_size);
275 tree_block_size = MAX(tree_block_size + block_size_delta, min_block_size);
276 CV_CALL( tree_storage = cvCreateMemStorage( tree_block_size ));
277 CV_CALL( node_heap = cvCreateSet( 0, sizeof(*node_heap), sizeof(CvDTreeNode), tree_storage ));
279 nv_size = var_count*sizeof(int);
280 nv_size = MAX( nv_size, (int)sizeof(CvSetElem) );
282 temp_block_size = nv_size;
286 if( sample_count < cv_n*MAX(params.min_sample_count,10) )
287 CV_ERROR( CV_StsOutOfRange,
288 "The many folds in cross-validation for such a small dataset" );
290 cv_size = cvAlign( cv_n*(sizeof(int) + sizeof(double)*2), sizeof(double) );
291 temp_block_size = MAX(temp_block_size, cv_size);
294 temp_block_size = MAX( temp_block_size + block_size_delta, min_block_size );
295 CV_CALL( temp_storage = cvCreateMemStorage( temp_block_size ));
296 CV_CALL( nv_heap = cvCreateSet( 0, sizeof(*nv_heap), nv_size, temp_storage ));
298 CV_CALL( cv_heap = cvCreateSet( 0, sizeof(*cv_heap), cv_size, temp_storage ));
300 CV_CALL( data_root = new_node( 0, sample_count, 0, 0 ));
301 CV_CALL( int_ptr = (int**)cvAlloc( sample_count*sizeof(int_ptr[0]) ));
305 // transform the training data to convenient representation
306 for( vi = 0; vi <= var_count; vi++ )
309 const uchar* mask = 0;
310 int m_step = 0, step;
311 const int* idata = 0;
312 const float* fdata = 0;
315 if( vi < var_count ) // analyze i-th input variable
317 int vi0 = vidx ? vidx[vi] : vi;
318 ci = get_var_type(vi);
319 step = ds_step; m_step = ms_step;
320 if( CV_MAT_TYPE(_train_data->type) == CV_32SC1 )
321 idata = _train_data->data.i + vi0*dv_step;
323 fdata = _train_data->data.fl + vi0*dv_step;
325 mask = _missing_mask->data.ptr + vi0*mv_step;
327 else // analyze _responses
330 step = CV_IS_MAT_CONT(_responses->type) ?
331 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
332 if( CV_MAT_TYPE(_responses->type) == CV_32SC1 )
333 idata = _responses->data.i;
335 fdata = _responses->data.fl;
338 if( vi < var_count && ci >= 0 ||
339 vi == var_count && is_classifier ) // process categorical variable or response
341 int c_count, prev_label;
342 int* c_map, *dst = get_cat_var_data( data_root, vi );
345 for( i = 0; i < sample_count; i++ )
347 int val = INT_MAX, si = sidx ? sidx[i] : i;
348 if( !mask || !mask[si*m_step] )
351 val = idata[si*step];
354 float t = fdata[si*step];
358 sprintf( err, "%d-th value of %d-th (categorical) "
359 "variable is not an integer", i, vi );
360 CV_ERROR( CV_StsBadArg, err );
366 sprintf( err, "%d-th value of %d-th (categorical) "
367 "variable is too large", i, vi );
368 CV_ERROR( CV_StsBadArg, err );
373 int_ptr[i] = dst + i;
376 // sort all the values, including the missing measurements
377 // that should all move to the end
378 icvSortIntPtr( int_ptr, sample_count, 0 );
379 //qsort( int_ptr, sample_count, sizeof(int_ptr[0]), icvCmpIntPtr );
381 c_count = num_valid > 0;
383 // count the categories
384 for( i = 1; i < num_valid; i++ )
385 c_count += *int_ptr[i] != *int_ptr[i-1];
388 max_c_count = MAX( max_c_count, c_count );
389 cat_count->data.i[ci] = c_count;
390 cat_ofs->data.i[ci] = total_c_count;
392 // resize cat_map, if need
393 if( cat_map->cols < total_c_count + c_count )
396 CV_CALL( cat_map = cvCreateMat( 1,
397 MAX(cat_map->cols*3/2,total_c_count+c_count), CV_32SC1 ));
398 for( i = 0; i < total_c_count; i++ )
399 cat_map->data.i[i] = tmp_map->data.i[i];
400 cvReleaseMat( &tmp_map );
403 c_map = cat_map->data.i + total_c_count;
404 total_c_count += c_count;
406 // compact the class indices and build the map
407 prev_label = ~*int_ptr[0];
410 for( i = 0; i < num_valid; i++ )
412 int cur_label = *int_ptr[i];
413 if( cur_label != prev_label )
414 c_map[++c_count] = prev_label = cur_label;
415 *int_ptr[i] = c_count;
418 // replace labels for missing values with -1
419 for( ; i < sample_count; i++ )
422 else if( ci < 0 ) // process ordered variable
424 CvPair32s32f* dst = get_ord_var_data( data_root, vi );
426 for( i = 0; i < sample_count; i++ )
429 int si = sidx ? sidx[i] : i;
430 if( !mask || !mask[si*m_step] )
433 val = (float)idata[si*step];
435 val = fdata[si*step];
437 if( fabs(val) >= ord_nan )
439 sprintf( err, "%d-th value of %d-th (ordered) "
440 "variable (=%g) is too large", i, vi, val );
441 CV_ERROR( CV_StsBadArg, err );
449 icvSortPairs( dst, sample_count, 0 );
451 else // special case: process ordered response,
452 // it will be stored similarly to categorical vars (i.e. no pairs)
454 float* dst = get_ord_responses( data_root );
456 for( i = 0; i < sample_count; i++ )
459 int si = sidx ? sidx[i] : i;
461 val = (float)idata[si*step];
463 val = fdata[si*step];
465 if( fabs(val) >= ord_nan )
467 sprintf( err, "%d-th value of %d-th (ordered) "
468 "variable (=%g) is out of range", i, vi, val );
469 CV_ERROR( CV_StsBadArg, err );
474 cat_count->data.i[cat_var_count] = 0;
475 cat_ofs->data.i[cat_var_count] = total_c_count;
476 num_valid = sample_count;
480 data_root->set_num_valid(vi, num_valid);
485 int* dst = get_labels(data_root);
488 for( i = vi = 0; i < sample_count; i++ )
491 vi &= vi < cv_n ? -1 : 0;
494 for( i = 0; i < sample_count; i++ )
496 int a = cvRandInt(r) % sample_count;
497 int b = cvRandInt(r) % sample_count;
498 CV_SWAP( dst[a], dst[b], vi );
502 cat_map->cols = MAX( total_c_count, 1 );
504 max_split_size = cvAlign(sizeof(CvDTreeSplit) +
505 (MAX(0,max_c_count - 33)/32)*sizeof(int),sizeof(void*));
506 CV_CALL( split_heap = cvCreateSet( 0, sizeof(*split_heap), max_split_size, tree_storage ));
508 have_priors = is_classifier && params.priors;
511 int m = get_num_classes();
513 CV_CALL( priors = cvCreateMat( 1, m, CV_64F ));
514 for( i = 0; i < m; i++ )
516 double val = have_priors ? params.priors[i] : 1.;
518 CV_ERROR( CV_StsOutOfRange, "Every class weight should be positive" );
519 priors->data.db[i] = val;
525 cvScale( priors, priors, 1./sum );
527 CV_CALL( priors_mult = cvCloneMat( priors ));
528 CV_CALL( counts = cvCreateMat( 1, m, CV_32SC1 ));
531 CV_CALL( direction = cvCreateMat( 1, sample_count, CV_8UC1 ));
532 CV_CALL( split_buf = cvCreateMat( 1, sample_count, CV_32SC1 ));
540 cvReleaseMat( &sample_idx );
541 cvReleaseMat( &var_type0 );
542 cvReleaseMat( &tmp_map );
546 CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
548 CvDTreeNode* root = 0;
549 CvMat* isubsample_idx = 0;
550 CvMat* subsample_co = 0;
552 CV_FUNCNAME( "CvDTreeTrainData::subsample_data" );
557 CV_ERROR( CV_StsError, "No training data has been set" );
560 CV_CALL( isubsample_idx = cvPreprocessIndexArray( _subsample_idx, sample_count ));
562 if( !isubsample_idx )
564 // make a copy of the root node
567 root = new_node( 0, 1, 0, 0 );
570 root->num_valid = temp.num_valid;
571 if( root->num_valid )
573 for( i = 0; i < var_count; i++ )
574 root->num_valid[i] = data_root->num_valid[i];
576 root->cv_Tn = temp.cv_Tn;
577 root->cv_node_risk = temp.cv_node_risk;
578 root->cv_node_error = temp.cv_node_error;
582 int* sidx = isubsample_idx->data.i;
583 // co - array of count/offset pairs (to handle duplicated values in _subsample_idx)
584 int* co, cur_ofs = 0;
585 int vi, i, total = data_root->sample_count;
586 int count = isubsample_idx->rows + isubsample_idx->cols - 1;
587 int work_var_count = get_work_var_count();
588 root = new_node( 0, count, 1, 0 );
590 CV_CALL( subsample_co = cvCreateMat( 1, total*2, CV_32SC1 ));
591 cvZero( subsample_co );
592 co = subsample_co->data.i;
593 for( i = 0; i < count; i++ )
595 for( i = 0; i < total; i++ )
606 for( vi = 0; vi < work_var_count; vi++ )
608 int ci = get_var_type(vi);
610 if( ci >= 0 || vi >= var_count )
612 const int* src = get_cat_var_data( data_root, vi );
613 int* dst = get_cat_var_data( root, vi );
616 for( i = 0; i < count; i++ )
618 int val = src[sidx[i]];
620 num_valid += val >= 0;
624 root->set_num_valid(vi, num_valid);
628 const CvPair32s32f* src = get_ord_var_data( data_root, vi );
629 CvPair32s32f* dst = get_ord_var_data( root, vi );
630 int j = 0, idx, count_i;
631 int num_valid = data_root->get_num_valid(vi);
633 for( i = 0; i < num_valid; i++ )
639 float val = src[i].val;
640 for( cur_ofs = co[idx*2+1]; count_i > 0; count_i--, j++, cur_ofs++ )
648 root->set_num_valid(vi, j);
650 for( ; i < total; i++ )
656 float val = src[i].val;
657 for( cur_ofs = co[idx*2+1]; count_i > 0; count_i--, j++, cur_ofs++ )
670 cvReleaseMat( &isubsample_idx );
671 cvReleaseMat( &subsample_co );
677 void CvDTreeTrainData::get_vectors( const CvMat* _subsample_idx,
678 float* values, uchar* missing,
679 float* responses, bool get_class_idx )
681 CvMat* subsample_idx = 0;
682 CvMat* subsample_co = 0;
684 CV_FUNCNAME( "CvDTreeTrainData::get_vectors" );
688 int i, vi, total = sample_count, count = total, cur_ofs = 0;
694 CV_CALL( subsample_idx = cvPreprocessIndexArray( _subsample_idx, sample_count ));
695 sidx = subsample_idx->data.i;
696 CV_CALL( subsample_co = cvCreateMat( 1, sample_count*2, CV_32SC1 ));
697 co = subsample_co->data.i;
698 cvZero( subsample_co );
699 count = subsample_idx->cols + subsample_idx->rows - 1;
700 for( i = 0; i < count; i++ )
702 for( i = 0; i < total; i++ )
704 int count_i = co[i*2];
707 co[i*2+1] = cur_ofs*var_count;
714 memset( missing, 1, count*var_count );
716 for( vi = 0; vi < var_count; vi++ )
718 int ci = get_var_type(vi);
719 if( ci >= 0 ) // categorical
721 float* dst = values + vi;
722 uchar* m = missing ? missing + vi : 0;
723 const int* src = get_cat_var_data(data_root, vi);
725 for( i = 0; i < count; i++, dst += var_count )
727 int idx = sidx ? sidx[i] : i;
739 float* dst = values + vi;
740 uchar* m = missing ? missing + vi : 0;
741 const CvPair32s32f* src = get_ord_var_data(data_root, vi);
742 int count1 = data_root->get_num_valid(vi);
744 for( i = 0; i < count1; i++ )
751 cur_ofs = co[idx*2+1];
754 cur_ofs = idx*var_count;
757 float val = src[i].val;
758 for( ; count_i > 0; count_i--, cur_ofs += var_count )
774 const int* src = get_class_labels(data_root);
775 for( i = 0; i < count; i++ )
777 int idx = sidx ? sidx[i] : i;
778 int val = get_class_idx ? src[idx] :
779 cat_map->data.i[cat_ofs->data.i[cat_var_count]+src[idx]];
780 responses[i] = (float)val;
785 const float* src = get_ord_responses(data_root);
786 for( i = 0; i < count; i++ )
788 int idx = sidx ? sidx[i] : i;
789 responses[i] = src[idx];
796 cvReleaseMat( &subsample_idx );
797 cvReleaseMat( &subsample_co );
801 CvDTreeNode* CvDTreeTrainData::new_node( CvDTreeNode* parent, int count,
802 int storage_idx, int offset )
804 CvDTreeNode* node = (CvDTreeNode*)cvSetNew( node_heap );
806 node->sample_count = count;
807 node->depth = parent ? parent->depth + 1 : 0;
808 node->parent = parent;
809 node->left = node->right = 0;
815 node->buf_idx = storage_idx;
816 node->offset = offset;
818 node->num_valid = (int*)cvSetNew( nv_heap );
821 node->alpha = node->node_risk = node->tree_risk = node->tree_error = 0.;
822 node->complexity = 0;
824 if( params.cv_folds > 0 && cv_heap )
826 int cv_n = params.cv_folds;
828 node->cv_Tn = (int*)cvSetNew( cv_heap );
829 node->cv_node_risk = (double*)cvAlignPtr(node->cv_Tn + cv_n, sizeof(double));
830 node->cv_node_error = node->cv_node_risk + cv_n;
836 node->cv_node_risk = 0;
837 node->cv_node_error = 0;
844 CvDTreeSplit* CvDTreeTrainData::new_split_ord( int vi, float cmp_val,
845 int split_point, int inversed, float quality )
847 CvDTreeSplit* split = (CvDTreeSplit*)cvSetNew( split_heap );
849 split->ord.c = cmp_val;
850 split->ord.split_point = split_point;
851 split->inversed = inversed;
852 split->quality = quality;
859 CvDTreeSplit* CvDTreeTrainData::new_split_cat( int vi, float quality )
861 CvDTreeSplit* split = (CvDTreeSplit*)cvSetNew( split_heap );
862 int i, n = (max_c_count + 31)/32;
866 split->quality = quality;
867 for( i = 0; i < n; i++ )
868 split->subset[i] = 0;
875 void CvDTreeTrainData::free_node( CvDTreeNode* node )
877 CvDTreeSplit* split = node->split;
878 free_node_data( node );
881 CvDTreeSplit* next = split->next;
882 cvSetRemoveByPtr( split_heap, split );
886 cvSetRemoveByPtr( node_heap, node );
890 void CvDTreeTrainData::free_node_data( CvDTreeNode* node )
892 if( node->num_valid )
894 cvSetRemoveByPtr( nv_heap, node->num_valid );
897 // do not free cv_* fields, as all the cross-validation related data is released at once.
901 void CvDTreeTrainData::free_train_data()
903 cvReleaseMat( &counts );
904 cvReleaseMat( &buf );
905 cvReleaseMat( &direction );
906 cvReleaseMat( &split_buf );
907 cvReleaseMemStorage( &temp_storage );
908 cv_heap = nv_heap = 0;
912 void CvDTreeTrainData::clear()
916 cvReleaseMemStorage( &tree_storage );
918 cvReleaseMat( &var_idx );
919 cvReleaseMat( &var_type );
920 cvReleaseMat( &cat_count );
921 cvReleaseMat( &cat_ofs );
922 cvReleaseMat( &cat_map );
923 cvReleaseMat( &priors );
924 cvReleaseMat( &priors_mult );
926 node_heap = split_heap = 0;
928 sample_count = var_all = var_count = max_c_count = ord_var_count = cat_var_count = 0;
929 have_labels = have_priors = is_classifier = false;
931 buf_count = buf_size = 0;
940 int CvDTreeTrainData::get_num_classes() const
942 return is_classifier ? cat_count->data.i[cat_var_count] : 0;
946 int CvDTreeTrainData::get_var_type(int vi) const
948 return var_type->data.i[vi];
952 int CvDTreeTrainData::get_work_var_count() const
954 return var_count + 1 + (have_labels ? 1 : 0);
957 CvPair32s32f* CvDTreeTrainData::get_ord_var_data( CvDTreeNode* n, int vi )
959 int oi = ~get_var_type(vi);
960 assert( 0 <= oi && oi < ord_var_count );
961 return (CvPair32s32f*)(buf->data.i + n->buf_idx*buf->cols +
962 n->offset + oi*n->sample_count*2);
966 int* CvDTreeTrainData::get_class_labels( CvDTreeNode* n )
968 return get_cat_var_data( n, var_count );
972 float* CvDTreeTrainData::get_ord_responses( CvDTreeNode* n )
974 return (float*)get_cat_var_data( n, var_count );
978 int* CvDTreeTrainData::get_labels( CvDTreeNode* n )
980 return have_labels ? get_cat_var_data( n, var_count + 1 ) : 0;
984 int* CvDTreeTrainData::get_cat_var_data( CvDTreeNode* n, int vi )
986 int ci = get_var_type(vi);
987 assert( 0 <= ci && ci <= cat_var_count + 1 );
988 return buf->data.i + n->buf_idx*buf->cols + n->offset +
989 (ord_var_count*2 + ci)*n->sample_count;
993 int CvDTreeTrainData::get_child_buf_idx( CvDTreeNode* n )
995 int idx = n->buf_idx + 1;
996 if( idx >= buf_count )
997 idx = shared ? 1 : 0;
1002 void CvDTreeTrainData::write_params( CvFileStorage* fs )
1004 CV_FUNCNAME( "CvDTreeTrainData::write_params" );
1008 int vi, vcount = var_count;
1010 cvWriteInt( fs, "is_classifier", is_classifier ? 1 : 0 );
1011 cvWriteInt( fs, "var_all", var_all );
1012 cvWriteInt( fs, "var_count", var_count );
1013 cvWriteInt( fs, "ord_var_count", ord_var_count );
1014 cvWriteInt( fs, "cat_var_count", cat_var_count );
1016 cvStartWriteStruct( fs, "training_params", CV_NODE_MAP );
1017 cvWriteInt( fs, "use_surrogates", params.use_surrogates ? 1 : 0 );
1021 cvWriteInt( fs, "max_categories", params.max_categories );
1025 cvWriteReal( fs, "regression_accuracy", params.regression_accuracy );
1028 cvWriteInt( fs, "max_depth", params.max_depth );
1029 cvWriteInt( fs, "min_sample_count", params.min_sample_count );
1030 cvWriteInt( fs, "cross_validation_folds", params.cv_folds );
1032 if( params.cv_folds > 1 )
1034 cvWriteInt( fs, "use_1se_rule", params.use_1se_rule ? 1 : 0 );
1035 cvWriteInt( fs, "truncate_pruned_tree", params.truncate_pruned_tree ? 1 : 0 );
1039 cvWrite( fs, "priors", priors );
1041 cvEndWriteStruct( fs );
1044 cvWrite( fs, "var_idx", var_idx );
1046 cvStartWriteStruct( fs, "var_type", CV_NODE_SEQ+CV_NODE_FLOW );
1048 for( vi = 0; vi < vcount; vi++ )
1049 cvWriteInt( fs, 0, var_type->data.i[vi] >= 0 );
1051 cvEndWriteStruct( fs );
1053 if( cat_count && (cat_var_count > 0 || is_classifier) )
1055 CV_ASSERT( cat_count != 0 );
1056 cvWrite( fs, "cat_count", cat_count );
1057 cvWrite( fs, "cat_map", cat_map );
1064 void CvDTreeTrainData::read_params( CvFileStorage* fs, CvFileNode* node )
1066 CV_FUNCNAME( "CvDTreeTrainData::read_params" );
1070 CvFileNode *tparams_node, *vartype_node;
1072 int vi, max_split_size, tree_block_size;
1074 is_classifier = (cvReadIntByName( fs, node, "is_classifier" ) != 0);
1075 var_all = cvReadIntByName( fs, node, "var_all" );
1076 var_count = cvReadIntByName( fs, node, "var_count", var_all );
1077 cat_var_count = cvReadIntByName( fs, node, "cat_var_count" );
1078 ord_var_count = cvReadIntByName( fs, node, "ord_var_count" );
1080 tparams_node = cvGetFileNodeByName( fs, node, "training_params" );
1082 if( tparams_node ) // training parameters are not necessary
1084 params.use_surrogates = cvReadIntByName( fs, tparams_node, "use_surrogates", 1 ) != 0;
1088 params.max_categories = cvReadIntByName( fs, tparams_node, "max_categories" );
1092 params.regression_accuracy =
1093 (float)cvReadRealByName( fs, tparams_node, "regression_accuracy" );
1096 params.max_depth = cvReadIntByName( fs, tparams_node, "max_depth" );
1097 params.min_sample_count = cvReadIntByName( fs, tparams_node, "min_sample_count" );
1098 params.cv_folds = cvReadIntByName( fs, tparams_node, "cross_validation_folds" );
1100 if( params.cv_folds > 1 )
1102 params.use_1se_rule = cvReadIntByName( fs, tparams_node, "use_1se_rule" ) != 0;
1103 params.truncate_pruned_tree =
1104 cvReadIntByName( fs, tparams_node, "truncate_pruned_tree" ) != 0;
1107 priors = (CvMat*)cvReadByName( fs, tparams_node, "priors" );
1110 if( !CV_IS_MAT(priors) )
1111 CV_ERROR( CV_StsParseError, "priors must stored as a matrix" );
1112 priors_mult = cvCloneMat( priors );
1116 CV_CALL( var_idx = (CvMat*)cvReadByName( fs, node, "var_idx" ));
1119 if( !CV_IS_MAT(var_idx) ||
1120 var_idx->cols != 1 && var_idx->rows != 1 ||
1121 var_idx->cols + var_idx->rows - 1 != var_count ||
1122 CV_MAT_TYPE(var_idx->type) != CV_32SC1 )
1123 CV_ERROR( CV_StsParseError,
1124 "var_idx (if exist) must be valid 1d integer vector containing <var_count> elements" );
1126 for( vi = 0; vi < var_count; vi++ )
1127 if( (unsigned)var_idx->data.i[vi] >= (unsigned)var_all )
1128 CV_ERROR( CV_StsOutOfRange, "some of var_idx elements are out of range" );
1131 ////// read var type
1132 CV_CALL( var_type = cvCreateMat( 1, var_count + 2, CV_32SC1 ));
1136 vartype_node = cvGetFileNodeByName( fs, node, "var_type" );
1138 if( vartype_node && CV_NODE_TYPE(vartype_node->tag) == CV_NODE_INT && var_count == 1 )
1139 var_type->data.i[0] = vartype_node->data.i ? cat_var_count++ : ord_var_count--;
1142 if( !vartype_node || CV_NODE_TYPE(vartype_node->tag) != CV_NODE_SEQ ||
1143 vartype_node->data.seq->total != var_count )
1144 CV_ERROR( CV_StsParseError, "var_type must exist and be a sequence of 0's and 1's" );
1146 cvStartReadSeq( vartype_node->data.seq, &reader );
1148 for( vi = 0; vi < var_count; vi++ )
1150 CvFileNode* n = (CvFileNode*)reader.ptr;
1151 if( CV_NODE_TYPE(n->tag) != CV_NODE_INT || (n->data.i & ~1) )
1152 CV_ERROR( CV_StsParseError, "var_type must exist and be a sequence of 0's and 1's" );
1153 var_type->data.i[vi] = n->data.i ? cat_var_count++ : ord_var_count--;
1154 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
1157 var_type->data.i[var_count] = cat_var_count;
1159 ord_var_count = ~ord_var_count;
1160 if( cat_var_count != cat_var_count || ord_var_count != ord_var_count )
1161 CV_ERROR( CV_StsParseError, "var_type is inconsistent with cat_var_count and ord_var_count" );
1164 if( cat_var_count > 0 || is_classifier )
1166 int ccount, total_c_count = 0;
1167 CV_CALL( cat_count = (CvMat*)cvReadByName( fs, node, "cat_count" ));
1168 CV_CALL( cat_map = (CvMat*)cvReadByName( fs, node, "cat_map" ));
1170 if( !CV_IS_MAT(cat_count) || !CV_IS_MAT(cat_map) ||
1171 cat_count->cols != 1 && cat_count->rows != 1 ||
1172 CV_MAT_TYPE(cat_count->type) != CV_32SC1 ||
1173 cat_count->cols + cat_count->rows - 1 != cat_var_count + is_classifier ||
1174 cat_map->cols != 1 && cat_map->rows != 1 ||
1175 CV_MAT_TYPE(cat_map->type) != CV_32SC1 )
1176 CV_ERROR( CV_StsParseError,
1177 "Both cat_count and cat_map must exist and be valid 1d integer vectors of an appropriate size" );
1179 ccount = cat_var_count + is_classifier;
1181 CV_CALL( cat_ofs = cvCreateMat( 1, ccount + 1, CV_32SC1 ));
1182 cat_ofs->data.i[0] = 0;
1185 for( vi = 0; vi < ccount; vi++ )
1187 int val = cat_count->data.i[vi];
1189 CV_ERROR( CV_StsOutOfRange, "some of cat_count elements are out of range" );
1190 max_c_count = MAX( max_c_count, val );
1191 cat_ofs->data.i[vi+1] = total_c_count += val;
1194 if( cat_map->cols + cat_map->rows - 1 != total_c_count )
1195 CV_ERROR( CV_StsBadSize,
1196 "cat_map vector length is not equal to the total number of categories in all categorical vars" );
1199 max_split_size = cvAlign(sizeof(CvDTreeSplit) +
1200 (MAX(0,max_c_count - 33)/32)*sizeof(int),sizeof(void*));
1202 tree_block_size = MAX((int)sizeof(CvDTreeNode)*8, max_split_size);
1203 tree_block_size = MAX(tree_block_size + block_size_delta, min_block_size);
1204 CV_CALL( tree_storage = cvCreateMemStorage( tree_block_size ));
1205 CV_CALL( node_heap = cvCreateSet( 0, sizeof(node_heap[0]),
1206 sizeof(CvDTreeNode), tree_storage ));
1207 CV_CALL( split_heap = cvCreateSet( 0, sizeof(split_heap[0]),
1208 max_split_size, tree_storage ));
1214 /////////////////////// Decision Tree /////////////////////////
1220 default_model_name = "my_tree";
1226 void CvDTree::clear()
1228 cvReleaseMat( &var_importance );
1238 pruned_tree_idx = -1;
1248 const CvDTreeNode* CvDTree::get_root() const
1254 int CvDTree::get_pruned_tree_idx() const
1256 return pruned_tree_idx;
1260 CvDTreeTrainData* CvDTree::get_data()
1266 bool CvDTree::train( const CvMat* _train_data, int _tflag,
1267 const CvMat* _responses, const CvMat* _var_idx,
1268 const CvMat* _sample_idx, const CvMat* _var_type,
1269 const CvMat* _missing_mask, CvDTreeParams _params )
1271 bool result = false;
1273 CV_FUNCNAME( "CvDTree::train" );
1278 data = new CvDTreeTrainData( _train_data, _tflag, _responses,
1279 _var_idx, _sample_idx, _var_type,
1280 _missing_mask, _params, false );
1281 CV_CALL( result = do_train(0));
1289 bool CvDTree::train( CvDTreeTrainData* _data, const CvMat* _subsample_idx )
1291 bool result = false;
1293 CV_FUNCNAME( "CvDTree::train" );
1299 data->shared = true;
1300 CV_CALL( result = do_train(_subsample_idx));
1308 bool CvDTree::do_train( const CvMat* _subsample_idx )
1310 bool result = false;
1312 CV_FUNCNAME( "CvDTree::do_train" );
1316 root = data->subsample_data( _subsample_idx );
1318 CV_CALL( try_split_node(root));
1320 if( data->params.cv_folds > 0 )
1321 CV_CALL( prune_cv());
1324 data->free_train_data();
1334 void CvDTree::try_split_node( CvDTreeNode* node )
1336 CvDTreeSplit* best_split = 0;
1337 int i, n = node->sample_count, vi;
1338 bool can_split = true;
1339 double quality_scale;
1341 calc_node_value( node );
1343 if( node->sample_count <= data->params.min_sample_count ||
1344 node->depth >= data->params.max_depth )
1347 if( can_split && data->is_classifier )
1349 // check if we have a "pure" node,
1350 // we assume that cls_count is filled by calc_node_value()
1351 int* cls_count = data->counts->data.i;
1352 int nz = 0, m = data->get_num_classes();
1353 for( i = 0; i < m; i++ )
1354 nz += cls_count[i] != 0;
1355 if( nz == 1 ) // there is only one class
1358 else if( can_split )
1360 if( sqrt(node->node_risk)/n < data->params.regression_accuracy )
1366 best_split = find_best_split(node);
1367 // TODO: check the split quality ...
1368 node->split = best_split;
1371 if( !can_split || !best_split )
1373 data->free_node_data(node);
1377 quality_scale = calc_node_dir( node );
1379 if( data->params.use_surrogates )
1381 // find all the surrogate splits
1382 // and sort them by their similarity to the primary one
1383 for( vi = 0; vi < data->var_count; vi++ )
1385 CvDTreeSplit* split;
1386 int ci = data->get_var_type(vi);
1388 if( vi == best_split->var_idx )
1392 split = find_surrogate_split_cat( node, vi );
1394 split = find_surrogate_split_ord( node, vi );
1399 CvDTreeSplit* prev_split = node->split;
1400 split->quality = (float)(split->quality*quality_scale);
1402 while( prev_split->next &&
1403 prev_split->next->quality > split->quality )
1404 prev_split = prev_split->next;
1405 split->next = prev_split->next;
1406 prev_split->next = split;
1411 split_node_data( node );
1412 try_split_node( node->left );
1413 try_split_node( node->right );
1417 // calculate direction (left(-1),right(1),missing(0))
1418 // for each sample using the best split
1419 // the function returns scale coefficients for surrogate split quality factors.
1420 // the scale is applied to normalize surrogate split quality relatively to the
1421 // best (primary) split quality. That is, if a surrogate split is absolutely
1422 // identical to the primary split, its quality will be set to the maximum value =
1423 // quality of the primary split; otherwise, it will be lower.
1424 // besides, the function compute node->maxlr,
1425 // minimum possible quality (w/o considering the above mentioned scale)
1426 // for a surrogate split. Surrogate splits with quality less than node->maxlr
1427 // are not discarded.
1428 double CvDTree::calc_node_dir( CvDTreeNode* node )
1430 char* dir = (char*)data->direction->data.ptr;
1431 int i, n = node->sample_count, vi = node->split->var_idx;
1434 assert( !node->split->inversed );
1436 if( data->get_var_type(vi) >= 0 ) // split on categorical var
1438 const int* labels = data->get_cat_var_data(node,vi);
1439 const int* subset = node->split->subset;
1441 if( !data->have_priors )
1443 int sum = 0, sum_abs = 0;
1445 for( i = 0; i < n; i++ )
1447 int idx = labels[i];
1448 int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
1449 sum += d; sum_abs += d & 1;
1453 R = (sum_abs + sum) >> 1;
1454 L = (sum_abs - sum) >> 1;
1458 const int* responses = data->get_class_labels(node);
1459 const double* priors = data->priors_mult->data.db;
1460 double sum = 0, sum_abs = 0;
1462 for( i = 0; i < n; i++ )
1464 int idx = labels[i];
1465 double w = priors[responses[i]];
1466 int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
1467 sum += d*w; sum_abs += (d & 1)*w;
1471 R = (sum_abs + sum) * 0.5;
1472 L = (sum_abs - sum) * 0.5;
1475 else // split on ordered var
1477 const CvPair32s32f* sorted = data->get_ord_var_data(node,vi);
1478 int split_point = node->split->ord.split_point;
1479 int n1 = node->get_num_valid(vi);
1481 assert( 0 <= split_point && split_point < n1-1 );
1483 if( !data->have_priors )
1485 for( i = 0; i <= split_point; i++ )
1486 dir[sorted[i].i] = (char)-1;
1487 for( ; i < n1; i++ )
1488 dir[sorted[i].i] = (char)1;
1490 dir[sorted[i].i] = (char)0;
1493 R = n1 - split_point + 1;
1497 const int* responses = data->get_class_labels(node);
1498 const double* priors = data->priors_mult->data.db;
1501 for( i = 0; i <= split_point; i++ )
1503 int idx = sorted[i].i;
1504 double w = priors[responses[idx]];
1505 dir[idx] = (char)-1;
1509 for( ; i < n1; i++ )
1511 int idx = sorted[i].i;
1512 double w = priors[responses[idx]];
1518 dir[sorted[i].i] = (char)0;
1522 node->maxlr = MAX( L, R );
1523 return node->split->quality/(L + R);
1527 CvDTreeSplit* CvDTree::find_best_split( CvDTreeNode* node )
1530 CvDTreeSplit *best_split = 0, *split = 0, *t;
1532 for( vi = 0; vi < data->var_count; vi++ )
1534 int ci = data->get_var_type(vi);
1535 if( node->get_num_valid(vi) <= 1 )
1538 if( data->is_classifier )
1541 split = find_split_cat_class( node, vi );
1543 split = find_split_ord_class( node, vi );
1548 split = find_split_cat_reg( node, vi );
1550 split = find_split_ord_reg( node, vi );
1555 if( !best_split || best_split->quality < split->quality )
1556 CV_SWAP( best_split, split, t );
1558 cvSetRemoveByPtr( data->split_heap, split );
1566 CvDTreeSplit* CvDTree::find_split_ord_class( CvDTreeNode* node, int vi )
1568 const float epsilon = FLT_EPSILON*2;
1569 const CvPair32s32f* sorted = data->get_ord_var_data(node, vi);
1570 const int* responses = data->get_class_labels(node);
1571 int n = node->sample_count;
1572 int n1 = node->get_num_valid(vi);
1573 int m = data->get_num_classes();
1574 const int* rc0 = data->counts->data.i;
1575 int* lc = (int*)cvStackAlloc(m*sizeof(lc[0]));
1576 int* rc = (int*)cvStackAlloc(m*sizeof(rc[0]));
1578 double lsum2 = 0, rsum2 = 0, best_val = 0;
1579 const double* priors = data->have_priors ? data->priors_mult->data.db : 0;
1581 // init arrays of class instance counters on both sides of the split
1582 for( i = 0; i < m; i++ )
1588 // compensate for missing values
1589 for( i = n1; i < n; i++ )
1590 rc[responses[sorted[i].i]]--;
1596 for( i = 0; i < m; i++ )
1597 rsum2 += (double)rc[i]*rc[i];
1599 for( i = 0; i < n1 - 1; i++ )
1601 int idx = responses[sorted[i].i];
1604 lv = lc[idx]; rv = rc[idx];
1607 lc[idx] = lv + 1; rc[idx] = rv - 1;
1609 if( sorted[i].val + epsilon < sorted[i+1].val )
1611 double val = (lsum2*R + rsum2*L)/((double)L*R);
1612 if( best_val < val )
1622 double L = 0, R = 0;
1623 for( i = 0; i < m; i++ )
1625 double wv = rc[i]*priors[i];
1630 for( i = 0; i < n1 - 1; i++ )
1632 int idx = responses[sorted[i].i];
1634 double p = priors[idx], p2 = p*p;
1636 lv = lc[idx]; rv = rc[idx];
1637 lsum2 += p2*(lv*2 + 1);
1638 rsum2 -= p2*(rv*2 - 1);
1639 lc[idx] = lv + 1; rc[idx] = rv - 1;
1641 if( sorted[i].val + epsilon < sorted[i+1].val )
1643 double val = (lsum2*R + rsum2*L)/((double)L*R);
1644 if( best_val < val )
1653 return best_i >= 0 ? data->new_split_ord( vi,
1654 (sorted[best_i].val + sorted[best_i+1].val)*0.5f, best_i,
1655 0, (float)best_val ) : 0;
1659 void CvDTree::cluster_categories( const int* vectors, int n, int m,
1660 int* csums, int k, int* labels )
1662 // TODO: consider adding priors (class weights) and sample weights to the clustering algorithm
1663 int iters = 0, max_iters = 100;
1665 double* buf = (double*)cvStackAlloc( (n + k)*sizeof(buf[0]) );
1666 double *v_weights = buf, *c_weights = buf + k;
1667 bool modified = true;
1668 CvRNG* r = &data->rng;
1670 // assign labels randomly
1671 for( i = idx = 0; i < n; i++ )
1674 const int* v = vectors + i*m;
1676 idx &= idx < k ? -1 : 0;
1678 // compute weight of each vector
1679 for( j = 0; j < m; j++ )
1681 v_weights[i] = sum ? 1./sum : 0.;
1684 for( i = 0; i < n; i++ )
1686 int i1 = cvRandInt(r) % n;
1687 int i2 = cvRandInt(r) % n;
1688 CV_SWAP( labels[i1], labels[i2], j );
1691 for( iters = 0; iters <= max_iters; iters++ )
1694 for( i = 0; i < k; i++ )
1696 for( j = 0; j < m; j++ )
1700 for( i = 0; i < n; i++ )
1702 const int* v = vectors + i*m;
1703 int* s = csums + labels[i]*m;
1704 for( j = 0; j < m; j++ )
1708 // exit the loop here, when we have up-to-date csums
1709 if( iters == max_iters || !modified )
1714 // calculate weight of each cluster
1715 for( i = 0; i < k; i++ )
1717 const int* s = csums + i*m;
1719 for( j = 0; j < m; j++ )
1721 c_weights[i] = sum ? 1./sum : 0;
1724 // now for each vector determine the closest cluster
1725 for( i = 0; i < n; i++ )
1727 const int* v = vectors + i*m;
1728 double alpha = v_weights[i];
1729 double min_dist2 = DBL_MAX;
1732 for( idx = 0; idx < k; idx++ )
1734 const int* s = csums + idx*m;
1735 double dist2 = 0., beta = c_weights[idx];
1736 for( j = 0; j < m; j++ )
1738 double t = v[j]*alpha - s[j]*beta;
1741 if( min_dist2 > dist2 )
1748 if( min_idx != labels[i] )
1750 labels[i] = min_idx;
1756 CvDTreeSplit* CvDTree::find_split_cat_class( CvDTreeNode* node, int vi )
1758 CvDTreeSplit* split;
1759 const int* labels = data->get_cat_var_data(node, vi);
1760 const int* responses = data->get_class_labels(node);
1761 int ci = data->get_var_type(vi);
1762 int n = node->sample_count;
1763 int m = data->get_num_classes();
1764 int _mi = data->cat_count->data.i[ci], mi = _mi;
1765 int* lc = (int*)cvStackAlloc(m*sizeof(lc[0]));
1766 int* rc = (int*)cvStackAlloc(m*sizeof(rc[0]));
1767 int* _cjk = (int*)cvStackAlloc(m*(mi+1)*sizeof(_cjk[0]))+m, *cjk = _cjk;
1768 double* c_weights = (double*)cvStackAlloc( mi*sizeof(c_weights[0]) );
1769 int* cluster_labels = 0;
1772 double L = 0, R = 0;
1773 double best_val = 0;
1774 int prevcode = 0, best_subset = -1, subset_i, subset_n, subtract = 0;
1775 const double* priors = data->priors_mult->data.db;
1777 // init array of counters:
1778 // c_{jk} - number of samples that have vi-th input variable = j and response = k.
1779 for( j = -1; j < mi; j++ )
1780 for( k = 0; k < m; k++ )
1783 for( i = 0; i < n; i++ )
1792 if( mi > data->params.max_categories )
1794 mi = MIN(data->params.max_categories, n);
1796 cluster_labels = (int*)cvStackAlloc(mi*sizeof(cluster_labels[0]));
1797 cluster_categories( _cjk, _mi, m, cjk, mi, cluster_labels );
1805 int_ptr = (int**)cvStackAlloc( mi*sizeof(int_ptr[0]) );
1806 for( j = 0; j < mi; j++ )
1807 int_ptr[j] = cjk + j*2 + 1;
1808 icvSortIntPtr( int_ptr, mi, 0 );
1813 for( k = 0; k < m; k++ )
1816 for( j = 0; j < mi; j++ )
1817 sum += cjk[j*m + k];
1822 for( j = 0; j < mi; j++ )
1825 for( k = 0; k < m; k++ )
1826 sum += cjk[j*m + k]*priors[k];
1831 for( ; subset_i < subset_n; subset_i++ )
1835 double lsum2 = 0, rsum2 = 0;
1838 idx = (int)(int_ptr[subset_i] - cjk)/2;
1841 int graycode = (subset_i>>1)^subset_i;
1842 int diff = graycode ^ prevcode;
1844 // determine index of the changed bit.
1846 idx = diff >= (1 << 16) ? 16 : 0;
1847 u.f = (float)(((diff >> 16) | diff) & 65535);
1848 idx += (u.i >> 23) - 127;
1849 subtract = graycode < prevcode;
1850 prevcode = graycode;
1854 weight = c_weights[idx];
1855 if( weight < FLT_EPSILON )
1860 for( k = 0; k < m; k++ )
1863 int lval = lc[k] + t;
1864 int rval = rc[k] - t;
1865 double p = priors[k], p2 = p*p;
1866 lsum2 += p2*lval*lval;
1867 rsum2 += p2*rval*rval;
1868 lc[k] = lval; rc[k] = rval;
1875 for( k = 0; k < m; k++ )
1878 int lval = lc[k] - t;
1879 int rval = rc[k] + t;
1880 double p = priors[k], p2 = p*p;
1881 lsum2 += p2*lval*lval;
1882 rsum2 += p2*rval*rval;
1883 lc[k] = lval; rc[k] = rval;
1889 if( L > FLT_EPSILON && R > FLT_EPSILON )
1891 double val = (lsum2*R + rsum2*L)/((double)L*R);
1892 if( best_val < val )
1895 best_subset = subset_i;
1900 if( best_subset < 0 )
1903 split = data->new_split_cat( vi, (float)best_val );
1907 for( i = 0; i <= best_subset; i++ )
1909 idx = (int)(int_ptr[i] - cjk) >> 1;
1910 split->subset[idx >> 5] |= 1 << (idx & 31);
1915 for( i = 0; i < _mi; i++ )
1917 idx = cluster_labels ? cluster_labels[i] : i;
1918 if( best_subset & (1 << idx) )
1919 split->subset[i >> 5] |= 1 << (i & 31);
1927 CvDTreeSplit* CvDTree::find_split_ord_reg( CvDTreeNode* node, int vi )
1929 const float epsilon = FLT_EPSILON*2;
1930 const CvPair32s32f* sorted = data->get_ord_var_data(node, vi);
1931 const float* responses = data->get_ord_responses(node);
1932 int n = node->sample_count;
1933 int n1 = node->get_num_valid(vi);
1935 double best_val = 0, lsum = 0, rsum = node->value*n;
1938 // compensate for missing values
1939 for( i = n1; i < n; i++ )
1940 rsum -= responses[sorted[i].i];
1942 // find the optimal split
1943 for( i = 0; i < n1 - 1; i++ )
1945 float t = responses[sorted[i].i];
1950 if( sorted[i].val + epsilon < sorted[i+1].val )
1952 double val = (lsum*lsum*R + rsum*rsum*L)/((double)L*R);
1953 if( best_val < val )
1961 return best_i >= 0 ? data->new_split_ord( vi,
1962 (sorted[best_i].val + sorted[best_i+1].val)*0.5f, best_i,
1963 0, (float)best_val ) : 0;
1967 CvDTreeSplit* CvDTree::find_split_cat_reg( CvDTreeNode* node, int vi )
1969 CvDTreeSplit* split;
1970 const int* labels = data->get_cat_var_data(node, vi);
1971 const float* responses = data->get_ord_responses(node);
1972 int ci = data->get_var_type(vi);
1973 int n = node->sample_count;
1974 int mi = data->cat_count->data.i[ci];
1975 double* sum = (double*)cvStackAlloc( (mi+1)*sizeof(sum[0]) ) + 1;
1976 int* counts = (int*)cvStackAlloc( (mi+1)*sizeof(counts[0]) ) + 1;
1977 double** sum_ptr = 0;
1978 int i, L = 0, R = 0;
1979 double best_val = 0, lsum = 0, rsum = 0;
1980 int best_subset = -1, subset_i;
1982 for( i = -1; i < mi; i++ )
1983 sum[i] = counts[i] = 0;
1985 // calculate sum response and weight of each category of the input var
1986 for( i = 0; i < n; i++ )
1988 int idx = labels[i];
1989 double s = sum[idx] + responses[i];
1990 int nc = counts[idx] + 1;
1995 // calculate average response in each category
1996 for( i = 0; i < mi; i++ )
2000 sum[i] /= MAX(counts[i],1);
2001 sum_ptr[i] = sum + i;
2004 icvSortDblPtr( sum_ptr, mi, 0 );
2006 // revert back to unnormalized sums
2007 // (there should be a very little loss of accuracy)
2008 for( i = 0; i < mi; i++ )
2009 sum[i] *= counts[i];
2011 for( subset_i = 0; subset_i < mi-1; subset_i++ )
2013 int idx = (int)(sum_ptr[subset_i] - sum);
2014 int ni = counts[idx];
2018 double s = sum[idx];
2024 double val = (lsum*lsum*R + rsum*rsum*L)/((double)L*R);
2025 if( best_val < val )
2028 best_subset = subset_i;
2034 if( best_subset < 0 )
2037 split = data->new_split_cat( vi, (float)best_val );
2038 for( i = 0; i <= best_subset; i++ )
2040 int idx = (int)(sum_ptr[i] - sum);
2041 split->subset[idx >> 5] |= 1 << (idx & 31);
2048 CvDTreeSplit* CvDTree::find_surrogate_split_ord( CvDTreeNode* node, int vi )
2050 const float epsilon = FLT_EPSILON*2;
2051 const CvPair32s32f* sorted = data->get_ord_var_data(node, vi);
2052 const char* dir = (char*)data->direction->data.ptr;
2053 int n1 = node->get_num_valid(vi);
2054 // LL - number of samples that both the primary and the surrogate splits send to the left
2055 // LR - ... primary split sends to the left and the surrogate split sends to the right
2056 // RL - ... primary split sends to the right and the surrogate split sends to the left
2057 // RR - ... both send to the right
2058 int i, best_i = -1, best_inversed = 0;
2061 if( !data->have_priors )
2063 int LL = 0, RL = 0, LR, RR;
2064 int worst_val = cvFloor(node->maxlr), _best_val = worst_val;
2065 int sum = 0, sum_abs = 0;
2067 for( i = 0; i < n1; i++ )
2069 int d = dir[sorted[i].i];
2070 sum += d; sum_abs += d & 1;
2073 // sum_abs = R + L; sum = R - L
2074 RR = (sum_abs + sum) >> 1;
2075 LR = (sum_abs - sum) >> 1;
2077 // initially all the samples are sent to the right by the surrogate split,
2078 // LR of them are sent to the left by primary split, and RR - to the right.
2079 // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
2080 for( i = 0; i < n1 - 1; i++ )
2082 int d = dir[sorted[i].i];
2087 if( LL + RR > _best_val && sorted[i].val + epsilon < sorted[i+1].val )
2090 best_i = i; best_inversed = 0;
2096 if( RL + LR > _best_val && sorted[i].val + epsilon < sorted[i+1].val )
2099 best_i = i; best_inversed = 1;
2103 best_val = _best_val;
2107 double LL = 0, RL = 0, LR, RR;
2108 double worst_val = node->maxlr;
2109 double sum = 0, sum_abs = 0;
2110 const double* priors = data->priors_mult->data.db;
2111 const int* responses = data->get_class_labels(node);
2112 best_val = worst_val;
2114 for( i = 0; i < n1; i++ )
2116 int idx = sorted[i].i;
2117 double w = priors[responses[idx]];
2119 sum += d*w; sum_abs += (d & 1)*w;
2122 // sum_abs = R + L; sum = R - L
2123 RR = (sum_abs + sum)*0.5;
2124 LR = (sum_abs - sum)*0.5;
2126 // initially all the samples are sent to the right by the surrogate split,
2127 // LR of them are sent to the left by primary split, and RR - to the right.
2128 // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
2129 for( i = 0; i < n1 - 1; i++ )
2131 int idx = sorted[i].i;
2132 double w = priors[responses[idx]];
2138 if( LL + RR > best_val && sorted[i].val + epsilon < sorted[i+1].val )
2141 best_i = i; best_inversed = 0;
2147 if( RL + LR > best_val && sorted[i].val + epsilon < sorted[i+1].val )
2150 best_i = i; best_inversed = 1;
2156 return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi,
2157 (sorted[best_i].val + sorted[best_i+1].val)*0.5f, best_i,
2158 best_inversed, (float)best_val ) : 0;
2162 CvDTreeSplit* CvDTree::find_surrogate_split_cat( CvDTreeNode* node, int vi )
2164 const int* labels = data->get_cat_var_data(node, vi);
2165 const char* dir = (char*)data->direction->data.ptr;
2166 int n = node->sample_count;
2167 // LL - number of samples that both the primary and the surrogate splits send to the left
2168 // LR - ... primary split sends to the left and the surrogate split sends to the right
2169 // RL - ... primary split sends to the right and the surrogate split sends to the left
2170 // RR - ... both send to the right
2171 CvDTreeSplit* split = data->new_split_cat( vi, 0 );
2172 int i, mi = data->cat_count->data.i[data->get_var_type(vi)], l_win = 0;
2173 double best_val = 0;
2174 double* lc = (double*)cvStackAlloc( (mi+1)*2*sizeof(lc[0]) ) + 1;
2175 double* rc = lc + mi + 1;
2177 for( i = -1; i < mi; i++ )
2180 // for each category calculate the weight of samples
2181 // sent to the left (lc) and to the right (rc) by the primary split
2182 if( !data->have_priors )
2184 int* _lc = (int*)cvStackAlloc((mi+2)*2*sizeof(_lc[0])) + 1;
2185 int* _rc = _lc + mi + 1;
2187 for( i = -1; i < mi; i++ )
2188 _lc[i] = _rc[i] = 0;
2190 for( i = 0; i < n; i++ )
2192 int idx = labels[i];
2194 int sum = _lc[idx] + d;
2195 int sum_abs = _rc[idx] + (d & 1);
2196 _lc[idx] = sum; _rc[idx] = sum_abs;
2199 for( i = 0; i < mi; i++ )
2202 int sum_abs = _rc[i];
2203 lc[i] = (sum_abs - sum) >> 1;
2204 rc[i] = (sum_abs + sum) >> 1;
2209 const double* priors = data->priors_mult->data.db;
2210 const int* responses = data->get_class_labels(node);
2212 for( i = 0; i < n; i++ )
2214 int idx = labels[i];
2215 double w = priors[responses[i]];
2217 double sum = lc[idx] + d*w;
2218 double sum_abs = rc[idx] + (d & 1)*w;
2219 lc[idx] = sum; rc[idx] = sum_abs;
2222 for( i = 0; i < mi; i++ )
2225 double sum_abs = rc[i];
2226 lc[i] = (sum_abs - sum) * 0.5;
2227 rc[i] = (sum_abs + sum) * 0.5;
2231 // 2. now form the split.
2232 // in each category send all the samples to the same direction as majority
2233 for( i = 0; i < mi; i++ )
2235 double lval = lc[i], rval = rc[i];
2238 split->subset[i >> 5] |= 1 << (i & 31);
2246 split->quality = (float)best_val;
2247 if( split->quality <= node->maxlr || l_win == 0 || l_win == mi )
2248 cvSetRemoveByPtr( data->split_heap, split ), split = 0;
2254 void CvDTree::calc_node_value( CvDTreeNode* node )
2256 int i, j, k, n = node->sample_count, cv_n = data->params.cv_folds;
2257 const int* cv_labels = data->get_labels(node);
2259 if( data->is_classifier )
2261 // in case of classification tree:
2262 // * node value is the label of the class that has the largest weight in the node.
2263 // * node risk is the weighted number of misclassified samples,
2264 // * j-th cross-validation fold value and risk are calculated as above,
2265 // but using the samples with cv_labels(*)!=j.
2266 // * j-th cross-validation fold error is calculated as the weighted number of
2267 // misclassified samples with cv_labels(*)==j.
2269 // compute the number of instances of each class
2270 int* cls_count = data->counts->data.i;
2271 const int* responses = data->get_class_labels(node);
2272 int m = data->get_num_classes();
2273 int* cv_cls_count = (int*)cvStackAlloc(m*cv_n*sizeof(cv_cls_count[0]));
2274 double max_val = -1, total_weight = 0;
2276 double* priors = data->priors_mult->data.db;
2278 for( k = 0; k < m; k++ )
2283 for( i = 0; i < n; i++ )
2284 cls_count[responses[i]]++;
2288 for( j = 0; j < cv_n; j++ )
2289 for( k = 0; k < m; k++ )
2290 cv_cls_count[j*m + k] = 0;
2292 for( i = 0; i < n; i++ )
2294 j = cv_labels[i]; k = responses[i];
2295 cv_cls_count[j*m + k]++;
2298 for( j = 0; j < cv_n; j++ )
2299 for( k = 0; k < m; k++ )
2300 cls_count[k] += cv_cls_count[j*m + k];
2303 if( data->have_priors && node->parent == 0 )
2305 // compute priors_mult from priors, take the sample ratio into account.
2307 for( k = 0; k < m; k++ )
2309 int n_k = cls_count[k];
2310 priors[k] = data->priors->data.db[k]*(n_k ? 1./n_k : 0.);
2314 for( k = 0; k < m; k++ )
2318 for( k = 0; k < m; k++ )
2320 double val = cls_count[k]*priors[k];
2321 total_weight += val;
2329 node->class_idx = max_k;
2330 node->value = data->cat_map->data.i[
2331 data->cat_ofs->data.i[data->cat_var_count] + max_k];
2332 node->node_risk = total_weight - max_val;
2334 for( j = 0; j < cv_n; j++ )
2336 double sum_k = 0, sum = 0, max_val_k = 0;
2337 max_val = -1; max_k = -1;
2339 for( k = 0; k < m; k++ )
2341 double w = priors[k];
2342 double val_k = cv_cls_count[j*m + k]*w;
2343 double val = cls_count[k]*w - val_k;
2354 node->cv_Tn[j] = INT_MAX;
2355 node->cv_node_risk[j] = sum - max_val;
2356 node->cv_node_error[j] = sum_k - max_val_k;
2361 // in case of regression tree:
2362 // * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
2363 // n is the number of samples in the node.
2364 // * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
2365 // * j-th cross-validation fold value and risk are calculated as above,
2366 // but using the samples with cv_labels(*)!=j.
2367 // * j-th cross-validation fold error is calculated
2368 // using samples with cv_labels(*)==j as the test subset:
2369 // error_j = sum_(i,cv_labels(i)==j)((Y_i - <node_value_j>)^2),
2370 // where node_value_j is the node value calculated
2371 // as described in the previous bullet, and summation is done
2372 // over the samples with cv_labels(*)==j.
2374 double sum = 0, sum2 = 0;
2375 const float* values = data->get_ord_responses(node);
2376 double *cv_sum = 0, *cv_sum2 = 0;
2381 for( i = 0; i < n; i++ )
2383 double t = values[i];
2390 cv_sum = (double*)cvStackAlloc( cv_n*sizeof(cv_sum[0]) );
2391 cv_sum2 = (double*)cvStackAlloc( cv_n*sizeof(cv_sum2[0]) );
2392 cv_count = (int*)cvStackAlloc( cv_n*sizeof(cv_count[0]) );
2394 for( j = 0; j < cv_n; j++ )
2396 cv_sum[j] = cv_sum2[j] = 0.;
2400 for( i = 0; i < n; i++ )
2403 double t = values[i];
2404 double s = cv_sum[j] + t;
2405 double s2 = cv_sum2[j] + t*t;
2406 int nc = cv_count[j] + 1;
2412 for( j = 0; j < cv_n; j++ )
2419 node->node_risk = sum2 - (sum/n)*sum;
2420 node->value = sum/n;
2422 for( j = 0; j < cv_n; j++ )
2424 double s = cv_sum[j], si = sum - s;
2425 double s2 = cv_sum2[j], s2i = sum2 - s2;
2426 int c = cv_count[j], ci = n - c;
2427 double r = si/MAX(ci,1);
2428 node->cv_node_risk[j] = s2i - r*r*ci;
2429 node->cv_node_error[j] = s2 - 2*r*s + c*r*r;
2430 node->cv_Tn[j] = INT_MAX;
2436 void CvDTree::complete_node_dir( CvDTreeNode* node )
2438 int vi, i, n = node->sample_count, nl, nr, d0 = 0, d1 = -1;
2439 int nz = n - node->get_num_valid(node->split->var_idx);
2440 char* dir = (char*)data->direction->data.ptr;
2442 // try to complete direction using surrogate splits
2443 if( nz && data->params.use_surrogates )
2445 CvDTreeSplit* split = node->split->next;
2446 for( ; split != 0 && nz; split = split->next )
2448 int inversed_mask = split->inversed ? -1 : 0;
2449 vi = split->var_idx;
2451 if( data->get_var_type(vi) >= 0 ) // split on categorical var
2453 const int* labels = data->get_cat_var_data(node, vi);
2454 const int* subset = split->subset;
2456 for( i = 0; i < n; i++ )
2459 if( !dir[i] && (idx = labels[i]) >= 0 )
2461 int d = CV_DTREE_CAT_DIR(idx,subset);
2462 dir[i] = (char)((d ^ inversed_mask) - inversed_mask);
2468 else // split on ordered var
2470 const CvPair32s32f* sorted = data->get_ord_var_data(node, vi);
2471 int split_point = split->ord.split_point;
2472 int n1 = node->get_num_valid(vi);
2474 assert( 0 <= split_point && split_point < n-1 );
2476 for( i = 0; i < n1; i++ )
2478 int idx = sorted[i].i;
2481 int d = i <= split_point ? -1 : 1;
2482 dir[idx] = (char)((d ^ inversed_mask) - inversed_mask);
2491 // find the default direction for the rest
2494 for( i = nr = 0; i < n; i++ )
2497 d0 = nl > nr ? -1 : nr > nl;
2500 // make sure that every sample is directed either to the left or to the right
2501 for( i = 0; i < n; i++ )
2511 dir[i] = (char)d; // remap (-1,1) to (0,1)
2516 void CvDTree::split_node_data( CvDTreeNode* node )
2518 int vi, i, n = node->sample_count, nl, nr;
2519 char* dir = (char*)data->direction->data.ptr;
2520 CvDTreeNode *left = 0, *right = 0;
2521 int* new_idx = data->split_buf->data.i;
2522 int new_buf_idx = data->get_child_buf_idx( node );
2523 int work_var_count = data->get_work_var_count();
2525 // speedup things a little, especially for tree ensembles with a lots of small trees:
2526 // do not physically split the input data between the left and right child nodes
2527 // when we are not going to split them further,
2528 // as calc_node_value() does not requires input features anyway.
2529 bool split_input_data;
2531 complete_node_dir(node);
2533 for( i = nl = nr = 0; i < n; i++ )
2536 // initialize new indices for splitting ordered variables
2537 new_idx[i] = (nl & (d-1)) | (nr & -d); // d ? ri : li
2542 node->left = left = data->new_node( node, nl, new_buf_idx, node->offset );
2543 node->right = right = data->new_node( node, nr, new_buf_idx, node->offset +
2544 (data->ord_var_count + work_var_count)*nl );
2546 split_input_data = node->depth + 1 < data->params.max_depth &&
2547 (node->left->sample_count > data->params.min_sample_count ||
2548 node->right->sample_count > data->params.min_sample_count);
2550 // split ordered variables, keep both halves sorted.
2551 for( vi = 0; vi < data->var_count; vi++ )
2553 int ci = data->get_var_type(vi);
2554 int n1 = node->get_num_valid(vi);
2555 CvPair32s32f *src, *ldst0, *rdst0, *ldst, *rdst;
2556 CvPair32s32f tl, tr;
2558 if( ci >= 0 || !split_input_data )
2561 src = data->get_ord_var_data(node, vi);
2562 ldst0 = ldst = data->get_ord_var_data(left, vi);
2563 rdst0 = rdst = data->get_ord_var_data(right, vi);
2564 tl = ldst0[nl]; tr = rdst0[nr];
2567 for( i = 0; i < n1; i++ )
2570 float val = src[i].val;
2573 ldst->i = rdst->i = idx;
2574 ldst->val = rdst->val = val;
2579 left->set_num_valid(vi, (int)(ldst - ldst0));
2580 right->set_num_valid(vi, (int)(rdst - rdst0));
2588 ldst->i = rdst->i = idx;
2589 ldst->val = rdst->val = ord_nan;
2594 ldst0[nl] = tl; rdst0[nr] = tr;
2597 // split categorical vars, responses and cv_labels using new_idx relocation table
2598 for( vi = 0; vi < work_var_count; vi++ )
2600 int ci = data->get_var_type(vi);
2601 int n1 = node->get_num_valid(vi), nr1 = 0;
2602 int *src, *ldst0, *rdst0, *ldst, *rdst;
2605 if( ci < 0 || (vi < data->var_count && !split_input_data) )
2608 src = data->get_cat_var_data(node, vi);
2609 ldst0 = ldst = data->get_cat_var_data(left, vi);
2610 rdst0 = rdst = data->get_cat_var_data(right, vi);
2611 tl = ldst0[nl]; tr = rdst0[nr];
2613 for( i = 0; i < n; i++ )
2617 *ldst = *rdst = val;
2620 nr1 += (val >= 0)&d;
2623 if( vi < data->var_count )
2625 left->set_num_valid(vi, n1 - nr1);
2626 right->set_num_valid(vi, nr1);
2629 ldst0[nl] = tl; rdst0[nr] = tr;
2632 // deallocate the parent node data that is not needed anymore
2633 data->free_node_data(node);
2637 void CvDTree::prune_cv()
2643 // 1. build tree sequence for each cv fold, calculate error_{Tj,beta_k}.
2644 // 2. choose the best tree index (if need, apply 1SE rule).
2645 // 3. store the best index and cut the branches.
2647 CV_FUNCNAME( "CvDTree::prune_cv" );
2651 int ti, j, tree_count = 0, cv_n = data->params.cv_folds, n = root->sample_count;
2652 // currently, 1SE for regression is not implemented
2653 bool use_1se = data->params.use_1se_rule != 0 && data->is_classifier;
2655 double min_err = 0, min_err_se = 0;
2658 CV_CALL( ab = cvCreateMat( 1, 256, CV_64F ));
2660 // build the main tree sequence, calculate alpha's
2663 double min_alpha = update_tree_rnc(tree_count, -1);
2664 if( cut_tree(tree_count, -1, min_alpha) )
2667 if( ab->cols <= tree_count )
2669 CV_CALL( temp = cvCreateMat( 1, ab->cols*3/2, CV_64F ));
2670 for( ti = 0; ti < ab->cols; ti++ )
2671 temp->data.db[ti] = ab->data.db[ti];
2672 cvReleaseMat( &ab );
2677 ab->data.db[tree_count] = min_alpha;
2680 ab->data.db[0] = 0.;
2682 if( tree_count > 0 )
2684 for( ti = 1; ti < tree_count-1; ti++ )
2685 ab->data.db[ti] = sqrt(ab->data.db[ti]*ab->data.db[ti+1]);
2686 ab->data.db[tree_count-1] = DBL_MAX*0.5;
2688 CV_CALL( err_jk = cvCreateMat( cv_n, tree_count, CV_64F ));
2689 err = err_jk->data.db;
2691 for( j = 0; j < cv_n; j++ )
2694 for( ; tk < tree_count; tj++ )
2696 double min_alpha = update_tree_rnc(tj, j);
2697 if( cut_tree(tj, j, min_alpha) )
2698 min_alpha = DBL_MAX;
2700 for( ; tk < tree_count; tk++ )
2702 if( ab->data.db[tk] > min_alpha )
2704 err[j*tree_count + tk] = root->tree_error;
2709 for( ti = 0; ti < tree_count; ti++ )
2712 for( j = 0; j < cv_n; j++ )
2713 sum_err += err[j*tree_count + ti];
2714 if( ti == 0 || sum_err < min_err )
2719 min_err_se = sqrt( sum_err*(n - sum_err) );
2721 else if( sum_err < min_err + min_err_se )
2726 pruned_tree_idx = min_idx;
2727 free_prune_data(data->params.truncate_pruned_tree != 0);
2731 cvReleaseMat( &err_jk );
2732 cvReleaseMat( &ab );
2733 cvReleaseMat( &temp );
2737 double CvDTree::update_tree_rnc( int T, int fold )
2739 CvDTreeNode* node = root;
2740 double min_alpha = DBL_MAX;
2744 CvDTreeNode* parent;
2747 int t = fold >= 0 ? node->cv_Tn[fold] : node->Tn;
2748 if( t <= T || !node->left )
2750 node->complexity = 1;
2751 node->tree_risk = node->node_risk;
2752 node->tree_error = 0.;
2755 node->tree_risk = node->cv_node_risk[fold];
2756 node->tree_error = node->cv_node_error[fold];
2763 for( parent = node->parent; parent && parent->right == node;
2764 node = parent, parent = parent->parent )
2766 parent->complexity += node->complexity;
2767 parent->tree_risk += node->tree_risk;
2768 parent->tree_error += node->tree_error;
2770 parent->alpha = ((fold >= 0 ? parent->cv_node_risk[fold] : parent->node_risk)
2771 - parent->tree_risk)/(parent->complexity - 1);
2772 min_alpha = MIN( min_alpha, parent->alpha );
2778 parent->complexity = node->complexity;
2779 parent->tree_risk = node->tree_risk;
2780 parent->tree_error = node->tree_error;
2781 node = parent->right;
2788 int CvDTree::cut_tree( int T, int fold, double min_alpha )
2790 CvDTreeNode* node = root;
2796 CvDTreeNode* parent;
2799 int t = fold >= 0 ? node->cv_Tn[fold] : node->Tn;
2800 if( t <= T || !node->left )
2802 if( node->alpha <= min_alpha + FLT_EPSILON )
2805 node->cv_Tn[fold] = T;
2815 for( parent = node->parent; parent && parent->right == node;
2816 node = parent, parent = parent->parent )
2822 node = parent->right;
2829 void CvDTree::free_prune_data(bool cut_tree)
2831 CvDTreeNode* node = root;
2835 CvDTreeNode* parent;
2838 // do not call cvSetRemoveByPtr( cv_heap, node->cv_Tn )
2839 // as we will clear the whole cross-validation heap at the end
2841 node->cv_node_error = node->cv_node_risk = 0;
2847 for( parent = node->parent; parent && parent->right == node;
2848 node = parent, parent = parent->parent )
2850 if( cut_tree && parent->Tn <= pruned_tree_idx )
2852 data->free_node( parent->left );
2853 data->free_node( parent->right );
2854 parent->left = parent->right = 0;
2861 node = parent->right;
2865 cvClearSet( data->cv_heap );
2869 void CvDTree::free_tree()
2871 if( root && data && data->shared )
2873 pruned_tree_idx = INT_MIN;
2874 free_prune_data(true);
2875 data->free_node(root);
2881 CvDTreeNode* CvDTree::predict( const CvMat* _sample,
2882 const CvMat* _missing, bool preprocessed_input ) const
2884 CvDTreeNode* result = 0;
2887 CV_FUNCNAME( "CvDTree::predict" );
2891 int i, step, mstep = 0;
2892 const float* sample;
2894 CvDTreeNode* node = root;
2901 CV_ERROR( CV_StsError, "The tree has not been trained yet" );
2903 if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
2904 _sample->cols != 1 && _sample->rows != 1 ||
2905 _sample->cols + _sample->rows - 1 != data->var_all && !preprocessed_input ||
2906 _sample->cols + _sample->rows - 1 != data->var_count && preprocessed_input )
2907 CV_ERROR( CV_StsBadArg,
2908 "the input sample must be 1d floating-point vector with the same "
2909 "number of elements as the total number of variables used for training" );
2911 sample = _sample->data.fl;
2912 step = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(sample[0]);
2914 if( data->cat_count && !preprocessed_input ) // cache for categorical variables
2916 int n = data->cat_count->cols;
2917 catbuf = (int*)cvStackAlloc(n*sizeof(catbuf[0]));
2918 for( i = 0; i < n; i++ )
2924 if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
2925 !CV_ARE_SIZES_EQ(_missing, _sample) )
2926 CV_ERROR( CV_StsBadArg,
2927 "the missing data mask must be 8-bit vector of the same size as input sample" );
2928 m = _missing->data.ptr;
2929 mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step/sizeof(m[0]);
2932 vtype = data->var_type->data.i;
2933 vidx = data->var_idx && !preprocessed_input ? data->var_idx->data.i : 0;
2934 cmap = data->cat_map ? data->cat_map->data.i : 0;
2935 cofs = data->cat_ofs ? data->cat_ofs->data.i : 0;
2937 while( node->Tn > pruned_tree_idx && node->left )
2939 CvDTreeSplit* split = node->split;
2941 for( ; !dir && split != 0; split = split->next )
2943 int vi = split->var_idx;
2945 i = vidx ? vidx[vi] : vi;
2946 float val = sample[i*step];
2947 if( m && m[i*mstep] )
2949 if( ci < 0 ) // ordered
2950 dir = val <= split->ord.c ? -1 : 1;
2954 if( preprocessed_input )
2961 int a = c = cofs[ci];
2963 int ival = cvRound(val);
2965 CV_ERROR( CV_StsBadArg,
2966 "one of input categorical variable is not an integer" );
2971 if( ival < cmap[c] )
2973 else if( ival > cmap[c] )
2979 if( c < 0 || ival != cmap[c] )
2982 catbuf[ci] = c -= cofs[ci];
2985 dir = CV_DTREE_CAT_DIR(c, split->subset);
2988 if( split->inversed )
2994 double diff = node->right->sample_count - node->left->sample_count;
2995 dir = diff < 0 ? -1 : 1;
2997 node = dir < 0 ? node->left : node->right;
3008 const CvMat* CvDTree::get_var_importance()
3010 if( !var_importance )
3012 CvDTreeNode* node = root;
3016 var_importance = cvCreateMat( 1, data->var_count, CV_64F );
3017 cvZero( var_importance );
3018 importance = var_importance->data.db;
3022 CvDTreeNode* parent;
3023 for( ;; node = node->left )
3025 CvDTreeSplit* split = node->split;
3027 if( !node->left || node->Tn <= pruned_tree_idx )
3030 for( ; split != 0; split = split->next )
3031 importance[split->var_idx] += split->quality;
3034 for( parent = node->parent; parent && parent->right == node;
3035 node = parent, parent = parent->parent )
3041 node = parent->right;
3044 cvNormalize( var_importance, var_importance, 1., 0, CV_L1 );
3047 return var_importance;
3051 void CvDTree::write_split( CvFileStorage* fs, CvDTreeSplit* split )
3055 cvStartWriteStruct( fs, 0, CV_NODE_MAP + CV_NODE_FLOW );
3056 cvWriteInt( fs, "var", split->var_idx );
3057 cvWriteReal( fs, "quality", split->quality );
3059 ci = data->get_var_type(split->var_idx);
3060 if( ci >= 0 ) // split on a categorical var
3062 int i, n = data->cat_count->data.i[ci], to_right = 0, default_dir;
3063 for( i = 0; i < n; i++ )
3064 to_right += CV_DTREE_CAT_DIR(i,split->subset) > 0;
3066 // ad-hoc rule when to use inverse categorical split notation
3067 // to achieve more compact and clear representation
3068 default_dir = to_right <= 1 || to_right <= MIN(3, n/2) || to_right <= n/3 ? -1 : 1;
3070 cvStartWriteStruct( fs, default_dir*(split->inversed ? -1 : 1) > 0 ?
3071 "in" : "not_in", CV_NODE_SEQ+CV_NODE_FLOW );
3073 for( i = 0; i < n; i++ )
3075 int dir = CV_DTREE_CAT_DIR(i,split->subset);
3076 if( dir*default_dir < 0 )
3077 cvWriteInt( fs, 0, i );
3079 cvEndWriteStruct( fs );
3082 cvWriteReal( fs, !split->inversed ? "le" : "gt", split->ord.c );
3084 cvEndWriteStruct( fs );
3088 void CvDTree::write_node( CvFileStorage* fs, CvDTreeNode* node )
3090 CvDTreeSplit* split;
3092 cvStartWriteStruct( fs, 0, CV_NODE_MAP );
3094 cvWriteInt( fs, "depth", node->depth );
3095 cvWriteInt( fs, "sample_count", node->sample_count );
3096 cvWriteReal( fs, "value", node->value );
3098 if( data->is_classifier )
3099 cvWriteInt( fs, "norm_class_idx", node->class_idx );
3101 cvWriteInt( fs, "Tn", node->Tn );
3102 cvWriteInt( fs, "complexity", node->complexity );
3103 cvWriteReal( fs, "alpha", node->alpha );
3104 cvWriteReal( fs, "node_risk", node->node_risk );
3105 cvWriteReal( fs, "tree_risk", node->tree_risk );
3106 cvWriteReal( fs, "tree_error", node->tree_error );
3110 cvStartWriteStruct( fs, "splits", CV_NODE_SEQ );
3112 for( split = node->split; split != 0; split = split->next )
3113 write_split( fs, split );
3115 cvEndWriteStruct( fs );
3118 cvEndWriteStruct( fs );
3122 void CvDTree::write_tree_nodes( CvFileStorage* fs )
3124 //CV_FUNCNAME( "CvDTree::write_tree_nodes" );
3128 CvDTreeNode* node = root;
3130 // traverse the tree and save all the nodes in depth-first order
3133 CvDTreeNode* parent;
3136 write_node( fs, node );
3142 for( parent = node->parent; parent && parent->right == node;
3143 node = parent, parent = parent->parent )
3149 node = parent->right;
3156 void CvDTree::write( CvFileStorage* fs, const char* name )
3158 //CV_FUNCNAME( "CvDTree::write" );
3162 cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_TREE );
3164 get_var_importance();
3165 data->write_params( fs );
3166 if( var_importance )
3167 cvWrite( fs, "var_importance", var_importance );
3170 cvEndWriteStruct( fs );
3176 void CvDTree::write( CvFileStorage* fs )
3178 //CV_FUNCNAME( "CvDTree::write" );
3182 cvWriteInt( fs, "best_tree_idx", pruned_tree_idx );
3184 cvStartWriteStruct( fs, "nodes", CV_NODE_SEQ );
3185 write_tree_nodes( fs );
3186 cvEndWriteStruct( fs );
3192 CvDTreeSplit* CvDTree::read_split( CvFileStorage* fs, CvFileNode* fnode )
3194 CvDTreeSplit* split = 0;
3196 CV_FUNCNAME( "CvDTree::read_split" );
3202 if( !fnode || CV_NODE_TYPE(fnode->tag) != CV_NODE_MAP )
3203 CV_ERROR( CV_StsParseError, "some of the splits are not stored properly" );
3205 vi = cvReadIntByName( fs, fnode, "var", -1 );
3206 if( (unsigned)vi >= (unsigned)data->var_count )
3207 CV_ERROR( CV_StsOutOfRange, "Split variable index is out of range" );
3209 ci = data->get_var_type(vi);
3210 if( ci >= 0 ) // split on categorical var
3212 int i, n = data->cat_count->data.i[ci], inversed = 0, val;
3215 split = data->new_split_cat( vi, 0 );
3216 inseq = cvGetFileNodeByName( fs, fnode, "in" );
3219 inseq = cvGetFileNodeByName( fs, fnode, "not_in" );
3223 (CV_NODE_TYPE(inseq->tag) != CV_NODE_SEQ && CV_NODE_TYPE(inseq->tag) != CV_NODE_INT))
3224 CV_ERROR( CV_StsParseError,
3225 "Either 'in' or 'not_in' tags should be inside a categorical split data" );
3227 if( CV_NODE_TYPE(inseq->tag) == CV_NODE_INT )
3229 val = inseq->data.i;
3230 if( (unsigned)val >= (unsigned)n )
3231 CV_ERROR( CV_StsOutOfRange, "some of in/not_in elements are out of range" );
3233 split->subset[val >> 5] |= 1 << (val & 31);
3237 cvStartReadSeq( inseq->data.seq, &reader );
3239 for( i = 0; i < reader.seq->total; i++ )
3241 CvFileNode* inode = (CvFileNode*)reader.ptr;
3242 val = inode->data.i;
3243 if( CV_NODE_TYPE(inode->tag) != CV_NODE_INT || (unsigned)val >= (unsigned)n )
3244 CV_ERROR( CV_StsOutOfRange, "some of in/not_in elements are out of range" );
3246 split->subset[val >> 5] |= 1 << (val & 31);
3247 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
3251 // for categorical splits we do not use inversed splits,
3252 // instead we inverse the variable set in the split
3254 for( i = 0; i < (n + 31) >> 5; i++ )
3255 split->subset[i] ^= -1;
3259 CvFileNode* cmp_node;
3260 split = data->new_split_ord( vi, 0, 0, 0, 0 );
3262 cmp_node = cvGetFileNodeByName( fs, fnode, "le" );
3265 cmp_node = cvGetFileNodeByName( fs, fnode, "gt" );
3266 split->inversed = 1;
3269 split->ord.c = (float)cvReadReal( cmp_node );
3272 split->quality = (float)cvReadRealByName( fs, fnode, "quality" );
3280 CvDTreeNode* CvDTree::read_node( CvFileStorage* fs, CvFileNode* fnode, CvDTreeNode* parent )
3282 CvDTreeNode* node = 0;
3284 CV_FUNCNAME( "CvDTree::read_node" );
3291 if( !fnode || CV_NODE_TYPE(fnode->tag) != CV_NODE_MAP )
3292 CV_ERROR( CV_StsParseError, "some of the tree elements are not stored properly" );
3294 CV_CALL( node = data->new_node( parent, 0, 0, 0 ));
3295 depth = cvReadIntByName( fs, fnode, "depth", -1 );
3296 if( depth != node->depth )
3297 CV_ERROR( CV_StsParseError, "incorrect node depth" );
3299 node->sample_count = cvReadIntByName( fs, fnode, "sample_count" );
3300 node->value = cvReadRealByName( fs, fnode, "value" );
3301 if( data->is_classifier )
3302 node->class_idx = cvReadIntByName( fs, fnode, "norm_class_idx" );
3304 node->Tn = cvReadIntByName( fs, fnode, "Tn" );
3305 node->complexity = cvReadIntByName( fs, fnode, "complexity" );
3306 node->alpha = cvReadRealByName( fs, fnode, "alpha" );
3307 node->node_risk = cvReadRealByName( fs, fnode, "node_risk" );
3308 node->tree_risk = cvReadRealByName( fs, fnode, "tree_risk" );
3309 node->tree_error = cvReadRealByName( fs, fnode, "tree_error" );
3311 splits = cvGetFileNodeByName( fs, fnode, "splits" );
3315 CvDTreeSplit* last_split = 0;
3317 if( CV_NODE_TYPE(splits->tag) != CV_NODE_SEQ )
3318 CV_ERROR( CV_StsParseError, "splits tag must stored as a sequence" );
3320 cvStartReadSeq( splits->data.seq, &reader );
3321 for( i = 0; i < reader.seq->total; i++ )
3323 CvDTreeSplit* split;
3324 CV_CALL( split = read_split( fs, (CvFileNode*)reader.ptr ));
3326 node->split = last_split = split;
3328 last_split = last_split->next = split;
3330 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
3340 void CvDTree::read_tree_nodes( CvFileStorage* fs, CvFileNode* fnode )
3342 CV_FUNCNAME( "CvDTree::read_tree_nodes" );
3348 CvDTreeNode* parent = &_root;
3350 parent->left = parent->right = parent->parent = 0;
3352 cvStartReadSeq( fnode->data.seq, &reader );
3354 for( i = 0; i < reader.seq->total; i++ )
3358 CV_CALL( node = read_node( fs, (CvFileNode*)reader.ptr, parent != &_root ? parent : 0 ));
3360 parent->left = node;
3362 parent->right = node;
3367 while( parent && parent->right )
3368 parent = parent->parent;
3371 CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
3380 void CvDTree::read( CvFileStorage* fs, CvFileNode* fnode )
3382 CvDTreeTrainData* _data = new CvDTreeTrainData();
3383 _data->read_params( fs, fnode );
3385 read( fs, fnode, _data );
3386 get_var_importance();
3390 // a special entry point for reading weak decision trees from the tree ensembles
3391 void CvDTree::read( CvFileStorage* fs, CvFileNode* node, CvDTreeTrainData* _data )
3393 CV_FUNCNAME( "CvDTree::read" );
3397 CvFileNode* tree_nodes;
3402 tree_nodes = cvGetFileNodeByName( fs, node, "nodes" );
3403 if( !tree_nodes || CV_NODE_TYPE(tree_nodes->tag) != CV_NODE_SEQ )
3404 CV_ERROR( CV_StsParseError, "nodes tag is missing" );
3406 pruned_tree_idx = cvReadIntByName( fs, node, "best_tree_idx", -1 );
3407 read_tree_nodes( fs, tree_nodes );