SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ID3ClassifierTree.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Parijat Mazumdar
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
36 
37 using namespace shogun;
38 
41 {
42 }
43 
45 {
46 }
47 
49 {
50  REQUIRE(data, "Data required for classification in apply_multiclass\n")
51 
52  CDenseFeatures<float64_t>* feats = dynamic_cast<CDenseFeatures<float64_t>*>(data);
53  int32_t num_vecs = feats->get_num_vectors();
54  SGVector<float64_t> labels = SGVector<float64_t>(num_vecs);
55 
56  for (int32_t i=0; i<num_vecs; i++)
57  {
58  SGVector<float64_t> sample = feats->get_feature_vector(i);
59  node_t* node = get_root();
60  CDynamicObjectArray* children = node->get_children();
61 
62  while (children->get_num_elements())
63  {
64  int32_t flag = 0;
65  for (int32_t j=0; j<children->get_num_elements(); j++)
66  {
67  node_t* child = dynamic_cast<node_t*>(children->get_element(j));
68  if (child->data.transit_if_feature_value
69  == sample[node->data.attribute_id])
70  {
71  flag = 1;
72 
73  SG_UNREF(node);
74  node = child;
75 
76  SG_UNREF(children);
77  children = node->get_children();
78 
79  break;
80  }
81 
82  SG_UNREF(child);
83  }
84 
85  if (!flag)
86  break;
87  }
88 
89  labels[i] = node->data.class_label;
90 
91  SG_UNREF(node);
92  SG_UNREF(children);
93  }
94 
95  CMulticlassLabels* ret = new CMulticlassLabels(labels);
96  return ret;
97 }
98 
100 {
101  REQUIRE(data,"Data required for training\n")
102  REQUIRE(data->get_feature_class()==C_DENSE, "Dense data required for training\n")
103 
104  int32_t num_features = (dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_num_features();
105  SGVector<int32_t> feature_ids = SGVector<int32_t>(num_features);
106  feature_ids.range_fill();
107 
108  set_root(id3train(data, dynamic_cast<CMulticlassLabels*>(m_labels), feature_ids, 0));
109 
110  return true;
111 }
112 
113 CTreeMachineNode<id3TreeNodeData>* CID3ClassifierTree::id3train(CFeatures* data,
114  CMulticlassLabels* class_labels, SGVector<int32_t> feature_id_vector, int32_t level)
115 {
116  node_t* node = new node_t();
117  CDenseFeatures<float64_t>* feats = dynamic_cast<CDenseFeatures<float64_t>*>(data);
118  int32_t num_vecs = feats->get_num_vectors();
119 
120  // if all samples belong to the same class
121  if (class_labels->get_unique_labels().size() == 1)
122  {
123  node->data.class_label = class_labels->get_unique_labels()[0];
124  return node;
125  }
126 
127  // if no feature is left
128  if (feature_id_vector.vlen == 0)
129  {
130  // decide label - label occuring max times
131  SGVector<float64_t> labels = class_labels->get_labels();
132  labels.qsort();
133 
134  int32_t most_label = labels[0];
135  int32_t most_num = 1;
136  int32_t count = 1;
137 
138  for (int32_t i=1; i<labels.vlen; i++)
139  {
140  while ((labels[i] == labels[i-1]) && (i<labels.vlen))
141  {
142  count++;
143  i++;
144  }
145 
146  if (count>most_num)
147  {
148  most_num = count;
149  most_label = labels[i-1];
150  }
151 
152  count = 1;
153  }
154 
155  node->data.class_label = most_label;
156  return node;
157  }
158 
159  // else get the feature with the highest informational gain
160  float64_t max = 0;
161  int32_t best_feature_index = -1;
162  for (int32_t i=0; i<feats->get_num_features(); i++)
163  {
164  float64_t gain = informational_gain_attribute(i,feats,class_labels);
165 
166  if (gain >= max)
167  {
168  max = gain;
169  best_feature_index = i;
170  }
171  }
172 
173  //get feature values for the best feature chosen
174  SGVector<float64_t> best_feature_values = SGVector<float64_t>(num_vecs);
175  for (int32_t i=0; i<num_vecs; i++)
176  best_feature_values[i] = (feats->get_feature_vector(i))[best_feature_index];
177 
178  CMulticlassLabels* best_feature_labels = new CMulticlassLabels(best_feature_values);
179  SGVector<float64_t> best_labels_unique = best_feature_labels->get_unique_labels();
180 
181  for (int32_t i=0; i<best_labels_unique.vlen; i++)
182  {
183  //compute the number of vectors with active attribute value
184  int32_t num_cols = 0;
185  float64_t active_feature_value = best_labels_unique[i];
186 
187  for (int32_t j=0; j<num_vecs; j++)
188  {
189  if ( active_feature_value == best_feature_values[j])
190  num_cols++;
191  }
192 
193  SGMatrix<float64_t> mat = SGMatrix<float64_t>(feats->get_num_features()-1, num_cols);
194  SGVector<float64_t> new_labels_vector = SGVector<float64_t>(num_cols);
195 
196  int32_t cnt = 0;
197  //choose the samples that have the active feature value
198  for (int32_t j=0; j<num_vecs; j++)
199  {
200  SGVector<float64_t> sample = feats->get_feature_vector(j);
201  if (active_feature_value == sample[best_feature_index])
202  {
203  int32_t idx = -1;
204  for (int32_t k=0; k<sample.size(); k++)
205  {
206  if (k != best_feature_index)
207  mat(++idx, cnt) = sample[k];
208  }
209 
210  new_labels_vector[cnt] = class_labels->get_labels()[j];
211  cnt++;
212  }
213  }
214 
215  //remove the best_attribute from the remaining attributes index vector
216  SGVector<int32_t> new_feature_id_vector = SGVector<int32_t>(feature_id_vector.vlen-1);
217  cnt = -1;
218  for (int32_t j=0;j<feature_id_vector.vlen;j++)
219  {
220  if (j!=best_feature_index)
221  new_feature_id_vector[++cnt] = feature_id_vector[j];
222  }
223 
224  CMulticlassLabels* new_class_labels = new CMulticlassLabels(new_labels_vector);
226 
227  node_t* child = id3train(new_data, new_class_labels, new_feature_id_vector, level+1);
228  child->data.transit_if_feature_value = active_feature_value;
229  node->data.attribute_id = feature_id_vector[best_feature_index];
230  node->add_child(child);
231 
232  SG_UNREF(new_class_labels);
233  SG_UNREF(new_data);
234  }
235 
236  SG_UNREF(best_feature_labels);
237 
238  return node;
239 }
240 
241 float64_t CID3ClassifierTree::informational_gain_attribute(int32_t attr_no, CFeatures* data,
242  CMulticlassLabels* class_labels)
243 {
244  REQUIRE(data,"Data required for information gain calculation\n")
245  REQUIRE(data->get_feature_class()==C_DENSE,
246  "Dense data required for information gain calculation\n")
247 
248  float64_t gain = 0;
249  CDenseFeatures<float64_t>* feats = dynamic_cast<CDenseFeatures<float64_t>*>(data);
250  int32_t num_vecs = feats->get_num_vectors();
251 
252  //get attribute values for attribute
253  SGVector<float64_t> attribute_values = SGVector<float64_t>(num_vecs);
254 
255  for (int32_t i=0; i<num_vecs; i++)
256  attribute_values[i] = (feats->get_feature_vector(i))[attr_no];
257 
258  CMulticlassLabels* attribute_labels = new CMulticlassLabels(attribute_values);
259  SGVector<float64_t> attr_val_unique = attribute_labels->get_unique_labels();
260 
261  for (int32_t i=0; i<attr_val_unique.vlen; i++)
262  {
263  //calculate class entropy for the specific attribute_value
264  int32_t attr_count=0;
265 
266  for (int32_t j=0; j<num_vecs; j++)
267  {
268  if (attribute_values[j] == attr_val_unique[i])
269  attr_count++;
270  }
271 
272  SGVector<float64_t> sub_class = SGVector<float64_t>(attr_count);
273  int32_t count = 0;
274 
275  for (int32_t j=0; j<num_vecs; j++)
276  {
277  if (attribute_values[j] == attr_val_unique[i])
278  sub_class[count++] = class_labels->get_label(j);
279  }
280 
281  CMulticlassLabels* sub_labels = new CMulticlassLabels(sub_class);
282  float64_t sub_entropy = entropy(sub_labels);
283  gain += sub_entropy*(attr_count-0.f)/(num_vecs-0.f);
284 
285  SG_UNREF(sub_labels);
286  }
287 
288  float64_t data_entropy = entropy(class_labels);
289  gain = data_entropy-gain;
290 
291  SG_UNREF(attribute_labels);
292 
293  return gain;
294 }
295 
296 float64_t CID3ClassifierTree::entropy(CMulticlassLabels* labels)
297 {
299  (labels->get_unique_labels().size());
300 
301  for (int32_t i=0;i<labels->get_unique_labels().size();i++)
302  {
303  int32_t count = 0;
304 
305  for (int32_t j=0;j<labels->get_num_labels();j++)
306  {
307  if (labels->get_unique_labels()[i] == labels->get_label(j))
308  count++;
309  }
310 
311  log_ratios[i] = (count-0.f)/(labels->get_num_labels()-0.f);
312 
313  if (log_ratios[i] != 0)
314  log_ratios[i] = CMath::log(log_ratios[i]);
315  }
316 
317  return CStatistics::entropy(log_ratios.vector, log_ratios.vlen);
318 }

SHOGUN Machine Learning Toolbox - Documentation