SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QuadraticTimeMMD.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2012-2013 Heiko Strathmann
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
34 #include <shogun/kernel/Kernel.h>
37 
38 using namespace shogun;
39 
41 {
42  init();
43 }
44 
46  index_t m) :
47  CKernelTwoSampleTest(kernel, p_and_q, m)
48 {
49  init();
50 
51  if (p_and_q && m!=p_and_q->get_num_vectors()/2)
52  {
53  SG_ERROR("Only features with equal number of vectors "
54  "are currently possible\n");
55  }
56 }
57 
59  CFeatures* q) : CKernelTwoSampleTest(kernel, p, q)
60 {
61  init();
62 
63  if (p && q && p->get_num_vectors()!=q->get_num_vectors())
64  {
65  SG_ERROR("Only features with equal number of vectors "
66  "are currently possible\n");
67  }
68 }
69 
71  CKernelTwoSampleTest(custom_kernel, NULL, m)
72 {
73  init();
74 }
75 
77 {
78 
79 }
80 
81 void CQuadraticTimeMMD::init()
82 {
83  SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples"
84  " for spectrum method null-distribution approximation",
86  SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of "
87  " Eigenvalues for spectrum method null-distribution approximation",
89  SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type",
90  "Biased or unbiased MMD", MS_NOT_AVAILABLE);
91 
95 }
96 
98 {
99  /* split computations into three terms from JLMR paper (see documentation )*/
100  index_t m=m_m;
101 
102  /* init kernel with features */
104 
105  /* first term */
106  float64_t first=0;
107  for (index_t i=0; i<m; ++i)
108  {
109  /* ensure i!=j while adding up */
110  for (index_t j=0; j<m; ++j)
111  first+=i==j ? 0 : m_kernel->kernel(i,j);
112  }
113  first/=(m-1);
114 
115  /* second term */
116  float64_t second=0;
117  for (index_t i=m_m; i<m_m+m; ++i)
118  {
119  /* ensure i!=j while adding up */
120  for (index_t j=m_m; j<m_m+m; ++j)
121  second+=i==j ? 0 : m_kernel->kernel(i,j);
122  }
123  second/=(m-1);
124 
125  /* third term */
126  float64_t third=0;
127  for (index_t i=0; i<m; ++i)
128  {
129  for (index_t j=m_m; j<m_m+m; ++j)
130  third+=m_kernel->kernel(i,j);
131  }
132  third*=2.0/m;
133 
134  return first+second-third;
135 }
136 
138 {
139  /* split computations into three terms from JLMR paper (see documentation )*/
140  index_t m=m_m;
141 
142  /* init kernel with features */
144 
145  /* first term */
146  float64_t first=0;
147  for (index_t i=0; i<m; ++i)
148  {
149  for (index_t j=0; j<m; ++j)
150  first+=m_kernel->kernel(i,j);
151  }
152  first/=m;
153 
154  /* second term */
155  float64_t second=0;
156  for (index_t i=m_m; i<m_m+m; ++i)
157  {
158  for (index_t j=m_m; j<m_m+m; ++j)
159  second+=m_kernel->kernel(i,j);
160  }
161  second/=m;
162 
163  /* third term */
164  float64_t third=0;
165  for (index_t i=0; i<m; ++i)
166  {
167  for (index_t j=m_m; j<m_m+m; ++j)
168  third+=m_kernel->kernel(i,j);
169  }
170  third*=2.0/m;
171 
172  return first+second-third;
173 }
174 
176 {
177  if (!m_kernel)
178  SG_ERROR("No kernel specified!\n")
179 
180  float64_t result=0;
181  switch (m_statistic_type)
182  {
183  case UNBIASED:
185  break;
186  case BIASED:
187  result=compute_biased_statistic();
188  break;
189  default:
190  SG_ERROR("Unknown statistic type!\n");
191  break;
192  }
193 
194  return result;
195 }
196 
198 {
199  float64_t result=0;
200 
202  {
203  case MMD2_SPECTRUM:
204  {
205 #ifdef HAVE_LAPACK
206  /* get samples from null-distribution and compute p-value of statistic */
209  null_samples.qsort();
210  index_t pos=null_samples.find_position_to_insert(statistic);
211  result=1.0-((float64_t)pos)/null_samples.vlen;
212 #else // HAVE_LAPACK
213  SG_ERROR("Only possible if shogun is compiled with LAPACK enabled\n");
214 #endif // HAVE_LAPACK
215  break;
216  }
217 
218  case MMD2_GAMMA:
219  {
220  /* fit gamma and return cdf at statistic */
222  result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
223  break;
224  }
225 
226  default:
227  result=CKernelTwoSampleTest::compute_p_value(statistic);
228  break;
229  }
230 
231  return result;
232 }
233 
235  bool multiple_kernels)
236 {
237  SGVector<float64_t> mmds;
238  if (!multiple_kernels)
239  {
240  /* just one mmd result */
241  mmds=SGVector<float64_t>(1);
242  mmds[0]=compute_statistic();
243  }
244  else
245  {
247  "multiple kernels specified, but underlying kernel is not of type "
248  "K_COMBINED\n");
249 
250  /* cast and allocate memory for results */
252  SG_REF(combined);
253  mmds=SGVector<float64_t>(combined->get_num_subkernels());
254 
255  /* iterate through all kernels and compute statistic */
256  /* TODO this might be done in parallel */
257  for (index_t i=0; i<mmds.vlen; ++i)
258  {
259  CKernel* current=combined->get_kernel(i);
260  /* temporarily replace underlying kernel and compute statistic */
261  m_kernel=current;
262  mmds[i]=compute_statistic();
263 
264  SG_UNREF(current);
265  }
266 
267  /* restore combined kernel */
268  m_kernel=combined;
269  SG_UNREF(combined);
270  }
271 
272  return mmds;
273 }
274 
276 {
277  float64_t result=0;
278 
280  {
281  case MMD2_SPECTRUM:
282  {
283 #ifdef HAVE_LAPACK
284  /* get samples from null-distribution and compute threshold */
287  null_samples.qsort();
288  result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))];
289 #else // HAVE_LAPACK
290  SG_ERROR("Only possible if shogun is compiled with LAPACK enabled\n");
291 #endif // HAVE_LAPACK
292  break;
293  }
294 
295  case MMD2_GAMMA:
296  {
297  /* fit gamma and return inverse cdf at alpha */
299  result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
300  break;
301  }
302 
303  default:
304  /* sampling null is handled here */
306  break;
307  }
308 
309  return result;
310 }
311 
312 
313 #ifdef HAVE_LAPACK
315  index_t num_eigenvalues)
316 {
317  REQUIRE(m_kernel, "(%d, %d): No kernel set!\n", num_samples,
318  num_eigenvalues);
320  "(%d, %d): No features set and no custom kernel in use!\n",
321  num_samples, num_eigenvalues);
322 
323  index_t num_data;
325  num_data=m_kernel->get_num_vec_rhs();
326  else
327  num_data=m_p_and_q->get_num_vectors();
328 
329  if (m_m!=num_data/2)
330  SG_ERROR("Currently, only equal sample sizes are supported\n");
331 
332  if (num_samples<=2)
333  {
334  SG_ERROR("Number of samples has to be at least 2, "
335  "better in the hundreds");
336  }
337 
338  if (num_eigenvalues>2*m_m-1)
339  SG_ERROR("Number of Eigenvalues too large\n");
340 
341  if (num_eigenvalues<1)
342  SG_ERROR("Number of Eigenvalues too small\n");
343 
344  /* evtl. warn user not to use wrong statistic type */
346  {
347  SG_WARNING("Note: provided statistic has "
348  "to be BIASED. Please ensure that! To get rid of warning,"
349  "call %s::set_statistic_type(BIASED)\n", get_name());
350  }
351 
352  /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
353  * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
354  * works since X and Y are concatenated here */
357 
358  /* center matrix K=H*K*H */
359  K.center();
360 
361  /* compute eigenvalues and select num_eigenvalues largest ones */
362  SGVector<float64_t> eigenvalues=
364  SGVector<float64_t> largest_ev(num_eigenvalues);
365 
366  /* take largest EV, scale by 1/2/m on the fly and take abs value*/
367  for (index_t i=0; i<num_eigenvalues; ++i)
368  largest_ev[i]=CMath::abs(
369  1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
370 
371  /* finally, sample from null distribution */
372  SGVector<float64_t> null_samples(num_samples);
373  for (index_t i=0; i<num_samples; ++i)
374  {
375  /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
376  null_samples[i]=0;
377  for (index_t j=0; j<largest_ev.vlen; ++j)
378  null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
379 
380  null_samples[i]*=2;
381  }
382 
383  return null_samples;
384 }
385 #endif // HAVE_LAPACK
386 
388 {
389  REQUIRE(m_kernel, "No kernel set!\n");
391  "No features set and no custom kernel in use!\n");
392 
393  index_t num_data;
395  num_data=m_kernel->get_num_vec_rhs();
396  else
397  num_data=m_p_and_q->get_num_vectors();
398 
399  if (m_m!=num_data/2)
400  SG_ERROR("Currently, only equal sample sizes are supported\n");
401 
402  /* evtl. warn user not to use wrong statistic type */
404  {
405  SG_WARNING("Note: provided statistic has "
406  "to be BIASED. Please ensure that! To get rid of warning,"
407  "call %s::set_statistic_type(BIASED)\n", get_name());
408  }
409 
410  /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
411  * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
412  * works since X and Y are concatenated here */
414 
415  /* compute mean under H0 of MMD, which is
416  * meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) );
417  * in MATLAB.
418  * Remove diagonals on the fly */
419  float64_t mean_mmd=0;
420  for (index_t i=0; i<m_m; ++i)
421  {
422  /* virtual KL matrix is in upper right corner of SHOGUN K matrix
423  * so this sums the diagonal of the matrix between X and Y*/
424  mean_mmd+=m_kernel->kernel(i, m_m+i);
425  }
426  mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd);
427 
428  /* compute variance under H0 of MMD, which is
429  * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 ));
430  * in MATLAB, so sum up all elements */
431  float64_t var_mmd=0;
432  for (index_t i=0; i<m_m; ++i)
433  {
434  for (index_t j=0; j<m_m; ++j)
435  {
436  /* dont add diagonal of all pairs of imaginary kernel matrices */
437  if (i==j || m_m+i==j || m_m+j==i)
438  continue;
439 
440  float64_t to_add=m_kernel->kernel(i, j);
441  to_add+=m_kernel->kernel(m_m+i, m_m+j);
442  to_add-=m_kernel->kernel(i, m_m+j);
443  to_add-=m_kernel->kernel(m_m+i, j);
444  var_mmd+=CMath::pow(to_add, 2);
445  }
446  }
447  var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1);
448 
449  /* parameters for gamma distribution */
450  float64_t a=CMath::pow(mean_mmd, 2)/var_mmd;
451  float64_t b=var_mmd*m_m / mean_mmd;
452 
453  SGVector<float64_t> result(2);
454  result[0]=a;
455  result[1]=b;
456 
457  return result;
458 }
459 
461  num_samples_spectrum)
462 {
463  m_num_samples_spectrum=num_samples_spectrum;
464 }
465 
467  index_t num_eigenvalues_spectrum)
468 {
469  m_num_eigenvalues_spectrum=num_eigenvalues_spectrum;
470 }
471 
473  statistic_type)
474 {
475  m_statistic_type=statistic_type;
476 }

SHOGUN Machine Learning Toolbox - Documentation