SHOGUN
3.2.1
Main Page
Related Pages
Modules
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
src
shogun
statistics
StreamingMMD.cpp
Go to the documentation of this file.
1
/*
2
* Copyright (c) The Shogun Machine Learning Toolbox
3
* Written (w) 2012-2013 Heiko Strathmann
4
* Written (w) 2014 Soumyajit De
5
* All rights reserved.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions are met:
9
*
10
* 1. Redistributions of source code must retain the above copyright notice, this
11
* list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright notice,
13
* this list of conditions and the following disclaimer in the documentation
14
* and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*
27
* The views and conclusions contained in the software and documentation are those
28
* of the authors and should not be interpreted as representing official policies,
29
* either expressed or implied, of the Shogun Development Team.
30
*/
31
32
#include <
shogun/statistics/StreamingMMD.h
>
33
#include <
shogun/features/Features.h
>
34
#include <
shogun/features/streaming/StreamingFeatures.h
>
35
#include <
shogun/mathematics/Statistics.h
>
36
#include <
shogun/lib/List.h
>
37
38
using namespace
shogun;
39
40
CStreamingMMD::CStreamingMMD
() :
CKernelTwoSampleTest
()
41
{
42
init();
43
}
44
45
CStreamingMMD::CStreamingMMD
(
CKernel
* kernel,
CStreamingFeatures
* p,
46
CStreamingFeatures
* q,
index_t
m,
index_t
blocksize) :
47
CKernelTwoSampleTest
(kernel, NULL, m)
48
{
49
init();
50
51
m_streaming_p
=p;
52
SG_REF
(
m_streaming_p
);
53
54
m_streaming_q
=q;
55
SG_REF
(
m_streaming_q
);
56
57
m_blocksize
=blocksize;
58
}
59
60
CStreamingMMD::~CStreamingMMD
()
61
{
62
SG_UNREF
(
m_streaming_p
);
63
SG_UNREF
(
m_streaming_q
);
64
65
/* m_kernel is SG_UNREFed in base desctructor */
66
}
67
68
void
CStreamingMMD::init()
69
{
70
SG_ADD
((
CSGObject
**)&
m_streaming_p
,
"streaming_p"
,
"Streaming features p"
,
71
MS_NOT_AVAILABLE
);
72
SG_ADD
((
CSGObject
**)&
m_streaming_q
,
"streaming_q"
,
"Streaming features p"
,
73
MS_NOT_AVAILABLE
);
74
SG_ADD
(&
m_blocksize
,
"blocksize"
,
"Number of elements processed at once"
,
75
MS_NOT_AVAILABLE
);
76
SG_ADD
(&
m_simulate_h0
,
"simulate_h0"
,
"Whether p and q are mixed"
,
77
MS_NOT_AVAILABLE
);
78
79
m_streaming_p
=NULL;
80
m_streaming_q
=NULL;
81
m_blocksize
=10000;
82
m_simulate_h0
=
false
;
83
}
84
85
float64_t
CStreamingMMD::compute_statistic
()
86
{
87
/* use wrapper method and compute for single kernel */
88
SGVector<float64_t>
statistic;
89
SGVector<float64_t>
variance;
90
compute_statistic_and_variance
(statistic, variance,
false
);
91
92
return
statistic[0];
93
}
94
95
SGVector<float64_t>
CStreamingMMD::compute_statistic
(
bool
multiple_kernels)
96
{
97
/* make sure multiple_kernels flag is used only with a combined kernel */
98
REQUIRE
(!multiple_kernels ||
m_kernel
->
get_kernel_type
()==
K_COMBINED
,
99
"multiple kernels specified, but underlying kernel is not of type "
100
"K_COMBINED\n"
);
101
102
SGVector<float64_t>
statistic;
103
SGVector<float64_t>
variance;
104
compute_statistic_and_variance
(statistic, variance, multiple_kernels);
105
106
return
statistic;
107
}
108
109
float64_t
CStreamingMMD::compute_variance_estimate
()
110
{
111
/* use wrapper method and compute for single kernel */
112
SGVector<float64_t>
statistic;
113
SGVector<float64_t>
variance;
114
compute_statistic_and_variance
(statistic, variance,
false
);
115
116
return
variance[0];
117
}
118
119
float64_t
CStreamingMMD::compute_p_value
(
float64_t
statistic)
120
{
121
float64_t
result=0;
122
123
switch
(
m_null_approximation_method
)
124
{
125
case
MMD1_GAUSSIAN
:
126
{
127
/* compute variance and use to estimate Gaussian distribution */
128
float64_t
std_dev=
CMath::sqrt
(
compute_variance_estimate
());
129
result=1.0-
CStatistics::normal_cdf
(statistic, std_dev);
130
}
131
break
;
132
133
default
:
134
/* sampling null is handled here */
135
result=
CKernelTwoSampleTest::compute_p_value
(statistic);
136
break
;
137
}
138
139
return
result;
140
}
141
142
float64_t
CStreamingMMD::compute_threshold
(
float64_t
alpha)
143
{
144
float64_t
result=0;
145
146
switch
(
m_null_approximation_method
)
147
{
148
case
MMD1_GAUSSIAN
:
149
{
150
/* compute variance and use to estimate Gaussian distribution */
151
float64_t
std_dev=
CMath::sqrt
(
compute_variance_estimate
());
152
result=1.0-
CStatistics::inverse_normal_cdf
(1-alpha, 0, std_dev);
153
}
154
break
;
155
156
default
:
157
/* sampling null is handled here */
158
result=
CKernelTwoSampleTest::compute_threshold
(alpha);
159
break
;
160
}
161
162
return
result;
163
}
164
165
float64_t
CStreamingMMD::perform_test
()
166
{
167
float64_t
result=0;
168
169
switch
(
m_null_approximation_method
)
170
{
171
case
MMD1_GAUSSIAN
:
172
{
173
/* compute variance and use to estimate Gaussian distribution, use
174
* wrapper method and compute for single kernel */
175
SGVector<float64_t>
statistic;
176
SGVector<float64_t>
variance;
177
compute_statistic_and_variance
(statistic, variance,
false
);
178
179
/* estimate Gaussian distribution */
180
result=1.0-
CStatistics::normal_cdf
(statistic[0],
181
CMath::sqrt
(variance[0]));
182
}
183
break
;
184
185
default
:
186
/* sampling null can be done separately in superclass */
187
result=
CHypothesisTest::perform_test
();
188
break
;
189
}
190
191
return
result;
192
}
193
194
SGVector<float64_t>
CStreamingMMD::sample_null
()
195
{
196
SGVector<float64_t>
samples(
m_num_null_samples
);
197
198
/* instead of permutating samples, just samples new data all the time. */
199
CStreamingFeatures
* p=
m_streaming_p
;
200
CStreamingFeatures
* q=
m_streaming_q
;
201
SG_REF
(p);
202
SG_REF
(q);
203
204
bool
old=
m_simulate_h0
;
205
set_simulate_h0
(
true
);
206
for
(
index_t
i=0; i<
m_num_null_samples
; ++i)
207
{
208
/* compute statistic for this permutation of mixed samples */
209
samples[i]=
compute_statistic
();
210
}
211
set_simulate_h0
(old);
212
m_streaming_p
=p;
213
m_streaming_q
=q;
214
SG_UNREF
(p);
215
SG_UNREF
(q);
216
217
return
samples;
218
}
219
220
CList
*
CStreamingMMD::stream_data_blocks
(
index_t
num_blocks,
221
index_t
num_this_run)
222
{
223
SG_DEBUG
(
"entering!\n"
);
224
225
/* the list of blocks of data to be returned, turning delete_data flag
226
* on which SG_REFs the elements when appended or returned. */
227
CList
* data=
new
CList
(
true
);
228
229
SG_DEBUG
(
"streaming %d blocks from p of blocksize %d!\n"
, num_blocks,
230
num_this_run);
231
232
/* stream data from p num_blocks of time*/
233
for
(
index_t
i=0; i<num_blocks; ++i)
234
{
235
CFeatures
* block=
m_streaming_p
->
get_streamed_features
(num_this_run);
236
data->
append_element
(block);
237
}
238
239
SG_DEBUG
(
"streaming %d blocks from q of blocksize %d!\n"
, num_blocks,
240
num_this_run);
241
242
/* stream data from q num_blocks of time*/
243
for
(
index_t
i=0; i<num_blocks; ++i)
244
{
245
CFeatures
* block=
m_streaming_q
->
get_streamed_features
(num_this_run);
246
data->
append_element
(block);
247
}
248
249
/* check whether h0 should be simulated and permute if so */
250
if
(
m_simulate_h0
)
251
{
252
/* create merged copy of all feature instances to permute */
253
SG_DEBUG
(
"merging and premuting features!\n"
);
254
255
/* use the first element to merge rest of the data into */
256
CFeatures
* merged=(
CFeatures
*)data->
get_first_element
();
257
data->
delete_element
();
258
merged=merged->
create_merged_copy
(data);
259
260
/* get rid of unnecessary feature objects */
261
data->
delete_all_elements
();
262
263
/* permute */
264
SGVector<index_t>
inds(merged->
get_num_vectors
());
265
inds.
range_fill
();
266
inds.permute();
267
merged->
add_subset
(inds);
268
269
/* copy back */
270
SGVector<index_t>
copy(num_this_run);
271
copy.
range_fill
();
272
for
(
index_t
i=0; i<2*num_blocks; ++i)
273
{
274
CFeatures
* current=merged->
copy_subset
(copy);
275
data->
append_element
(current);
276
/* SG_UNREF'ing since copy_subset does a SG_REF, this is
277
* safe since the object is already SG_REF'ed inside the list */
278
SG_UNREF
(current);
279
280
if
(i<2*num_blocks-1)
281
copy.
add
(num_this_run);
282
}
283
284
/* clean up */
285
SG_UNREF
(merged);
286
}
287
288
SG_REF
(data);
289
290
SG_DEBUG
(
"leaving!\n"
);
291
return
data;
292
}
293
294
void
CStreamingMMD::set_p_and_q
(
CFeatures
* p_and_q)
295
{
296
SG_ERROR
(
"Method not implemented since linear time mmd is based on "
297
"streaming features\n"
);
298
}
299
300
CFeatures
*
CStreamingMMD::get_p_and_q
()
301
{
302
SG_ERROR
(
"Method not implemented since linear time mmd is based on "
303
"streaming features\n"
);
304
return
NULL;
305
}
306
307
CStreamingFeatures
*
CStreamingMMD::get_streaming_p
()
308
{
309
SG_REF
(
m_streaming_p
);
310
return
m_streaming_p
;
311
}
312
313
CStreamingFeatures
*
CStreamingMMD::get_streaming_q
()
314
{
315
SG_REF
(
m_streaming_q
);
316
return
m_streaming_q
;
317
}
318
SHOGUN
Machine Learning Toolbox - Documentation