SHOGUN
3.2.1
首页
相关页面
模块
类
文件
文件列表
文件成员
全部
类
命名空间
文件
函数
变量
类型定义
枚举
枚举值
友元
宏定义
组
页
src
shogun
statistics
StreamingMMD.cpp
浏览该文件的文档.
1
/*
2
* Copyright (c) The Shogun Machine Learning Toolbox
3
* Written (w) 2012-2013 Heiko Strathmann
4
* Written (w) 2014 Soumyajit De
5
* All rights reserved.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions are met:
9
*
10
* 1. Redistributions of source code must retain the above copyright notice, this
11
* list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright notice,
13
* this list of conditions and the following disclaimer in the documentation
14
* and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*
27
* The views and conclusions contained in the software and documentation are those
28
* of the authors and should not be interpreted as representing official policies,
29
* either expressed or implied, of the Shogun Development Team.
30
*/
31
32
#include <
shogun/statistics/StreamingMMD.h
>
33
#include <
shogun/features/Features.h
>
34
#include <
shogun/features/streaming/StreamingFeatures.h
>
35
#include <
shogun/mathematics/Statistics.h
>
36
#include <
shogun/lib/List.h
>
37
38
using namespace
shogun;
39
40
CStreamingMMD::CStreamingMMD
() :
CKernelTwoSampleTest
()
41
{
42
init();
43
}
44
45
CStreamingMMD::CStreamingMMD
(
CKernel
* kernel,
CStreamingFeatures
* p,
46
CStreamingFeatures
* q,
index_t
m,
index_t
blocksize) :
47
CKernelTwoSampleTest
(kernel, NULL, m)
48
{
49
init();
50
51
m_streaming_p
=p;
52
SG_REF
(
m_streaming_p
);
53
54
m_streaming_q
=q;
55
SG_REF
(
m_streaming_q
);
56
57
m_blocksize
=blocksize;
58
}
59
60
CStreamingMMD::~CStreamingMMD
()
61
{
62
SG_UNREF
(
m_streaming_p
);
63
SG_UNREF
(
m_streaming_q
);
64
65
/* m_kernel is SG_UNREFed in base desctructor */
66
}
67
68
void
CStreamingMMD::init()
69
{
70
SG_ADD
((
CSGObject
**)&
m_streaming_p
,
"streaming_p"
,
"Streaming features p"
,
71
MS_NOT_AVAILABLE
);
72
SG_ADD
((
CSGObject
**)&
m_streaming_q
,
"streaming_q"
,
"Streaming features p"
,
73
MS_NOT_AVAILABLE
);
74
SG_ADD
(&
m_blocksize
,
"blocksize"
,
"Number of elements processed at once"
,
75
MS_NOT_AVAILABLE
);
76
SG_ADD
(&
m_simulate_h0
,
"simulate_h0"
,
"Whether p and q are mixed"
,
77
MS_NOT_AVAILABLE
);
78
79
m_streaming_p
=NULL;
80
m_streaming_q
=NULL;
81
m_blocksize
=10000;
82
m_simulate_h0
=
false
;
83
}
84
85
float64_t
CStreamingMMD::compute_statistic
()
86
{
87
/* use wrapper method and compute for single kernel */
88
SGVector<float64_t>
statistic;
89
SGVector<float64_t>
variance;
90
compute_statistic_and_variance
(statistic, variance,
false
);
91
92
return
statistic[0];
93
}
94
95
SGVector<float64_t>
CStreamingMMD::compute_statistic
(
bool
multiple_kernels)
96
{
97
/* make sure multiple_kernels flag is used only with a combined kernel */
98
REQUIRE
(!multiple_kernels ||
m_kernel
->
get_kernel_type
()==
K_COMBINED
,
99
"multiple kernels specified, but underlying kernel is not of type "
100
"K_COMBINED\n"
);
101
102
SGVector<float64_t>
statistic;
103
SGVector<float64_t>
variance;
104
compute_statistic_and_variance
(statistic, variance, multiple_kernels);
105
106
return
statistic;
107
}
108
109
float64_t
CStreamingMMD::compute_variance_estimate
()
110
{
111
/* use wrapper method and compute for single kernel */
112
SGVector<float64_t>
statistic;
113
SGVector<float64_t>
variance;
114
compute_statistic_and_variance
(statistic, variance,
false
);
115
116
return
variance[0];
117
}
118
119
float64_t
CStreamingMMD::compute_p_value
(
float64_t
statistic)
120
{
121
float64_t
result=0;
122
123
switch
(
m_null_approximation_method
)
124
{
125
case
MMD1_GAUSSIAN
:
126
{
127
/* compute variance and use to estimate Gaussian distribution */
128
float64_t
std_dev=
CMath::sqrt
(
compute_variance_estimate
());
129
result=1.0-
CStatistics::normal_cdf
(statistic, std_dev);
130
}
131
break
;
132
133
default
:
134
/* sampling null is handled here */
135
result=
CKernelTwoSampleTest::compute_p_value
(statistic);
136
break
;
137
}
138
139
return
result;
140
}
141
142
float64_t
CStreamingMMD::compute_threshold
(
float64_t
alpha)
143
{
144
float64_t
result=0;
145
146
switch
(
m_null_approximation_method
)
147
{
148
case
MMD1_GAUSSIAN
:
149
{
150
/* compute variance and use to estimate Gaussian distribution */
151
float64_t
std_dev=
CMath::sqrt
(
compute_variance_estimate
());
152
result=1.0-
CStatistics::inverse_normal_cdf
(1-alpha, 0, std_dev);
153
}
154
break
;
155
156
default
:
157
/* sampling null is handled here */
158
result=
CKernelTwoSampleTest::compute_threshold
(alpha);
159
break
;
160
}
161
162
return
result;
163
}
164
165
float64_t
CStreamingMMD::perform_test
()
166
{
167
float64_t
result=0;
168
169
switch
(
m_null_approximation_method
)
170
{
171
case
MMD1_GAUSSIAN
:
172
{
173
/* compute variance and use to estimate Gaussian distribution, use
174
* wrapper method and compute for single kernel */
175
SGVector<float64_t>
statistic;
176
SGVector<float64_t>
variance;
177
compute_statistic_and_variance
(statistic, variance,
false
);
178
179
/* estimate Gaussian distribution */
180
result=1.0-
CStatistics::normal_cdf
(statistic[0],
181
CMath::sqrt
(variance[0]));
182
}
183
break
;
184
185
default
:
186
/* sampling null can be done separately in superclass */
187
result=
CHypothesisTest::perform_test
();
188
break
;
189
}
190
191
return
result;
192
}
193
194
SGVector<float64_t>
CStreamingMMD::sample_null
()
195
{
196
SGVector<float64_t>
samples(
m_num_null_samples
);
197
198
/* instead of permutating samples, just samples new data all the time. */
199
CStreamingFeatures
* p=
m_streaming_p
;
200
CStreamingFeatures
* q=
m_streaming_q
;
201
SG_REF
(p);
202
SG_REF
(q);
203
204
bool
old=
m_simulate_h0
;
205
set_simulate_h0
(
true
);
206
for
(
index_t
i=0; i<
m_num_null_samples
; ++i)
207
{
208
/* compute statistic for this permutation of mixed samples */
209
samples[i]=
compute_statistic
();
210
}
211
set_simulate_h0
(old);
212
m_streaming_p
=p;
213
m_streaming_q
=q;
214
SG_UNREF
(p);
215
SG_UNREF
(q);
216
217
return
samples;
218
}
219
220
CList
*
CStreamingMMD::stream_data_blocks
(
index_t
num_blocks,
221
index_t
num_this_run)
222
{
223
SG_DEBUG
(
"entering!\n"
);
224
225
/* the list of blocks of data to be returned, turning delete_data flag
226
* on which SG_REFs the elements when appended or returned. */
227
CList
* data=
new
CList
(
true
);
228
229
SG_DEBUG
(
"streaming %d blocks from p of blocksize %d!\n"
, num_blocks,
230
num_this_run);
231
232
/* stream data from p num_blocks of time*/
233
for
(
index_t
i=0; i<num_blocks; ++i)
234
{
235
CFeatures
* block=
m_streaming_p
->
get_streamed_features
(num_this_run);
236
data->
append_element
(block);
237
}
238
239
SG_DEBUG
(
"streaming %d blocks from q of blocksize %d!\n"
, num_blocks,
240
num_this_run);
241
242
/* stream data from q num_blocks of time*/
243
for
(
index_t
i=0; i<num_blocks; ++i)
244
{
245
CFeatures
* block=
m_streaming_q
->
get_streamed_features
(num_this_run);
246
data->
append_element
(block);
247
}
248
249
/* check whether h0 should be simulated and permute if so */
250
if
(
m_simulate_h0
)
251
{
252
/* create merged copy of all feature instances to permute */
253
SG_DEBUG
(
"merging and premuting features!\n"
);
254
255
/* use the first element to merge rest of the data into */
256
CFeatures
* merged=(
CFeatures
*)data->
get_first_element
();
257
data->
delete_element
();
258
merged=merged->
create_merged_copy
(data);
259
260
/* get rid of unnecessary feature objects */
261
data->
delete_all_elements
();
262
263
/* permute */
264
SGVector<index_t>
inds(merged->
get_num_vectors
());
265
inds.
range_fill
();
266
inds.permute();
267
merged->
add_subset
(inds);
268
269
/* copy back */
270
SGVector<index_t>
copy(num_this_run);
271
copy.
range_fill
();
272
for
(
index_t
i=0; i<2*num_blocks; ++i)
273
{
274
CFeatures
* current=merged->
copy_subset
(copy);
275
data->
append_element
(current);
276
/* SG_UNREF'ing since copy_subset does a SG_REF, this is
277
* safe since the object is already SG_REF'ed inside the list */
278
SG_UNREF
(current);
279
280
if
(i<2*num_blocks-1)
281
copy.
add
(num_this_run);
282
}
283
284
/* clean up */
285
SG_UNREF
(merged);
286
}
287
288
SG_REF
(data);
289
290
SG_DEBUG
(
"leaving!\n"
);
291
return
data;
292
}
293
294
void
CStreamingMMD::set_p_and_q
(
CFeatures
* p_and_q)
295
{
296
SG_ERROR
(
"Method not implemented since linear time mmd is based on "
297
"streaming features\n"
);
298
}
299
300
CFeatures
*
CStreamingMMD::get_p_and_q
()
301
{
302
SG_ERROR
(
"Method not implemented since linear time mmd is based on "
303
"streaming features\n"
);
304
return
NULL;
305
}
306
307
CStreamingFeatures
*
CStreamingMMD::get_streaming_p
()
308
{
309
SG_REF
(
m_streaming_p
);
310
return
m_streaming_p
;
311
}
312
313
CStreamingFeatures
*
CStreamingMMD::get_streaming_q
()
314
{
315
SG_REF
(
m_streaming_q
);
316
return
m_streaming_q
;
317
}
318
SHOGUN
机器学习工具包 - 项目文档