SHOGUN
3.2.1
首页
相关页面
模块
类
文件
文件列表
文件成员
全部
类
命名空间
文件
函数
变量
类型定义
枚举
枚举值
友元
宏定义
组
页
src
shogun
lib
NGramTokenizer.cpp
浏览该文件的文档.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#include <
shogun/lib/NGramTokenizer.h
>
12
#include <
shogun/base/Parameter.h
>
13
14
namespace
shogun
15
{
16
17
CNGramTokenizer::CNGramTokenizer
(int32_t ns) :
CTokenizer
()
18
{
19
n
= ns;
20
last_idx
= 0;
21
init();
22
}
23
24
CNGramTokenizer::CNGramTokenizer
(
const
CNGramTokenizer
& orig)
25
:
CTokenizer
(orig)
26
{
27
CTokenizer::set_text
(orig.
text
);
28
n
= orig.
n
;
29
init();
30
}
31
32
void
CNGramTokenizer::init()
33
{
34
SG_ADD
(&
n
,
"n"
,
"Size of n-grams"
,
35
MS_NOT_AVAILABLE
);
36
SG_ADD
(&
last_idx
,
"last_idx"
,
"Index of last token"
,
37
MS_NOT_AVAILABLE
);
38
}
39
40
void
CNGramTokenizer::set_text
(
SGVector<char>
txt)
41
{
42
last_idx
= 0;
43
CTokenizer::set_text
(txt);
44
}
45
46
const
char
*
CNGramTokenizer::get_name
()
const
47
{
48
return
"NGramTokenizer"
;
49
}
50
51
bool
CNGramTokenizer::has_next
()
52
{
53
return
last_idx
<=
text
.
size
()-
n
;
54
}
55
56
index_t
CNGramTokenizer::next_token_idx
(
index_t
& start)
57
{
58
start =
last_idx
++;
59
return
start +
n
;
60
}
61
62
CNGramTokenizer
*
CNGramTokenizer::get_copy
()
63
{
64
CNGramTokenizer
* t =
new
CNGramTokenizer
(
n
);
65
return
t;
66
}
67
}
SHOGUN
机器学习工具包 - 项目文档