SHOGUN
3.2.1
首页
相关页面
模块
类
文件
文件列表
文件成员
全部
类
命名空间
文件
函数
变量
类型定义
枚举
枚举值
友元
宏定义
组
页
src
shogun
lib
DelimiterTokenizer.cpp
浏览该文件的文档.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#include <
shogun/base/Parameter.h
>
12
#include <
shogun/lib/DelimiterTokenizer.h
>
13
14
namespace
shogun
15
{
16
17
CDelimiterTokenizer::CDelimiterTokenizer
(
bool
skip_delimiters) : delimiters(256)
18
{
19
last_idx
= 0;
20
skip_consecutive_delimiters
= skip_delimiters;
21
init();
22
}
23
24
CDelimiterTokenizer::CDelimiterTokenizer
(
const
CDelimiterTokenizer
& orig)
25
{
26
CTokenizer::set_text
(orig.
text
);
27
delimiters
= orig.
delimiters
;
28
init();
29
}
30
31
void
CDelimiterTokenizer::init()
32
{
33
SG_ADD
(&
last_idx
,
"last_idx"
,
"Index of last token"
,
34
MS_NOT_AVAILABLE
);
35
SG_ADD
(&
skip_consecutive_delimiters
,
"skip_consecutive_delimiters"
,
36
"Whether to skip consecutive delimiters or not"
,
MS_NOT_AVAILABLE
);
37
SGVector<bool>::fill_vector
(
delimiters
, 256, 0);
38
}
39
40
void
CDelimiterTokenizer::set_text
(
SGVector<char>
txt)
41
{
42
last_idx
= 0;
43
CTokenizer::set_text
(txt);
44
}
45
46
const
char
*
CDelimiterTokenizer::get_name
()
const
47
{
48
return
"DelimiterTokenizer"
;
49
}
50
51
bool
CDelimiterTokenizer::has_next
()
52
{
53
if
(
skip_consecutive_delimiters
)
54
{
55
for
(
index_t
i=
last_idx
; i<
text
.
size
(); i++)
56
{
57
if
(!
delimiters
[(uint8_t)
text
[i]])
58
return
true
;
59
}
60
return
false
;
61
}
62
else
63
return
last_idx
<
text
.
size
();
64
}
65
66
void
CDelimiterTokenizer::init_for_whitespace
()
67
{
68
clear_delimiters
();
69
delimiters
[
' '
] = 1;
70
delimiters
[
'\t'
] = 1;
71
}
72
73
void
CDelimiterTokenizer::clear_delimiters
()
74
{
75
memset(
delimiters
, 0,
sizeof
(
delimiters
));
76
}
77
78
index_t
CDelimiterTokenizer::next_token_idx
(
index_t
& start)
79
{
80
start =
last_idx
;
81
82
if
(
skip_consecutive_delimiters
)
83
{
84
while
(
delimiters
[(uint8_t)
text
[start]])
85
start++;
86
}
87
88
if
(!
delimiters
[(uint8_t)
text
[start]])
89
{
90
for
(
last_idx
=start+1;
last_idx
<
text
.
size
();
last_idx
++)
91
{
92
if
(
delimiters
[(uint8_t)
text
[
last_idx
]])
93
break
;
94
}
95
}
96
97
return
last_idx
++;
98
}
99
100
CDelimiterTokenizer
*
CDelimiterTokenizer::get_copy
()
101
{
102
CDelimiterTokenizer
* t =
new
CDelimiterTokenizer
();
103
t->
delimiters
=
delimiters
;
104
t->
skip_consecutive_delimiters
=
skip_consecutive_delimiters
;
105
return
t;
106
}
107
108
void
CDelimiterTokenizer::set_skip_delimiters
(
bool
skip_delimiters)
109
{
110
skip_consecutive_delimiters
= skip_delimiters;
111
}
112
113
bool
CDelimiterTokenizer::get_skip_delimiters
()
const
114
{
115
return
skip_consecutive_delimiters
;
116
}
117
}
SHOGUN
机器学习工具包 - 项目文档