SHOGUN
3.2.1
首页
相关页面
模块
类
文件
文件列表
文件成员
全部
类
命名空间
文件
函数
变量
类型定义
枚举
枚举值
友元
宏定义
组
页
src
shogun
converter
HashedDocConverter.h
浏览该文件的文档.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#ifndef _HASHEDDOCCONVERTER__H__
12
#define _HASHEDDOCCONVERTER__H__
13
14
#include <
shogun/converter/Converter.h
>
15
#include <
shogun/features/Features.h
>
16
#include <
shogun/lib/Tokenizer.h
>
17
#include <
shogun/features/SparseFeatures.h
>
18
19
namespace
shogun
20
{
21
class
CFeatures;
22
class
CTokenizer;
23
class
CConverter;
24
template
<
class
T>
class
CSparseFeatures
;
25
37
class
CHashedDocConverter
:
public
CConverter
38
{
39
public
:
41
CHashedDocConverter
();
42
51
CHashedDocConverter
(int32_t hash_bits,
bool
normalize =
false
, int32_t n_grams = 1, int32_t skips = 0);
52
61
CHashedDocConverter
(
CTokenizer
* tzer, int32_t hash_bits,
bool
normalize =
false
, int32_t n_grams = 1,
62
int32_t skips = 0);
63
65
virtual
~CHashedDocConverter
();
66
72
virtual
CFeatures
*
apply
(
CFeatures
* features);
73
79
SGSparseVector<float64_t>
apply
(
SGVector<char>
document);
80
99
static
index_t
generate_ngram_hashes
(
SGVector<uint32_t>
& hashes,
index_t
hashes_start,
index_t
len,
100
SGVector<index_t>
& ngram_hashes, int32_t
num_bits
, int32_t
ngrams
, int32_t
tokens_to_skip
);
101
103
virtual
const
char
*
get_name
()
const
;
104
109
void
set_normalization
(
bool
normalize);
110
118
void
set_k_skip_n_grams
(int32_t k, int32_t n);
119
protected
:
120
122
void
init
(
CTokenizer
* tzer, int32_t d,
bool
normalize, int32_t n_grams, int32_t skips);
123
130
int32_t
count_distinct_indices
(
CDynamicArray<uint32_t>
& hashed_indices);
131
138
SGSparseVector<float64_t>
create_hashed_representation
(
CDynamicArray<uint32_t>
& hashed_indices);
139
140
protected
:
141
143
int32_t
num_bits
;
144
146
CTokenizer
*
tokenizer
;
147
149
bool
should_normalize
;
150
152
int32_t
ngrams
;
153
155
int32_t
tokens_to_skip
;
156
};
157
}
158
159
#endif
SHOGUN
机器学习工具包 - 项目文档