SHOGUN
3.2.1
Main Page
Related Pages
Modules
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
src
shogun
converter
HashedDocConverter.h
Go to the documentation of this file.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2013 Evangelos Anagnostopoulos
8
* Copyright (C) 2013 Evangelos Anagnostopoulos
9
*/
10
11
#ifndef _HASHEDDOCCONVERTER__H__
12
#define _HASHEDDOCCONVERTER__H__
13
14
#include <
shogun/converter/Converter.h
>
15
#include <
shogun/features/Features.h
>
16
#include <
shogun/lib/Tokenizer.h
>
17
#include <
shogun/features/SparseFeatures.h
>
18
19
namespace
shogun
20
{
21
class
CFeatures;
22
class
CTokenizer;
23
class
CConverter;
24
template
<
class
T>
class
CSparseFeatures
;
25
37
class
CHashedDocConverter
:
public
CConverter
38
{
39
public
:
41
CHashedDocConverter
();
42
51
CHashedDocConverter
(int32_t hash_bits,
bool
normalize =
false
, int32_t n_grams = 1, int32_t skips = 0);
52
61
CHashedDocConverter
(
CTokenizer
* tzer, int32_t hash_bits,
bool
normalize =
false
, int32_t n_grams = 1,
62
int32_t skips = 0);
63
65
virtual
~CHashedDocConverter
();
66
72
virtual
CFeatures
*
apply
(
CFeatures
* features);
73
79
SGSparseVector<float64_t>
apply
(
SGVector<char>
document);
80
99
static
index_t
generate_ngram_hashes
(
SGVector<uint32_t>
& hashes,
index_t
hashes_start,
index_t
len,
100
SGVector<index_t>
& ngram_hashes, int32_t
num_bits
, int32_t
ngrams
, int32_t
tokens_to_skip
);
101
103
virtual
const
char
*
get_name
()
const
;
104
109
void
set_normalization
(
bool
normalize);
110
118
void
set_k_skip_n_grams
(int32_t k, int32_t n);
119
protected
:
120
122
void
init
(
CTokenizer
* tzer, int32_t d,
bool
normalize, int32_t n_grams, int32_t skips);
123
130
int32_t
count_distinct_indices
(
CDynamicArray<uint32_t>
& hashed_indices);
131
138
SGSparseVector<float64_t>
create_hashed_representation
(
CDynamicArray<uint32_t>
& hashed_indices);
139
140
protected
:
141
143
int32_t
num_bits
;
144
146
CTokenizer
*
tokenizer
;
147
149
bool
should_normalize
;
150
152
int32_t
ngrams
;
153
155
int32_t
tokens_to_skip
;
156
};
157
}
158
159
#endif
SHOGUN
Machine Learning Toolbox - Documentation