19 using namespace shogun;
25 init(NULL, 16,
false, 1, 0);
31 init(NULL, hash_bits, normalize, n_grams, skips);
35 int32_t hash_bits,
bool normalize, int32_t n_grams, int32_t skips) :
CConverter()
37 init(tzer, hash_bits, normalize, n_grams, skips);
45 void CHashedDocConverter::init(
CTokenizer* tzer, int32_t hash_bits,
bool normalize,
46 int32_t n_grams, int32_t skips)
66 SG_ADD(&
ngrams,
"ngrams",
"Number of consecutive tokens",
78 return "HashedDocConverter";
84 if (strcmp(features->
get_name(),
"StringFeatures")!=0)
85 SG_ERROR(
"CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");
94 matrix[vec_idx] =
apply(doc);
104 const int32_t array_size = 1024*1024;
113 int32_t len = cached_hashes.
vlen - 1;
120 const int32_t seed = 0xdeadbeaf;
123 while (hashes_end<ngrams-1+tokens_to_skip && tokenizer->has_next())
127 end-token_start, seed);
128 cached_hashes[hashes_end++] = token_hash;
136 end-token_start, seed);
137 cached_hashes[hashes_end] = token_hash;
143 hashed_indices.append_element(ngram_indices[i]);
147 if (hashes_end==cached_hashes.
vlen)
149 if (hashes_start==cached_hashes.
vlen)
156 while (hashes_start!=hashes_end)
162 for (
index_t i=0; i<max_idx; i++)
163 hashed_indices.append_element(ngram_indices[i]);
166 if (hashes_start==cached_hashes.
vlen)
181 return sparse_doc_rep;
195 (hashed_indices[i+1]==hashed_indices[i]) )
202 return sparse_doc_rep;
209 ngram_hashes[h_idx++] = hashes[hashes_start] & ((1 <<
num_bits) -1);
218 uint32_t ngram_hash = hashes[hashes_start];
219 for (
index_t i=hashes_start+1+s; i<=hashes_start+n+s; i++)
220 ngram_hash = ngram_hash ^ hashes[i % hashes.
vlen];
221 ngram_hash = ngram_hash & ((1 << num_bits) - 1);
222 ngram_hashes[h_idx++] = ngram_hash;
233 int32_t num_nnz_features = 0;
238 (hashed_indices[i+1]==hashed_indices[i]) )
243 return num_nnz_features;