24 if ( (n_grams==1 && skips!=0) || (skips<0))
27 init(hash_bits, docs, tzer, normalize, n_grams, skips);
43 CTokenizer* tzer,
bool normalize, int32_t n_grams, int32_t skips)
59 SG_ADD(&
ngrams,
"ngrams",
"Number of tokens to combine for quadratic feature support",
93 SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);
102 hddf->doc_collection->free_feature_vector(sv2, vec_idx2);
124 int32_t len = hashes.
vlen - 1;
134 const int32_t seed = 0xdeadbeaf;
137 while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
141 hashes[hashes_end++] = token_hash;
149 hashes[hashes_end] = token_hash;
155 result += vec2[hashed_indices[i]];
159 if (hashes_end==hashes.
vlen)
161 if (hashes_start==hashes.
vlen)
167 while (hashes_start!=hashes_end)
173 for (
index_t i=0; i<max_idx; i++)
174 result += vec2[hashed_indices[i]];
177 if (hashes_start==hashes.
vlen)
187 float64_t* vec2, int32_t vec2_len,
bool abs_val)
211 const int32_t seed = 0xdeadbeaf;
214 while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
218 hashes[hashes_end++] = token_hash;
225 hashes[hashes_end] = token_hash;
231 vec2[hashed_indices[i]] += value;
235 if (hashes_end==hashes.vlen)
237 if (hashes_start==hashes.vlen)
243 while (hashes_start!=hashes_end)
249 for (
index_t i=0; i<max_idx; i++)
250 vec2[hashed_indices[i]] += value;
253 if (hashes_start==hashes.vlen)
263 int32_t length, int32_t num_bits, uint32_t seed)
266 return hash & ((1 <<
num_bits) - 1);
278 int32_t num_nnz_features = sv.
size();
280 return num_nnz_features;
302 return "HashedDocDotFeatures";