Language model. More...
#include <stdio.h>
#include <logmath.h>
#include <hash_table.h>
#include <cmd_ln.h>
#include "s3types.h"
#include "lmclass.h"
#include "dict.h"
Go to the source code of this file.
Classes | |
struct | lmlog_t |
Log quantities represented in either floating or integer format. More... | |
struct | sorted_entry_s |
struct | sorted_list_t |
The sorted list used lm reading. list is a (64K long) array. The first entry is the root of the tree and is created during initialization. More... | |
struct | ug_t |
A unigram structure Please see. More... | |
struct | bg_t |
A bigram structure. More... | |
struct | bg32_t |
A bigram structure which has 32 bits. More... | |
struct | tg_t |
A trigram structure. More... | |
struct | tg32_t |
A 32 bits version of tg_t. More... | |
struct | membg_t |
Management of in-memory bigrams. Not used if all bigrams in memory. More... | |
struct | membg32_t |
A 32 bits version of membg_t. More... | |
struct | tginfo_s |
struct | tginfo32_s |
struct | lm_tgcache_entry_t |
struct | lm_tgcache_entry32_t |
struct | lm_s |
struct | lmset_s |
struct | wordprob_t |
Generic structure that could be used at any n-gram level. More... | |
Defines | |
#define | LM_DICTWID_BADMAP -16000 |
#define | LM_CLASSID_BASE 0x01000000 |
#define | LM_LEGACY_CONSTANT BAD_S3LMWID |
#define | LM_SPHINX_CONSTANT BAD_S3LMWID32 |
#define | LM_CLASSID_TO_CLASS(m, i) ((m)->lmclass[(i)-LM_CLASSID_BASE]) |
#define | MIN_PROB_F -99.0 |
#define | LM_ALLOC_BLOCK 16 |
#define | LM_SUCCESS 1 |
#define | LM_FAIL 0 |
#define | LM_NOT_FOUND -1 |
#define | LM_OFFSET_TOO_LARGE -2 |
#define | LM_NO_DATA_MARK -3 |
#define | LM_UNKNOWN_NG -4 |
#define | LM_BAD_LM_COUNT -5 |
#define | LM_UNKNOWN_WORDS -6 |
#define | LM_BAD_BIGRAM -7 |
#define | LM_BAD_TRIGRAM -8 |
#define | LM_BAD_QUADGRAM -9 |
#define | LM_BAD_QUINGRAM -10 |
#define | LM_BAD_NGRAM -11 |
#define | LM_TOO_MANY_NGRAM -12 |
#define | LM_NO_MINUS_1GRAM -13 |
#define | LM_FILE_NOT_FOUND -14 |
#define | LM_CANNOT_ALLOCATE -15 |
#define | LMDMP_VERSIONNULL 0 |
#define | LMDMP_VERSION_TG_16BIT -1 |
#define | LMDMP_VERSION_TG_16BIT_V2 -2 |
#define | LMDMP_VERSION_TG_32BIT -3 |
#define | LMTXT_VERSION 1000 |
#define | LMFST_VERSION 1001 |
#define | LMFORCED_TXT32VERSION 1002 |
#define | NO_WORD -1 |
#define | LOG2_BG_SEG_SZ 9 |
#define | BG_SEG_SZ (1 << (LOG2_BG_SEG_SZ)) |
#define | LM_TGCACHE_SIZE 100003 |
#define | lm_lmwid2dictwid(lm, u) ((lm)->ug[u].dictwid) |
#define | lm_n_ug(lm) ((lm)->n_ug) |
#define | lm_n_bg(lm) ((lm)->n_bg) |
#define | lm_n_tg(lm) ((lm)->n_tg) |
#define | lm_wordstr(lm, u) ((lm)->wordstr[u]) |
#define | lm_startwid(lm) ((lm)->startlwid) |
#define | lm_finishwid(lm) ((lm)->finishlwid) |
#define | lm_access_type(lm) ((lm)->access_type) |
#define | LM_TGPROB(lm, tgptr) ((lm)->tgprob[(tgptr)->probid].l) |
#define | LM_BGPROB(lm, bgptr) ((lm)->bgprob[(bgptr)->probid].l) |
#define | LM_UGPROB(lm, ugptr) ((ugptr)->prob.l) |
#define | LM_RAWSCORE(lm, score) ((score - (lm)->wip) / ((lm)->lw)) |
#define | LM_DICTWID(lm, lmwid) ((lm)->ug[(lmwid)].dictwid) |
Typedefs | |
typedef struct sorted_entry_s | sorted_entry_t |
typedef struct tginfo_s | tginfo_t |
typedef struct tginfo32_s | tginfo32_t |
typedef struct lm_s | lm_t |
typedef struct lmset_s | lmset_t |
Functions | |
S3DECODER_EXPORT lmset_t * | lmset_init (const char *lmfile, const char *lmctlfile, const char *ctl_lm, const char *lmname, const char *lmdumpdir, float32 lw, float32 wip, float32 uw, dict_t *dict, logmath_t *logmath) |
lmset_t * | lmset_read_lm (const char *lmfile, dict_t *dict, const char *lmname, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath) |
lmset_t * | lmset_read_ctl (const char *ctlfile, dict_t *dict, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath) |
lm_t * | lmset_get_lm_widx (lmset_t *lms, int32 lmidx) |
lm_t * | lmset_get_lm_wname (lmset_t *lms, const char *lmname) |
void | lmset_set_curlm_widx (lmset_t *lms, int32 lmidx) |
S3DECODER_EXPORT void | lmset_set_curlm_wname (lmset_t *lms, const char *lmname) |
int32 | lmset_name_to_idx (lmset_t *lms, const char *lmname) |
char * | lmset_idx_to_name (lmset_t *lms, int32 lmidx) |
void | lmset_add_lm (lmset_t *lms, lm_t *lm, const char *lmname) |
void | lmset_delete_lm (lmset_t *lms, const char *lmname) |
S3DECODER_EXPORT void | lmset_free (lmset_t *lms) |
int32 | lm_tglist (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg_t **tg, int32 *bowt) |
int32 | lm_tg32list (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg32_t **tg, int32 *bowt) |
int32 | lm_bglist (lm_t *lmp, s3lmwid32_t w, bg_t **bg, int32 *bowt) |
int32 | lm_bg32list (lm_t *lmp, s3lmwid32_t w, bg32_t **bg, int32 *bowt) |
s3lmwid32_t | lm_wid (lm_t *lm, const char *wd) |
void | lm_null_struct (lm_t *lm) |
int32 | lm_ug_wordprob (lm_t *lm, dict_t *dict, int32 th, wordprob_t *wp) |
int32 | lm_uglist (lm_t *lmp, ug_t **ug) |
int32 | lm_ug_score (lm_t *lmp, s3lmwid32_t lwid, s3wid_t wid) |
int32 | lm_ug_exists (lm_t *lm, s3lmwid32_t lwid) |
int32 | lm_bg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2) |
int32 | lm_bg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2) |
int32 | lm_tg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3, s3wid_t w3) |
int32 | lm_tg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3) |
void | lm_set_param (lm_t *lm, float64 lw, float64 wip) |
S3DECODER_EXPORT int32 | lm_rawscore (lm_t *lm, int32 score) |
S3DECODER_EXPORT void | lm_cache_reset (lm_t *lmp) |
S3DECODER_EXPORT void | lm_cache_stats_dump (lm_t *lmp) |
lm_t * | lm_read (const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath) |
lm_t * | lm_read_advance (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, logmath_t *logmath) |
S3DECODER_EXPORT lm_t * | lm_read_advance2 (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, int lminmemory, logmath_t *logmath) |
S3DECODER_EXPORT int32 | lm_write (lm_t *model, const char *outputfile, const char *filename, const char *fmt) |
int32 | lm_write_advance (lm_t *model, const char *outputfile, const char *filename, const char *fmt, const char *inputenc, char *outputenc) |
S3DECODER_EXPORT void | lm_free (lm_t *lm) |
int32 | lm_add_wordlist (lm_t *lm, dict_t *dict, const char *filename) |
int32 | lm_add_word_to_ug (lm_t *lm, dict_t *dict, const char *newword) |
int32 | lm_get_classid (lm_t *model, const char *name) |
void | lm_convert_structure (lm_t *model, int32 is32bits) |
int32 | lm_is32bits (lm_t *model) |
void | ug_write (FILE *fp, ug_t *ug) |
void | bg_write (FILE *fp, bg_t *bg) |
void | bg32_write (FILE *fp, bg32_t *bg) |
void | tg_write (FILE *fp, tg_t *tg) |
void | tg32_write (FILE *fp, tg32_t *tg) |
void | copy_bg_to_bg32 (lm_t *lm) |
void | copy_bg32_to_bg (lm_t *lm) |
void | copy_tg_to_tg32 (lm_t *lm) |
void | copy_tg32_to_tg (lm_t *lm) |
void | swap_bg (bg_t *bg) |
void | swap_bg32 (bg32_t *bg) |
void | swap_tg (tg_t *tg) |
void | swap_tg32 (tg32_t *tg) |
int32 | find_bg (bg_t *bg, int32 n, s3lmwid32_t w) |
int32 | find_bg32 (bg32_t *bg, int32 n, s3lmwid32_t w) |
int32 | find_tg (tg_t *tg, int32 n, s3lmwid32_t w) |
int32 | find_tg32 (tg32_t *tg, int32 n, s3lmwid32_t w) |
ug_t * | NewUnigramTable (int32 n_ug) |
Language model.
This is the header file for language model support in Sphinx 3. Sphinx 3 supports language model in 4 formats. The four formats are
ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in 3.X (X=6)
DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X (X>4)
DMP32 : We start to break the limit of number of words of 65535. This is the first LM file format in Sphinx 3.X that could capture 4 billion words in the language model
FST: In AT&T format, we start to support in 3.X (X=6).
At 20060302 we can only read and used ARPA, DMP-based format in the decoder. we can write ARPA, DMP, DMP32 and FST file format.
#define BG_SEG_SZ (1 << (LOG2_BG_SEG_SZ)) |
#define lm_access_type | ( | lm | ) | ((lm)->access_type) |
#define LM_ALLOC_BLOCK 16 |
#define LM_BAD_BIGRAM -7 |
A bad bigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.
#define LM_BAD_LM_COUNT -5 |
When reading LM, if count is bad, return this msg
#define LM_BAD_NGRAM -11 |
(RESERVED BUT NOT USED) A bad n-gram. generalization of message -7 to -10. In our case, we don't make the message as specific as possible.
#define LM_BAD_QUADGRAM -9 |
(RESERVED BUT NOT USED) A bad quadgram (4-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.
#define LM_BAD_QUINGRAM -10 |
(RESERVED BUT NOT USED) A bad quingram (5-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. BTW, there is no need to remind me the mixed use of quadgram and quingram is stupid English. I read Manning and Schultze.
#define LM_BAD_TRIGRAM -8 |
A bad trigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.
#define LM_BGPROB | ( | lm, | |||
bgptr | ) | ((lm)->bgprob[(bgptr)->probid].l) |
#define LM_CANNOT_ALLOCATE -15 |
When cannot allocate tables in LM return this message
#define LM_CLASSID_BASE 0x01000000 |
#define LM_CLASSID_TO_CLASS | ( | m, | |||
i | ) | ((m)->lmclass[(i)-LM_CLASSID_BASE]) |
#define LM_DICTWID | ( | lm, | |||
lmwid | ) | ((lm)->ug[(lmwid)].dictwid) |
#define LM_DICTWID_BADMAP -16000 |
#define LM_FAIL 0 |
Constant that define an operation failed.
#define LM_FILE_NOT_FOUND -14 |
When couldn't find the LM file, return this message
#define lm_finishwid | ( | lm | ) | ((lm)->finishlwid) |
#define LM_LEGACY_CONSTANT BAD_S3LMWID |
Upper limit of the words of Sphinx 3.X =65535 (~65k), this is introduced since 1996 when Ravi first wrote Sphinx 3.0. It was with us since.
#define lm_lmwid2dictwid | ( | lm, | |||
u | ) | ((lm)->ug[u].dictwid) |
Access macros; not meant for arbitrary use
#define lm_n_bg | ( | lm | ) | ((lm)->n_bg) |
#define lm_n_tg | ( | lm | ) | ((lm)->n_tg) |
#define lm_n_ug | ( | lm | ) | ((lm)->n_ug) |
#define LM_NO_DATA_MARK -3 |
When reading text-based LM, return thisif we see no data mark
#define LM_NO_MINUS_1GRAM -13 |
When reading n-gram, if the corresponding (n-1)-gram doesn't exists, return this message.
#define LM_NOT_FOUND -1 |
Constant which indicate an LM couldn't be found
#define LM_OFFSET_TOO_LARGE -2 |
Constant where the 16 bit LM was used, but th tgcount is larger than LM_LEGACY_CONSTANT (65535). This breaks addressing scheme in the current LM.
#define LM_RAWSCORE | ( | lm, | |||
score | ) | ((score - (lm)->wip) / ((lm)->lw)) |
#define LM_SPHINX_CONSTANT BAD_S3LMWID32 |
(4 billion), ARCHAN: this is introduced by in Sphinx 3.6 during the time of Release Candidate I (2006 March). The caveat of using this constant is that it is much hard to detect byte-swapping problem. in general. Also, if the world has more than 10000 cities, each has 1 million roads name. We are stuck in this case. I assume this will happen in year3001.
#define lm_startwid | ( | lm | ) | ((lm)->startlwid) |
#define LM_SUCCESS 1 |
Sucess and error message. Constant that indicates an operation succeed
#define LM_TGCACHE_SIZE 100003 |
#define LM_TGPROB | ( | lm, | |||
tgptr | ) | ((lm)->tgprob[(tgptr)->probid].l) |
#define LM_TOO_MANY_NGRAM -12 |
When reading LM, if the number of n-grams is more than the number specified header. return this header
#define LM_UGPROB | ( | lm, | |||
ugptr | ) | ((ugptr)->prob.l) |
Referenced by word_trans().
#define LM_UNKNOWN_NG -4 |
When reading the header of LM, if there is unknown K for K-gram
#define LM_UNKNOWN_WORDS -6 |
When an unknown word is found during LM readin, return this message
#define lm_wordstr | ( | lm, | |||
u | ) | ((lm)->wordstr[u]) |
#define LMDMP_VERSION_TG_16BIT -1 |
VERSION 1 is the simplest DMP file which is trigram or lower which used 16 bits in bigram and trigram.
#define LMDMP_VERSION_TG_16BIT_V2 -2 |
VERSION 2 means legacy VERSION 1 DMP file which has log_bg_seg_sz != 9
#define LMDMP_VERSION_TG_32BIT -3 |
VERSION 3 is the 32 bit extension of VERSION 1 but the bigram and trigram are represented by 32 bits data structure
#define LMDMP_VERSIONNULL 0 |
Versioning of LM VERSION 0 is oldest, in the past, we used to use the version number to store the number of unigram, you will see logic that said vn > LMDMP_VERSIONNULL
#define LMFORCED_TXT32VERSION 1002 |
VERSION 1002 is the internal version of text-based LM. The difference betwwen 1002 and 1000 is that 1002 will assume LM is 32bits. This fact is used in lm_is32bits(lm)
#define LMFST_VERSION 1001 |
VERSION 1001 is the FST-based LM
#define LMTXT_VERSION 1000 |
VERSION 1000 is the text-based LM
#define LOG2_BG_SEG_SZ 9 |
#define MIN_PROB_F -99.0 |
The minimum value of probabilities and backoff weights. When changing, notice that both s2 and s3 may transform this number to very small integer (say -2e-31) This will easily cause integer wrap around. -99 is chosen for that reason.
#define NO_WORD -1 |
typedef struct sorted_entry_s sorted_entry_t |
typedef struct tginfo32_s tginfo32_t |
void bg32_write | ( | FILE * | fp, | |
bg32_t * | bg | |||
) |
Write of BG (32bits) structure
fp | A file pointer | |
bg | A pointer of the bg32_t structure |
void bg_write | ( | FILE * | fp, | |
bg_t * | bg | |||
) |
Write of BG structure
fp | A file pointer | |
bg | A pointer of the bg_t structure |
void copy_bg32_to_bg | ( | lm_t * | lm | ) |
Convert the 32 bit bigram structure to 16 bit
lm | LM |
void copy_bg_to_bg32 | ( | lm_t * | lm | ) |
Convert the 16 bit bigram structure to 32 bit
lm | LM |
void copy_tg32_to_tg | ( | lm_t * | lm | ) |
Convert the 32 bit trigram structure to 16 bit
lm | LM |
void copy_tg_to_tg32 | ( | lm_t * | lm | ) |
Convert the 16 bit trigram structure to 32 bit
lm | LM |
int32 find_bg | ( | bg_t * | bg, | |
int32 | n, | |||
s3lmwid32_t | w | |||
) |
bg | In: The bigram |
int32 find_bg32 | ( | bg32_t * | bg, | |
int32 | n, | |||
s3lmwid32_t | w | |||
) |
bg | In: The bigram |
int32 find_tg | ( | tg_t * | tg, | |
int32 | n, | |||
s3lmwid32_t | w | |||
) |
tg | In: The trigram |
int32 find_tg32 | ( | tg32_t * | tg, | |
int32 | n, | |||
s3lmwid32_t | w | |||
) |
tg | In: The trigram |
Add a word to the LM
look up the dictionary and see whether it exists in the dictionary Looks alike with wid.c's logic at this point.
(Incomplete!) Not fully tested in the situation for on-line recognition.
We also avoid the addition of classes at this point because that could complicated things quite a lot.
lm | In/Out: a modified LM structure | |
dict | In: an initialized dictionary structure Used to update lmwid2dictid mapping. | |
newword | In: a pointer of a new word |
Add word list to the LM For each word in the file, call lm_add_wordlist. The file is assume to have a format like this: <word1> <word2> <word3> <word4>
If the lmwid2dictid mapping is not updated, or the dictionary itself is not used in the context. Just specify dict=NULL;
lm | In/Out: a modified LM structure | |
dict | In: an initialized dictionary structure Used to update | |
filename | In: a file that contains a list of word one wants to add |
int32 lm_bg32list | ( | lm_t * | lmp, | |
s3lmwid32_t | w, | |||
bg32_t ** | bg, | |||
int32 * | bowt | |||
) |
lmp | In: LM being queried | |
w | In: LM word id of the 1-word history | |
bg | Out: *bg = array of bigrams for w | |
bowt | Out: *bowt = backoff-weight for w |
int32 lm_bg_exists | ( | lm_t * | lm, | |
s3lmwid32_t | lw1, | |||
s3lmwid32_t | lw2 | |||
) |
Whether a certain bigram exists.
lm | In: LM |
int32 lm_bg_score | ( | lm_t * | lmp, | |
s3lmwid32_t | lw1, | |||
s3lmwid32_t | lw2, | |||
s3wid_t | w2 | |||
) |
lmp | In: LM begin queried |
int32 lm_bglist | ( | lm_t * | lmp, | |
s3lmwid32_t | w, | |||
bg_t ** | bg, | |||
int32 * | bowt | |||
) |
Return the bigram followers for the given word w. Return value: #bigrams in returned list.
lmp | In: LM being queried | |
w | In: LM word id of the 1-word history | |
bg | Out: *bg = array of bigrams for w | |
bowt | Out: *bowt = backoff-weight for w |
S3DECODER_EXPORT void lm_cache_reset | ( | lm_t * | lmp | ) |
LM cache related
lmp | In: the LM |
S3DECODER_EXPORT void lm_cache_stats_dump | ( | lm_t * | lmp | ) |
LM cache statistic dumping
lmp | In: the LM |
void lm_convert_structure | ( | lm_t * | model, | |
int32 | is32bits | |||
) |
Explicity convert structure from 16bit -> 32bit or 32bit to 16bit.
model | In: LM file being used |
S3DECODER_EXPORT void lm_free | ( | lm_t * | lm | ) |
int32 lm_get_classid | ( | lm_t * | model, | |
const char * | name | |||
) |
Get class ID given a LM.
model | In: LM file being queried | |
name | In: The name of the class |
int32 lm_is32bits | ( | lm_t * | model | ) |
Check whether the model is operating at 32 bits
void lm_null_struct | ( | lm_t * | lm | ) |
Set all pointers to NULL in the lm
S3DECODER_EXPORT int32 lm_rawscore | ( | lm_t * | lm, | |
int32 | score | |||
) |
lm | In: the LM |
lm_t* lm_read | ( | const char * | file, | |
const char * | lmname, | |||
cmd_ln_t * | config, | |||
logmath_t * | logmath | |||
) |
A simple version of reading in a LM
lm_read is a simple version of lm_read_advance. It will assume language weight, word insertion penalty and unigram weight to be automatically applied. There is also no class-based LM (so ndict=0). Format is set to NULL, so the program will determine it automatically.
file | In: LM file being read | |
lmname | In: LM name |
lm_t* lm_read_advance | ( | const char * | file, | |
const char * | lmname, | |||
float64 | lw, | |||
float64 | wip, | |||
float64 | uw, | |||
int32 | ndict, | |||
const char * | fmt, | |||
int32 | applyweight, | |||
logmath_t * | logmath | |||
) |
Read an LM file, it will automatically decide whether the file is a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump (non-public functions) correspondingly. Currently the code is not aware about OOV.
lw, wip, uw and ndict are mainly used for recognition purpose. When lm_read is used for other purpose, one could just used dummy setting. recommended one is lw=1.0,wip=0.1,uw=1.0 and ndict=0. These are very useful when lm_read is just used as reading the LM.
If applyweight is 0, lw,wip, uw will not be apply the LM at all. This will allow users to just call the LM routine without initializing other modules (such as logs3_init).
If applyweight is 1, then logs3_init must be called before lm_read. This is usually the case when kb_init is called before the code.
fmt now could be either "TXT", "DMP" and "TXT32" or just NULL. If it is NULL, the LM format will be automatically determined. If it is specified as "TXT" or "DMP", the corresponding lm reader will be called. In such a case, it is important for the users to know what he/she is doing. (Unfortunately, this is mostly not true. ) In the case of "TXT32", a text LM will be forced to 32bit mode.
ndict is the dictionary size of the application. This is needed because class-based LM are addressed in the dictionary wid-space instead of lm wid-space. If class-based LM is not used, just set this to zero.
Note: there are two defense mechanisms of lm_read_advance. First of all, if no fmt is specified, it will start to read the lm in the order of DMP->TXT. Second, if txt format is specified but LM is found to hit the 16bit legacy segments limit, it will automatically switch to read TXT32 LM
file | In: LM file being read | |
lmname | In: LM name | |
lw | In: Language weight | |
wip | In: Word insertion penalty | |
uw | In: Unigram weight (interpolation with uniform distr.) | |
ndict | In: Number of dictionary entry. We need that because class-based LM is addressed in dictionary word ID space. | |
fmt | In: file format of the LM, it is now either "TXT", "DMP" and NULL, if NULL, file format is automaticaly determined | |
applyweight | In: whether lw,wip, uw should be applied to the lm or not |
S3DECODER_EXPORT lm_t* lm_read_advance2 | ( | const char * | file, | |
const char * | lmname, | |||
float64 | lw, | |||
float64 | wip, | |||
float64 | uw, | |||
int32 | ndict, | |||
const char * | fmt, | |||
int32 | applyweight, | |||
int | lminmemory, | |||
logmath_t * | logmath | |||
) |
file | In: LM file being read | |
lmname | In: LM name | |
lw | In: Language weight | |
wip | In: Word insertion penalty | |
uw | In: Unigram weight (interpolation with uniform distr.) | |
ndict | In: Number of dictionary entry. We need that because class-based LM is addressed in dictionary word ID space. | |
fmt | In: file format of the LM, it is now either "TXT", "DMP" and NULL, if NULL, file format is automaticaly determined | |
applyweight | In: whether lw,wip, uw should be applied to the lm or not | |
lminmemory | In: Whether LM is read into memory |
Referenced by main().
void lm_set_param | ( | lm_t * | lm, | |
float64 | lw, | |||
float64 | wip | |||
) |
Set the language-weight and insertion penalty parameters for the LM, after revoking any earlier set of such parameters.
WARNING!! This function doesn't prevent underflow of values. Make sure you call safe lm2logs3 before it.
lm | In: the LM | |
lw | In: the langauage weight | |
wip | In: the word insertion penalty |
int32 lm_tg32list | ( | lm_t * | lmp, | |
s3lmwid32_t | w1, | |||
s3lmwid32_t | w2, | |||
tg32_t ** | tg, | |||
int32 * | bowt | |||
) |
lmp | In: LM being queried | |
w1 | In: LM word id of the first of a 2-word history | |
w2 | In: LM word id of the second of the 2-word history | |
tg | Out: *tg = array of trigrams for <w1,w2> | |
bowt | Out: *bowt = backoff-weight for <w1, w2> |
int32 lm_tg_exists | ( | lm_t * | lm, | |
s3lmwid32_t | lw1, | |||
s3lmwid32_t | lw2, | |||
s3lmwid32_t | lw3 | |||
) |
Whether a certain trigram exists.
lm | In: LM |
int32 lm_tg_score | ( | lm_t * | lmp, | |
s3lmwid32_t | lw1, | |||
s3lmwid32_t | lw2, | |||
s3lmwid32_t | lw3, | |||
s3wid_t | w3 | |||
) |
Return trigram score for the given three word sequence. If w1 is BAD_LMWID(lm), return lm_bg_score (w2, w3). If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3).
20040227: This also account for the in-class probability of w3.
lmp | In: LM begin queried |
int32 lm_tglist | ( | lm_t * | lmp, | |
s3lmwid32_t | w1, | |||
s3lmwid32_t | w2, | |||
tg_t ** | tg, | |||
int32 * | bowt | |||
) |
Return trigram followers for given two words. Both w1 and w2 must be valid. Return value: #trigrams in returned list.
lmp | In: LM being queried | |
w1 | In: LM word id of the first of a 2-word history | |
w2 | In: LM word id of the second of the 2-word history | |
tg | Out: *tg = array of trigrams for <w1,w2> | |
bowt | Out: *bowt = backoff-weight for <w1, w2> |
int32 lm_ug_exists | ( | lm_t * | lm, | |
s3lmwid32_t | lwid | |||
) |
lm | LM | |
lwid | LM ID for the word |
int32 lm_ug_score | ( | lm_t * | lmp, | |
s3lmwid32_t | lwid, | |||
s3wid_t | wid | |||
) |
Return unigram score for the given word
lmp | In: LM begin queried | |
lwid | LM ID for the word | |
wid | Dict ID for the word |
int32 lm_ug_wordprob | ( | lm_t * | lm, | |
dict_t * | dict, | |||
int32 | th, | |||
wordprob_t * | wp | |||
) |
Like lm_bg_wordprob, but for unigrams. Return value: #entries filled in the wordprob array.
lm | In: LM being queried | |
dict | In : The dictionary | |
wp | In/out: Array to be filled |
Return the unigrams in LM. Return value: #unigrams in returned list.
lmp | In: LM being queried | |
ug | Out: *ug = unigram array |
Referenced by word_trans().
s3lmwid32_t lm_wid | ( | lm_t * | lm, | |
const char * | wd | |||
) |
S3DECODER_EXPORT int32 lm_write | ( | lm_t * | model, | |
const char * | outputfile, | |||
const char * | filename, | |||
const char * | fmt | |||
) |
Simple writing of an LM file, the input and output encoding will assume to be iso8859-1. Call lm_write. To convert encoding, please use lm_write_advance.
outputfile | In: the pointer LM we want to output In: the output file name | |
filename | In: the LM file name | |
fmt | In: LM file format, it is now either "TXT" or "DMP" |
Referenced by main().
int32 lm_write_advance | ( | lm_t * | model, | |
const char * | outputfile, | |||
const char * | filename, | |||
const char * | fmt, | |||
const char * | inputenc, | |||
char * | outputenc | |||
) |
Writing of an LM file with advanced options such as encoding support. Called by lm_write.
fmt now could be TXT, DMP, FST
inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312. Not every pair of conversion works.
Current input/output encodings support list. 0: iso8859-1 1: gb2312-hex 2: gb2312
-: do nothing n: doesn't make sense or not compatible x: not supported yet y: supported
i\o 0 1 2 0 - n n 1 n - y 2 n x -
When we have 4 encoding types: This document should be implemented as a data structure.
This conversion table is copied from encoding.c, please take a look the latest support in encoding.c
model | In: the pointer LM we want to output | |
outputfile | In: the output file name | |
filename | In: the LM file name | |
fmt | In: LM file format, it is now either "TXT", "DMP", "FST" | |
inputenc | In: Input encoding type | |
outputenc | Out: Output encoding type |
Add a new lm into the lmset. Notice that lms->n_lm will be added by 1
lms | In/Out : The set of LM | |
lm | In : The input LM | |
lmname | In: The lm name |
void lmset_delete_lm | ( | lmset_t * | lms, | |
const char * | lmname | |||
) |
Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1
lms | In/Out : The set of LM | |
lmname | The lm name |
S3DECODER_EXPORT void lmset_free | ( | lmset_t * | lms | ) |
Free the lmset data structure
lms | In: The set of LM |
Get an LM by index.
lms | In: The set of LM | |
lmidx | In: LM index |
Get an LM by name
lms | In: The set of LM | |
lmname | In: The LM name |
char* lmset_idx_to_name | ( | lmset_t * | lms, | |
int32 | lmidx | |||
) |
Convert index to name
lms | In: The set of LM | |
lmidx | In: LM index |
S3DECODER_EXPORT lmset_t* lmset_init | ( | const char * | lmfile, | |
const char * | lmctlfile, | |||
const char * | ctl_lm, | |||
const char * | lmname, | |||
const char * | lmdumpdir, | |||
float32 | lw, | |||
float32 | wip, | |||
float32 | uw, | |||
dict_t * | dict, | |||
logmath_t * | logmath | |||
) |
A wrapper function of controlling the behavior of LM initialization
(ARCHAN 20050617) lmset_init controls the behavior how the lmset which is an array of lm was initialized by different command-line arguments. lmfile and lmctlfile are mutually exclusive. Each will invoke one reading functions.
In the case of -lmfile is specified. A lmset with one single lm (or lmset->n_lm=1) will be returned. The single lm's name will be called lmname.
In the case of -lmctlfile is specified. A lmset with multiple lms will be returned. The number of lm will depend on the number of lm specified by -lmctlfile. For the format, please read the current format of -lmctlfile in lm.c
ctl_lm is the equivalent of -ctl for lm. When -ctl_lm is not specified in command-line (ctl_lm is NULL). Then either lm with name lmname will be used as the default lm. If lmname is NULL, then the first lm will be named as the "default"
lmdumpdir is currently not used. It is there for backward compatibility purpose.
lw,wip,uw are language weight, word insertion pernalty and unigram weight. Their values are crucial to computation of the language model score. Therefore, the programmer is urged to carefully set these three values and also be careful of the order.
dict is assumed to be a pre-initialized dict_t structure which is used in deriving the mapping between the dictionary word and the lm words
ARCHAN 20050711 -lminmemory is the only global variable that control the code and we haven't explicitly specify it. Currently, if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used. if the LM is txt-base, only -lminmemory=1 is accepted. (This will be changed in future.)
ARCHAN 20050705: A survival guide for this part of the code. Our language mode code is unnecessarily complicated and is mainly caused by the fact the way we specified class-based LM and multiple LM are inter-dependent. For example, one could specify a multiple LMs file (i.e. lmctlfile) and have no classes. However, if one would like to specify class information even with a single LM, one need to use a multiple LM file format (i.e. lmctlfile).
This difficulty is well-observed in the period of Sphinx 3.4-3.6. That might imply that a new LM format is needed if we want to sustain this part of the development.
lmfile | The lm file name, lmfile and lmctlfile are mutally exclusive | |
lmctlfile | The file that specified multiple LMs and class information, lmfile and lmctlfile are mutually exclusive | |
ctl_lm | The control file that describes which lm to use for a particular utterance | |
lmname | The LM name to use if ctl_lm is not specified | |
lmdumpdir | Currently not used | |
lw | Language model weight | |
wip | Word insertion penalty | |
uw | Unigram weight | |
dict | A pre-initialized dict_t structure |
int32 lmset_name_to_idx | ( | lmset_t * | lms, | |
const char * | lmname | |||
) |
Convert name to index
lms | In: The set of LM | |
lmname | In: The LM name |
lmset_t* lmset_read_ctl | ( | const char * | ctlfile, | |
dict_t * | dict, | |||
float64 | lw, | |||
float64 | wip, | |||
float64 | uw, | |||
const char * | lmdumpdir, | |||
logmath_t * | logmath | |||
) |
Read the LM control file. **Usually**, it is also a class-based LM,
ctlfile | Control file name | |
dict | In: Dictionary | |
lw | In: Language weight | |
wip | In: Word insertion penalty | |
uw | In: Unigram weight | |
lmdumpdir | In: LMdumpdir |
lmset_t* lmset_read_lm | ( | const char * | lmfile, | |
dict_t * | dict, | |||
const char * | lmname, | |||
float64 | lw, | |||
float64 | wip, | |||
float64 | uw, | |||
const char * | lmdumpdir, | |||
logmath_t * | logmath | |||
) |
Read a single LM into the lmset.
lmfile | In: The LM file | |
dict | In: A pre-initialized dictionary file | |
lmname | In: The LM name | |
lw | The language weight | |
wip | The word insertion penalty | |
uw | The unigram weight | |
lmdumpdir | In: LM dump dir |
void lmset_set_curlm_widx | ( | lmset_t * | lms, | |
int32 | lmidx | |||
) |
Set the current LM with index
lms | In: The set of LM | |
lmidx | In: LM index |
S3DECODER_EXPORT void lmset_set_curlm_wname | ( | lmset_t * | lms, | |
const char * | lmname | |||
) |
Set the current LM with name
lms | In: The set of LM | |
lmname | In: The LM name |
ug_t* NewUnigramTable | ( | int32 | n_ug | ) |
Create a new unigram table
n_ug | Number of unigram |
void swap_bg | ( | bg_t * | bg | ) |
Swap 16 bits bigram
void swap_bg32 | ( | bg32_t * | bg | ) |
Swap 32 bits bigram
void swap_tg | ( | tg_t * | tg | ) |
Swap 16 bits trigram
void swap_tg32 | ( | tg32_t * | tg | ) |
Swap 32 bits trigram
void tg32_write | ( | FILE * | fp, | |
tg32_t * | tg | |||
) |
Write of TG (32bits) structure
fp | A file pointer | |
tg | A pointer of the tg32_t structure |
void tg_write | ( | FILE * | fp, | |
tg_t * | tg | |||
) |
Write of TG structure
fp | A file pointer | |
tg | A pointer of the tg_t structure |