lm.h File Reference

Language model. More...

#include <stdio.h>
#include <logmath.h>
#include <hash_table.h>
#include <cmd_ln.h>
#include "s3types.h"
#include "lmclass.h"
#include "dict.h"

Go to the source code of this file.

Classes

struct  lmlog_t
 Log quantities represented in either floating or integer format. More...
struct  sorted_entry_s
struct  sorted_list_t
 The sorted list used lm reading. list is a (64K long) array. The first entry is the root of the tree and is created during initialization. More...
struct  ug_t
 A unigram structure Please see. More...
struct  bg_t
 A bigram structure. More...
struct  bg32_t
 A bigram structure which has 32 bits. More...
struct  tg_t
 A trigram structure. More...
struct  tg32_t
 A 32 bits version of tg_t. More...
struct  membg_t
 Management of in-memory bigrams. Not used if all bigrams in memory. More...
struct  membg32_t
 A 32 bits version of membg_t. More...
struct  tginfo_s
struct  tginfo32_s
struct  lm_tgcache_entry_t
struct  lm_tgcache_entry32_t
struct  lm_s
struct  lmset_s
struct  wordprob_t
 Generic structure that could be used at any n-gram level. More...

Defines

#define LM_DICTWID_BADMAP   -16000
#define LM_CLASSID_BASE   0x01000000
#define LM_LEGACY_CONSTANT   BAD_S3LMWID
#define LM_SPHINX_CONSTANT   BAD_S3LMWID32
#define LM_CLASSID_TO_CLASS(m, i)   ((m)->lmclass[(i)-LM_CLASSID_BASE])
#define MIN_PROB_F   -99.0
#define LM_ALLOC_BLOCK   16
#define LM_SUCCESS   1
#define LM_FAIL   0
#define LM_NOT_FOUND   -1
#define LM_OFFSET_TOO_LARGE   -2
#define LM_NO_DATA_MARK   -3
#define LM_UNKNOWN_NG   -4
#define LM_BAD_LM_COUNT   -5
#define LM_UNKNOWN_WORDS   -6
#define LM_BAD_BIGRAM   -7
#define LM_BAD_TRIGRAM   -8
#define LM_BAD_QUADGRAM   -9
#define LM_BAD_QUINGRAM   -10
#define LM_BAD_NGRAM   -11
#define LM_TOO_MANY_NGRAM   -12
#define LM_NO_MINUS_1GRAM   -13
#define LM_FILE_NOT_FOUND   -14
#define LM_CANNOT_ALLOCATE   -15
#define LMDMP_VERSIONNULL   0
#define LMDMP_VERSION_TG_16BIT   -1
#define LMDMP_VERSION_TG_16BIT_V2   -2
#define LMDMP_VERSION_TG_32BIT   -3
#define LMTXT_VERSION   1000
#define LMFST_VERSION   1001
#define LMFORCED_TXT32VERSION   1002
#define NO_WORD   -1
#define LOG2_BG_SEG_SZ   9
#define BG_SEG_SZ   (1 << (LOG2_BG_SEG_SZ))
#define LM_TGCACHE_SIZE   100003
#define lm_lmwid2dictwid(lm, u)   ((lm)->ug[u].dictwid)
#define lm_n_ug(lm)   ((lm)->n_ug)
#define lm_n_bg(lm)   ((lm)->n_bg)
#define lm_n_tg(lm)   ((lm)->n_tg)
#define lm_wordstr(lm, u)   ((lm)->wordstr[u])
#define lm_startwid(lm)   ((lm)->startlwid)
#define lm_finishwid(lm)   ((lm)->finishlwid)
#define lm_access_type(lm)   ((lm)->access_type)
#define LM_TGPROB(lm, tgptr)   ((lm)->tgprob[(tgptr)->probid].l)
#define LM_BGPROB(lm, bgptr)   ((lm)->bgprob[(bgptr)->probid].l)
#define LM_UGPROB(lm, ugptr)   ((ugptr)->prob.l)
#define LM_RAWSCORE(lm, score)   ((score - (lm)->wip) / ((lm)->lw))
#define LM_DICTWID(lm, lmwid)   ((lm)->ug[(lmwid)].dictwid)

Typedefs

typedef struct sorted_entry_s sorted_entry_t
typedef struct tginfo_s tginfo_t
typedef struct tginfo32_s tginfo32_t
typedef struct lm_s lm_t
typedef struct lmset_s lmset_t

Functions

S3DECODER_EXPORT lmset_tlmset_init (const char *lmfile, const char *lmctlfile, const char *ctl_lm, const char *lmname, const char *lmdumpdir, float32 lw, float32 wip, float32 uw, dict_t *dict, logmath_t *logmath)
lmset_tlmset_read_lm (const char *lmfile, dict_t *dict, const char *lmname, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath)
lmset_tlmset_read_ctl (const char *ctlfile, dict_t *dict, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath)
lm_tlmset_get_lm_widx (lmset_t *lms, int32 lmidx)
lm_tlmset_get_lm_wname (lmset_t *lms, const char *lmname)
void lmset_set_curlm_widx (lmset_t *lms, int32 lmidx)
S3DECODER_EXPORT void lmset_set_curlm_wname (lmset_t *lms, const char *lmname)
int32 lmset_name_to_idx (lmset_t *lms, const char *lmname)
char * lmset_idx_to_name (lmset_t *lms, int32 lmidx)
void lmset_add_lm (lmset_t *lms, lm_t *lm, const char *lmname)
void lmset_delete_lm (lmset_t *lms, const char *lmname)
S3DECODER_EXPORT void lmset_free (lmset_t *lms)
int32 lm_tglist (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg_t **tg, int32 *bowt)
int32 lm_tg32list (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg32_t **tg, int32 *bowt)
int32 lm_bglist (lm_t *lmp, s3lmwid32_t w, bg_t **bg, int32 *bowt)
int32 lm_bg32list (lm_t *lmp, s3lmwid32_t w, bg32_t **bg, int32 *bowt)
s3lmwid32_t lm_wid (lm_t *lm, const char *wd)
void lm_null_struct (lm_t *lm)
int32 lm_ug_wordprob (lm_t *lm, dict_t *dict, int32 th, wordprob_t *wp)
int32 lm_uglist (lm_t *lmp, ug_t **ug)
int32 lm_ug_score (lm_t *lmp, s3lmwid32_t lwid, s3wid_t wid)
int32 lm_ug_exists (lm_t *lm, s3lmwid32_t lwid)
int32 lm_bg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2)
int32 lm_bg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
int32 lm_tg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3, s3wid_t w3)
int32 lm_tg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3)
void lm_set_param (lm_t *lm, float64 lw, float64 wip)
S3DECODER_EXPORT int32 lm_rawscore (lm_t *lm, int32 score)
S3DECODER_EXPORT void lm_cache_reset (lm_t *lmp)
S3DECODER_EXPORT void lm_cache_stats_dump (lm_t *lmp)
lm_tlm_read (const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath)
lm_tlm_read_advance (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, logmath_t *logmath)
S3DECODER_EXPORT lm_tlm_read_advance2 (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, int lminmemory, logmath_t *logmath)
S3DECODER_EXPORT int32 lm_write (lm_t *model, const char *outputfile, const char *filename, const char *fmt)
int32 lm_write_advance (lm_t *model, const char *outputfile, const char *filename, const char *fmt, const char *inputenc, char *outputenc)
S3DECODER_EXPORT void lm_free (lm_t *lm)
int32 lm_add_wordlist (lm_t *lm, dict_t *dict, const char *filename)
int32 lm_add_word_to_ug (lm_t *lm, dict_t *dict, const char *newword)
int32 lm_get_classid (lm_t *model, const char *name)
void lm_convert_structure (lm_t *model, int32 is32bits)
int32 lm_is32bits (lm_t *model)
void ug_write (FILE *fp, ug_t *ug)
void bg_write (FILE *fp, bg_t *bg)
void bg32_write (FILE *fp, bg32_t *bg)
void tg_write (FILE *fp, tg_t *tg)
void tg32_write (FILE *fp, tg32_t *tg)
void copy_bg_to_bg32 (lm_t *lm)
void copy_bg32_to_bg (lm_t *lm)
void copy_tg_to_tg32 (lm_t *lm)
void copy_tg32_to_tg (lm_t *lm)
void swap_bg (bg_t *bg)
void swap_bg32 (bg32_t *bg)
void swap_tg (tg_t *tg)
void swap_tg32 (tg32_t *tg)
int32 find_bg (bg_t *bg, int32 n, s3lmwid32_t w)
int32 find_bg32 (bg32_t *bg, int32 n, s3lmwid32_t w)
int32 find_tg (tg_t *tg, int32 n, s3lmwid32_t w)
int32 find_tg32 (tg32_t *tg, int32 n, s3lmwid32_t w)
ug_tNewUnigramTable (int32 n_ug)

Detailed Description

Language model.

This is the header file for language model support in Sphinx 3. Sphinx 3 supports language model in 4 formats. The four formats are

ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in 3.X (X=6)

DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X (X>4)

DMP32 : We start to break the limit of number of words of 65535. This is the first LM file format in Sphinx 3.X that could capture 4 billion words in the language model

FST: In AT&T format, we start to support in 3.X (X=6).

At 20060302 we can only read and used ARPA, DMP-based format in the decoder. we can write ARPA, DMP, DMP32 and FST file format.


Define Documentation

#define BG_SEG_SZ   (1 << (LOG2_BG_SEG_SZ))
#define lm_access_type ( lm   )     ((lm)->access_type)
#define LM_ALLOC_BLOCK   16
#define LM_BAD_BIGRAM   -7

A bad bigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.

#define LM_BAD_LM_COUNT   -5

When reading LM, if count is bad, return this msg

#define LM_BAD_NGRAM   -11

(RESERVED BUT NOT USED) A bad n-gram. generalization of message -7 to -10. In our case, we don't make the message as specific as possible.

#define LM_BAD_QUADGRAM   -9

(RESERVED BUT NOT USED) A bad quadgram (4-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.

#define LM_BAD_QUINGRAM   -10

(RESERVED BUT NOT USED) A bad quingram (5-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. BTW, there is no need to remind me the mixed use of quadgram and quingram is stupid English. I read Manning and Schultze.

#define LM_BAD_TRIGRAM   -8

A bad trigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.

#define LM_BGPROB ( lm,
bgptr   )     ((lm)->bgprob[(bgptr)->probid].l)
#define LM_CANNOT_ALLOCATE   -15

When cannot allocate tables in LM return this message

#define LM_CLASSID_BASE   0x01000000
#define LM_CLASSID_TO_CLASS ( m,
 )     ((m)->lmclass[(i)-LM_CLASSID_BASE])
#define LM_DICTWID ( lm,
lmwid   )     ((lm)->ug[(lmwid)].dictwid)
#define LM_DICTWID_BADMAP   -16000
#define LM_FAIL   0

Constant that define an operation failed.

#define LM_FILE_NOT_FOUND   -14

When couldn't find the LM file, return this message

#define lm_finishwid ( lm   )     ((lm)->finishlwid)
#define LM_LEGACY_CONSTANT   BAD_S3LMWID

Upper limit of the words of Sphinx 3.X =65535 (~65k), this is introduced since 1996 when Ravi first wrote Sphinx 3.0. It was with us since.

#define lm_lmwid2dictwid ( lm,
 )     ((lm)->ug[u].dictwid)

Access macros; not meant for arbitrary use

#define lm_n_bg ( lm   )     ((lm)->n_bg)
#define lm_n_tg ( lm   )     ((lm)->n_tg)
#define lm_n_ug ( lm   )     ((lm)->n_ug)
#define LM_NO_DATA_MARK   -3

When reading text-based LM, return thisif we see no data mark

#define LM_NO_MINUS_1GRAM   -13

When reading n-gram, if the corresponding (n-1)-gram doesn't exists, return this message.

#define LM_NOT_FOUND   -1

Constant which indicate an LM couldn't be found

#define LM_OFFSET_TOO_LARGE   -2

Constant where the 16 bit LM was used, but th tgcount is larger than LM_LEGACY_CONSTANT (65535). This breaks addressing scheme in the current LM.

#define LM_RAWSCORE ( lm,
score   )     ((score - (lm)->wip) / ((lm)->lw))
#define LM_SPHINX_CONSTANT   BAD_S3LMWID32

(4 billion), ARCHAN: this is introduced by in Sphinx 3.6 during the time of Release Candidate I (2006 March). The caveat of using this constant is that it is much hard to detect byte-swapping problem. in general. Also, if the world has more than 10000 cities, each has 1 million roads name. We are stuck in this case. I assume this will happen in year3001.

#define lm_startwid ( lm   )     ((lm)->startlwid)
#define LM_SUCCESS   1

Sucess and error message. Constant that indicates an operation succeed

#define LM_TGCACHE_SIZE   100003
#define LM_TGPROB ( lm,
tgptr   )     ((lm)->tgprob[(tgptr)->probid].l)
#define LM_TOO_MANY_NGRAM   -12

When reading LM, if the number of n-grams is more than the number specified header. return this header

#define LM_UGPROB ( lm,
ugptr   )     ((ugptr)->prob.l)

Referenced by word_trans().

#define LM_UNKNOWN_NG   -4

When reading the header of LM, if there is unknown K for K-gram

#define LM_UNKNOWN_WORDS   -6

When an unknown word is found during LM readin, return this message

#define lm_wordstr ( lm,
 )     ((lm)->wordstr[u])
#define LMDMP_VERSION_TG_16BIT   -1

VERSION 1 is the simplest DMP file which is trigram or lower which used 16 bits in bigram and trigram.

#define LMDMP_VERSION_TG_16BIT_V2   -2

VERSION 2 means legacy VERSION 1 DMP file which has log_bg_seg_sz != 9

#define LMDMP_VERSION_TG_32BIT   -3

VERSION 3 is the 32 bit extension of VERSION 1 but the bigram and trigram are represented by 32 bits data structure

#define LMDMP_VERSIONNULL   0

Versioning of LM VERSION 0 is oldest, in the past, we used to use the version number to store the number of unigram, you will see logic that said vn > LMDMP_VERSIONNULL

#define LMFORCED_TXT32VERSION   1002

VERSION 1002 is the internal version of text-based LM. The difference betwwen 1002 and 1000 is that 1002 will assume LM is 32bits. This fact is used in lm_is32bits(lm)

#define LMFST_VERSION   1001

VERSION 1001 is the FST-based LM

#define LMTXT_VERSION   1000

VERSION 1000 is the text-based LM

#define LOG2_BG_SEG_SZ   9
#define MIN_PROB_F   -99.0

The minimum value of probabilities and backoff weights. When changing, notice that both s2 and s3 may transform this number to very small integer (say -2e-31) This will easily cause integer wrap around. -99 is chosen for that reason.

#define NO_WORD   -1

Typedef Documentation

typedef struct lm_s lm_t
typedef struct lmset_s lmset_t
typedef struct tginfo32_s tginfo32_t
typedef struct tginfo_s tginfo_t

Function Documentation

void bg32_write ( FILE *  fp,
bg32_t bg 
)

Write of BG (32bits) structure

Parameters:
fp A file pointer
bg A pointer of the bg32_t structure
void bg_write ( FILE *  fp,
bg_t bg 
)

Write of BG structure

Parameters:
fp A file pointer
bg A pointer of the bg_t structure
void copy_bg32_to_bg ( lm_t lm  ) 

Convert the 32 bit bigram structure to 16 bit

Parameters:
lm LM
void copy_bg_to_bg32 ( lm_t lm  ) 

Convert the 16 bit bigram structure to 32 bit

Parameters:
lm LM
void copy_tg32_to_tg ( lm_t lm  ) 

Convert the 32 bit trigram structure to 16 bit

Parameters:
lm LM
void copy_tg_to_tg32 ( lm_t lm  ) 

Convert the 16 bit trigram structure to 32 bit

Parameters:
lm LM
int32 find_bg ( bg_t bg,
int32  n,
s3lmwid32_t  w 
)
Parameters:
bg In: The bigram
int32 find_bg32 ( bg32_t bg,
int32  n,
s3lmwid32_t  w 
)
Parameters:
bg In: The bigram
int32 find_tg ( tg_t tg,
int32  n,
s3lmwid32_t  w 
)
Parameters:
tg In: The trigram
int32 find_tg32 ( tg32_t tg,
int32  n,
s3lmwid32_t  w 
)
Parameters:
tg In: The trigram
int32 lm_add_word_to_ug ( lm_t lm,
dict_t dict,
const char *  newword 
)

Add a word to the LM

look up the dictionary and see whether it exists in the dictionary Looks alike with wid.c's logic at this point.

(Incomplete!) Not fully tested in the situation for on-line recognition.

We also avoid the addition of classes at this point because that could complicated things quite a lot.

Parameters:
lm In/Out: a modified LM structure
dict In: an initialized dictionary structure Used to update lmwid2dictid mapping.
newword In: a pointer of a new word
int32 lm_add_wordlist ( lm_t lm,
dict_t dict,
const char *  filename 
)

Add word list to the LM For each word in the file, call lm_add_wordlist. The file is assume to have a format like this: <word1> <word2> <word3> <word4>

If the lmwid2dictid mapping is not updated, or the dictionary itself is not used in the context. Just specify dict=NULL;

Parameters:
lm In/Out: a modified LM structure
dict In: an initialized dictionary structure Used to update
filename In: a file that contains a list of word one wants to add
int32 lm_bg32list ( lm_t lmp,
s3lmwid32_t  w,
bg32_t **  bg,
int32 *  bowt 
)
Parameters:
lmp In: LM being queried
w In: LM word id of the 1-word history
bg Out: *bg = array of bigrams for w
bowt Out: *bowt = backoff-weight for w
int32 lm_bg_exists ( lm_t lm,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2 
)

Whether a certain bigram exists.

Parameters:
lm In: LM
int32 lm_bg_score ( lm_t lmp,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2,
s3wid_t  w2 
)
Parameters:
lmp In: LM begin queried
int32 lm_bglist ( lm_t lmp,
s3lmwid32_t  w,
bg_t **  bg,
int32 *  bowt 
)

Return the bigram followers for the given word w. Return value: #bigrams in returned list.

Parameters:
lmp In: LM being queried
w In: LM word id of the 1-word history
bg Out: *bg = array of bigrams for w
bowt Out: *bowt = backoff-weight for w
S3DECODER_EXPORT void lm_cache_reset ( lm_t lmp  ) 

LM cache related

Parameters:
lmp In: the LM
S3DECODER_EXPORT void lm_cache_stats_dump ( lm_t lmp  ) 

LM cache statistic dumping

Parameters:
lmp In: the LM
void lm_convert_structure ( lm_t model,
int32  is32bits 
)

Explicity convert structure from 16bit -> 32bit or 32bit to 16bit.

Parameters:
model In: LM file being used
S3DECODER_EXPORT void lm_free ( lm_t lm  ) 

Deallocate the language model.

Parameters:
lm In: a LM structure

Referenced by main().

int32 lm_get_classid ( lm_t model,
const char *  name 
)

Get class ID given a LM.

Parameters:
model In: LM file being queried
name In: The name of the class
int32 lm_is32bits ( lm_t model  ) 

Check whether the model is operating at 32 bits

void lm_null_struct ( lm_t lm  ) 

Set all pointers to NULL in the lm

S3DECODER_EXPORT int32 lm_rawscore ( lm_t lm,
int32  score 
)
Parameters:
lm In: the LM
lm_t* lm_read ( const char *  file,
const char *  lmname,
cmd_ln_t *  config,
logmath_t *  logmath 
)

A simple version of reading in a LM

lm_read is a simple version of lm_read_advance. It will assume language weight, word insertion penalty and unigram weight to be automatically applied. There is also no class-based LM (so ndict=0). Format is set to NULL, so the program will determine it automatically.

Parameters:
file In: LM file being read
lmname In: LM name
lm_t* lm_read_advance ( const char *  file,
const char *  lmname,
float64  lw,
float64  wip,
float64  uw,
int32  ndict,
const char *  fmt,
int32  applyweight,
logmath_t *  logmath 
)

Read an LM file, it will automatically decide whether the file is a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump (non-public functions) correspondingly. Currently the code is not aware about OOV.

lw, wip, uw and ndict are mainly used for recognition purpose. When lm_read is used for other purpose, one could just used dummy setting. recommended one is lw=1.0,wip=0.1,uw=1.0 and ndict=0. These are very useful when lm_read is just used as reading the LM.

If applyweight is 0, lw,wip, uw will not be apply the LM at all. This will allow users to just call the LM routine without initializing other modules (such as logs3_init).

If applyweight is 1, then logs3_init must be called before lm_read. This is usually the case when kb_init is called before the code.

fmt now could be either "TXT", "DMP" and "TXT32" or just NULL. If it is NULL, the LM format will be automatically determined. If it is specified as "TXT" or "DMP", the corresponding lm reader will be called. In such a case, it is important for the users to know what he/she is doing. (Unfortunately, this is mostly not true. ) In the case of "TXT32", a text LM will be forced to 32bit mode.

ndict is the dictionary size of the application. This is needed because class-based LM are addressed in the dictionary wid-space instead of lm wid-space. If class-based LM is not used, just set this to zero.

Note: there are two defense mechanisms of lm_read_advance. First of all, if no fmt is specified, it will start to read the lm in the order of DMP->TXT. Second, if txt format is specified but LM is found to hit the 16bit legacy segments limit, it will automatically switch to read TXT32 LM

Returns:
pointer to LM structure created.
Parameters:
file In: LM file being read
lmname In: LM name
lw In: Language weight
wip In: Word insertion penalty
uw In: Unigram weight (interpolation with uniform distr.)
ndict In: Number of dictionary entry. We need that because class-based LM is addressed in dictionary word ID space.
fmt In: file format of the LM, it is now either "TXT", "DMP" and NULL, if NULL, file format is automaticaly determined
applyweight In: whether lw,wip, uw should be applied to the lm or not
S3DECODER_EXPORT lm_t* lm_read_advance2 ( const char *  file,
const char *  lmname,
float64  lw,
float64  wip,
float64  uw,
int32  ndict,
const char *  fmt,
int32  applyweight,
int  lminmemory,
logmath_t *  logmath 
)
Parameters:
file In: LM file being read
lmname In: LM name
lw In: Language weight
wip In: Word insertion penalty
uw In: Unigram weight (interpolation with uniform distr.)
ndict In: Number of dictionary entry. We need that because class-based LM is addressed in dictionary word ID space.
fmt In: file format of the LM, it is now either "TXT", "DMP" and NULL, if NULL, file format is automaticaly determined
applyweight In: whether lw,wip, uw should be applied to the lm or not
lminmemory In: Whether LM is read into memory

Referenced by main().

void lm_set_param ( lm_t lm,
float64  lw,
float64  wip 
)

Set the language-weight and insertion penalty parameters for the LM, after revoking any earlier set of such parameters.

WARNING!! This function doesn't prevent underflow of values. Make sure you call safe lm2logs3 before it.

Parameters:
lm In: the LM
lw In: the langauage weight
wip In: the word insertion penalty
int32 lm_tg32list ( lm_t lmp,
s3lmwid32_t  w1,
s3lmwid32_t  w2,
tg32_t **  tg,
int32 *  bowt 
)
Parameters:
lmp In: LM being queried
w1 In: LM word id of the first of a 2-word history
w2 In: LM word id of the second of the 2-word history
tg Out: *tg = array of trigrams for <w1,w2>
bowt Out: *bowt = backoff-weight for <w1, w2>
int32 lm_tg_exists ( lm_t lm,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2,
s3lmwid32_t  lw3 
)

Whether a certain trigram exists.

Parameters:
lm In: LM
int32 lm_tg_score ( lm_t lmp,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2,
s3lmwid32_t  lw3,
s3wid_t  w3 
)

Return trigram score for the given three word sequence. If w1 is BAD_LMWID(lm), return lm_bg_score (w2, w3). If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3).

20040227: This also account for the in-class probability of w3.

Parameters:
lmp In: LM begin queried
int32 lm_tglist ( lm_t lmp,
s3lmwid32_t  w1,
s3lmwid32_t  w2,
tg_t **  tg,
int32 *  bowt 
)

Return trigram followers for given two words. Both w1 and w2 must be valid. Return value: #trigrams in returned list.

Parameters:
lmp In: LM being queried
w1 In: LM word id of the first of a 2-word history
w2 In: LM word id of the second of the 2-word history
tg Out: *tg = array of trigrams for <w1,w2>
bowt Out: *bowt = backoff-weight for <w1, w2>
int32 lm_ug_exists ( lm_t lm,
s3lmwid32_t  lwid 
)
Parameters:
lm LM
lwid LM ID for the word
int32 lm_ug_score ( lm_t lmp,
s3lmwid32_t  lwid,
s3wid_t  wid 
)

Return unigram score for the given word

Parameters:
lmp In: LM begin queried
lwid LM ID for the word
wid Dict ID for the word
int32 lm_ug_wordprob ( lm_t lm,
dict_t dict,
int32  th,
wordprob_t wp 
)

Like lm_bg_wordprob, but for unigrams. Return value: #entries filled in the wordprob array.

Parameters:
lm In: LM being queried
dict In : The dictionary
wp In/out: Array to be filled
int32 lm_uglist ( lm_t lmp,
ug_t **  ug 
)

Return the unigrams in LM. Return value: #unigrams in returned list.

Parameters:
lmp In: LM being queried
ug Out: *ug = unigram array

Referenced by word_trans().

s3lmwid32_t lm_wid ( lm_t lm,
const char *  wd 
)
S3DECODER_EXPORT int32 lm_write ( lm_t model,
const char *  outputfile,
const char *  filename,
const char *  fmt 
)

Simple writing of an LM file, the input and output encoding will assume to be iso8859-1. Call lm_write. To convert encoding, please use lm_write_advance.

Parameters:
outputfile In: the pointer LM we want to output In: the output file name
filename In: the LM file name
fmt In: LM file format, it is now either "TXT" or "DMP"

Referenced by main().

int32 lm_write_advance ( lm_t model,
const char *  outputfile,
const char *  filename,
const char *  fmt,
const char *  inputenc,
char *  outputenc 
)

Writing of an LM file with advanced options such as encoding support. Called by lm_write.

fmt now could be TXT, DMP, FST

inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312. Not every pair of conversion works.

Current input/output encodings support list. 0: iso8859-1 1: gb2312-hex 2: gb2312

-: do nothing n: doesn't make sense or not compatible x: not supported yet y: supported

i\o 0 1 2 0 - n n 1 n - y 2 n x -

When we have 4 encoding types: This document should be implemented as a data structure.

This conversion table is copied from encoding.c, please take a look the latest support in encoding.c

Parameters:
model In: the pointer LM we want to output
outputfile In: the output file name
filename In: the LM file name
fmt In: LM file format, it is now either "TXT", "DMP", "FST"
inputenc In: Input encoding type
outputenc Out: Output encoding type
void lmset_add_lm ( lmset_t lms,
lm_t lm,
const char *  lmname 
)

Add a new lm into the lmset. Notice that lms->n_lm will be added by 1

Parameters:
lms In/Out : The set of LM
lm In : The input LM
lmname In: The lm name
void lmset_delete_lm ( lmset_t lms,
const char *  lmname 
)

Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1

Parameters:
lms In/Out : The set of LM
lmname The lm name
S3DECODER_EXPORT void lmset_free ( lmset_t lms  ) 

Free the lmset data structure

Parameters:
lms In: The set of LM
lm_t* lmset_get_lm_widx ( lmset_t lms,
int32  lmidx 
)

Get an LM by index.

Parameters:
lms In: The set of LM
lmidx In: LM index
lm_t* lmset_get_lm_wname ( lmset_t lms,
const char *  lmname 
)

Get an LM by name

Returns:
a pointer of the LM with name lmname
Parameters:
lms In: The set of LM
lmname In: The LM name
char* lmset_idx_to_name ( lmset_t lms,
int32  lmidx 
)

Convert index to name

Returns:
a pointer of the name string. No memory is allocated.
Parameters:
lms In: The set of LM
lmidx In: LM index
S3DECODER_EXPORT lmset_t* lmset_init ( const char *  lmfile,
const char *  lmctlfile,
const char *  ctl_lm,
const char *  lmname,
const char *  lmdumpdir,
float32  lw,
float32  wip,
float32  uw,
dict_t dict,
logmath_t *  logmath 
)

A wrapper function of controlling the behavior of LM initialization

(ARCHAN 20050617) lmset_init controls the behavior how the lmset which is an array of lm was initialized by different command-line arguments. lmfile and lmctlfile are mutually exclusive. Each will invoke one reading functions.

In the case of -lmfile is specified. A lmset with one single lm (or lmset->n_lm=1) will be returned. The single lm's name will be called lmname.

In the case of -lmctlfile is specified. A lmset with multiple lms will be returned. The number of lm will depend on the number of lm specified by -lmctlfile. For the format, please read the current format of -lmctlfile in lm.c

ctl_lm is the equivalent of -ctl for lm. When -ctl_lm is not specified in command-line (ctl_lm is NULL). Then either lm with name lmname will be used as the default lm. If lmname is NULL, then the first lm will be named as the "default"

lmdumpdir is currently not used. It is there for backward compatibility purpose.

lw,wip,uw are language weight, word insertion pernalty and unigram weight. Their values are crucial to computation of the language model score. Therefore, the programmer is urged to carefully set these three values and also be careful of the order.

dict is assumed to be a pre-initialized dict_t structure which is used in deriving the mapping between the dictionary word and the lm words

ARCHAN 20050711 -lminmemory is the only global variable that control the code and we haven't explicitly specify it. Currently, if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used. if the LM is txt-base, only -lminmemory=1 is accepted. (This will be changed in future.)

ARCHAN 20050705: A survival guide for this part of the code. Our language mode code is unnecessarily complicated and is mainly caused by the fact the way we specified class-based LM and multiple LM are inter-dependent. For example, one could specify a multiple LMs file (i.e. lmctlfile) and have no classes. However, if one would like to specify class information even with a single LM, one need to use a multiple LM file format (i.e. lmctlfile).

This difficulty is well-observed in the period of Sphinx 3.4-3.6. That might imply that a new LM format is needed if we want to sustain this part of the development.

Parameters:
lmfile The lm file name, lmfile and lmctlfile are mutally exclusive
lmctlfile The file that specified multiple LMs and class information, lmfile and lmctlfile are mutually exclusive
ctl_lm The control file that describes which lm to use for a particular utterance
lmname The LM name to use if ctl_lm is not specified
lmdumpdir Currently not used
lw Language model weight
wip Word insertion penalty
uw Unigram weight
dict A pre-initialized dict_t structure
int32 lmset_name_to_idx ( lmset_t lms,
const char *  lmname 
)

Convert name to index

Parameters:
lms In: The set of LM
lmname In: The LM name
lmset_t* lmset_read_ctl ( const char *  ctlfile,
dict_t dict,
float64  lw,
float64  wip,
float64  uw,
const char *  lmdumpdir,
logmath_t *  logmath 
)

Read the LM control file. **Usually**, it is also a class-based LM,

Parameters:
ctlfile Control file name
dict In: Dictionary
lw In: Language weight
wip In: Word insertion penalty
uw In: Unigram weight
lmdumpdir In: LMdumpdir
lmset_t* lmset_read_lm ( const char *  lmfile,
dict_t dict,
const char *  lmname,
float64  lw,
float64  wip,
float64  uw,
const char *  lmdumpdir,
logmath_t *  logmath 
)

Read a single LM into the lmset.

Parameters:
lmfile In: The LM file
dict In: A pre-initialized dictionary file
lmname In: The LM name
lw The language weight
wip The word insertion penalty
uw The unigram weight
lmdumpdir In: LM dump dir
void lmset_set_curlm_widx ( lmset_t lms,
int32  lmidx 
)

Set the current LM with index

Parameters:
lms In: The set of LM
lmidx In: LM index
S3DECODER_EXPORT void lmset_set_curlm_wname ( lmset_t lms,
const char *  lmname 
)

Set the current LM with name

Parameters:
lms In: The set of LM
lmname In: The LM name
ug_t* NewUnigramTable ( int32  n_ug  ) 

Create a new unigram table

Parameters:
n_ug Number of unigram
void swap_bg ( bg_t bg  ) 

Swap 16 bits bigram

void swap_bg32 ( bg32_t bg  ) 

Swap 32 bits bigram

void swap_tg ( tg_t tg  ) 

Swap 16 bits trigram

void swap_tg32 ( tg32_t tg  ) 

Swap 32 bits trigram

void tg32_write ( FILE *  fp,
tg32_t tg 
)

Write of TG (32bits) structure

Parameters:
fp A file pointer
tg A pointer of the tg32_t structure
void tg_write ( FILE *  fp,
tg_t tg 
)

Write of TG structure

Parameters:
fp A file pointer
tg A pointer of the tg_t structure
void ug_write ( FILE *  fp,
ug_t ug 
)

Write of UG structure

Parameters:
fp A file pointer
ug A pointer of the ug_t structure

Generated on 7 Mar 2010 by  doxygen 1.6.1