lm.h

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * lm.h - Disk/memory based word-trigram backoff LM
00039  *
00040  * **********************************************
00041  * CMU ARPA Speech Project
00042  *
00043  * Copyright (c) 1997 Carnegie Mellon University.
00044  * ALL RIGHTS RESERVED.
00045  * **********************************************
00046  *
00047  * HISTORY
00048  * $Log: lm.h,v $
00049  * Revision 1.16  2006/03/02 22:10:36  arthchan2003
00050  * Add *g_write into the code.
00051  *
00052  * Revision 1.15  2006/02/28 22:26:51  egouvea
00053  * Moved definition of lm_wid() outside of the #if 0/#endif block, so
00054  * it's declared.
00055  *
00056  * Revision 1.14  2006/02/24 13:38:08  arthchan2003
00057  * Added lm_read, it is a simple version of lm_read_advance.
00058  *
00059  * Revision 1.13  2006/02/23 04:16:29  arthchan2003
00060  * Merged from SPHINX3_5_2_RCI_IRII_BRANCH:
00061  * Splited the original lm.c into five parts,
00062  * a, lm.c - a controller of other subroutines.
00063  * b, lm_3g.c - implement TXT-based lm operations
00064  * c, lm_3g_dmp.c - implement DMP-based lm operations
00065  * d, lm_attfsm.c - implement FSM-based lm operations
00066  * e, lmset.c - implement sets of lm.
00067  *
00068  * Revision 1.12.4.3  2006/01/16 19:56:37  arthchan2003
00069  * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format.  This code used Yannick Esteve's and LIUM code.
00070  *
00071  * Revision 1.12.4.2  2005/11/17 06:15:22  arthchan2003
00072  * Added input-encoding and output-encoding into the lm structure.
00073  *
00074  * Revision 1.12.4.1  2005/07/13 01:46:22  arthchan2003
00075  * 1, Fixed dox-doc, 2, Added more documentation for major functions such as lm_read and lm_write.
00076  *
00077  * Revision 1.12  2005/06/21 22:24:02  arthchan2003
00078  * Log. In this change, I introduced a new interface for lm ,which is
00079  * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the
00080  * same structure and handle LM initialization (lm_init) switching,
00081  * (lmset_curlm_widx), delete LM (lmset_delete_lm).  The internal
00082  * structure is called lmarray and is an array of pointers of lm.  The
00083  * current lm is always maintained and pointed by a pointer called cur_lm
00084  * . This substantially clarify the structure of the code.  At this
00085  * check-in, not every core function of lmset is completed.
00086  * e.g. lmset_add_lm because that required testing of several LM reading
00087  * routines and could be quite time-consuming.
00088  *
00089  * Log. Another notable change is the fact dict2lmwid map is started to
00090  * be part of the LM. The reason of this is clearly described inside the
00091  * code. Don't want to repeat here.
00092  *
00093  * Log. The new interface has been already used broadly in both Sphinx
00094  * 3.0 and sphinx 3.x family of tools.
00095  *
00096  * Revision 1.5  2005/06/18 03:22:28  archan
00097  * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar.
00098  *
00099  * Revision 1.4  2005/06/17 23:44:40  archan
00100  * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend.  2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search.
00101  *
00102  * Revision 1.3  2005/06/13 04:02:59  archan
00103  * Fixed most doxygen-style documentation under libs3decoder.
00104  *
00105  * Revision 1.2  2005/05/10 21:21:54  archan
00106  * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM.
00107  *
00108  * Revision 1.1  2005/05/04 06:08:07  archan
00109  * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future.
00110  *
00111  * Revision 1.6  2005/05/04 04:02:24  archan
00112  * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search.  Not yet tested. Just want to keep up my own momentum.
00113  *
00114  * Revision 1.5  2005/04/21 23:50:26  archan
00115  * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in.  At this moment, everything in search mode 5 is already done.  It is time to test the idea whether the search can really be used.
00116  *
00117  * Revision 1.4  2005/04/20 03:37:59  archan
00118  * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm.
00119  *
00120  * Revision 1.3  2005/03/30 01:22:47  archan
00121  * Fixed mistakes in last updates. Add
00122  *
00123  * 
00124  * 20.Apr.2001  RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
00125  *              Adding lm_free() to free allocated memory
00126  * 
00127  * 24-Jun-97    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00128  *              Added lm_t.access_type; made lm_wid externally visible.
00129  * 
00130  * 24-Jun-97    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00131  *              Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz.
00132  * 
00133  * 13-Feb-97    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00134  *              Created from original S3 version.
00135  */
00136 
00137 
00138 #ifndef _S3_LM_H_
00139 #define _S3_LM_H_
00140 
00141 #include <stdio.h>
00142 
00143 #include <logmath.h>
00144 #include <hash_table.h>
00145 #include <cmd_ln.h>
00146 
00147 #ifdef __cplusplus
00148 extern "C" {
00149 #endif
00150 #if 0
00151 } /* Fool Emacs into not indenting things. */
00152 #endif
00153 
00154 #define LM_DICTWID_BADMAP       -16000          
00155 #define LM_CLASSID_BASE         0x01000000      
00158 #define LM_LEGACY_CONSTANT      BAD_S3LMWID          
00163 #define LM_SPHINX_CONSTANT      BAD_S3LMWID32      
00172 #define LM_CLASSID_TO_CLASS(m,i)        ((m)->lmclass[(i)-LM_CLASSID_BASE])
00173 
00174 #define MIN_PROB_F       -99.0  
00182 #define LM_ALLOC_BLOCK      16  
00188 #define LM_SUCCESS           1  
00190 #define LM_FAIL              0  
00191 #define LM_NOT_FOUND        -1  
00193 #define LM_OFFSET_TOO_LARGE -2  
00199 #define LM_NO_DATA_MARK     -3  
00202 #define LM_UNKNOWN_NG       -4  
00204 #define LM_BAD_LM_COUNT     -5  
00206 #define LM_UNKNOWN_WORDS    -6  
00209 #define LM_BAD_BIGRAM       -7  
00215 #define LM_BAD_TRIGRAM      -8  
00221 #define LM_BAD_QUADGRAM     -9  
00228 #define LM_BAD_QUINGRAM     -10  
00239 #define LM_BAD_NGRAM       -11  
00245 #define LM_TOO_MANY_NGRAM  -12  
00249 #define LM_NO_MINUS_1GRAM  -13  
00252 #define LM_FILE_NOT_FOUND  -14  
00254 #define LM_CANNOT_ALLOCATE -15  
00258 #define LMDMP_VERSIONNULL 0   
00264 #define LMDMP_VERSION_TG_16BIT -1 
00268 #define LMDMP_VERSION_TG_16BIT_V2 -2 
00271 #define LMDMP_VERSION_TG_32BIT -3 
00277 #define LMTXT_VERSION         1000 
00278 #define LMFST_VERSION         1001 
00279 #define LMFORCED_TXT32VERSION 1002 
00287 #define NO_WORD -1
00288 
00289 #include "s3types.h"
00290 #include "lmclass.h"
00291 #include "dict.h"
00292 
00293 /*
00294  * ARCHAN 20050503: comment copied from Sphinx 2
00295  * Bigram probs and bo-wts, and trigram probs are kept in separate tables
00296  * rather than within the bigram_t and trigram_t structures.  These tables
00297  * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h).
00298  * The following tree structure is used to construct these tables of unique
00299  * values.  Whenever a new value is read from the LM file, the sorted tree
00300  * structure is searched to see if the value already exists, and inserted
00301  * if not found.
00302  */
00303 
00330 typedef union {
00331     float32 f; 
00332     int32 l;   
00333 } lmlog_t;
00334 
00335 
00336 
00341 typedef struct sorted_entry_s {
00342     lmlog_t val;                
00343     uint32 lower;       
00346     uint32 higher;      
00349 } sorted_entry_t;
00350 
00355 typedef struct {
00356     sorted_entry_t *list; 
00357     int32 free;         
00358 } sorted_list_t;
00359 
00364 typedef struct {
00365     s3wid_t dictwid;    
00369     lmlog_t prob;       
00370     lmlog_t bowt;
00371     int32 firstbg;      
00372 } ug_t;
00373 
00378 typedef struct {
00379     s3lmwid_t wid;      
00380     uint16 probid;      
00381     uint16 bowtid;      
00382     uint16 firsttg;     
00383 } bg_t;
00384 
00385 
00389 typedef struct {
00390     s3lmwid32_t wid;    
00391     uint32 probid;      
00392     uint32 bowtid;      
00393     uint32 firsttg;     
00394 } bg32_t;
00395 
00396 
00401 typedef struct {
00402     s3lmwid_t wid;      
00403     uint16 probid;      
00404 } tg_t;
00405 
00406 
00411 typedef struct {
00412     s3lmwid32_t wid;    
00413     uint32 probid;      
00414 } tg32_t;
00415 
00416 
00420 typedef struct {
00421     bg_t *bg;           
00422     int32 used;         
00424 } membg_t;
00425 
00430 typedef struct {
00431     bg32_t *bg32;               
00432     int32 used;         
00434 } membg32_t;
00435 
00436 
00448 typedef struct tginfo_s {
00449     s3lmwid_t w1;               
00451     int32 n_tg;                 
00452     tg_t *tg;                   
00453     int32 bowt;                 
00454     int32 used;                 
00455     struct tginfo_s *next;      
00456 } tginfo_t;
00457 
00463 typedef struct tginfo32_s {
00464     s3lmwid32_t w1;             
00466     int32 n_tg;                 
00467     tg32_t *tg32;                       
00468     int32 bowt;                 
00469     int32 used;                 
00470     struct tginfo32_s *next;    
00471 } tginfo32_t;
00472 
00473 
00474 /*
00475  * \struct lm_tgcache_entry_t
00476  * Entries in a fast and dirty cache for trigram lookups.  See lm_t.tgcache.
00477  */
00478 typedef struct {
00479     s3lmwid_t lwid[3];          
00480     int32 lscr;                 
00481 } lm_tgcache_entry_t;
00482 
00483 
00484 /*
00485  * \struct lm_tgcache_entry32_t
00486  * \brief 32 bit version of lm_tg_cache_entry
00487  */
00488 typedef struct {
00489     s3lmwid32_t lwid[3];                
00490     int32 lscr;                 
00491 } lm_tgcache_entry32_t;
00492 
00493 
00494 
00495 /* 
00496  * A note on lm/dict/dict2lm.   -ARCHAN 20050616
00497  * 
00498  * In older versions of sphinx3 (<s3.4). dict2lm is a separate object
00499  * from lm and dict.  A kb actually owns a dict2lm so programer will
00500  * read the lm.  This seprates the initalization of lm and dict2lm and
00501  * it makes a lot of sense if there is **only one** lm and **only one
00502  * dict2lm. 
00503  * 
00504  * However, when multiple LMs and switching of them is required.
00505  * Then, the problem of the above architecture starts to show up.  For
00506  * example, 
00507  *  lmset=lm_read_ctl ();
00508  *  for(i=0;i<kb->n_lm;i++){
00509  *   dict2lmwid[i]=wid_dict_lm_map
00510  *  }
00511  * At the same time, one will also have an array of lms (lmset[i]) for 
00512  * corresponding dict2lm[i]!
00513  *
00514  * Of course, having multiple arrays of things will somedays caused
00515  * problems.
00516  *
00517  * The resolution is that we observed that the dict2lm map mostly
00518  * changed when the lm needs to change. Also, the fact that the
00519  * dictionary pronounciation itself seldom changes. That is partially
00520  * caused by the fact we don't have too much research on So at the
00521  * end, that is why it makes sense to let the lm to own a dict2lm.
00522  * 
00523  * What if we also allow the dictionary to change? That is a tough
00524  * question.  In that case perhaps, we should still inventory of sets
00525  * of lm and dict2lm and allow lm to store a pointer of dict2lm.  Once
00526  * there are changes in dict, programmer will be responsible to update
00527  * dict2lm. (Storing pointers will allow programmers not to update
00528  * everything but just lms corresponding to a particular dict.)  I
00529  * guess in that case it will be sign of having a wrapper that control
00530  * both lm and dict together.
00531  */
00532 
00533 /*
00534  * Comments by RKM
00535  * To conserve space, bg/tg probs/ptrs kept in many tables.  Since the number of
00536  * distinct prob values << #bg/#tg, these table indices can be easily fit into
00537  * 16 bits.  bgprob and bgbowt are such indices.  The firsttg entry for a bigram
00538  * is harder.  It is supposed to be the index of the first trigram entry for each
00539  * bigram.  But #tg can be >> 2^16.  Hence the following segmentation scheme:
00540  * Partition bigrams into segments of lm_t.bg_seg_sz consecutive entries, such that
00541  * #trigrams in each segment <= 2**16 (the corresponding trigram segment).  The
00542  * bigram_t.firsttg value is then a 16-bit relative index within the trigram
00543  * segment.  A separate table--lm_t.tg_segbase--has the absolute index of the
00544  * 1st trigram for each segment.
00545  */
00546 
00547 /* Default values for lm_t.log_bg_seg.sz */
00548 #define LOG2_BG_SEG_SZ  9       
00549 #define BG_SEG_SZ       (1 << (LOG2_BG_SEG_SZ))
00550 #define LM_TGCACHE_SIZE         100003  /* A prime no. (hopefully it IS one!) */
00551 
00552 /* 20040211 ARCHAN: Yes! Indeed it is a prime */
00553 
00559 typedef struct lm_s {
00560     char *name ;        
00561     int32 n_ug;         
00562     int32 n_bg;         
00563     int32 n_tg;         
00564     int32 max_ug;       
00566     int32 n_ng;           
00568     char **wordstr;     
00571     uint32 log_bg_seg_sz;
00572     uint32 bg_seg_sz;
00573 
00574     ug_t *ug;           
00576     /* 20040225 ARCHAN : Data structure to maintain dictionary information */
00577     /* Data structure for dictionary to LM words look up mapping */
00578     /* 20060306 ARCHAN: Change this to a 32 bits data structure */
00579     s3lmwid32_t *dict2lmwid; 
00580     s3lmwid32_t startlwid;      
00581     s3lmwid32_t finishlwid;     
00583     bg_t *bg;           
00584     tg_t *tg;           
00585     membg_t *membg;     
00586     tginfo_t **tginfo;  
00589     lm_tgcache_entry_t *tgcache; 
00597     /**************************/
00598 
00599 
00600     bg32_t *bg32;               
00601     tg32_t *tg32;               
00602     membg32_t *membg32; 
00603     tginfo32_t **tginfo32;      
00605     lm_tgcache_entry32_t *tgcache32; 
00607     /**************************/
00608     
00609     lmlog_t *bgprob;    
00610     lmlog_t *tgprob;    
00611     lmlog_t *tgbowt;    
00612     int32 *tg_segbase;  
00614     int32 n_bgprob;
00615     int32 n_tgprob;
00616     int32 n_tgbowt;
00617 
00618     FILE *fp;
00619     int32 byteswap;     
00620     int32 bgoff;        
00621     int32 tgoff;        
00623     float32 lw;         
00624     int32 wip;          
00627     /* Statistics */
00628     int32 n_bg_fill;    
00629     int32 n_bg_inmem;   
00630     int32 n_bg_score;   
00631     int32 n_bg_bo;      
00632     int32 n_tg_fill;    
00633     int32 n_tg_inmem;   
00634     int32 n_tg_score;   
00635     int32 n_tg_bo;      
00636     int32 n_tgcache_hit;  
00638     int32 access_type;  
00642     int32 isLM_IN_MEMORY;  
00645     int32 dict_size;  
00647     hash_table_t *HT;           
00650     /* Data structure that maintains the class information */
00651     lmclass_t **lmclass;   
00652     int32 n_lmclass;      
00653     int32 *inclass_ugscore; 
00656     int32 inputenc ; 
00657     int32 outputenc ; 
00658     int32 version;  
00661     int32 is32bits; 
00663     /* Arrays of unique bigram probs and bo-wts, and trigram probs */
00664     sorted_list_t sorted_prob2; 
00665     sorted_list_t sorted_bowt2; 
00666     sorted_list_t sorted_prob3; 
00667     int32 max_sorted_entries; 
00669     logmath_t *logmath;
00670 } lm_t;
00671 
00672 
00673 
00678 typedef struct lmset_s {
00679     lm_t **lmarray;  
00680     lm_t *cur_lm; 
00682     int32 cur_lm_idx; 
00683     int32 n_lm;       
00684     int32 n_alloc_lm; 
00685 } lmset_t;
00686 
00688 #define lm_lmwid2dictwid(lm,u)  ((lm)->ug[u].dictwid)
00689 #define lm_n_ug(lm)             ((lm)->n_ug)
00690 #define lm_n_bg(lm)             ((lm)->n_bg)
00691 #define lm_n_tg(lm)             ((lm)->n_tg)
00692 #define lm_wordstr(lm,u)        ((lm)->wordstr[u])
00693 #define lm_startwid(lm)         ((lm)->startlwid)
00694 #define lm_finishwid(lm)        ((lm)->finishlwid)
00695 #define lm_access_type(lm)      ((lm)->access_type)
00696 
00697 
00701 typedef struct {
00702     s3wid_t wid;        
00703     int32 prob;         
00704 } wordprob_t;
00705   
00706 
00761 S3DECODER_EXPORT
00762 lmset_t* lmset_init(const char* lmfile,  
00763                     const char* lmctlfile, 
00764                     const char* ctl_lm,    
00765                     const char* lmname,    
00766                     const char* lmdumpdir, 
00767                     float32 lw,      
00768                     float32 wip,     
00769                     float32 uw,      
00770                     dict_t *dict,     
00771                     logmath_t *logmath
00772     );
00773 
00774 
00775 /* It is still a sore point: To have two interfaces for two different
00776    type of input.  Some of the code is still duplicated.  Changing
00777    one doesn't the other one will be changed
00778 */
00779 
00783 lmset_t* lmset_read_lm(const char *lmfile, 
00784                        dict_t *dict,       
00785                        const char *lmname, 
00786                        float64 lw,         
00787                        float64 wip,        
00788                        float64 uw,          
00789                        const char *lmdumpdir, 
00790                        logmath_t *logmath
00791     );
00792 
00797 lmset_t* lmset_read_ctl(const char * ctlfile,
00798                         dict_t* dict,  
00799                         float64 lw,     
00800                         float64 wip,    
00801                         float64 uw,    
00802                         const char* lmdumpdir, 
00803                         logmath_t *logmath
00804     );  
00805 
00809 lm_t* lmset_get_lm_widx(lmset_t *lms,  
00810                         int32 lmidx    
00811     );
00812 
00817 lm_t* lmset_get_lm_wname(lmset_t *lms,  
00818                          const char *lmname   
00819     );
00820 
00824 void lmset_set_curlm_widx(lmset_t *lms, 
00825                           int32 lmidx   
00826     );
00827 
00831 S3DECODER_EXPORT
00832 void lmset_set_curlm_wname(lmset_t *lms, 
00833                            const char *lmname   
00834     );
00835   
00839 int32 lmset_name_to_idx(lmset_t *lms, 
00840                         const char *lmname 
00841     );
00842 
00848 char* lmset_idx_to_name(lmset_t *lms, 
00849                         int32 lmidx 
00850     );
00851 
00852 
00857 void lmset_add_lm(lmset_t *lms,  
00858                   lm_t *lm,      
00859                   const char* lmname 
00860     );
00861 
00866 void lmset_delete_lm(lmset_t *lms, 
00867                      const char *lmname 
00868     );
00869 
00873 S3DECODER_EXPORT
00874 void lmset_free(lmset_t *lms 
00875     );
00876 
00881 int32 lm_tglist (lm_t *lmp,     
00882                  s3lmwid32_t w1,        
00883                  s3lmwid32_t w2,        
00884                  tg_t **tg,     
00885                  int32 *bowt    
00886     );
00887 
00888 int32 lm_tg32list (lm_t *lmp,   
00889                    s3lmwid32_t w1,      
00890                    s3lmwid32_t w2,      
00891                    tg32_t **tg, 
00892                    int32 *bowt  
00893     );
00894 
00899 int32 lm_bglist (lm_t *lmp,     
00900                  s3lmwid32_t w, 
00901                  bg_t **bg,     
00902                  int32 *bowt    
00903     );
00904 
00905 int32 lm_bg32list (lm_t *lmp,   
00906                    s3lmwid32_t w,       
00907                    bg32_t **bg, 
00908                    int32 *bowt  
00909     );
00910 
00911 
00912 #if 0 /*Obsolete and it will cause conflict the code, so comment for now*/
00913 /*
00914  * Somewhat like lm_bglist, but fill up a wordprob_t array from the bigram list found, instead
00915  * of simply returning the bglist.  The wordprob array contains dictionary word IDs.  But note
00916  * that only the base IDs are entered; the caller is responsible for filling out the alternative
00917  * pronunciations.
00918  * Return value:  \#entries filled in the wordprob array.
00919  */
00920 int32 lm_bg_wordprob(lm_t *lm,          
00921                      s3lmwid32_t w,     
00922                      int32 th,          
00923                      wordprob_t *wp,    
00925                      int32 *bowt        
00926     );
00927 
00928 #endif
00929 
00930 /* Return LM word ID for the given string, or BAD_LMWID(lm) if not available */
00931 s3lmwid32_t lm_wid (lm_t *lm, const char *wd);
00932 
00936 void lm_null_struct(lm_t* lm 
00937     );
00938 
00943 int32 lm_ug_wordprob(lm_t *lm, 
00944                      dict_t *dict, 
00945                      int32 th,
00946                      wordprob_t *wp 
00947     );
00948 
00950 int32 lm_uglist (lm_t *lmp,     
00951                  ug_t **ug      
00952     );
00953   
00954 
00955 
00956 /* 20040227: This also account the in-class probability of wid*/
00958 int32 lm_ug_score (lm_t *lmp,  
00959                    s3lmwid32_t lwid, 
00960                    s3wid_t wid     
00961     );
00962 
00963   
00964 int32 lm_ug_exists(lm_t* lm ,  
00965                    s3lmwid32_t lwid 
00966     );
00967   
00968 /*
00969  * Return bigram score for the given two word sequence.  If w1 is BAD_LMWID(lm), return
00970  * lm_ug_score (w2).
00971  * 20040227: This also account for the in-class probability of w2. 
00972  */
00973 int32 lm_bg_score (lm_t *lmp, 
00974                    s3lmwid32_t lw1, 
00975                    s3lmwid32_t lw2,
00976                    s3wid_t w2);
00977 
00978 
00982 int32 lm_bg_exists (lm_t *lm,  
00983                     s3lmwid32_t lw1,  
00984                     s3lmwid32_t lw2   
00985     );
00986 
00993 int32 lm_tg_score (lm_t *lmp,  
00994                    s3lmwid32_t lw1, 
00995                    s3lmwid32_t lw2, 
00996                    s3lmwid32_t lw3, 
00997                    s3wid_t w3);
00998 
00999 
01003 int32 lm_tg_exists (lm_t *lm,  
01004                     s3lmwid32_t lw1,  
01005                     s3lmwid32_t lw2,
01006                     s3lmwid32_t lw3
01007     );
01008 
01016 void lm_set_param (lm_t *lm,  
01017                    float64 lw,  
01018                    float64 wip  
01019     );
01020 
01021 
01022 S3DECODER_EXPORT
01023 int32 lm_rawscore (lm_t *lm,  
01024                    int32 score
01025     );
01026 
01027 
01028 
01030 S3DECODER_EXPORT
01031 void lm_cache_reset (lm_t *lmp 
01032     );
01033 
01035 S3DECODER_EXPORT
01036 void lm_cache_stats_dump (lm_t *lmp 
01037     );
01038 
01048 lm_t * lm_read ( 
01049     const char *file,   
01050     const char *lmname,  
01051     cmd_ln_t *config,
01052     logmath_t *logmath);
01053 
01094 lm_t *lm_read_advance (const char *file,        
01095                        const char *lmname,   
01096                        float64 lw,      
01097                        float64 wip,     
01098                        float64 uw,      
01099                        int32 ndict,    
01102                        const char* fmt,       
01106                        int32 applyweight,      
01108                        logmath_t *logmath
01109     );
01110 
01111 S3DECODER_EXPORT
01112 lm_t *lm_read_advance2(const char *file,        
01113                        const char *lmname,   
01114                        float64 lw,      
01115                        float64 wip,     
01116                        float64 uw,      
01117                        int32 ndict,    
01120                        const char* fmt,       
01124                        int32 applyweight,      
01126                        int lminmemory, 
01127                        logmath_t *logmath
01128     );
01134 S3DECODER_EXPORT
01135 int32 lm_write(lm_t *model, 
01136                const char *outputfile, 
01137                const char *filename, 
01138                const char *fmt   
01139     );
01140   
01172 int32 lm_write_advance(lm_t *model, 
01173                        const char *outputfile, 
01174                        const char *filename, 
01175                        const char *fmt,   
01176                        const char* inputenc, 
01177                        char* outputenc 
01178     );
01179 
01180 /* RAH, added code for freeing allocated memory 
01181  */
01185 S3DECODER_EXPORT
01186 void lm_free (lm_t *lm 
01187     );
01188 
01202 int32 lm_add_wordlist(lm_t *lm, 
01203                       dict_t *dict, 
01206                       const char* filename 
01209     );
01210 
01223 int32 lm_add_word_to_ug(lm_t *lm, 
01224                         dict_t *dict, 
01227                         const char* newword 
01228     );
01232 int32 lm_get_classid (lm_t *model, 
01233                       const char *name   
01234     );
01235 
01239 void lm_convert_structure(lm_t *model, 
01240                           int32 is32bits 
01241     );  
01242 
01246 int32 lm_is32bits(lm_t* model);
01247 
01251 void ug_write(FILE* fp,  
01252               ug_t* ug   
01253     );
01257 void bg_write(FILE* fp, 
01258               bg_t* bg  
01259     );
01260 
01264 void bg32_write(FILE* fp, 
01265                 bg32_t* bg  
01266     );
01267 
01272 void tg_write(FILE* fp, 
01273               tg_t* tg  
01274     );
01275 
01280 void tg32_write(FILE* fp, 
01281                 tg32_t* tg  
01282     );
01283 
01284 
01288 void copy_bg_to_bg32(lm_t *lm 
01289     );
01290 
01295 void copy_bg32_to_bg(lm_t *lm 
01296     );
01297 
01301 void copy_tg_to_tg32(lm_t *lm 
01302     );
01303 
01308 void copy_tg32_to_tg(lm_t *lm 
01309     );
01310 
01314 void swap_bg(bg_t* bg);
01315   
01316 
01320 void swap_bg32(bg32_t* bg);
01321 
01325 void swap_tg(tg_t* tg);
01326   
01327 
01331 void swap_tg32(tg32_t* tg);
01332 
01333 int32 find_bg (bg_t *bg,  
01334                int32 n, 
01335                s3lmwid32_t w
01336     );
01337 
01338 int32 find_bg32 (bg32_t *bg,  
01339                  int32 n, 
01340                  s3lmwid32_t w
01341     );
01342 
01343 
01344 int32 find_tg (tg_t *tg, 
01345                int32 n, s3lmwid32_t w);
01346 
01347 int32 find_tg32 (tg32_t *tg, 
01348                  int32 n, s3lmwid32_t w);
01349   
01350 /* Macro versions of access functions */
01351 #define LM_TGPROB(lm,tgptr)     ((lm)->tgprob[(tgptr)->probid].l)
01352 #define LM_BGPROB(lm,bgptr)     ((lm)->bgprob[(bgptr)->probid].l)
01353 #define LM_UGPROB(lm,ugptr)     ((ugptr)->prob.l)
01354 #define LM_RAWSCORE(lm,score)   ((score - (lm)->wip) / ((lm)->lw))
01355 #define LM_DICTWID(lm,lmwid)     ((lm)->ug[(lmwid)].dictwid)
01356 
01360 ug_t *NewUnigramTable (int32 n_ug 
01361     );
01362 
01363 
01364 #if 0
01365 { /* Stop indent from complaining */
01366 #endif
01367 #ifdef __cplusplus
01368 }
01369 #endif
01370 
01371 #endif

Generated on 7 Mar 2010 by  doxygen 1.6.1