00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138 #ifndef _S3_LM_H_
00139 #define _S3_LM_H_
00140
00141 #include <stdio.h>
00142
00143 #include <logmath.h>
00144 #include <hash_table.h>
00145 #include <cmd_ln.h>
00146
00147 #ifdef __cplusplus
00148 extern "C" {
00149 #endif
00150 #if 0
00151 }
00152 #endif
00153
00154 #define LM_DICTWID_BADMAP -16000
00155 #define LM_CLASSID_BASE 0x01000000
00158 #define LM_LEGACY_CONSTANT BAD_S3LMWID
00163 #define LM_SPHINX_CONSTANT BAD_S3LMWID32
00172 #define LM_CLASSID_TO_CLASS(m,i) ((m)->lmclass[(i)-LM_CLASSID_BASE])
00173
00174 #define MIN_PROB_F -99.0
00182 #define LM_ALLOC_BLOCK 16
00188 #define LM_SUCCESS 1
00190 #define LM_FAIL 0
00191 #define LM_NOT_FOUND -1
00193 #define LM_OFFSET_TOO_LARGE -2
00199 #define LM_NO_DATA_MARK -3
00202 #define LM_UNKNOWN_NG -4
00204 #define LM_BAD_LM_COUNT -5
00206 #define LM_UNKNOWN_WORDS -6
00209 #define LM_BAD_BIGRAM -7
00215 #define LM_BAD_TRIGRAM -8
00221 #define LM_BAD_QUADGRAM -9
00228 #define LM_BAD_QUINGRAM -10
00239 #define LM_BAD_NGRAM -11
00245 #define LM_TOO_MANY_NGRAM -12
00249 #define LM_NO_MINUS_1GRAM -13
00252 #define LM_FILE_NOT_FOUND -14
00254 #define LM_CANNOT_ALLOCATE -15
00258 #define LMDMP_VERSIONNULL 0
00264 #define LMDMP_VERSION_TG_16BIT -1
00268 #define LMDMP_VERSION_TG_16BIT_V2 -2
00271 #define LMDMP_VERSION_TG_32BIT -3
00277 #define LMTXT_VERSION 1000
00278 #define LMFST_VERSION 1001
00279 #define LMFORCED_TXT32VERSION 1002
00287 #define NO_WORD -1
00288
00289 #include "s3types.h"
00290 #include "lmclass.h"
00291 #include "dict.h"
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00330 typedef union {
00331 float32 f;
00332 int32 l;
00333 } lmlog_t;
00334
00335
00336
00341 typedef struct sorted_entry_s {
00342 lmlog_t val;
00343 uint32 lower;
00346 uint32 higher;
00349 } sorted_entry_t;
00350
00355 typedef struct {
00356 sorted_entry_t *list;
00357 int32 free;
00358 } sorted_list_t;
00359
00364 typedef struct {
00365 s3wid_t dictwid;
00369 lmlog_t prob;
00370 lmlog_t bowt;
00371 int32 firstbg;
00372 } ug_t;
00373
00378 typedef struct {
00379 s3lmwid_t wid;
00380 uint16 probid;
00381 uint16 bowtid;
00382 uint16 firsttg;
00383 } bg_t;
00384
00385
00389 typedef struct {
00390 s3lmwid32_t wid;
00391 uint32 probid;
00392 uint32 bowtid;
00393 uint32 firsttg;
00394 } bg32_t;
00395
00396
00401 typedef struct {
00402 s3lmwid_t wid;
00403 uint16 probid;
00404 } tg_t;
00405
00406
00411 typedef struct {
00412 s3lmwid32_t wid;
00413 uint32 probid;
00414 } tg32_t;
00415
00416
00420 typedef struct {
00421 bg_t *bg;
00422 int32 used;
00424 } membg_t;
00425
00430 typedef struct {
00431 bg32_t *bg32;
00432 int32 used;
00434 } membg32_t;
00435
00436
00448 typedef struct tginfo_s {
00449 s3lmwid_t w1;
00451 int32 n_tg;
00452 tg_t *tg;
00453 int32 bowt;
00454 int32 used;
00455 struct tginfo_s *next;
00456 } tginfo_t;
00457
00463 typedef struct tginfo32_s {
00464 s3lmwid32_t w1;
00466 int32 n_tg;
00467 tg32_t *tg32;
00468 int32 bowt;
00469 int32 used;
00470 struct tginfo32_s *next;
00471 } tginfo32_t;
00472
00473
00474
00475
00476
00477
00478 typedef struct {
00479 s3lmwid_t lwid[3];
00480 int32 lscr;
00481 } lm_tgcache_entry_t;
00482
00483
00484
00485
00486
00487
00488 typedef struct {
00489 s3lmwid32_t lwid[3];
00490 int32 lscr;
00491 } lm_tgcache_entry32_t;
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548 #define LOG2_BG_SEG_SZ 9
00549 #define BG_SEG_SZ (1 << (LOG2_BG_SEG_SZ))
00550 #define LM_TGCACHE_SIZE 100003
00551
00552
00553
00559 typedef struct lm_s {
00560 char *name ;
00561 int32 n_ug;
00562 int32 n_bg;
00563 int32 n_tg;
00564 int32 max_ug;
00566 int32 n_ng;
00568 char **wordstr;
00571 uint32 log_bg_seg_sz;
00572 uint32 bg_seg_sz;
00573
00574 ug_t *ug;
00576
00577
00578
00579 s3lmwid32_t *dict2lmwid;
00580 s3lmwid32_t startlwid;
00581 s3lmwid32_t finishlwid;
00583 bg_t *bg;
00584 tg_t *tg;
00585 membg_t *membg;
00586 tginfo_t **tginfo;
00589 lm_tgcache_entry_t *tgcache;
00597
00598
00599
00600 bg32_t *bg32;
00601 tg32_t *tg32;
00602 membg32_t *membg32;
00603 tginfo32_t **tginfo32;
00605 lm_tgcache_entry32_t *tgcache32;
00607
00608
00609 lmlog_t *bgprob;
00610 lmlog_t *tgprob;
00611 lmlog_t *tgbowt;
00612 int32 *tg_segbase;
00614 int32 n_bgprob;
00615 int32 n_tgprob;
00616 int32 n_tgbowt;
00617
00618 FILE *fp;
00619 int32 byteswap;
00620 int32 bgoff;
00621 int32 tgoff;
00623 float32 lw;
00624 int32 wip;
00627
00628 int32 n_bg_fill;
00629 int32 n_bg_inmem;
00630 int32 n_bg_score;
00631 int32 n_bg_bo;
00632 int32 n_tg_fill;
00633 int32 n_tg_inmem;
00634 int32 n_tg_score;
00635 int32 n_tg_bo;
00636 int32 n_tgcache_hit;
00638 int32 access_type;
00642 int32 isLM_IN_MEMORY;
00645 int32 dict_size;
00647 hash_table_t *HT;
00650
00651 lmclass_t **lmclass;
00652 int32 n_lmclass;
00653 int32 *inclass_ugscore;
00656 int32 inputenc ;
00657 int32 outputenc ;
00658 int32 version;
00661 int32 is32bits;
00663
00664 sorted_list_t sorted_prob2;
00665 sorted_list_t sorted_bowt2;
00666 sorted_list_t sorted_prob3;
00667 int32 max_sorted_entries;
00669 logmath_t *logmath;
00670 } lm_t;
00671
00672
00673
00678 typedef struct lmset_s {
00679 lm_t **lmarray;
00680 lm_t *cur_lm;
00682 int32 cur_lm_idx;
00683 int32 n_lm;
00684 int32 n_alloc_lm;
00685 } lmset_t;
00686
00688 #define lm_lmwid2dictwid(lm,u) ((lm)->ug[u].dictwid)
00689 #define lm_n_ug(lm) ((lm)->n_ug)
00690 #define lm_n_bg(lm) ((lm)->n_bg)
00691 #define lm_n_tg(lm) ((lm)->n_tg)
00692 #define lm_wordstr(lm,u) ((lm)->wordstr[u])
00693 #define lm_startwid(lm) ((lm)->startlwid)
00694 #define lm_finishwid(lm) ((lm)->finishlwid)
00695 #define lm_access_type(lm) ((lm)->access_type)
00696
00697
00701 typedef struct {
00702 s3wid_t wid;
00703 int32 prob;
00704 } wordprob_t;
00705
00706
00761 S3DECODER_EXPORT
00762 lmset_t* lmset_init(const char* lmfile,
00763 const char* lmctlfile,
00764 const char* ctl_lm,
00765 const char* lmname,
00766 const char* lmdumpdir,
00767 float32 lw,
00768 float32 wip,
00769 float32 uw,
00770 dict_t *dict,
00771 logmath_t *logmath
00772 );
00773
00774
00775
00776
00777
00778
00779
00783 lmset_t* lmset_read_lm(const char *lmfile,
00784 dict_t *dict,
00785 const char *lmname,
00786 float64 lw,
00787 float64 wip,
00788 float64 uw,
00789 const char *lmdumpdir,
00790 logmath_t *logmath
00791 );
00792
00797 lmset_t* lmset_read_ctl(const char * ctlfile,
00798 dict_t* dict,
00799 float64 lw,
00800 float64 wip,
00801 float64 uw,
00802 const char* lmdumpdir,
00803 logmath_t *logmath
00804 );
00805
00809 lm_t* lmset_get_lm_widx(lmset_t *lms,
00810 int32 lmidx
00811 );
00812
00817 lm_t* lmset_get_lm_wname(lmset_t *lms,
00818 const char *lmname
00819 );
00820
00824 void lmset_set_curlm_widx(lmset_t *lms,
00825 int32 lmidx
00826 );
00827
00831 S3DECODER_EXPORT
00832 void lmset_set_curlm_wname(lmset_t *lms,
00833 const char *lmname
00834 );
00835
00839 int32 lmset_name_to_idx(lmset_t *lms,
00840 const char *lmname
00841 );
00842
00848 char* lmset_idx_to_name(lmset_t *lms,
00849 int32 lmidx
00850 );
00851
00852
00857 void lmset_add_lm(lmset_t *lms,
00858 lm_t *lm,
00859 const char* lmname
00860 );
00861
00866 void lmset_delete_lm(lmset_t *lms,
00867 const char *lmname
00868 );
00869
00873 S3DECODER_EXPORT
00874 void lmset_free(lmset_t *lms
00875 );
00876
00881 int32 lm_tglist (lm_t *lmp,
00882 s3lmwid32_t w1,
00883 s3lmwid32_t w2,
00884 tg_t **tg,
00885 int32 *bowt
00886 );
00887
00888 int32 lm_tg32list (lm_t *lmp,
00889 s3lmwid32_t w1,
00890 s3lmwid32_t w2,
00891 tg32_t **tg,
00892 int32 *bowt
00893 );
00894
00899 int32 lm_bglist (lm_t *lmp,
00900 s3lmwid32_t w,
00901 bg_t **bg,
00902 int32 *bowt
00903 );
00904
00905 int32 lm_bg32list (lm_t *lmp,
00906 s3lmwid32_t w,
00907 bg32_t **bg,
00908 int32 *bowt
00909 );
00910
00911
00912 #if 0
00913
00914
00915
00916
00917
00918
00919
00920 int32 lm_bg_wordprob(lm_t *lm,
00921 s3lmwid32_t w,
00922 int32 th,
00923 wordprob_t *wp,
00925 int32 *bowt
00926 );
00927
00928 #endif
00929
00930
00931 s3lmwid32_t lm_wid (lm_t *lm, const char *wd);
00932
00936 void lm_null_struct(lm_t* lm
00937 );
00938
00943 int32 lm_ug_wordprob(lm_t *lm,
00944 dict_t *dict,
00945 int32 th,
00946 wordprob_t *wp
00947 );
00948
00950 int32 lm_uglist (lm_t *lmp,
00951 ug_t **ug
00952 );
00953
00954
00955
00956
00958 int32 lm_ug_score (lm_t *lmp,
00959 s3lmwid32_t lwid,
00960 s3wid_t wid
00961 );
00962
00963
00964 int32 lm_ug_exists(lm_t* lm ,
00965 s3lmwid32_t lwid
00966 );
00967
00968
00969
00970
00971
00972
00973 int32 lm_bg_score (lm_t *lmp,
00974 s3lmwid32_t lw1,
00975 s3lmwid32_t lw2,
00976 s3wid_t w2);
00977
00978
00982 int32 lm_bg_exists (lm_t *lm,
00983 s3lmwid32_t lw1,
00984 s3lmwid32_t lw2
00985 );
00986
00993 int32 lm_tg_score (lm_t *lmp,
00994 s3lmwid32_t lw1,
00995 s3lmwid32_t lw2,
00996 s3lmwid32_t lw3,
00997 s3wid_t w3);
00998
00999
01003 int32 lm_tg_exists (lm_t *lm,
01004 s3lmwid32_t lw1,
01005 s3lmwid32_t lw2,
01006 s3lmwid32_t lw3
01007 );
01008
01016 void lm_set_param (lm_t *lm,
01017 float64 lw,
01018 float64 wip
01019 );
01020
01021
01022 S3DECODER_EXPORT
01023 int32 lm_rawscore (lm_t *lm,
01024 int32 score
01025 );
01026
01027
01028
01030 S3DECODER_EXPORT
01031 void lm_cache_reset (lm_t *lmp
01032 );
01033
01035 S3DECODER_EXPORT
01036 void lm_cache_stats_dump (lm_t *lmp
01037 );
01038
01048 lm_t * lm_read (
01049 const char *file,
01050 const char *lmname,
01051 cmd_ln_t *config,
01052 logmath_t *logmath);
01053
01094 lm_t *lm_read_advance (const char *file,
01095 const char *lmname,
01096 float64 lw,
01097 float64 wip,
01098 float64 uw,
01099 int32 ndict,
01102 const char* fmt,
01106 int32 applyweight,
01108 logmath_t *logmath
01109 );
01110
01111 S3DECODER_EXPORT
01112 lm_t *lm_read_advance2(const char *file,
01113 const char *lmname,
01114 float64 lw,
01115 float64 wip,
01116 float64 uw,
01117 int32 ndict,
01120 const char* fmt,
01124 int32 applyweight,
01126 int lminmemory,
01127 logmath_t *logmath
01128 );
01134 S3DECODER_EXPORT
01135 int32 lm_write(lm_t *model,
01136 const char *outputfile,
01137 const char *filename,
01138 const char *fmt
01139 );
01140
01172 int32 lm_write_advance(lm_t *model,
01173 const char *outputfile,
01174 const char *filename,
01175 const char *fmt,
01176 const char* inputenc,
01177 char* outputenc
01178 );
01179
01180
01181
01185 S3DECODER_EXPORT
01186 void lm_free (lm_t *lm
01187 );
01188
01202 int32 lm_add_wordlist(lm_t *lm,
01203 dict_t *dict,
01206 const char* filename
01209 );
01210
01223 int32 lm_add_word_to_ug(lm_t *lm,
01224 dict_t *dict,
01227 const char* newword
01228 );
01232 int32 lm_get_classid (lm_t *model,
01233 const char *name
01234 );
01235
01239 void lm_convert_structure(lm_t *model,
01240 int32 is32bits
01241 );
01242
01246 int32 lm_is32bits(lm_t* model);
01247
01251 void ug_write(FILE* fp,
01252 ug_t* ug
01253 );
01257 void bg_write(FILE* fp,
01258 bg_t* bg
01259 );
01260
01264 void bg32_write(FILE* fp,
01265 bg32_t* bg
01266 );
01267
01272 void tg_write(FILE* fp,
01273 tg_t* tg
01274 );
01275
01280 void tg32_write(FILE* fp,
01281 tg32_t* tg
01282 );
01283
01284
01288 void copy_bg_to_bg32(lm_t *lm
01289 );
01290
01295 void copy_bg32_to_bg(lm_t *lm
01296 );
01297
01301 void copy_tg_to_tg32(lm_t *lm
01302 );
01303
01308 void copy_tg32_to_tg(lm_t *lm
01309 );
01310
01314 void swap_bg(bg_t* bg);
01315
01316
01320 void swap_bg32(bg32_t* bg);
01321
01325 void swap_tg(tg_t* tg);
01326
01327
01331 void swap_tg32(tg32_t* tg);
01332
01333 int32 find_bg (bg_t *bg,
01334 int32 n,
01335 s3lmwid32_t w
01336 );
01337
01338 int32 find_bg32 (bg32_t *bg,
01339 int32 n,
01340 s3lmwid32_t w
01341 );
01342
01343
01344 int32 find_tg (tg_t *tg,
01345 int32 n, s3lmwid32_t w);
01346
01347 int32 find_tg32 (tg32_t *tg,
01348 int32 n, s3lmwid32_t w);
01349
01350
01351 #define LM_TGPROB(lm,tgptr) ((lm)->tgprob[(tgptr)->probid].l)
01352 #define LM_BGPROB(lm,bgptr) ((lm)->bgprob[(bgptr)->probid].l)
01353 #define LM_UGPROB(lm,ugptr) ((ugptr)->prob.l)
01354 #define LM_RAWSCORE(lm,score) ((score - (lm)->wip) / ((lm)->lw))
01355 #define LM_DICTWID(lm,lmwid) ((lm)->ug[(lmwid)].dictwid)
01356
01360 ug_t *NewUnigramTable (int32 n_ug
01361 );
01362
01363
01364 #if 0
01365 {
01366 #endif
01367 #ifdef __cplusplus
01368 }
01369 #endif
01370
01371 #endif