00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108 #ifndef _CMDLN_MACRO_H_
00109 #define _CMDLN_MACRO_H_
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129 #define vq_cluster_command_line_macro() \
00130 { "-stdev", \
00131 ARG_BOOLEAN, \
00132 "no", \
00133 "Use std.dev. (rather than var) in computing vector distances during clustering" }, \
00134 { "-eps", \
00135 ARG_FLOAT64, \
00136 "0.0001", \
00137 "Stopping criterion: stop iterations if relative decrease in sq(error) < eps" }, \
00138 { "-iter", \
00139 ARG_INT32, \
00140 "100", \
00141 "Max no. of k-means iterations for clustering" }
00142
00143 #define gmm_command_line_macro() \
00144 { "-mean",\
00145 ARG_STRING,\
00146 NULL,\
00147 "Mixture gaussian means input file" },\
00148 { "-var",\
00149 ARG_STRING,\
00150 NULL,\
00151 "Mixture gaussian variances input file" },\
00152 { "-varfloor",\
00153 ARG_FLOAT32,\
00154 "0.0001",\
00155 "Mixture gaussian variance floor (applied to data from -var file)" },\
00156 { "-mixw",\
00157 ARG_STRING,\
00158 NULL,\
00159 "Senone mixture weights input file" },\
00160 { "-mixwfloor",\
00161 ARG_FLOAT32,\
00162 "0.0000001",\
00163 "Senone mixture weights floor (applied to data from -mixw file)" }
00164
00165 #define acoustic_model_command_line_macro() \
00166 gmm_command_line_macro(), \
00167 { "-hmm", \
00168 ARG_STRING, \
00169 NULL, \
00170 "Directory for specifying Sphinx 3's hmm, the following files are assummed to be present, mdef, mean, var, mixw, tmat. If -mdef, -mean, -var, -mixw or -tmat are specified, they will override this command. "}, \
00171 { "-featparams", \
00172 ARG_STRING, \
00173 NULL, \
00174 "File containing feature extraction parameters."}, \
00175 { "-mdef", \
00176 ARG_STRING,\
00177 NULL,\
00178 "Model definition input file" },\
00179 { "-tmat",\
00180 ARG_STRING,\
00181 NULL,\
00182 "HMM state transition matrix input file" },\
00183 { "-tmatfloor",\
00184 ARG_FLOAT32,\
00185 "0.0001",\
00186 "HMM state transition probability floor (applied to -tmat file)" },\
00187 { "-senmgau",\
00188 ARG_STRING,\
00189 ".cont.",\
00190 "Senone to mixture-gaussian mapping file (or .semi. or .cont.)" }, \
00191 { "-topn", \
00192 ARG_INT32, \
00193 "4", \
00194 "(S3.0 GMM Computation only) No. of top scoring densities computed in each mixture gaussian codebook (semi-continuous models only)" }
00195
00196 #define language_model_command_line_macro() \
00197 { "-lm", \
00198 ARG_STRING, \
00199 NULL, \
00200 "Word trigram language model input file" }, \
00201 { "-lmctlfn", \
00202 ARG_STRING, \
00203 NULL, \
00204 "Specify a set of language model\n"}, \
00205 { "-lmdumpdir", \
00206 ARG_STRING, \
00207 NULL, \
00208 "The directory for dumping the DMP file. "}, \
00209 { "-lmname", \
00210 ARG_STRING, \
00211 NULL, \
00212 "Name of language model in -lmctlfn to use for all utterances" }
00213
00214 #if 0
00215
00216 { "-fsgctlfn",
00217 ARG_STRING,
00218 NULL,
00219 "A finite state grammar control file" },
00220 #endif
00221
00222 #define finite_state_grammar_command_line_macro() \
00223 { "-fsg", \
00224 ARG_STRING, \
00225 NULL, \
00226 "(FSG Mode (Mode 2) only) Finite state grammar"}, \
00227 { "-fsgusealtpron", \
00228 ARG_BOOLEAN, \
00229 "yes", \
00230 "(FSG Mode (Mode 2) only) Use alternative pronunciations for FSG"}, \
00231 { "-fsgusefiller", \
00232 ARG_BOOLEAN, \
00233 "yes", \
00234 "(FSG Mode (Mode 2) only) Insert filler words at each state."}
00235
00236
00237 #define log_table_command_line_macro() \
00238 { "-logbase", \
00239 ARG_FLOAT32, \
00240 "1.0003", \
00241 "Base in which all log-likelihoods calculated" }, \
00242 { "-log3table", \
00243 ARG_BOOLEAN, \
00244 "yes", \
00245 "Determines whether to use the logs3 table or to compute the values at run time."}
00246
00247 #define phoneme_lookahead_command_line_macro() \
00248 { "-pheurtype", \
00249 ARG_INT32, \
00250 "0", \
00251 "0 = bypass, 1= sum of max, 2 = sum of avg, 3 = sum of 1st senones only" }, \
00252 { "-pl_window", \
00253 ARG_INT32, \
00254 "1", \
00255 "Window size (actually window size-1) of phoneme look-ahead." }, \
00256 { "-pl_beam", \
00257 ARG_FLOAT64, \
00258 "1.0e-80", \
00259 "Beam for phoneme look-ahead. [1 (narrowest)..10000000(very wide)]" }
00260
00261 #define histogram_pruning_command_line_macro() \
00262 { "-maxwpf", \
00263 ARG_INT32, \
00264 "20", \
00265 "(Only used in Mode 4 and 5) Max no. of distinct word exits to maintain at each frame" }, \
00266 { "-maxhistpf", \
00267 ARG_INT32, \
00268 "100", \
00269 "(Only used in Mode 4 and 5) Max no. of histories to maintain at each frame" }, \
00270 { "-hmmhistbinsize", \
00271 ARG_INT32, \
00272 "5000", \
00273 "(Only used in Mode 4 and 5) Performance histogram: #frames vs #HMMs active; #HMMs/bin in this histogram" }, \
00274 { "-maxhmmpf", \
00275 ARG_INT32, \
00276 "20000", \
00277 "(Only used in Mode 4 and 5) Max no. of active HMMs to maintain at each frame; approx." }
00278
00279 #define dictionary_command_line_macro() \
00280 { "-dict", \
00281 ARG_STRING, \
00282 NULL, \
00283 "Main pronunciation dictionary (lexicon) input file" }, \
00284 { "-fdict", \
00285 ARG_STRING, \
00286 NULL, \
00287 "Silence and filler (noise) word pronunciation dictionary input file" }, \
00288 { "-lts_mismatch", \
00289 ARG_BOOLEAN, \
00290 "no", \
00291 "Use CMUDict letter-to-sound rules to generate pronunciations for LM words doesn't appear in the dictionary . Use it with care. It assumes that the phone set in the mdef and dict are the same as the LTS rule. "}
00292
00293 #define gaussian_selection_command_line_macro() \
00294 { "-gs", \
00295 ARG_STRING, \
00296 NULL, \
00297 "Gaussian Selection Mapping." }
00298
00299 #define fast_GMM_computation_command_line_macro() \
00300 { "-subvq", \
00301 ARG_STRING, \
00302 NULL, \
00303 "Sub-vector quantized form of acoustic model" }, \
00304 { "-subvqbeam", \
00305 ARG_FLOAT64, \
00306 "3.0e-3", \
00307 "Beam selecting best components within each mixture Gaussian [0(widest)..1(narrowest)]" }, \
00308 gaussian_selection_command_line_macro(), \
00309 { "-ds", \
00310 ARG_INT32, \
00311 "1", \
00312 "Ratio of Down-sampling the frame computation." }, \
00313 { "-cond_ds", \
00314 ARG_BOOLEAN, \
00315 "no", \
00316 "Conditional Down-sampling, override normal down sampling. require specify a gaussian selection map" }, \
00317 { "-dist_ds", \
00318 ARG_BOOLEAN, \
00319 "no", \
00320 "Distance-based Down-sampling, override normal down sampling." }, \
00321 { "-gs4gs", \
00322 ARG_BOOLEAN, \
00323 "yes", \
00324 "A flag that specified whether the input GS map will be used for Gaussian Selection. If it is disabled, the map will only provide information to other modules." }, \
00325 { "-svq4svq", \
00326 ARG_BOOLEAN, \
00327 "no", \
00328 "A flag that specified whether the input SVQ will be used as approximate scores of the Gaussians" }, \
00329 { "-ci_pbeam", \
00330 ARG_FLOAT64, \
00331 "1e-80", \
00332 "CI phone beam for CI-based GMM Selection. [0(widest) .. 1(narrowest)]"}, \
00333 { "-tighten_factor",
00334 \
00335 ARG_FLOAT64, \
00336 "0.5", \
00337 "From 0 to 1, it tightens the beam width when the frame is dropped"}, \
00338 { "-maxcdsenpf", \
00339 ARG_INT32, \
00340 "100000", \
00341 "Max no. of distinct CD senone will be computed. " }, \
00342 { "-vqeval", \
00343 ARG_INT32, \
00344 "3", \
00345 "Number of subvectors to use for SubVQ-based frame evaluation (3 for all)"}, \
00346 { "-kdtree",\
00347 ARG_STRING,\
00348 NULL,\
00349 "kd-Tree file for Gaussian selection (for .s2semi models only)" }, \
00350 { "-kdmaxdepth",\
00351 ARG_INT32,\
00352 "0",\
00353 "Maximum depth of kd-Trees to use" }, \
00354 { "-kdmaxbbi",\
00355 ARG_INT32,\
00356 "-1",\
00357 "Maximum number of Gaussians per leaf node in kd-Trees" }
00358
00359 #if 0
00360 { "-feat",
00361 ARG_STRING,
00362 "s2_4x",
00363 "Feature stream:\n\t\t\t\ts2_4x: Sphinx-II type 4 streams, 12cep, 24dcep, 3pow, 12ddcep\n\t\t\t\ts3_1x39: Single stream, 12cep+12dcep+3pow+12ddcep\n\t\t\t\t1s_12c_12d_3p_12dd: Single stream, 12cep+12dcep+3pow+12ddcep\n\t\t\t\t1s_c: Single stream, given input vector only\n\t\t\t\t1s_c_d: Feature + Deltas only\n\t\t\t\t1s_c_dd: Feature + Double deltas only\n\t\t\t\t1s_c_d_dd: Feature + Deltas + Double deltas\n\t\t\t\t1s_c_wd_dd: Feature cep+windowed delcep+deldel \n\t\t\t1s_c_d_ld_dd: Feature + delta + longter delta + doubledelta" },
00364
00365 { "-feat",
00366 ARG_STRING,
00367 "1s_c_d_dd",
00368 "Feature stream: s2_4x / s3_1x39 / cep_dcep[,%d] / cep[,%d] / %d,%d,...,%d" },
00369 #endif
00370
00371
00372 #define speaker_adaptation_command_line_macro() \
00373 { "-mllr", \
00374 ARG_STRING, \
00375 NULL, \
00376 "MLLR transfomation matrix to be applied to mixture gaussian means"}, \
00377 { "-cb2mllr", \
00378 ARG_STRING, \
00379 ".1cls.", \
00380 "Senone to MLLR transformation matrix mapping file (or .1cls.)" }
00381
00382
00383 #define common_filler_properties_command_line_macro() \
00384 { "-fillpen", \
00385 ARG_STRING, \
00386 NULL, \
00387 "Filler word probabilities input file (used in place of -silpen and -noisepen)" }, \
00388 { "-silprob", \
00389 ARG_FLOAT32, \
00390 "0.1", \
00391 "Default silence word probability" }, \
00392 { "-fillprob", \
00393 ARG_FLOAT32, \
00394 "0.1", \
00395 "Default non-silence filler word probability" }, \
00396 { "-lw", \
00397 ARG_FLOAT32, \
00398 "9.5", \
00399 "Language weight" }, \
00400 { "-wip", \
00401 ARG_FLOAT32, \
00402 "0.7", \
00403 "Word insertion penalty" }, \
00404 { "-uw", \
00405 ARG_FLOAT32, \
00406 "0.7", \
00407 "Unigram weight" }
00408
00409
00410
00411 #define phone_insertion_penalty_command_line_macro() \
00412 { "-phonepen", \
00413 ARG_FLOAT32, \
00414 "1.0", \
00415 "(Mode 2 and 3 only) Word insertion penalty" }
00416
00417
00418 #define common_s3x_beam_properties_command_line_macro() \
00419 { "-beam", \
00420 ARG_FLOAT64, \
00421 "1.0e-55", \
00422 "Beam selecting active HMMs (relative to best) in each frame [0(widest)..1(narrowest)]" }, \
00423 { "-pbeam", \
00424 ARG_FLOAT64, \
00425 "1.0e-50", \
00426 "Beam selecting HMMs transitioning to successors in each frame [0(widest)..1(narrowest)]" }, \
00427 { "-wbeam", \
00428 ARG_FLOAT64, \
00429 "1.0e-35", \
00430 "Beam selecting word-final HMMs exiting in each frame [0(widest)..1(narrowest)]" }, \
00431 { "-wend_beam", \
00432 ARG_FLOAT64, \
00433 "1.0e-80", \
00434 "Beam selecting word-final HMMs exiting in each frame [0(widest) .. 1(narrowest)]" }, \
00435 { "-ptranskip", \
00436 ARG_INT32, \
00437 "0", \
00438 "(Not used in Mode 3) Use wbeam for phone transitions every so many frames (if >= 1)" }
00439
00440 #define common_application_properties_command_line_macro() \
00441 { "-logfn", \
00442 ARG_STRING, \
00443 NULL, \
00444 "Log file (default stdout/stderr)" }
00445
00446 #define control_file_handling_command_line_macro() \
00447 { "-ctl", \
00448 ARG_STRING, \
00449 NULL, \
00450 "Control file listing utterances to be processed" }, \
00451 { "-ctloffset", \
00452 ARG_INT32, \
00453 "0", \
00454 "No. of utterances at the beginning of -ctl file to be skipped" }, \
00455 { "-ctlcount", \
00456 ARG_INT32, \
00457 "1000000000", \
00458 "No. of utterances to be processed (after skipping -ctloffset entries)" }
00459
00460 #define hypothesis_file_handling_command_line_macro() \
00461 { "-hyp", \
00462 ARG_STRING, \
00463 NULL, \
00464 "Recognition result file, with only words" }, \
00465 { "-hypseg", \
00466 ARG_STRING, \
00467 NULL, \
00468 "Recognition result file, with word segmentations and scores" }
00469
00470 #define score_handling_command_line_macro() \
00471 { "-hypsegscore_unscale", \
00472 ARG_BOOLEAN, \
00473 "yes", \
00474 "When displaying the results, whether to unscale back the acoustic score with the best score in a frame"}
00475
00476 #define cepstral_input_handling_command_line_macro() \
00477 { "-cepdir", \
00478 ARG_STRING, \
00479 NULL, \
00480 "Input cepstrum files directory (prefixed to filespecs in control file)" }, \
00481 { "-cepext", \
00482 ARG_STRING, \
00483 ".mfc", \
00484 "Input cepstrum files extension (prefixed to filespecs in control file)" }, \
00485 { "-adcin", \
00486 ARG_BOOLEAN, \
00487 "no", \
00488 "Input is waveform data rather than cepstra (-cepdir and -cepext are still used)" }, \
00489 { "-adchdr", \
00490 ARG_INT32, \
00491 "0", \
00492 "Number of bytes to skip at the beginning of a waveform file (44 for WAV, 1024 for Sphere)" }
00493
00494 #define output_lattice_handling_command_line_macro() \
00495 { "-outlatdir", \
00496 ARG_STRING, \
00497 NULL, \
00498 "Directory in which to dump word lattices" }, \
00499 { "-outlatfmt", \
00500 ARG_STRING, \
00501 "s3", \
00502 "Format in which to dump word lattices (either 's3' or 'htk')" }, \
00503 { "-latext", \
00504 ARG_STRING, \
00505 "lat.gz", \
00506 "Filename extension for lattice files (gzip compressed, by default - remove .gz for uncompressed)" }
00507
00508
00509
00510 #define history_table_command_line_macro() \
00511 { "-bptbldir", \
00512 ARG_STRING, \
00513 NULL, \
00514 "Directory in which to dump word Viterbi back pointer table (for debugging)" }, \
00515 { "-bptblsize", \
00516 ARG_INT32, \
00517 "32768", \
00518 "Number of BPtable entries to allocate initially (grown as necessary)" }
00519
00520
00521
00522
00523 #define decode_specific_command_line_macro() \
00524 { "-mode", \
00525 ARG_STRING, \
00526 "fwdtree",\
00527 "Decoding mode, one of allphone, fsg, fwdflat, fwdtree."}, \
00528 { "-op_mode", \
00529 ARG_INT32, \
00530 "-1", \
00531 "Operation mode, for internal use only."}, \
00532 { "-hmmdump", \
00533 ARG_BOOLEAN, \
00534 "no", \
00535 "Whether to dump active HMM details to stderr (for debugging)" }, \
00536 { "-lextreedump", \
00537 ARG_INT32, \
00538 "0", \
00539 "Whether to dump the lextree structure to stderr (for debugging), 1 for Ravi's format, 2 for Dot format, Larger than 2 will be treated as Ravi's format" }, \
00540 { "-bghist", \
00541 ARG_BOOLEAN, \
00542 "no", \
00543 "Bigram-mode: If TRUE only one BP entry/frame; else one per LM state" }, \
00544 { "-treeugprob", \
00545 ARG_BOOLEAN, \
00546 "yes", \
00547 "If true, Use unigram probs in lextree" }
00548
00549 #define dag_handling_command_line_macro() \
00550 { "-min_endfr", \
00551 ARG_INT32, \
00552 "3", \
00553 "Nodes ignored during search if they persist for fewer than so many end frames" }, \
00554 { "-dagfudge", \
00555 ARG_INT32, \
00556 "2", \
00557 "(0..2); 1 or 2: add edge if endframe == startframe; 2: if start == end-1" }, \
00558 { "-maxedge", \
00559 ARG_INT32, \
00560 "2000000", \
00561 "Max DAG edges allowed in utterance; aborted if exceeded; controls memory usage" }, \
00562 { "-maxlmop", \
00563 ARG_INT32, \
00564 "100000000", \
00565 "Max LMops in utterance after which it is aborted; controls CPU use (see maxlpf)" }, \
00566 { "-maxlpf", \
00567 ARG_INT32, \
00568 "40000", \
00569 "Max LMops/frame after which utterance aborted; controls CPU use (see maxlmop)" }, \
00570 {"-latcompress", \
00571 ARG_BOOLEAN, \
00572 "yes", \
00573 "Whether lattice is compressed."}
00574
00575
00576 #define second_stage_dag_handling_command_line_macro() \
00577 { "-bestpath", \
00578 ARG_BOOLEAN, \
00579 "no", \
00580 "Whether to run bestpath DAG search after forward Viterbi pass" }, \
00581 { "-bestpathlw", \
00582 ARG_FLOAT32, \
00583 NULL, \
00584 "Language weight for bestpath DAG search (default: same as -lw)" }, \
00585 {"-nbestdir", \
00586 ARG_STRING, \
00587 NULL, \
00588 "Input word-lattice directory with per-utt files for restricting words searched"}, \
00589 {"-nbestext", \
00590 ARG_STRING, \
00591 "nbest.gz", \
00592 "N-best filename extension (.gz or .Z extension for compression)"}, \
00593 {"-nbest", \
00594 ARG_INT32, \
00595 "200", \
00596 "Max. n-best hypotheses to generate per utterance"}, \
00597 {"-maxppath", \
00598 ARG_INT32, \
00599 "1000000", \
00600 "Max partial paths created after which utterance aborted; controls CPU/memory use"}, \
00601 {"-ppathdebug", \
00602 ARG_BOOLEAN, \
00603 "no", \
00604 "Generate debugging information for N-best search. "}
00605
00606 #define input_lattice_handling_command_line_macro() \
00607 { "-inlatdir", \
00608 ARG_STRING, \
00609 NULL, \
00610 "Input word-lattice directory with per-utt files for restricting words searched" }, \
00611 { "-inlatwin", \
00612 ARG_INT32, \
00613 "50", \
00614 "Input word-lattice words starting within +/- <this argument> of current frame considered during search" }
00615
00616 #define flat_fwd_debugging_command_line_macro() \
00617 { "-tracewhmm", \
00618 ARG_STRING, \
00619 NULL, \
00620 "(Mode 3 only) Word whose active HMMs are to be traced (for debugging/diagnosis/analysis)" }, \
00621 { "-hmmdumpef", \
00622 ARG_INT32, \
00623 "200000000", \
00624 "(Mode 3 only) Ending frame for dumping all active HMMs (for debugging/diagnosis/analysis)" }, \
00625 { "-hmmdumpsf", \
00626 ARG_INT32, \
00627 "200000000", \
00628 "(Mode 3 only) Starting frame for dumping all active HMMs (for debugging/diagnosis/analysis)" }, \
00629 { "-worddumpef", \
00630 ARG_INT32, \
00631 "200000000", \
00632 "(Mode 3 only) Ending frame for dumping all active words (for debugging/diagnosis/analysis)" }, \
00633 { "-worddumpsf", \
00634 ARG_INT32, \
00635 "200000000", \
00636 "(Mode 3 only) Starting frame for dumping all active words (for debugging/diagnosis/analysis)" }
00637
00638
00639 #define search_specific_command_line_macro() \
00640 {"-backtrace", \
00641 ARG_BOOLEAN, \
00642 "yes", \
00643 "Whether detailed backtrace information (word segmentation/scores) shown in log" }, \
00644 { "-bestsenscrdir", \
00645 ARG_STRING, \
00646 NULL, \
00647 "When Best senone score directory." }
00648
00649
00650
00651 #define search_modeTST_specific_command_line_macro() \
00652 { "-Nlextree", \
00653 ARG_INT32, \
00654 "3", \
00655 "(Mode 4 only) No. of lextrees to be instantiated; entries into them staggered in time" }, \
00656 { "-epl", \
00657 ARG_INT32, \
00658 "3", \
00659 "(Mode 4 only) Entries Per Lextree; #successive entries into one lextree before lextree-entries shifted to the next" }
00660
00661
00662 #define search_modeWST_specific_command_line_macro() \
00663 { "-Nstalextree", \
00664 ARG_INT32, \
00665 "25", \
00666 "(Mode 5 only) No. of lextrees to be instantiated statically; " }
00667
00668 #define partial_hypothesis_command_line_macro() \
00669 { "-maxhyplen", \
00670 ARG_INT32, \
00671 "1000", \
00672 "(Live-decoder only) Maximum number of words in a partial hypothesis (for block decoding)" }, \
00673 { "-phypdump", \
00674 ARG_BOOLEAN, \
00675 "yes", \
00676 "(Live-decoder only) dump parital hypothesis on the screen"}
00677
00678 #define control_lm_file_command_line_macro() \
00679 { "-ctl_lm", \
00680 ARG_STRING, \
00681 NULL, \
00682 "(Not used in mode 2 and 3) Control file that list the corresponding LMs" }
00683
00684 #define control_mllr_file_command_line_macro() \
00685 { "-ctl_mllr", \
00686 ARG_STRING, \
00687 NULL, \
00688 "Control file that list the corresponding MLLR matrix for an utterance"}
00689
00690 #define control_lm_mllr_file_command_line_macro() \
00691 control_lm_file_command_line_macro(), \
00692 control_mllr_file_command_line_macro()
00693
00694 #endif