Operations on corpus. More...
#include <stdio.h>
#include <hash_table.h>
#include <profile.h>
#include <s3types.h>
Go to the source code of this file.
Classes | |
struct | utt_res_t |
A structure to store utterance-based resource Assume that most resource are string pointers, the string itself is pre-allocated somewhere. More... | |
struct | corpus_t |
Structure for a corpus: essentially a set of strings each associated with a unique ID. Structure for a corpus: essentially a set of strings each associated with a unique ID. (Such as a reference sentence file, hypothesis file, and various control files.) NOTE: IDs are CASE-SENSITIVE. More... | |
Defines | |
#define | utt_res_set_uttfile(ur, name) ur->uttfile=name |
#define | utt_res_set_lmname(ur, name) ur->lmname=name |
#define | utt_res_set_fsgname(ur, name) ur->fsgname=name |
#define | utt_res_set_regmatname(ur, name) ur->regmatname=name |
#define | utt_res_set_cb2mllrname(ur, name) ur->cb2mllrname=name |
Functions | |
utt_res_t * | new_utt_res (void) |
void | free_utt_res (utt_res_t *ur) |
void | report_utt_res (utt_res_t *ur) |
corpus_t * | corpus_load_headid (const char *file, int32(*validate)(char *str), int32(*dup_resolve)(char *s1, char *s2)) |
corpus_t * | corpus_load_tailid (const char *file, int32(*validate)(char *str), int32(*dup_resolve)(char *s1, char *s2)) |
char * | corpus_lookup (corpus_t *corp, const char *id) |
int32 | ctl_read_entry (FILE *fp, char *uttfile, int32 *sf, int32 *ef, char *uttid) |
S3DECODER_EXPORT ptmr_t | ctl_process (const char *ctlfile, const char *ctllmfile, const char *ctlmllrfile, int32 nskip, int32 count, void(*func)(void *kb, utt_res_t *ur, int32 sf, int32 ef, char *uttid), void *kb) |
S3DECODER_EXPORT ptmr_t | ctl_process_utt (const char *uttfile, int32 count, void(*func)(void *kb, utt_res_t *ur, int32 sf, int32 ef, char *uttid), void *kb) |
void | ctl_infile (char *file, const char *dir, const char *ext, const char *utt) |
void | ctl_outfile (char *file, const char *dir, const char *ext, const char *utt, const char *uttid) |
Operations on corpus.
#define utt_res_set_cb2mllrname | ( | ur, | |||
name | ) | ur->cb2mllrname=name |
#define utt_res_set_fsgname | ( | ur, | |||
name | ) | ur->fsgname=name |
#define utt_res_set_lmname | ( | ur, | |||
name | ) | ur->lmname=name |
#define utt_res_set_regmatname | ( | ur, | |||
name | ) | ur->regmatname=name |
#define utt_res_set_uttfile | ( | ur, | |||
name | ) | ur->uttfile=name |
corpus_t* corpus_load_headid | ( | const char * | file, | |
int32(*)(char *str) | validate, | |||
int32(*)(char *s1, char *s2) | dup_resolve | |||
) |
Load a corpus from the given file and return it. Each line is a separate entry in the corpus. Blank lines are skipped. The ID is the FIRST word in a line.
Validation:
validate is an optional, application-supplied function to determine if each input corpus data entry is eligible (valid) for inclusion in the final corpus. It should return an integer value signifying the following actions: 0: Not valid, skip the entry; !0: Valid, include the entry. If validate is NULL, every input entry is included in the corpus.
Duplicate resolution:
dup_resolve is an optional, application-supplied function to resolve duplicate keys (IDs). It may be NULL if none is available. If present, and a duplicate key is encountered, the function is invoked with the original and the duplicate corpus strings as arguments (s1 and s2, respectively). It should return an integer value signifying the following actions: 0: Retain the original string, discard the new one; >0: Replace the original string with the new one; <0: Error (causes a FATAL_ERROR). If dup_resolve is NULL, any duplicate ID causes a FATAL_ERROR.
Return value: Ptr to corpus if successful.
file | Input file name, the file must be seekable and rewindable |
corpus_t* corpus_load_tailid | ( | const char * | file, | |
int32(*)(char *str) | validate, | |||
int32(*)(char *s1, char *s2) | dup_resolve | |||
) |
Similar to corpus_load_headid, but the ID is at the END of each line, in parentheses.
file | Input file name, the file must be seekable and rewindable |
char* corpus_lookup | ( | corpus_t * | corp, | |
const char * | id | |||
) |
Lookup the given corpus for the given ID and return the associated string. Return NULL if ID not found.
void ctl_infile | ( | char * | file, | |
const char * | dir, | |||
const char * | ext, | |||
const char * | utt | |||
) |
Build a complete input filename from the given uttname, directory and file-extension: If utt begins with a / ignore dir, otherwise prefix dir/ to utt; If a non-empty file extension is provided, and utt doesn't already have that extension, append .ext to filename.
file | Out: Generated filename (allocated by caller) | |
dir | In: Optional directory spec if relative utt specified | |
ext | In: File extension to be appended to utt to generate complete filename | |
utt | In: Utterance file pathname, absolute or relative, with or without file extension. This is usually the first field in a control file |
void ctl_outfile | ( | char * | file, | |
const char * | dir, | |||
const char * | ext, | |||
const char * | utt, | |||
const char * | uttid | |||
) |
Build a complete output filename from the given components as follows: if dir ends with ,CTL and utt does not begin with /, use dir/utt if dir ends with ,CTL and utt DOES begin with /, filename is utt if dir does not end with ,CTL, filename is dir/uttid. If a non-empty ext specified append .ext to generated filename.
file | Out: Generated filename (allocated by caller) | |
dir | In: Directory for the generated filename; see comment for special handling of ,CTL suffix | |
ext | In: File-extension applied to the generated filename | |
utt | In: Utterance file pathname, absolute or relative, with or without extension. This is usually the first field in a control file. | |
uttid | In: Utterance ID (derived from the control file |
S3DECODER_EXPORT ptmr_t ctl_process | ( | const char * | ctlfile, | |
const char * | ctllmfile, | |||
const char * | ctlmllrfile, | |||
int32 | nskip, | |||
int32 | count, | |||
void(*)(void *kb, utt_res_t *ur, int32 sf, int32 ef, char *uttid) | func, | |||
void * | kb | |||
) |
Process the given control file (or stdin if NULL): Skip the first nskip entries, and process the next count entries by calling the given function (*func) for each entry. Any error in reading the control file is FATAL. ctllmfile and ctlmllrfile can be specified optionally. If they are not specified, then NULL could be used.
Return value: ptmr_t structure containing cpu/elapsed time stats for the run.
ctlfile | In: Control file to read; use stdin if NULL | |
ctllmfile | In: Control file that specify the lm used for the corresponding utterance | |
ctlmllrfile | In: Contorl file that specify the mllr used for the corresponding utterance | |
nskip | In: No. of entries to skip at the head | |
count | In: No. of entries to process after nskip | |
func | In: Function to be invoked for each of the count entries processed. | |
kb | In: A catch-all data pointer to be passed as the first argument to func above |
Referenced by main().
S3DECODER_EXPORT ptmr_t ctl_process_utt | ( | const char * | uttfile, | |
int32 | count, | |||
void(*)(void *kb, utt_res_t *ur, int32 sf, int32 ef, char *uttid) | func, | |||
void * | kb | |||
) |
Like ctl_process, but process the single filename given (uttfile), count times. After each processing, wait for the time of modification on the given file to change. In this mode, the decoder can be used to process a dynamically generated sequence of utterances. To avoid race conditions, each new instance of the file should be created "in an instant": by creating it under a temporary name and finally renaming it to the given filename atomically.
uttfile | In: Filename to be process (in its entirety) | |
count | In: No. of iterations to process uttfile | |
func | A function pointer that do the actual processing |
Referenced by main().
int32 ctl_read_entry | ( | FILE * | fp, | |
char * | uttfile, | |||
int32 * | sf, | |||
int32 * | ef, | |||
char * | uttid | |||
) |
Read another entry from a S3 format "control file" and parse its various fields. Blank lines and lines beginning with a hash-character (#) are omitted. Control file entry format: uttfile(usually cepstrum file) [startframe endframe [uttid]] Any error in control file entry format is FATAL. Return value: 0 if successful, -1 if no more entries left.
fp | In: an input file pointer | |
uttfile | Out: (Cep)file containing utterance data | |
sf | Out: Start frame in uttfile; 0 if omitted | |
ef | Out: End frame in uttfile; -1 (signifying until EOF) if omitted | |
uttid | Out: Utterance ID (generated from uttfile/sf/ef if omitted) |
utt_res_t* new_utt_res | ( | void | ) |
This just return a new utter_res_t