mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-04 09:56:33 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
90b096b382
@ -196,7 +196,7 @@ class Moses():
|
||||
|
||||
|
||||
|
||||
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False):
|
||||
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False,lowmem=False):
|
||||
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
|
||||
relies on alphabetical sorting of phrase table.
|
||||
"""
|
||||
@ -209,7 +209,9 @@ class Moses():
|
||||
self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]]))
|
||||
self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)]))
|
||||
self.phrase_source = defaultdict(lambda: [0]*len(self.models))
|
||||
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
|
||||
|
||||
if lowmem:
|
||||
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
|
||||
|
||||
for model,priority,i in models:
|
||||
|
||||
@ -451,10 +453,10 @@ class Moses():
|
||||
return line
|
||||
|
||||
|
||||
def create_inverse(self,fobj):
|
||||
def create_inverse(self,fobj,tempdir=None):
|
||||
"""swap source and target phrase in the phrase table, and then sort (by target phrase)"""
|
||||
|
||||
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False)
|
||||
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False,dir=tempdir)
|
||||
swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|')
|
||||
|
||||
# just swap source and target phrase, and leave order of scores etc. intact.
|
||||
@ -463,7 +465,7 @@ class Moses():
|
||||
inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1))
|
||||
inverse.close()
|
||||
|
||||
inverse_sorted = sort_file(inverse.name)
|
||||
inverse_sorted = sort_file(inverse.name,tempdir=tempdir)
|
||||
os.remove(inverse.name)
|
||||
|
||||
return inverse_sorted
|
||||
@ -1254,14 +1256,16 @@ def handle_file(filename,action,fileobj=None,mode='r'):
|
||||
fileobj.close()
|
||||
|
||||
|
||||
def sort_file(filename):
|
||||
def sort_file(filename,tempdir=None):
|
||||
"""Sort a file and return temporary file"""
|
||||
|
||||
cmd = ['sort', filename]
|
||||
env = {}
|
||||
env['LC_ALL'] = 'C'
|
||||
if tempdir:
|
||||
cmd.extend(['-T',tempdir])
|
||||
|
||||
outfile = NamedTemporaryFile(delete=False)
|
||||
outfile = NamedTemporaryFile(delete=False,dir=tempdir)
|
||||
sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n')
|
||||
p = Popen(cmd,env=env,stdout=outfile.file)
|
||||
p.wait()
|
||||
@ -1344,6 +1348,8 @@ class Combine_TMs():
|
||||
|
||||
lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves.
|
||||
|
||||
tempdir: temporary directory (for low memory mode).
|
||||
|
||||
there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts'
|
||||
|
||||
recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights.
|
||||
@ -1507,25 +1513,25 @@ class Combine_TMs():
|
||||
self.loaded['pt-target'] = 1
|
||||
|
||||
|
||||
def _inverse_wrapper(self,weights):
|
||||
def _inverse_wrapper(self,weights,tempdir=None):
|
||||
"""if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables"""
|
||||
|
||||
sys.stderr.write('Processing first table half\n')
|
||||
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False)
|
||||
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False,dir=tempdir)
|
||||
self._write_phrasetable(models,pt_half1,weights)
|
||||
pt_half1.seek(0)
|
||||
|
||||
sys.stderr.write('Inverting tables\n')
|
||||
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table')),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table'),tempdir=tempdir),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||
sys.stderr.write('Processing second table half\n')
|
||||
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False)
|
||||
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False,dir=tempdir)
|
||||
self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True)
|
||||
pt_half2_inverted.close()
|
||||
for model,priority,i in models:
|
||||
model.close()
|
||||
os.remove(model.name)
|
||||
pt_half2 = sort_file(pt_half2_inverted.name)
|
||||
pt_half2 = sort_file(pt_half2_inverted.name,tempdir=tempdir)
|
||||
os.remove(pt_half2_inverted.name)
|
||||
|
||||
sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file))
|
||||
@ -1549,7 +1555,7 @@ class Combine_TMs():
|
||||
|
||||
i = 0
|
||||
sys.stderr.write('Incrementally loading and processing phrase tables...')
|
||||
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted):
|
||||
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted,lowmem=self.flags['lowmem']):
|
||||
|
||||
for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'):
|
||||
for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'):
|
||||
@ -1586,7 +1592,7 @@ class Combine_TMs():
|
||||
self._ensure_loaded(data)
|
||||
|
||||
if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'):
|
||||
self._inverse_wrapper(weights)
|
||||
self._inverse_wrapper(weights,tempdir=self.flags['tempdir'])
|
||||
else:
|
||||
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
|
||||
output_object = handle_file(self.output_file,'open',mode='w')
|
||||
@ -1631,12 +1637,12 @@ class Combine_TMs():
|
||||
sys.stderr.write('Error: only linear interpolation is supported for reordering model combination')
|
||||
|
||||
output_object = handle_file(self.output_file,'open',mode='w')
|
||||
models = [(self.open_table(model,table),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
|
||||
models = [(self.model_interface.open_table(model,'reordering-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
|
||||
|
||||
i = 0
|
||||
|
||||
sys.stderr.write('Incrementally loading and processing phrase tables...')
|
||||
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs'):
|
||||
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs',lowmem=self.flags['lowmem']):
|
||||
|
||||
for src in sorted(self.model_interface.reordering_pairs):
|
||||
for target in sorted(self.model_interface.reordering_pairs[src]):
|
||||
@ -1829,6 +1835,11 @@ def parse_command_line():
|
||||
parser.add_argument('--recompute_lexweights', action="store_true",
|
||||
help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".'))
|
||||
|
||||
parser.add_argument('--tempdir', type=str,
|
||||
default=None,
|
||||
help=('Temporary directory in --lowmem mode.'))
|
||||
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -1842,7 +1853,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
args = parse_command_line()
|
||||
#initialize
|
||||
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights)
|
||||
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights,tempdir=args.tempdir)
|
||||
# execute right method
|
||||
f_string = "combiner."+args.action+'()'
|
||||
exec(f_string)
|
||||
|
@ -87,7 +87,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
|
||||
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
|
||||
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
|
||||
} else {
|
||||
backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||
util::MapAnonymous(memory_size, backing.vocab);
|
||||
return reinterpret_cast<uint8_t*>(backing.vocab.get());
|
||||
}
|
||||
}
|
||||
@ -103,32 +103,44 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (config.write_method == Config::WRITE_AFTER) {
|
||||
util::MapAnonymous(memory_size, backing.search);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||
}
|
||||
// mmap it now.
|
||||
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
||||
std::size_t page_size = util::SizePage();
|
||||
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
||||
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
|
||||
} else {
|
||||
backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||
util::MapAnonymous(memory_size, backing.search);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||
}
|
||||
}
|
||||
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) {
|
||||
if (config.write_mmap) {
|
||||
util::SyncOrThrow(backing.search.get(), backing.search.size());
|
||||
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
|
||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||
Parameters params = Parameters();
|
||||
params.counts = counts;
|
||||
params.fixed.order = counts.size();
|
||||
params.fixed.probing_multiplier = config.probing_multiplier;
|
||||
params.fixed.model_type = model_type;
|
||||
params.fixed.has_vocabulary = config.include_vocab;
|
||||
params.fixed.search_version = search_version;
|
||||
WriteHeader(backing.vocab.get(), params);
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
|
||||
if (!config.write_mmap) return;
|
||||
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
|
||||
switch (config.write_method) {
|
||||
case Config::WRITE_MMAP:
|
||||
util::SyncOrThrow(backing.search.get(), backing.search.size());
|
||||
break;
|
||||
case Config::WRITE_AFTER:
|
||||
util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
|
||||
util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
|
||||
util::FSyncOrThrow(backing.file.get());
|
||||
break;
|
||||
}
|
||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||
Parameters params = Parameters();
|
||||
params.counts = counts;
|
||||
params.fixed.order = counts.size();
|
||||
params.fixed.probing_multiplier = config.probing_multiplier;
|
||||
params.fixed.model_type = model_type;
|
||||
params.fixed.has_vocabulary = config.include_vocab;
|
||||
params.fixed.search_version = search_version;
|
||||
WriteHeader(backing.vocab.get(), params);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
@ -172,7 +184,7 @@ void ReadHeader(int fd, Parameters &out) {
|
||||
UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
|
||||
|
||||
out.counts.resize(static_cast<std::size_t>(out.fixed.order));
|
||||
util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
|
||||
if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
|
||||
}
|
||||
|
||||
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) {
|
||||
|
@ -58,7 +58,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
||||
|
||||
// Write header to binary file. This is done last to prevent incomplete files
|
||||
// from loading.
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing);
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
|
||||
|
||||
namespace detail {
|
||||
|
||||
|
@ -18,11 +18,14 @@ namespace ngram {
|
||||
namespace {
|
||||
|
||||
void Usage(const char *name) {
|
||||
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
|
||||
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
|
||||
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
|
||||
" Default is -100. The ARPA file will always take precedence.\n"
|
||||
"-s allows models to be built even if they do not have <s> and </s>.\n"
|
||||
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n\n"
|
||||
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
|
||||
"-w mmap|after determines how writing is done.\n"
|
||||
" mmap maps the binary file and writes to it. Default for trie.\n"
|
||||
" after allocates anonymous memory, builds, and writes. Default for probing.\n\n"
|
||||
"type is either probing or trie. Default is probing.\n\n"
|
||||
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
|
||||
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
|
||||
@ -58,7 +61,7 @@ uint8_t ParseBitCount(const char *from) {
|
||||
unsigned long val = ParseUInt(from);
|
||||
if (val > 25) {
|
||||
util::ParseNumberException e(from);
|
||||
e << " bit counts are limited to 256.";
|
||||
e << " bit counts are limited to 25.";
|
||||
}
|
||||
return val;
|
||||
}
|
||||
@ -115,10 +118,10 @@ int main(int argc, char *argv[]) {
|
||||
using namespace lm::ngram;
|
||||
|
||||
try {
|
||||
bool quantize = false, set_backoff_bits = false, bhiksha = false;
|
||||
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false;
|
||||
lm::ngram::Config config;
|
||||
int opt;
|
||||
while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:a:")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) {
|
||||
switch(opt) {
|
||||
case 'q':
|
||||
config.prob_bits = ParseBitCount(optarg);
|
||||
@ -132,6 +135,7 @@ int main(int argc, char *argv[]) {
|
||||
case 'a':
|
||||
config.pointer_bhiksha_bits = ParseBitCount(optarg);
|
||||
bhiksha = true;
|
||||
break;
|
||||
case 'u':
|
||||
config.unknown_missing_logprob = ParseFloat(optarg);
|
||||
break;
|
||||
@ -144,6 +148,16 @@ int main(int argc, char *argv[]) {
|
||||
case 'm':
|
||||
config.building_memory = ParseUInt(optarg) * 1048576;
|
||||
break;
|
||||
case 'w':
|
||||
set_write_method = true;
|
||||
if (!strcmp(optarg, "mmap")) {
|
||||
config.write_method = Config::WRITE_MMAP;
|
||||
} else if (!strcmp(optarg, "after")) {
|
||||
config.write_method = Config::WRITE_AFTER;
|
||||
} else {
|
||||
Usage(argv[0]);
|
||||
}
|
||||
break;
|
||||
case 's':
|
||||
config.sentence_marker_missing = lm::SILENT;
|
||||
break;
|
||||
@ -160,45 +174,45 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
if (optind + 1 == argc) {
|
||||
ShowSizes(argv[optind], config);
|
||||
return 0;
|
||||
}
|
||||
const char *model_type, *from_file;
|
||||
if (optind + 2 == argc) {
|
||||
model_type = "probing";
|
||||
from_file = argv[optind];
|
||||
} else if (optind + 2 == argc) {
|
||||
config.write_mmap = argv[optind + 1];
|
||||
} else if (optind + 3 == argc) {
|
||||
model_type = argv[optind];
|
||||
from_file = argv[optind + 1];
|
||||
config.write_mmap = argv[optind + 2];
|
||||
} else {
|
||||
Usage(argv[0]);
|
||||
}
|
||||
if (!strcmp(model_type, "probing")) {
|
||||
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
|
||||
ProbingModel(from_file, config);
|
||||
} else if (!strcmp(model_type, "trie")) {
|
||||
if (quantize) {
|
||||
if (bhiksha) {
|
||||
QuantArrayTrieModel(from_file, config);
|
||||
ProbingModel(argv[optind], config);
|
||||
} else if (optind + 3 == argc) {
|
||||
const char *model_type = argv[optind];
|
||||
const char *from_file = argv[optind + 1];
|
||||
config.write_mmap = argv[optind + 2];
|
||||
if (!strcmp(model_type, "probing")) {
|
||||
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
|
||||
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
|
||||
ProbingModel(from_file, config);
|
||||
} else if (!strcmp(model_type, "trie")) {
|
||||
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
|
||||
if (quantize) {
|
||||
if (bhiksha) {
|
||||
QuantArrayTrieModel(from_file, config);
|
||||
} else {
|
||||
QuantTrieModel(from_file, config);
|
||||
}
|
||||
} else {
|
||||
QuantTrieModel(from_file, config);
|
||||
if (bhiksha) {
|
||||
ArrayTrieModel(from_file, config);
|
||||
} else {
|
||||
TrieModel(from_file, config);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (bhiksha) {
|
||||
ArrayTrieModel(from_file, config);
|
||||
} else {
|
||||
TrieModel(from_file, config);
|
||||
}
|
||||
Usage(argv[0]);
|
||||
}
|
||||
} else {
|
||||
Usage(argv[0]);
|
||||
}
|
||||
std::cerr << "Built " << config.write_mmap << " successfully." << std::endl;
|
||||
} catch (const std::exception &e) {
|
||||
}
|
||||
catch (const std::exception &e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
std::cerr << "ERROR" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cerr << "SUCCESS" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
@ -17,6 +17,7 @@ Config::Config() :
|
||||
temporary_directory_prefix(NULL),
|
||||
arpa_complain(ALL),
|
||||
write_mmap(NULL),
|
||||
write_method(WRITE_AFTER),
|
||||
include_vocab(true),
|
||||
prob_bits(8),
|
||||
backoff_bits(8),
|
||||
|
@ -70,9 +70,17 @@ struct Config {
|
||||
// to NULL to disable.
|
||||
const char *write_mmap;
|
||||
|
||||
typedef enum {
|
||||
WRITE_MMAP, // Map the file directly.
|
||||
WRITE_AFTER // Write after we're done.
|
||||
} WriteMethod;
|
||||
WriteMethod write_method;
|
||||
|
||||
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
||||
bool include_vocab;
|
||||
|
||||
|
||||
|
||||
// Quantization options. Only effective for QuantTrieModel. One value is
|
||||
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
||||
// to quantize (and one of the remaining backoffs will be 0).
|
||||
|
12
lm/model.cc
12
lm/model.cc
@ -46,7 +46,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
|
||||
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) {
|
||||
SetupMemory(start, params.counts, config);
|
||||
vocab_.LoadedBinary(fd, config.enumerate_vocab);
|
||||
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
|
||||
search_.LoadedBinary();
|
||||
}
|
||||
|
||||
@ -82,7 +82,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
search_.unigram.Unknown().backoff = 0.0;
|
||||
search_.unigram.Unknown().prob = config.unknown_missing_logprob;
|
||||
}
|
||||
FinishFile(config, kModelType, kVersion, counts, backing_);
|
||||
FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
|
||||
} catch (util::Exception &e) {
|
||||
e << " Byte: " << f.Offset();
|
||||
throw;
|
||||
@ -119,7 +119,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
}
|
||||
float backoff;
|
||||
// i is the order of the backoff we're looking for.
|
||||
const Middle *mid_iter = search_.MiddleBegin() + start - 2;
|
||||
typename Search::MiddleIter mid_iter = search_.MiddleBegin() + start - 2;
|
||||
for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++mid_iter) {
|
||||
if (!search_.LookupMiddleNoProb(*mid_iter, *i, backoff, node)) break;
|
||||
ret.prob += backoff;
|
||||
@ -139,7 +139,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
search_.LookupUnigram(*context_rbegin, out_state.backoff[0], node, ignored);
|
||||
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
|
||||
float *backoff_out = out_state.backoff + 1;
|
||||
const typename Search::Middle *mid = search_.MiddleBegin();
|
||||
typename Search::MiddleIter mid(search_.MiddleBegin());
|
||||
for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++mid) {
|
||||
if (!search_.LookupMiddleNoProb(*mid, *i, *backoff_out, node)) {
|
||||
std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
|
||||
@ -166,7 +166,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
// If this function is called, then it does depend on left words.
|
||||
ret.independent_left = false;
|
||||
ret.extend_left = extend_pointer;
|
||||
const typename Search::Middle *mid_iter = search_.MiddleBegin() + extend_length - 1;
|
||||
typename Search::MiddleIter mid_iter(search_.MiddleBegin() + extend_length - 1);
|
||||
const WordIndex *i = add_rbegin;
|
||||
for (; ; ++i, ++backoff_out, ++mid_iter) {
|
||||
if (i == add_rend) {
|
||||
@ -235,7 +235,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
|
||||
// Ok start by looking up the bigram.
|
||||
const WordIndex *hist_iter = context_rbegin;
|
||||
const typename Search::Middle *mid_iter = search_.MiddleBegin();
|
||||
typename Search::MiddleIter mid_iter(search_.MiddleBegin());
|
||||
for (; ; ++mid_iter, ++hist_iter, ++backoff_out) {
|
||||
if (hist_iter == context_rend) {
|
||||
// Ran out of history. Typically no backoff, but this could be a blank.
|
||||
|
@ -20,11 +20,11 @@ namespace ngram {
|
||||
|
||||
namespace {
|
||||
|
||||
void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) {
|
||||
std::sort(values, values_end);
|
||||
const float *start = values, *finish;
|
||||
void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
|
||||
std::sort(values.begin(), values.end());
|
||||
std::vector<float>::const_iterator start = values.begin(), finish;
|
||||
for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
|
||||
finish = values + (((values_end - values) * static_cast<uint64_t>(i + 1)) / bins);
|
||||
finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
|
||||
if (finish == start) {
|
||||
// zero length bucket.
|
||||
*centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
|
||||
@ -66,12 +66,12 @@ void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vec
|
||||
float *centers = start_ + TableStart(order) + ProbTableLength();
|
||||
*(centers++) = kNoExtensionBackoff;
|
||||
*(centers++) = kExtensionBackoff;
|
||||
MakeBins(&*backoff.begin(), &*backoff.end(), centers, (1ULL << backoff_bits_) - 2);
|
||||
MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
|
||||
}
|
||||
|
||||
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
|
||||
float *centers = start_ + TableStart(order);
|
||||
MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_));
|
||||
MakeBins(prob, centers, (1ULL << prob_bits_));
|
||||
}
|
||||
|
||||
void SeparatelyQuantize::FinishedLoading(const Config &config) {
|
||||
|
@ -84,9 +84,11 @@ template <class Middle> void FixSRI(int lower, float negative_lower_prob, unsign
|
||||
}
|
||||
|
||||
template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
|
||||
assert(n >= 2);
|
||||
ReadNGramHeader(f, n);
|
||||
|
||||
// vocab ids of words in reverse order
|
||||
// Both vocab_ids and keys are non-empty because n >= 2.
|
||||
// vocab ids of words in reverse order.
|
||||
std::vector<WordIndex> vocab_ids(n);
|
||||
std::vector<uint64_t> keys(n-1);
|
||||
typename Store::Entry::Value value;
|
||||
@ -147,7 +149,7 @@ template <class MiddleT, class LongestT> uint8_t *TemplateHashedSearch<MiddleT,
|
||||
|
||||
template <class MiddleT, class LongestT> template <class Voc> void TemplateHashedSearch<MiddleT, LongestT>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing) {
|
||||
// TODO: fix sorted.
|
||||
SetupMemory(GrowForSearch(config, 0, Size(counts, config), backing), counts, config);
|
||||
SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config);
|
||||
|
||||
PositiveProbWarn warn(config.positive_log_probability);
|
||||
|
||||
|
@ -91,8 +91,10 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
|
||||
|
||||
template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing);
|
||||
|
||||
const Middle *MiddleBegin() const { return &*middle_.begin(); }
|
||||
const Middle *MiddleEnd() const { return &*middle_.end(); }
|
||||
typedef typename std::vector<Middle>::const_iterator MiddleIter;
|
||||
|
||||
MiddleIter MiddleBegin() const { return middle_.begin(); }
|
||||
MiddleIter MiddleEnd() const { return middle_.end(); }
|
||||
|
||||
Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const {
|
||||
util::FloatEnc val;
|
||||
|
@ -197,7 +197,7 @@ class SRISucks {
|
||||
|
||||
void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
|
||||
for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
|
||||
it_[i] = &*values_[i].begin();
|
||||
it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
|
||||
}
|
||||
messages_[0].Apply(it_, unigram_file);
|
||||
BackoffMessages *messages = messages_ + 1;
|
||||
@ -229,8 +229,8 @@ class SRISucks {
|
||||
|
||||
class FindBlanks {
|
||||
public:
|
||||
FindBlanks(uint64_t *counts, unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
|
||||
: counts_(counts), longest_counts_(counts + order - 1), unigrams_(unigrams), sri_(messages) {}
|
||||
FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
|
||||
: counts_(order), unigrams_(unigrams), sri_(messages) {}
|
||||
|
||||
float UnigramProb(WordIndex index) const {
|
||||
return unigrams_[index].prob;
|
||||
@ -250,7 +250,7 @@ class FindBlanks {
|
||||
}
|
||||
|
||||
void Longest(const void * /*data*/) {
|
||||
++*longest_counts_;
|
||||
++counts_.back();
|
||||
}
|
||||
|
||||
// Unigrams wrote one past.
|
||||
@ -258,8 +258,12 @@ class FindBlanks {
|
||||
--counts_[0];
|
||||
}
|
||||
|
||||
const std::vector<uint64_t> &Counts() const {
|
||||
return counts_;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t *const counts_, *const longest_counts_;
|
||||
std::vector<uint64_t> counts_;
|
||||
|
||||
const ProbBackoff *unigrams_;
|
||||
|
||||
@ -473,14 +477,15 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
||||
}
|
||||
|
||||
SRISucks sri;
|
||||
std::vector<uint64_t> fixed_counts(counts.size());
|
||||
std::vector<uint64_t> fixed_counts;
|
||||
util::scoped_FILE unigram_file;
|
||||
util::scoped_fd unigram_fd(files.StealUnigram());
|
||||
{
|
||||
util::scoped_memory unigrams;
|
||||
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
|
||||
FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
||||
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
||||
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
|
||||
fixed_counts = finder.Counts();
|
||||
}
|
||||
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
|
||||
for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
|
||||
|
@ -62,6 +62,8 @@ template <class Quant, class Bhiksha> class TrieSearch {
|
||||
|
||||
void LoadedBinary();
|
||||
|
||||
typedef const Middle *MiddleIter;
|
||||
|
||||
const Middle *MiddleBegin() const { return middle_begin_; }
|
||||
const Middle *MiddleEnd() const { return middle_end_; }
|
||||
|
||||
|
@ -83,7 +83,12 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make
|
||||
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
|
||||
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
|
||||
|
||||
std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
std::stable_sort
|
||||
#else
|
||||
std::sort
|
||||
#endif
|
||||
(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
|
||||
|
||||
util::scoped_FILE out(maker.MakeFile());
|
||||
|
||||
@ -157,7 +162,10 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) {
|
||||
UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
|
||||
WriteOrThrow(file_, start, amount);
|
||||
long forward = entry_size_ - internal - amount;
|
||||
if (forward) UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
if (forward)
|
||||
#endif
|
||||
UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
|
||||
}
|
||||
|
||||
void RecordReader::Rewind() {
|
||||
@ -244,8 +252,13 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
|
||||
}
|
||||
// Sort full records by full n-gram.
|
||||
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
|
||||
// parallel_sort uses too much RAM
|
||||
std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
|
||||
// parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
std::stable_sort
|
||||
#else
|
||||
std::sort
|
||||
#endif
|
||||
(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
|
||||
files.push_back(DiskFlush(begin, out_end, maker));
|
||||
contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
|
||||
|
||||
|
16
lm/vocab.cc
16
lm/vocab.cc
@ -125,8 +125,10 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
|
||||
|
||||
void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
|
||||
if (enumerate_) {
|
||||
util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
|
||||
util::JointSort(begin_, end_, values);
|
||||
if (!strings_to_enumerate_.empty()) {
|
||||
util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
|
||||
util::JointSort(begin_, end_, values);
|
||||
}
|
||||
for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
|
||||
// <unk> strikes again: +1 here.
|
||||
enumerate_->Add(i + 1, strings_to_enumerate_[i]);
|
||||
@ -142,11 +144,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
|
||||
bound_ = end_ - begin_ + 1;
|
||||
}
|
||||
|
||||
void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
|
||||
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
|
||||
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
|
||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||
bound_ = end_ - begin_ + 1;
|
||||
ReadWords(fd, to, bound_);
|
||||
if (have_words) ReadWords(fd, to, bound_);
|
||||
}
|
||||
|
||||
namespace {
|
||||
@ -201,12 +203,12 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
|
||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||
}
|
||||
|
||||
void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
|
||||
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
|
||||
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
|
||||
lookup_.LoadedBinary();
|
||||
bound_ = header_->bound;
|
||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||
ReadWords(fd, to, bound_);
|
||||
if (have_words) ReadWords(fd, to, bound_);
|
||||
}
|
||||
|
||||
void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
|
||||
@ -229,7 +231,7 @@ void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialW
|
||||
if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as <unk>.";
|
||||
break;
|
||||
case THROW_UP:
|
||||
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. If you built your APRA with IRSTLM and forgot to run add-start-end.sh, complain to <bertoldi at fbk.eu> stating that you think build-lm.sh should do this by default, then go back and retrain your model from the start. To bypass this check and treat " << str << " as an OOV, pass -s. The resulting model will not work with e.g. Moses.");
|
||||
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. Run build_binary -s to disable this check.");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,7 +82,7 @@ class SortedVocabulary : public base::Vocabulary {
|
||||
|
||||
bool SawUnk() const { return saw_unk_; }
|
||||
|
||||
void LoadedBinary(int fd, EnumerateVocab *to);
|
||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
|
||||
|
||||
private:
|
||||
uint64_t *begin_, *end_;
|
||||
@ -143,9 +143,11 @@ class ProbingVocabulary : public base::Vocabulary {
|
||||
|
||||
void FinishedLoading(ProbBackoff *reorder_vocab);
|
||||
|
||||
std::size_t UnkCountChangePadding() const { return 0; }
|
||||
|
||||
bool SawUnk() const { return saw_unk_; }
|
||||
|
||||
void LoadedBinary(int fd, EnumerateVocab *to);
|
||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
|
||||
|
||||
private:
|
||||
typedef util::ProbingHashTable<ProbingVocabuaryEntry, util::IdentityHash> Lookup;
|
||||
|
@ -4,21 +4,90 @@
|
||||
#include <cmath>
|
||||
#include <climits>
|
||||
#include <fstream>
|
||||
#include <iterator>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include "Util.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// configure regularisation
|
||||
const char KEY_REFLEN[] = "reflen";
|
||||
const char REFLEN_AVERAGE[] = "average";
|
||||
const char REFLEN_SHORTEST[] = "shortest";
|
||||
const char REFLEN_CLOSEST[] = "closest";
|
||||
|
||||
} // namespace
|
||||
|
||||
// A simple STL-map based n-gram counts.
|
||||
// Basically, we provide typical accessors and mutaors, but
|
||||
// we intentionally does not allow erasing elements.
|
||||
class BleuScorer::NgramCounts {
|
||||
public:
|
||||
// Used to construct the ngram map
|
||||
struct NgramComparator {
|
||||
bool operator()(const vector<int>& a, const vector<int>& b) const {
|
||||
size_t i;
|
||||
const size_t as = a.size();
|
||||
const size_t bs = b.size();
|
||||
for (i = 0; i < as && i < bs; ++i) {
|
||||
if (a[i] < b[i]) {
|
||||
return true;
|
||||
}
|
||||
if (a[i] > b[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// entries are equal, shortest wins
|
||||
return as < bs;
|
||||
}
|
||||
};
|
||||
|
||||
typedef vector<int> Key;
|
||||
typedef int Value;
|
||||
typedef map<Key, Value, NgramComparator>::iterator iterator;
|
||||
typedef map<Key, Value, NgramComparator>::const_iterator const_iterator;
|
||||
|
||||
NgramCounts() : kDefaultCount(1) { }
|
||||
virtual ~NgramCounts() { }
|
||||
|
||||
// If the specified "ngram" is found, we add counts.
|
||||
// If not, we insert the default count in the container.
|
||||
void add(const Key& ngram) {
|
||||
const_iterator it = find(ngram);
|
||||
if (it != end()) {
|
||||
m_counts[ngram] = it->second + 1;
|
||||
} else {
|
||||
m_counts[ngram] = kDefaultCount;
|
||||
}
|
||||
}
|
||||
|
||||
void clear() { m_counts.clear(); }
|
||||
|
||||
bool empty() const { return m_counts.empty(); }
|
||||
|
||||
size_t size() const { return m_counts.size(); }
|
||||
size_t max_size() const { return m_counts.max_size(); }
|
||||
|
||||
iterator find(const Key& ngram) { return m_counts.find(ngram); }
|
||||
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
|
||||
|
||||
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
|
||||
|
||||
iterator begin() { return m_counts.begin(); }
|
||||
const_iterator begin() const { return m_counts.begin(); }
|
||||
iterator end() { return m_counts.end(); }
|
||||
const_iterator end() const { return m_counts.end(); }
|
||||
|
||||
private:
|
||||
const int kDefaultCount;
|
||||
map<Key, Value, NgramComparator> m_counts;
|
||||
};
|
||||
|
||||
BleuScorer::BleuScorer(const string& config)
|
||||
: StatisticsBasedScorer("BLEU",config),
|
||||
: StatisticsBasedScorer("BLEU", config),
|
||||
kLENGTH(4),
|
||||
m_ref_length_type(CLOSEST) {
|
||||
//configure regularisation
|
||||
static string KEY_REFLEN = "reflen";
|
||||
static string REFLEN_AVERAGE = "average";
|
||||
static string REFLEN_SHORTEST = "shortest";
|
||||
static string REFLEN_CLOSEST = "closest";
|
||||
|
||||
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
|
||||
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
|
||||
if (reflen == REFLEN_AVERAGE) {
|
||||
m_ref_length_type = AVERAGE;
|
||||
} else if (reflen == REFLEN_SHORTEST) {
|
||||
@ -28,18 +97,15 @@ BleuScorer::BleuScorer(const string& config)
|
||||
} else {
|
||||
throw runtime_error("Unknown reference length strategy: " + reflen);
|
||||
}
|
||||
// cerr << "Using reference length strategy: " << reflen << endl;
|
||||
}
|
||||
|
||||
BleuScorer::~BleuScorer() {}
|
||||
|
||||
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
|
||||
size_t BleuScorer::countNgrams(const string& line, NgramCounts& counts,
|
||||
unsigned int n)
|
||||
{
|
||||
vector<int> encoded_tokens;
|
||||
//cerr << line << endl;
|
||||
TokenizeAndEncode(line, encoded_tokens);
|
||||
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
|
||||
//cerr << endl;
|
||||
for (size_t k = 1; k <= n; ++k) {
|
||||
//ngram order longer than sentence - no point
|
||||
if (k > encoded_tokens.size()) {
|
||||
@ -50,18 +116,9 @@ size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned in
|
||||
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
|
||||
ngram.push_back(encoded_tokens[j]);
|
||||
}
|
||||
int count = 1;
|
||||
counts_iterator oldcount = counts.find(ngram);
|
||||
if (oldcount != counts.end()) {
|
||||
count = (oldcount->second) + 1;
|
||||
}
|
||||
//cerr << count << endl;
|
||||
counts[ngram] = count;
|
||||
//cerr << endl;
|
||||
counts.add(ngram);
|
||||
}
|
||||
}
|
||||
//cerr << "counted ngrams" << endl;
|
||||
//dump_counts(counts);
|
||||
return encoded_tokens.size();
|
||||
}
|
||||
|
||||
@ -82,9 +139,9 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
string line;
|
||||
size_t sid = 0; //sentence counter
|
||||
while (getline(refin,line)) {
|
||||
//cerr << line << endl;
|
||||
line = this->applyFactors(line);
|
||||
if (i == 0) {
|
||||
counts_t *counts = new counts_t; //these get leaked
|
||||
NgramCounts *counts = new NgramCounts; //these get leaked
|
||||
m_ref_counts.push_back(counts);
|
||||
vector<size_t> lengths;
|
||||
m_ref_lengths.push_back(lengths);
|
||||
@ -92,11 +149,12 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
if (m_ref_counts.size() <= sid) {
|
||||
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
|
||||
}
|
||||
counts_t counts;
|
||||
size_t length = countNgrams(line,counts,kLENGTH);
|
||||
NgramCounts counts;
|
||||
size_t length = countNgrams(line, counts, kLENGTH);
|
||||
|
||||
//for any counts larger than those already there, merge them in
|
||||
for (counts_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
|
||||
counts_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
|
||||
for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
|
||||
NgramCounts::const_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
|
||||
int oldcount = 0;
|
||||
if (oldcount_it != m_ref_counts[sid]->end()) {
|
||||
oldcount = oldcount_it->second;
|
||||
@ -113,83 +171,56 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
}
|
||||
++sid;
|
||||
}
|
||||
TRACE_ERR(endl);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
// cerr << text << endl;
|
||||
// cerr << sid << endl;
|
||||
//dump_counts(*m_ref_counts[sid]);
|
||||
if (sid >= m_ref_counts.size()) {
|
||||
stringstream msg;
|
||||
msg << "Sentence id (" << sid << ") not found in reference set";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
counts_t testcounts;
|
||||
//stats for this line
|
||||
vector<float> stats(kLENGTH*2);;
|
||||
size_t length = countNgrams(text,testcounts,kLENGTH);
|
||||
//dump_counts(testcounts);
|
||||
if (m_ref_length_type == SHORTEST) {
|
||||
//cerr << reflengths.size() << " " << sid << endl;
|
||||
int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end());
|
||||
stats.push_back(shortest);
|
||||
} else if (m_ref_length_type == AVERAGE) {
|
||||
int total = 0;
|
||||
for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
|
||||
total += m_ref_lengths[sid][i];
|
||||
}
|
||||
const float mean = static_cast<float>(total) / m_ref_lengths[sid].size();
|
||||
stats.push_back(mean);
|
||||
} else if (m_ref_length_type == CLOSEST) {
|
||||
int min_diff = INT_MAX;
|
||||
int min_idx = 0;
|
||||
for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
|
||||
const int reflength = m_ref_lengths[sid][i];
|
||||
const int diff = reflength - static_cast<int>(length);
|
||||
const int absolute_diff = abs(diff) - abs(min_diff);
|
||||
NgramCounts testcounts;
|
||||
// stats for this line
|
||||
vector<ScoreStatsType> stats(kLENGTH * 2);
|
||||
string sentence = this->applyFactors(text);
|
||||
const size_t length = countNgrams(sentence, testcounts, kLENGTH);
|
||||
|
||||
if (absolute_diff < 0) { //look for the closest reference
|
||||
min_diff = diff;
|
||||
min_idx = i;
|
||||
} else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest
|
||||
if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) {
|
||||
min_idx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
stats.push_back(m_ref_lengths[sid][min_idx]);
|
||||
} else {
|
||||
throw runtime_error("Unsupported reflength strategy");
|
||||
// Calculate effective reference length.
|
||||
switch (m_ref_length_type) {
|
||||
case SHORTEST:
|
||||
CalcShortest(sid, stats);
|
||||
break;
|
||||
case AVERAGE:
|
||||
CalcAverage(sid, stats);
|
||||
break;
|
||||
case CLOSEST:
|
||||
CalcClosest(sid, length, stats);
|
||||
break;
|
||||
default:
|
||||
throw runtime_error("Unsupported reflength strategy");
|
||||
}
|
||||
//cerr << "computed length" << endl;
|
||||
|
||||
//precision on each ngram type
|
||||
for (counts_iterator testcounts_it = testcounts.begin();
|
||||
for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
|
||||
testcounts_it != testcounts.end(); ++testcounts_it) {
|
||||
counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
|
||||
NgramCounts::const_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
|
||||
int correct = 0;
|
||||
int guess = testcounts_it->second;
|
||||
const int guess = testcounts_it->second;
|
||||
if (refcounts_it != m_ref_counts[sid]->end()) {
|
||||
correct = min(refcounts_it->second,guess);
|
||||
}
|
||||
size_t len = testcounts_it->first.size();
|
||||
const size_t len = testcounts_it->first.size();
|
||||
stats[len*2-2] += correct;
|
||||
stats[len*2-1] += guess;
|
||||
}
|
||||
stringstream sout;
|
||||
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
|
||||
//TRACE_ERR(sout.str() << endl);
|
||||
string stats_str = sout.str();
|
||||
entry.set(stats_str);
|
||||
entry.set(stats);
|
||||
}
|
||||
|
||||
float BleuScorer::calculateScore(const vector<int>& comps) const
|
||||
{
|
||||
//cerr << "BLEU: ";
|
||||
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
|
||||
float logbleu = 0.0;
|
||||
for (int i = 0; i < kLENGTH; ++i) {
|
||||
if (comps[2*i] == 0) {
|
||||
@ -203,15 +234,64 @@ float BleuScorer::calculateScore(const vector<int>& comps) const
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
//cerr << " " << exp(logbleu) << endl;
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
void BleuScorer::dump_counts(counts_t& counts) const {
|
||||
for (counts_const_iterator i = counts.begin(); i != counts.end(); ++i) {
|
||||
cerr << "(";
|
||||
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
|
||||
cerr << ") " << i->second << ", ";
|
||||
void BleuScorer::dump_counts(ostream* os,
|
||||
const NgramCounts& counts) const {
|
||||
for (NgramCounts::const_iterator it = counts.begin();
|
||||
it != counts.end(); ++it) {
|
||||
*os << "(";
|
||||
const NgramCounts::Key& keys = it->first;
|
||||
for (size_t i = 0; i < keys.size(); ++i) {
|
||||
if (i != 0) {
|
||||
*os << " ";
|
||||
}
|
||||
*os << keys[i];
|
||||
}
|
||||
*os << ") : " << it->second << ", ";
|
||||
}
|
||||
cerr << endl;
|
||||
*os << endl;
|
||||
}
|
||||
|
||||
void BleuScorer::CalcAverage(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const {
|
||||
int total = 0;
|
||||
for (size_t i = 0;
|
||||
i < m_ref_lengths[sentence_id].size(); ++i) {
|
||||
total += m_ref_lengths[sentence_id][i];
|
||||
}
|
||||
const float mean = static_cast<float>(total) /
|
||||
m_ref_lengths[sentence_id].size();
|
||||
stats.push_back(static_cast<ScoreStatsType>(mean));
|
||||
}
|
||||
|
||||
void BleuScorer::CalcClosest(size_t sentence_id,
|
||||
size_t length,
|
||||
vector<ScoreStatsType>& stats) const {
|
||||
int min_diff = INT_MAX;
|
||||
int min_idx = 0;
|
||||
for (size_t i = 0; i < m_ref_lengths[sentence_id].size(); ++i) {
|
||||
const int reflength = m_ref_lengths[sentence_id][i];
|
||||
const int length_diff = abs(reflength - static_cast<int>(length));
|
||||
|
||||
// Look for the closest reference
|
||||
if (length_diff < abs(min_diff)) {
|
||||
min_diff = reflength - length;
|
||||
min_idx = i;
|
||||
// if two references has the same closest length, take the shortest
|
||||
} else if (length_diff == abs(min_diff)) {
|
||||
if (reflength < static_cast<int>(m_ref_lengths[sentence_id][min_idx])) {
|
||||
min_idx = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
stats.push_back(m_ref_lengths[sentence_id][min_idx]);
|
||||
}
|
||||
|
||||
void BleuScorer::CalcShortest(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const {
|
||||
const int shortest = *min_element(m_ref_lengths[sentence_id].begin(),
|
||||
m_ref_lengths[sentence_id].end());
|
||||
stats.push_back(shortest);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef MERT_BLEU_SCORER_H_
|
||||
#define MERT_BLEU_SCORER_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -24,55 +24,42 @@ public:
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
return 2 * kLENGTH + 1;
|
||||
}
|
||||
virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; }
|
||||
|
||||
private:
|
||||
enum ReferenceLengthType {
|
||||
AVERAGE,
|
||||
SHORTEST,
|
||||
CLOSEST,
|
||||
CLOSEST
|
||||
};
|
||||
|
||||
//Used to construct the ngram map
|
||||
struct CompareNgrams {
|
||||
bool operator()(const vector<int>& a, const vector<int>& b) const {
|
||||
size_t i;
|
||||
const size_t as = a.size();
|
||||
const size_t bs = b.size();
|
||||
for (i = 0; i < as && i < bs; ++i) {
|
||||
if (a[i] < b[i]) {
|
||||
//cerr << "true" << endl;
|
||||
return true;
|
||||
}
|
||||
if (a[i] > b[i]) {
|
||||
//cerr << "false" << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
//entries are equal, shortest wins
|
||||
return as < bs;;
|
||||
}
|
||||
};
|
||||
|
||||
typedef map<vector<int>,int,CompareNgrams> counts_t;
|
||||
typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
|
||||
typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
|
||||
/**
|
||||
* A NgramCounts is a key-value store.
|
||||
* Clients don't have to worry about the actual implementation
|
||||
* since this type is used in internal only.
|
||||
*/
|
||||
class NgramCounts;
|
||||
|
||||
/**
|
||||
* Count the ngrams of each type, up to the given length in the input line.
|
||||
*/
|
||||
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
|
||||
size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n);
|
||||
|
||||
void dump_counts(counts_t& counts) const;
|
||||
void dump_counts(std::ostream* os, const NgramCounts& counts) const;
|
||||
|
||||
// For calculating effective reference length.
|
||||
void CalcAverage(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
void CalcClosest(size_t sentence_id, size_t length,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
void CalcShortest(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
|
||||
const int kLENGTH;
|
||||
ReferenceLengthType m_ref_length_type;
|
||||
|
||||
// data extracted from reference files
|
||||
ScopedVector<counts_t> m_ref_counts;
|
||||
ScopedVector<NgramCounts> m_ref_counts;
|
||||
vector<vector<size_t> > m_ref_lengths;
|
||||
|
||||
// no copying allowed
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "CderScorer.h"
|
||||
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
|
||||
@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
m_ref_sentences.push_back(vector<sent_t>());
|
||||
string line;
|
||||
while (getline(refin,line)) {
|
||||
line = this->applyFactors(line);
|
||||
sent_t encoded;
|
||||
TokenizeAndEncode(line, encoded);
|
||||
m_ref_sentences[rid].push_back(encoded);
|
||||
@ -40,13 +41,11 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
|
||||
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
vector<int> stats;
|
||||
prepareStatsVector(sid, text, stats);
|
||||
string sentence = this->applyFactors(text);
|
||||
|
||||
stringstream sout;
|
||||
copy(stats.begin(), stats.end(), ostream_iterator<float>(sout," "));
|
||||
string stats_str = sout.str();
|
||||
entry.set(stats_str);
|
||||
vector<int> stats;
|
||||
prepareStatsVector(sid, sentence, stats);
|
||||
entry.set(stats);
|
||||
}
|
||||
|
||||
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
|
||||
@ -55,9 +54,11 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
|
||||
TokenizeAndEncode(text, cand);
|
||||
|
||||
float max = -2;
|
||||
vector<int> tmp;
|
||||
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
|
||||
sent_t& ref = m_ref_sentences[rid][sid];
|
||||
vector<int> tmp = computeCD(cand, ref);
|
||||
const sent_t& ref = m_ref_sentences[rid][sid];
|
||||
tmp.clear();
|
||||
computeCD(cand, ref, tmp);
|
||||
if (calculateScore(tmp) > max) {
|
||||
stats = tmp;
|
||||
}
|
||||
@ -66,16 +67,14 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
|
||||
|
||||
float CderScorer::calculateScore(const vector<int>& comps) const
|
||||
{
|
||||
if (comps.size() != 2)
|
||||
{
|
||||
if (comps.size() != 2) {
|
||||
throw runtime_error("Size of stat vector for CDER is not 2");
|
||||
}
|
||||
|
||||
return 1 - (comps[0] / static_cast<float>(comps[1]));
|
||||
return 1.0f - (comps[0] / static_cast<float>(comps[1]));
|
||||
}
|
||||
|
||||
vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
|
||||
{
|
||||
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
|
||||
vector<int>& stats) const {
|
||||
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
|
||||
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
|
||||
|
||||
@ -113,10 +112,9 @@ vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
|
||||
row = nextRow;
|
||||
}
|
||||
|
||||
vector<int> stats(2);
|
||||
stats.resize(2);
|
||||
stats[0] = *(row->rbegin()); // CD distance is the cost of path from (0,0) to (I,L)
|
||||
stats[1] = ref.size();
|
||||
|
||||
delete row;
|
||||
return stats;
|
||||
}
|
||||
|
@ -1,8 +1,6 @@
|
||||
#ifndef MERT_CDER_SCORER_H_
|
||||
#define MERT_CDER_SCORER_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
@ -10,9 +8,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
class CderScorer: public StatisticsBasedScorer
|
||||
{
|
||||
public:
|
||||
class CderScorer: public StatisticsBasedScorer {
|
||||
public:
|
||||
explicit CderScorer(const string& config);
|
||||
~CderScorer();
|
||||
|
||||
@ -22,17 +19,16 @@ public:
|
||||
|
||||
virtual void prepareStatsVector(size_t sid, const string& text, vector<int>& stats);
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
return 2;
|
||||
}
|
||||
virtual size_t NumberOfScores() const { return 2; }
|
||||
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
|
||||
private:
|
||||
private:
|
||||
typedef vector<int> sent_t;
|
||||
vector<vector<sent_t> > m_ref_sentences;
|
||||
|
||||
vector<int> computeCD(const sent_t& cand, const sent_t& ref) const;
|
||||
void computeCD(const sent_t& cand, const sent_t& ref,
|
||||
vector<int>& stats) const;
|
||||
|
||||
// no copying allowed
|
||||
CderScorer(const CderScorer&);
|
||||
|
182
mert/InterpolatedScorer.cpp
Normal file
182
mert/InterpolatedScorer.cpp
Normal file
@ -0,0 +1,182 @@
|
||||
#include "ScorerFactory.h"
|
||||
#include "InterpolatedScorer.h"
|
||||
#include "Util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
|
||||
{
|
||||
|
||||
// name would be: HAMMING,BLEU or similar
|
||||
string scorers = name;
|
||||
while (scorers.length() > 0) {
|
||||
string scorertype = "";
|
||||
getNextPound(scorers,scorertype,",");
|
||||
Scorer *theScorer=ScorerFactory::getScorer(scorertype,config);
|
||||
_scorers.push_back(theScorer);
|
||||
}
|
||||
if (_scorers.size() == 0) {
|
||||
throw runtime_error("There are no scorers");
|
||||
}
|
||||
cerr << "Number of scorers: " << _scorers.size() << endl;
|
||||
|
||||
//TODO debug this
|
||||
string wtype = getConfig("weights","");
|
||||
//Default weights set to uniform ie. if two weights 0.5 each
|
||||
//weights should add to 1
|
||||
if (wtype.length() == 0) {
|
||||
float weight = 1.0/_scorers.size() ;
|
||||
//cout << " Default weights:" << weight << endl;
|
||||
for (size_t i = 0; i < _scorers.size(); i ++) {
|
||||
_scorerWeights.push_back(weight);
|
||||
}
|
||||
} else {
|
||||
float tot=0;
|
||||
//cout << "Defined weights:" << endl;
|
||||
while (wtype.length() > 0) {
|
||||
string scoreweight = "";
|
||||
getNextPound(wtype,scoreweight,"+");
|
||||
float weight = atof(scoreweight.c_str());
|
||||
_scorerWeights.push_back(weight);
|
||||
tot += weight;
|
||||
//cout << " :" << weight ;
|
||||
}
|
||||
//cout << endl;
|
||||
if (tot != float(1)) {
|
||||
for (vector<float>::iterator it = _scorerWeights.begin(); it != _scorerWeights.end(); ++it)
|
||||
{
|
||||
*it /= tot;
|
||||
}
|
||||
}
|
||||
|
||||
if (_scorers.size() != _scorerWeights.size()) {
|
||||
throw runtime_error("The number of weights does not equal the number of scorers!");
|
||||
}
|
||||
}
|
||||
cerr << "The weights for the interpolated scorers are: " << endl;
|
||||
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
|
||||
cerr << *it << " " ;
|
||||
}
|
||||
cerr <<endl;
|
||||
}
|
||||
|
||||
void InterpolatedScorer::setScoreData(ScoreData* data)
|
||||
{
|
||||
size_t last = 0;
|
||||
m_score_data = data;
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
int numScoresScorer = (*itsc)->NumberOfScores();
|
||||
ScoreData* newData =new ScoreData(**itsc);
|
||||
for (size_t i = 0; i < data->size(); i++) {
|
||||
ScoreArray scoreArray = data->get(i);
|
||||
ScoreArray newScoreArray;
|
||||
std::string istr;
|
||||
std::stringstream out;
|
||||
out << i;
|
||||
istr = out.str();
|
||||
size_t numNBest = scoreArray.size();
|
||||
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
|
||||
for (size_t j = 0; j < numNBest ; j++) {
|
||||
ScoreStats scoreStats = data->get(i, j);
|
||||
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
|
||||
ScoreStats newScoreStats;
|
||||
for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
|
||||
ScoreStatsType score = scoreStats.get(k);
|
||||
newScoreStats.add(score);
|
||||
}
|
||||
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
|
||||
newScoreArray.add(newScoreStats);
|
||||
}
|
||||
newScoreArray.setIndex(istr);
|
||||
newData->add(newScoreArray);
|
||||
}
|
||||
//newData->dump();
|
||||
|
||||
// NOTE: This class takes the ownership of the heap allocated
|
||||
// ScoreData objects to avoid the memory leak issues.
|
||||
m_scorers_score_data.push_back(newData);
|
||||
|
||||
(*itsc)->setScoreData(newData);
|
||||
last += numScoresScorer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** The interpolated scorer calls a vector of scorers and combines them with
|
||||
weights **/
|
||||
void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
statscores_t& scores) const
|
||||
{
|
||||
//cout << "*******InterpolatedScorer::score" << endl;
|
||||
size_t scorerNum = 0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
//int numScores = (*itsc)->NumberOfScores();
|
||||
statscores_t tscores;
|
||||
(*itsc)->score(candidates,diffs,tscores);
|
||||
size_t inc = 0;
|
||||
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
|
||||
//cout << "Scores " << (*itstatsc) << endl;
|
||||
float weight = _scorerWeights[scorerNum];
|
||||
if (weight == 0) {
|
||||
stringstream msg;
|
||||
msg << "No weights for scorer" << scorerNum ;
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
if (scorerNum == 0) {
|
||||
scores.push_back(weight * (*itstatsc));
|
||||
} else {
|
||||
scores[inc] += weight * (*itstatsc);
|
||||
}
|
||||
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
|
||||
inc++;
|
||||
|
||||
}
|
||||
scorerNum++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
{
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
(*itsc)->setReferenceFiles(referenceFiles);
|
||||
}
|
||||
}
|
||||
|
||||
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
stringstream buff;
|
||||
int i=0;
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
ScoreStats tempEntry;
|
||||
(*itsc)->prepareStats(sid, text, tempEntry);
|
||||
if (i > 0) buff << " ";
|
||||
buff << tempEntry;
|
||||
i++;
|
||||
}
|
||||
//cout << " Scores for interpolated: " << buff << endl;
|
||||
string str = buff.str();
|
||||
entry.set(str);
|
||||
}
|
||||
|
||||
void InterpolatedScorer::setFactors(const string& factors)
|
||||
{
|
||||
if (factors.empty()) return;
|
||||
|
||||
vector<string> fsplit;
|
||||
split(factors, ',', fsplit);
|
||||
|
||||
if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
|
||||
|
||||
for (size_t i = 0; i < _scorers.size(); ++i)
|
||||
{
|
||||
_scorers[i]->setFactors(fsplit[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
60
mert/InterpolatedScorer.h
Normal file
60
mert/InterpolatedScorer.h
Normal file
@ -0,0 +1,60 @@
|
||||
#ifndef __INTERPOLATED_SCORER_H__
|
||||
#define __INTERPOLATED_SCORER_H__
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
#include "ScoreData.h"
|
||||
#include "Scorer.h"
|
||||
#include "ScopedVector.h"
|
||||
|
||||
/**
|
||||
* Class that includes other scorers eg.
|
||||
* Interpolated HAMMING and BLEU scorer **/
|
||||
class InterpolatedScorer : public Scorer
|
||||
{
|
||||
public:
|
||||
// name would be: "HAMMING,BLEU" or similar
|
||||
InterpolatedScorer(const string& name, const string& config);
|
||||
virtual ~InterpolatedScorer() {}
|
||||
|
||||
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
statscores_t& scores) const;
|
||||
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
size_t sz=0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc != _scorers.end(); itsc++) {
|
||||
sz += (*itsc)->NumberOfScores();
|
||||
}
|
||||
return sz;
|
||||
};
|
||||
|
||||
virtual void setScoreData(ScoreData* data);
|
||||
|
||||
/**
|
||||
* Set the factors, which should be used for this metric
|
||||
*/
|
||||
virtual void setFactors(const string& factors);
|
||||
|
||||
protected:
|
||||
ScopedVector<Scorer> _scorers;
|
||||
|
||||
// Take the ownership of the heap-allocated the objects
|
||||
// by Scorer objects.
|
||||
ScopedVector<ScoreData> m_scorers_score_data;
|
||||
|
||||
vector<float> _scorerWeights;
|
||||
};
|
||||
|
||||
#endif //__INTERPOLATED_SCORER_H
|
@ -12,6 +12,7 @@ FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
|
||||
FeatureDataIterator.cpp
|
||||
Data.cpp
|
||||
BleuScorer.cpp
|
||||
InterpolatedScorer.cpp
|
||||
Point.cpp
|
||||
PerScorer.cpp
|
||||
Scorer.cpp
|
||||
@ -44,6 +45,7 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
|
||||
alias programs : mert extractor evaluator pro ;
|
||||
|
||||
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test timer_test : TimerTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
|
||||
install legacy : programs : <location>. ;
|
||||
|
@ -24,6 +24,11 @@ public:
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void whoami() const {
|
||||
cerr << "I AM MergeScorer" << endl;
|
||||
}
|
||||
|
@ -72,7 +72,6 @@ statscore_t Optimizer::GetStatScore(const Point& param) const
|
||||
{
|
||||
vector<unsigned> bests;
|
||||
Get1bests(param, bests);
|
||||
//copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," "));
|
||||
statscore_t score = GetStatScore(bests);
|
||||
return score;
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
string line;
|
||||
int sid = 0;
|
||||
while (getline(in,line)) {
|
||||
line = this->applyFactors(line);
|
||||
vector<int> tokens;
|
||||
TokenizeAndEncode(line, tokens);
|
||||
m_ref_tokens.push_back(multiset<int>());
|
||||
@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
msg << "Sentence id (" << sid << ") not found in reference set";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
|
||||
string sentence = this->applyFactors(text);
|
||||
|
||||
// Calculate correct, output_length and ref_length for
|
||||
// the line and store it in entry
|
||||
vector<int> testtokens;
|
||||
TokenizeAndEncode(text, testtokens);
|
||||
TokenizeAndEncode(sentence, testtokens);
|
||||
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
|
||||
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
|
||||
int correct = 0;
|
||||
|
@ -38,6 +38,7 @@ Point::Point(const vector<parameter_t>& init,
|
||||
}
|
||||
} else {
|
||||
CHECK(init.size()==pdim);
|
||||
CHECK(optindices.size() == Point::dim);
|
||||
for (unsigned int i=0; i<Point::dim; i++) {
|
||||
operator[](i)=init[optindices[i]];
|
||||
m_min[i] = min[optindices[i]];
|
||||
|
@ -60,6 +60,15 @@ public:
|
||||
static void setdim(size_t d) {
|
||||
dim = d;
|
||||
}
|
||||
|
||||
static void set_optindices(const vector<unsigned int>& indices) {
|
||||
optindices = indices;
|
||||
}
|
||||
|
||||
static const vector<unsigned int>& get_optindices() {
|
||||
return optindices;
|
||||
}
|
||||
|
||||
static bool OptimizeAll() {
|
||||
return fixedweights.empty();
|
||||
}
|
||||
|
@ -24,12 +24,6 @@ ScoreStats::ScoreStats(const size_t size)
|
||||
memset(array_, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
|
||||
ScoreStats::ScoreStats(std::string &theString)
|
||||
: available_(0), entries_(0), array_(NULL)
|
||||
{
|
||||
set(theString);
|
||||
}
|
||||
|
||||
ScoreStats::~ScoreStats()
|
||||
{
|
||||
if (array_) {
|
||||
@ -73,14 +67,14 @@ void ScoreStats::add(ScoreStatsType v)
|
||||
array_[entries_++]=v;
|
||||
}
|
||||
|
||||
void ScoreStats::set(std::string &theString)
|
||||
void ScoreStats::set(const std::string& str)
|
||||
{
|
||||
std::string substring, stringBuf;
|
||||
reset();
|
||||
|
||||
while (!theString.empty()) {
|
||||
getNextPound(theString, substring);
|
||||
add(ConvertStringToScoreStatsType(substring));
|
||||
vector<string> out;
|
||||
Tokenize(str.c_str(), ' ', &out);
|
||||
for (vector<string>::const_iterator it = out.begin();
|
||||
it != out.end(); ++it) {
|
||||
add(ConvertStringToScoreStatsType(*it));
|
||||
}
|
||||
}
|
||||
|
||||
@ -144,7 +138,7 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
|
||||
if (s1.get(k) != s2.get(k))
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
//END_ADDED
|
||||
|
@ -31,7 +31,7 @@ private:
|
||||
public:
|
||||
ScoreStats();
|
||||
explicit ScoreStats(const size_t size);
|
||||
explicit ScoreStats(std::string &theString);
|
||||
|
||||
~ScoreStats();
|
||||
|
||||
// We intentionally allow copying.
|
||||
@ -66,7 +66,15 @@ public:
|
||||
return array_;
|
||||
}
|
||||
|
||||
void set(std::string &theString);
|
||||
void set(const std::string& str);
|
||||
|
||||
// Much more efficient than the above.
|
||||
void set(const std::vector<ScoreStatsType>& stats) {
|
||||
reset();
|
||||
for (size_t i = 0; i < stats.size(); ++i) {
|
||||
add(stats[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t bytes() const {
|
||||
return GetArraySizeWithBytes();
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "Scorer.h"
|
||||
#include <limits>
|
||||
#include "Util.h"
|
||||
|
||||
namespace {
|
||||
|
||||
@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the factors, which should be used for this metric
|
||||
*/
|
||||
void Scorer::setFactors(const string& factors)
|
||||
{
|
||||
if (factors.empty()) return;
|
||||
vector<string> factors_vec;
|
||||
split(factors, '|', factors_vec);
|
||||
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
|
||||
{
|
||||
int factor = atoi(it->c_str());
|
||||
m_factors.push_back(factor);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the factored sentence and return the desired factors
|
||||
*/
|
||||
string Scorer::applyFactors(const string& sentence)
|
||||
{
|
||||
if (m_factors.size() == 0) return sentence;
|
||||
|
||||
vector<string> tokens;
|
||||
split(sentence, ' ', tokens);
|
||||
|
||||
stringstream sstream;
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
if (tokens[i] == "") continue;
|
||||
|
||||
vector<string> factors;
|
||||
split(tokens[i], '|', factors);
|
||||
|
||||
int fsize = factors.size();
|
||||
|
||||
if (i>0) sstream << " ";
|
||||
|
||||
for (size_t j = 0; j < m_factors.size(); ++j)
|
||||
{
|
||||
int findex = m_factors[j];
|
||||
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
|
||||
|
||||
if (j>0) sstream << "|";
|
||||
sstream << factors[findex];
|
||||
}
|
||||
}
|
||||
return sstream.str();
|
||||
}
|
||||
|
||||
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
|
||||
: Scorer(name,config) {
|
||||
//configure regularisation
|
||||
|
@ -28,10 +28,7 @@ class Scorer
|
||||
/**
|
||||
* Return the number of statistics needed for the computation of the score.
|
||||
*/
|
||||
virtual size_t NumberOfScores() const {
|
||||
cerr << "Scorer: 0" << endl;
|
||||
return 0;
|
||||
}
|
||||
virtual size_t NumberOfScores() const = 0;
|
||||
|
||||
/**
|
||||
* Set the reference files. This must be called before prepareStats().
|
||||
@ -57,7 +54,9 @@ class Scorer
|
||||
* applying each in turn, and calculating a new score each time.
|
||||
*/
|
||||
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
||||
statscores_t& scores) const {
|
||||
statscores_t& scores) const = 0;
|
||||
/*
|
||||
{
|
||||
//dummy impl
|
||||
if (!m_score_data) {
|
||||
throw runtime_error("score data not loaded");
|
||||
@ -67,6 +66,7 @@ class Scorer
|
||||
scores.push_back(0);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Calculate the score of the sentences corresponding to the list of candidate
|
||||
@ -93,10 +93,20 @@ class Scorer
|
||||
/**
|
||||
* Set the score data, prior to scoring.
|
||||
*/
|
||||
void setScoreData(ScoreData* data) {
|
||||
virtual void setScoreData(ScoreData* data) {
|
||||
m_score_data = data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the factors, which should be used for this metric
|
||||
*/
|
||||
virtual void setFactors(const string& factors);
|
||||
|
||||
/**
|
||||
* Take the factored sentence and return the desired factors
|
||||
*/
|
||||
virtual string applyFactors(const string& sentece);
|
||||
|
||||
private:
|
||||
class Encoder {
|
||||
public:
|
||||
@ -114,6 +124,7 @@ class Scorer
|
||||
string m_name;
|
||||
Encoder* m_encoder;
|
||||
map<string, string> m_config;
|
||||
vector<int> m_factors;
|
||||
|
||||
protected:
|
||||
ScoreData* m_score_data;
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "TerScorer.h"
|
||||
#include "CderScorer.h"
|
||||
#include "MergeScorer.h"
|
||||
#include "InterpolatedScorer.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -32,6 +33,11 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
|
||||
} else if (type == "MERGE") {
|
||||
return (MergeScorer*) new MergeScorer(config);
|
||||
} else {
|
||||
throw runtime_error("Unknown scorer type: " + type);
|
||||
if (type.find(',') != string::npos) {
|
||||
return new InterpolatedScorer(type, config);
|
||||
}
|
||||
else {
|
||||
throw runtime_error("Unknown scorer type: " + type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
|
||||
string line;
|
||||
int sid = 0;
|
||||
while ( getline ( in, line ) ) {
|
||||
line = this->applyFactors(line);
|
||||
vector<int> tokens;
|
||||
TokenizeAndEncode(line, tokens);
|
||||
m_references.push_back ( tokens );
|
||||
@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
|
||||
|
||||
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
|
||||
{
|
||||
string sentence = this->applyFactors(text);
|
||||
|
||||
terAlignment result;
|
||||
result.numEdits = 0.0 ;
|
||||
@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
|
||||
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
|
||||
}
|
||||
averageLength=averageLength/( double ) m_multi_references.size();
|
||||
TokenizeAndEncode(text, testtokens);
|
||||
TokenizeAndEncode(sentence, testtokens);
|
||||
terCalc * evaluation=new terCalc();
|
||||
evaluation->setDebugMode ( false );
|
||||
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
|
||||
|
135
mert/Timer.cpp
135
mert/Timer.cpp
@ -1,73 +1,106 @@
|
||||
#include "Timer.h"
|
||||
#include "Util.h"
|
||||
#include <cstdio>
|
||||
|
||||
double Timer::elapsed_time()
|
||||
{
|
||||
time_t now;
|
||||
time(&now);
|
||||
return difftime(now, start_time);
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
uint64_t GetMicroSeconds(const struct timeval& tv) {
|
||||
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
|
||||
}
|
||||
|
||||
double Timer::get_elapsed_time()
|
||||
{
|
||||
return elapsed_time();
|
||||
uint64_t GetTimeOfDayMicroSeconds() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
Timer::CPUTime Timer::GetCPUTimeMicroSeconds() const {
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
struct rusage usage;
|
||||
if (getrusage(RUSAGE_SELF, &usage)) {
|
||||
TRACE_ERR("Error occurred: getrusage().\n");
|
||||
exit(1);
|
||||
}
|
||||
CPUTime t;
|
||||
t.user_time = GetMicroSeconds(usage.ru_utime);
|
||||
t.sys_time = GetMicroSeconds(usage.ru_stime);
|
||||
return t;
|
||||
#else // Windows
|
||||
// Not implemented yet.
|
||||
// TODO: implement the Windows version using native APIs.
|
||||
CPUTime t;
|
||||
return t;
|
||||
#endif
|
||||
}
|
||||
|
||||
double Timer::get_elapsed_cpu_time() const {
|
||||
return static_cast<double>(get_elapsed_cpu_time_microseconds()) * 1e-6;
|
||||
}
|
||||
|
||||
uint64_t Timer::get_elapsed_cpu_time_microseconds() const {
|
||||
const CPUTime e = GetCPUTimeMicroSeconds();
|
||||
return (e.user_time - m_start_time.user_time) +
|
||||
(e.sys_time - m_start_time.sys_time);
|
||||
}
|
||||
|
||||
double Timer::get_elapsed_wall_time() const {
|
||||
return static_cast<double>(get_elapsed_wall_time_microseconds()) * 1e-6;
|
||||
}
|
||||
|
||||
uint64_t Timer::get_elapsed_wall_time_microseconds() const {
|
||||
return GetTimeOfDayMicroSeconds() - m_wall;
|
||||
}
|
||||
|
||||
void Timer::start(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Starting timer t";
|
||||
if (msg) TRACE_ERR( msg << std::endl);
|
||||
|
||||
// Return immediately if the timer is already running
|
||||
if (running) return;
|
||||
|
||||
// Change timer status to running
|
||||
running = true;
|
||||
|
||||
// Set the start time;
|
||||
time(&start_time);
|
||||
if (m_is_running) return;
|
||||
m_is_running = true;
|
||||
m_wall = GetTimeOfDayMicroSeconds();
|
||||
m_start_time = GetCPUTimeMicroSeconds();
|
||||
}
|
||||
|
||||
/***
|
||||
* Turn the timer off and start it again from 0. Print an optional message.
|
||||
*/
|
||||
/*
|
||||
inline void Timer::restart(const char* msg)
|
||||
void Timer::restart(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Restarting timer t";
|
||||
if (msg) TRACE_ERR( msg << std::endl;
|
||||
|
||||
// Set the timer status to running
|
||||
running = true;
|
||||
|
||||
// Set the accumulated time to 0 and the start time to now
|
||||
acc_time = 0;
|
||||
start_clock = clock();
|
||||
start_time = time(0);
|
||||
if (msg) {
|
||||
TRACE_ERR(msg << std::endl);
|
||||
}
|
||||
m_wall = GetTimeOfDayMicroSeconds();
|
||||
m_start_time = GetCPUTimeMicroSeconds();
|
||||
}
|
||||
*/
|
||||
|
||||
/***
|
||||
* Stop the timer and print an optional message.
|
||||
*/
|
||||
/*
|
||||
inline void Timer::stop(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Stopping timer t";
|
||||
check(msg);
|
||||
|
||||
// Recalculate and store the total accumulated time up until now
|
||||
if (running) acc_time += elapsed_time();
|
||||
|
||||
running = false;
|
||||
}
|
||||
*/
|
||||
|
||||
void Timer::check(const char* msg)
|
||||
{
|
||||
// Print an optional message, something like "Checking timer t";
|
||||
if (msg) TRACE_ERR( msg << " : ");
|
||||
|
||||
// TRACE_ERR( "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
|
||||
TRACE_ERR( "[" << (running ? elapsed_time() : 0) << "] seconds\n");
|
||||
if (m_is_running) {
|
||||
TRACE_ERR("[Wall " << get_elapsed_wall_time()
|
||||
<< " CPU " << get_elapsed_cpu_time() << "] seconds.\n");
|
||||
} else {
|
||||
TRACE_ERR("WARNING: the timer is not running.\n");
|
||||
}
|
||||
}
|
||||
|
||||
std::string Timer::ToString() const {
|
||||
std::string res;
|
||||
char tmp[64];
|
||||
const double wall = get_elapsed_wall_time();
|
||||
const CPUTime e = GetCPUTimeMicroSeconds();
|
||||
const double utime = (e.user_time - m_start_time.user_time) * 1e-6;
|
||||
const double stime = (e.sys_time - m_start_time.sys_time) * 1e-6;
|
||||
std::snprintf(tmp, sizeof(tmp), "wall %f user %f sec. sys %f sec. total %f sec.",
|
||||
wall, utime, stime, utime + stime);
|
||||
res.append(tmp);
|
||||
return res;
|
||||
}
|
||||
|
106
mert/Timer.h
106
mert/Timer.h
@ -1,46 +1,50 @@
|
||||
#ifndef MERT_TIMER_H_
|
||||
#define MERT_TIMER_H_
|
||||
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
class Timer
|
||||
{
|
||||
/**
|
||||
* Allow timers to be printed to ostreams using the syntax 'os << t'
|
||||
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
|
||||
* print out the total amount of time 't' has been "running".
|
||||
*/
|
||||
friend std::ostream& operator<<(std::ostream& os, Timer& t);
|
||||
private:
|
||||
// Time values are stored in microseconds.
|
||||
struct CPUTime {
|
||||
uint64_t user_time; // user CPU time
|
||||
uint64_t sys_time; // system CPU time
|
||||
|
||||
private:
|
||||
bool running;
|
||||
time_t start_time;
|
||||
CPUTime() : user_time(0), sys_time(0) { }
|
||||
};
|
||||
|
||||
/**
|
||||
* Return the total time that the timer has been in the "running"
|
||||
* state since it was first "started" or last "restarted". For
|
||||
* "short" time periods (less than an hour), the actual cpu time
|
||||
* used is reported instead of the elapsed time.
|
||||
* TODO in seconds?
|
||||
*/
|
||||
double elapsed_time();
|
||||
CPUTime GetCPUTimeMicroSeconds() const;
|
||||
|
||||
public:
|
||||
bool m_is_running;
|
||||
uint64_t m_wall; // wall-clock time in microseconds
|
||||
CPUTime m_start_time;
|
||||
|
||||
public:
|
||||
/**
|
||||
* 'running' is initially false. A timer needs to be explicitly started
|
||||
* using 'start' or 'restart'.
|
||||
* 'm_is_running' is initially false. A timer needs to be explicitly started
|
||||
* using 'start'.
|
||||
*/
|
||||
Timer() : running(false), start_time(0) { }
|
||||
Timer()
|
||||
: m_is_running(false),
|
||||
m_wall(0),
|
||||
m_start_time() {}
|
||||
|
||||
~Timer() {}
|
||||
|
||||
/**
|
||||
* Start a timer. If it is already running, let it continue running.
|
||||
* Print an optional message.
|
||||
*/
|
||||
void start(const char* msg = 0);
|
||||
// void restart(const char* msg = 0);
|
||||
// void stop(const char* msg = 0);
|
||||
|
||||
/**
|
||||
* Restart the timer iff the timer is already running.
|
||||
* if the timer is not running, just start the timer.
|
||||
*/
|
||||
void restart(const char* msg = 0);
|
||||
|
||||
/**
|
||||
* Print out an optional message followed by the current timer timing.
|
||||
@ -48,19 +52,49 @@ public:
|
||||
void check(const char* msg = 0);
|
||||
|
||||
/**
|
||||
* Return the total time that the timer has been in the "running"
|
||||
* state since it was first "started" or last "restarted". For
|
||||
* "short" time periods (less than an hour), the actual cpu time
|
||||
* used is reported instead of the elapsed time.
|
||||
* This function is the public version of elapsed_time()
|
||||
*/
|
||||
double get_elapsed_time();
|
||||
bool is_running() const { return m_is_running; }
|
||||
|
||||
/**
|
||||
* Return the total time in seconds that the timer has been in the
|
||||
* "running" state since it was first "started" or last "restarted".
|
||||
* For "short" time periods (less than an hour), the actual cpu time
|
||||
* used is reported instead of the elapsed time.
|
||||
*/
|
||||
double get_elapsed_cpu_time() const;
|
||||
|
||||
/**
|
||||
* Return the total time in microseconds.
|
||||
*/
|
||||
uint64_t get_elapsed_cpu_time_microseconds() const;
|
||||
|
||||
/**
|
||||
* Get elapsed wall-clock time in seconds.
|
||||
*/
|
||||
double get_elapsed_wall_time() const;
|
||||
|
||||
/**
|
||||
* Get elapsed wall-clock time in microseconds.
|
||||
*/
|
||||
uint64_t get_elapsed_wall_time_microseconds() const;
|
||||
|
||||
/**
|
||||
* Return a string that has the user CPU time, system time, and total time.
|
||||
*/
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, Timer& t)
|
||||
{
|
||||
//os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0);
|
||||
os << (t.running ? t.elapsed_time() : 0);
|
||||
/**
|
||||
* Allow timers to be printed to ostreams using the syntax 'os << t'
|
||||
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
|
||||
* print out the total amount of time 't' has been "running".
|
||||
*/
|
||||
inline std::ostream& operator<<(std::ostream& os, const Timer& t) {
|
||||
if (t.is_running()) {
|
||||
os << t.ToString();
|
||||
} else {
|
||||
os << "timer is not running.";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
|
32
mert/TimerTest.cpp
Normal file
32
mert/TimerTest.cpp
Normal file
@ -0,0 +1,32 @@
|
||||
#include "Timer.h"
|
||||
|
||||
#define BOOST_TEST_MODULE TimerTest
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
|
||||
BOOST_AUTO_TEST_CASE(timer_basic_test) {
|
||||
Timer timer;
|
||||
const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
|
||||
|
||||
timer.start();
|
||||
BOOST_REQUIRE(timer.is_running());
|
||||
BOOST_REQUIRE(usleep(sleep_time_microsec) == 0);
|
||||
// BOOST_CHECK(timer.get_elapsed_cpu_time() > 0.0);
|
||||
// BOOST_CHECK(timer.get_elapsed_cpu_time_microseconds() > 0);
|
||||
BOOST_CHECK(timer.get_elapsed_wall_time() > 0.0);
|
||||
BOOST_CHECK(timer.get_elapsed_wall_time_microseconds() > 0);
|
||||
|
||||
timer.restart();
|
||||
BOOST_REQUIRE(timer.is_running());
|
||||
BOOST_REQUIRE(usleep(sleep_time_microsec) == 0);
|
||||
// BOOST_CHECK(timer.get_elapsed_cpu_time() > 0.0);
|
||||
// BOOST_CHECK(timer.get_elapsed_cpu_time_microseconds() > 0);
|
||||
BOOST_CHECK(timer.get_elapsed_wall_time() > 0.0);
|
||||
BOOST_CHECK(timer.get_elapsed_wall_time_microseconds() > 0);
|
||||
|
||||
const std::string s = timer.ToString();
|
||||
BOOST_CHECK(!s.empty());
|
||||
}
|
@ -84,5 +84,5 @@ void PrintUserTime(const std::string &message)
|
||||
|
||||
double GetUserTime()
|
||||
{
|
||||
return g_timer.get_elapsed_time();
|
||||
return g_timer.get_elapsed_cpu_time();
|
||||
}
|
||||
|
@ -131,6 +131,7 @@ void usage()
|
||||
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
|
||||
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
|
||||
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
|
||||
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
|
||||
cerr << "[--reference|-R] comma separated list of reference files" << endl;
|
||||
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
|
||||
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
|
||||
@ -138,10 +139,19 @@ void usage()
|
||||
cerr << "[--help|-h] print this message and exit" << endl;
|
||||
cerr << endl;
|
||||
cerr << "Evaluator is able to compute more metrics at once. To do this," << endl;
|
||||
cerr << "separate scorers with semicolon (note that comma is used to separate" << endl;
|
||||
cerr << "scorers in the interpolated scorer)." << endl;
|
||||
cerr << "specify more --sctype arguments. You can also specify more --scconfig strings." << endl;
|
||||
cerr << endl;
|
||||
cerr << "If you specify only one metric and one candidate file, only the final score" << endl;
|
||||
cerr << "The example below prints BLEU score, PER score and interpolated" << endl;
|
||||
cerr << "score of CDER and PER with the given weights." << endl;
|
||||
cerr << endl;
|
||||
cerr << "./evaluator \\" << endl;
|
||||
cerr << "\t--sctype BLEU --scconfig reflen:closest \\" << endl;
|
||||
cerr << "\t--sctype PER \\" << endl;
|
||||
cerr << "\t--sctype CDER,PER --scconfig weights:0.25+0.75 \\" << endl;
|
||||
cerr << "\t--candidate CANDIDATE \\" << endl;
|
||||
cerr << "\t--reference REFERENCE" << endl;
|
||||
cerr << endl;
|
||||
cerr << "If you specify only one scorer and one candidate file, only the final score" << endl;
|
||||
cerr << "will be printed to stdout. Otherwise each line will contain metric name" << endl;
|
||||
cerr << "and/or filename and the final score. Since most of the metrics prints some" << endl;
|
||||
cerr << "debuging info, consider redirecting stderr to /dev/null." << endl;
|
||||
@ -155,24 +165,24 @@ static struct option long_options[] = {
|
||||
{"candidate", required_argument, 0, 'C'},
|
||||
{"bootstrap", required_argument, 0, 'b'},
|
||||
{"rseed", required_argument, 0, 'r'},
|
||||
{"factors", required_argument, 0, 'f'},
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
// Options used in evaluator.
|
||||
struct ProgramOption {
|
||||
string scorer_type;
|
||||
string scorer_config;
|
||||
vector<string> scorer_types;
|
||||
vector<string> scorer_configs;
|
||||
string reference;
|
||||
string candidate;
|
||||
vector<string> scorer_factors;
|
||||
int bootstrap;
|
||||
int seed;
|
||||
bool has_seed;
|
||||
|
||||
ProgramOption()
|
||||
: scorer_type("BLEU"),
|
||||
scorer_config(""),
|
||||
reference(""),
|
||||
: reference(""),
|
||||
candidate(""),
|
||||
bootstrap(0),
|
||||
seed(0),
|
||||
@ -182,13 +192,17 @@ struct ProgramOption {
|
||||
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
int c;
|
||||
int option_index;
|
||||
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
|
||||
int last_scorer_index = -1;
|
||||
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) {
|
||||
switch(c) {
|
||||
case 's':
|
||||
opt->scorer_type = string(optarg);
|
||||
opt->scorer_types.push_back(string(optarg));
|
||||
opt->scorer_configs.push_back(string(""));
|
||||
opt->scorer_factors.push_back(string(""));
|
||||
last_scorer_index++;
|
||||
break;
|
||||
case 'c':
|
||||
opt->scorer_config = string(optarg);
|
||||
opt->scorer_configs[last_scorer_index] = string(optarg);
|
||||
break;
|
||||
case 'R':
|
||||
opt->reference = string(optarg);
|
||||
@ -203,10 +217,21 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
opt->seed = strtol(optarg, NULL, 10);
|
||||
opt->has_seed = true;
|
||||
break;
|
||||
case 'f':
|
||||
opt->scorer_factors[last_scorer_index] = string(optarg);
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
}
|
||||
}
|
||||
|
||||
// Add default scorer if no scorer provided
|
||||
if (opt->scorer_types.size() == 0)
|
||||
{
|
||||
opt->scorer_types.push_back(string("BLEU"));
|
||||
opt->scorer_configs.push_back(string(""));
|
||||
opt->scorer_factors.push_back(string(""));
|
||||
}
|
||||
}
|
||||
|
||||
void InitSeed(const ProgramOption *opt) {
|
||||
@ -236,7 +261,6 @@ int main(int argc, char** argv)
|
||||
try {
|
||||
vector<string> refFiles;
|
||||
vector<string> candFiles;
|
||||
vector<string> scorerTypes;
|
||||
|
||||
if (option.reference.length() == 0) throw runtime_error("You have to specify at least one reference file.");
|
||||
split(option.reference, ',', refFiles);
|
||||
@ -244,17 +268,15 @@ int main(int argc, char** argv)
|
||||
if (option.candidate.length() == 0) throw runtime_error("You have to specify at least one candidate file.");
|
||||
split(option.candidate, ',', candFiles);
|
||||
|
||||
if (option.scorer_type.length() == 0) throw runtime_error("You have to specify at least one scorer.");
|
||||
split(option.scorer_type, ';', scorerTypes);
|
||||
|
||||
if (candFiles.size() > 1) g_has_more_files = true;
|
||||
if (scorerTypes.size() > 1) g_has_more_scorers = true;
|
||||
if (option.scorer_types.size() > 1) g_has_more_scorers = true;
|
||||
|
||||
for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt)
|
||||
{
|
||||
for (vector<string>::const_iterator scorerIt = scorerTypes.begin(); scorerIt != scorerTypes.end(); ++scorerIt)
|
||||
for (size_t i = 0; i < option.scorer_types.size(); i++)
|
||||
{
|
||||
g_scorer = ScorerFactory::getScorer(*scorerIt, option.scorer_config);
|
||||
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
|
||||
g_scorer->setFactors(option.scorer_factors[i]);
|
||||
g_scorer->setReferenceFiles(refFiles);
|
||||
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
|
||||
delete g_scorer;
|
||||
|
@ -26,6 +26,7 @@ void usage()
|
||||
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
|
||||
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
|
||||
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
|
||||
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
|
||||
cerr << "[--reference|-r] comma separated list of reference files" << endl;
|
||||
cerr << "[--binary|-b] use binary output format (default to text )" << endl;
|
||||
cerr << "[--nbest|-n] the nbest file" << endl;
|
||||
@ -41,6 +42,7 @@ void usage()
|
||||
static struct option long_options[] = {
|
||||
{"sctype", required_argument, 0, 's'},
|
||||
{"scconfig", required_argument,0, 'c'},
|
||||
{"factors", required_argument,0, 'f'},
|
||||
{"reference", required_argument, 0, 'r'},
|
||||
{"binary", no_argument, 0, 'b'},
|
||||
{"nbest", required_argument, 0, 'n'},
|
||||
@ -57,6 +59,7 @@ static struct option long_options[] = {
|
||||
struct ProgramOption {
|
||||
string scorerType;
|
||||
string scorerConfig;
|
||||
string scorerFactors;
|
||||
string referenceFile;
|
||||
string nbestFile;
|
||||
string scoreDataFile;
|
||||
@ -69,6 +72,7 @@ struct ProgramOption {
|
||||
ProgramOption()
|
||||
: scorerType("BLEU"),
|
||||
scorerConfig(""),
|
||||
scorerFactors(""),
|
||||
referenceFile(""),
|
||||
nbestFile(""),
|
||||
scoreDataFile("statscore.data"),
|
||||
@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
int c;
|
||||
int option_index;
|
||||
|
||||
while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
|
||||
while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
|
||||
switch (c) {
|
||||
case 's':
|
||||
opt->scorerType = string(optarg);
|
||||
@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
case 'c':
|
||||
opt->scorerConfig = string(optarg);
|
||||
break;
|
||||
case 'f':
|
||||
opt->scorerFactors = string(optarg);
|
||||
break;
|
||||
case 'r':
|
||||
opt->referenceFile = string(optarg);
|
||||
break;
|
||||
@ -180,6 +187,8 @@ int main(int argc, char** argv)
|
||||
|
||||
Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
|
||||
|
||||
scorer->setFactors(option.scorerFactors);
|
||||
|
||||
// load references
|
||||
if (referenceFiles.size() > 0)
|
||||
scorer->setReferenceFiles(referenceFiles);
|
||||
@ -206,16 +215,9 @@ int main(int argc, char** argv)
|
||||
data.remove_duplicates();
|
||||
//END_ADDED
|
||||
|
||||
if (option.binmode)
|
||||
cerr << "Binary write mode is selected" << endl;
|
||||
else
|
||||
cerr << "Binary write mode is NOT selected" << endl;
|
||||
|
||||
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
|
||||
PrintUserTime("Stopping...");
|
||||
|
||||
// timer.stop("Stopping...");
|
||||
|
||||
delete scorer;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -343,6 +343,8 @@ int main(int argc, char **argv)
|
||||
data.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
|
||||
}
|
||||
|
||||
TheScorer->setScoreData(data.getScoreData().get());
|
||||
|
||||
//ADDED_BY_TS
|
||||
data.remove_duplicates();
|
||||
//END_ADDED
|
||||
@ -362,13 +364,6 @@ int main(int argc, char **argv)
|
||||
vector<string> features;
|
||||
Tokenize(option.to_optimize_str.c_str(), ',', &features);
|
||||
|
||||
if (option.pdim != static_cast<int>(features.size())) {
|
||||
cerr << "Error: pdim and the specified number of features are not equal: "
|
||||
<< "pdim = " << option.pdim
|
||||
<< ", the number of features = " << features.size() << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (vector<string>::const_iterator it = features.begin();
|
||||
it != features.end(); ++it) {
|
||||
const int feature_index = data.getFeatureIndex(*it);
|
||||
@ -405,6 +400,7 @@ int main(int argc, char **argv)
|
||||
|
||||
Point::setpdim(option.pdim);
|
||||
Point::setdim(to_optimize.size());
|
||||
Point::set_optindices(to_optimize);
|
||||
|
||||
//starting points consist of specified points and random restarts
|
||||
vector<Point> startingPoints;
|
||||
|
@ -2575,7 +2575,7 @@ sub create_step {
|
||||
$subdir = "lm" if $subdir eq "interpolated-lm";
|
||||
open(STEP,">$file");
|
||||
print STEP "#!/bin/bash\n\n";
|
||||
print STEP "PATH=".$ENV{"PATH"}."\n";
|
||||
print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
|
||||
print STEP "cd $dir\n";
|
||||
print STEP "echo 'starting at '`date`' on '`hostname`\n";
|
||||
print STEP "mkdir -p $dir/$subdir\n\n";
|
||||
|
@ -181,7 +181,7 @@ if ($opt_hierarchical)
|
||||
my %PHRASE_USED;
|
||||
if (!$opt_hierarchical) {
|
||||
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
|
||||
open(INPUT,$input) or die "Can't read $input";
|
||||
open(INPUT,mk_open_string($input)) or die "Can't read $input";
|
||||
while(my $line = <INPUT>) {
|
||||
chomp($line);
|
||||
my @WORD = split(/ +/,$line);
|
||||
@ -207,6 +207,22 @@ if (!$opt_hierarchical) {
|
||||
close(INPUT);
|
||||
}
|
||||
|
||||
sub mk_open_string {
|
||||
my $file = shift;
|
||||
my $openstring;
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$openstring = "$ZCAT $file.gz |";
|
||||
} elsif ($file =~ /\.gz$/) {
|
||||
$openstring = "$ZCAT $file |";
|
||||
} elsif ($opt_hierarchical) {
|
||||
$openstring = "cat $file |";
|
||||
} else {
|
||||
$openstring = "< $file";
|
||||
}
|
||||
return $openstring;
|
||||
}
|
||||
|
||||
|
||||
# filter files
|
||||
for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
my ($used,$total) = (0,0);
|
||||
@ -215,16 +231,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
my $new_file = $TABLE_NEW_NAME[$i];
|
||||
print STDERR "filtering $file -> $new_file...\n";
|
||||
|
||||
my $openstring;
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$openstring = "$ZCAT $file.gz |";
|
||||
} elsif ($file =~ /\.gz$/) {
|
||||
$openstring = "$ZCAT $file |";
|
||||
} elsif ($opt_hierarchical) {
|
||||
$openstring = "cat $file |";
|
||||
} else {
|
||||
$openstring = "< $file";
|
||||
}
|
||||
my $openstring = mk_open_string($file);
|
||||
|
||||
my $new_openstring;
|
||||
if ($new_file =~ /\.gz$/) {
|
||||
@ -303,7 +310,7 @@ close(INFO);
|
||||
|
||||
|
||||
print "To run the decoder, please call:
|
||||
moses -f $dir/moses.ini < $input\n";
|
||||
moses -f $dir/moses.ini -i $input\n";
|
||||
|
||||
sub safesystem {
|
||||
print STDERR "Executing: @_\n";
|
||||
|
@ -376,7 +376,7 @@ void Model::zipFile()
|
||||
{
|
||||
fclose(file);
|
||||
file = fopen(filename.c_str(), "rb");
|
||||
FILE* gzfile = (FILE*) gzopen((filename+".gz").c_str(),"wb");
|
||||
gzFile gzfile = gzopen((filename+".gz").c_str(),"wb");
|
||||
char inbuffer[128];
|
||||
int num_read;
|
||||
while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {
|
||||
|
@ -137,6 +137,7 @@ my $___INPUTTYPE = 0;
|
||||
my $mertdir = undef; # path to new mert directory
|
||||
my $mertargs = undef; # args to pass through to mert & extractor
|
||||
my $mertmertargs = undef; # args to pass through to mert only
|
||||
my $extractorargs = undef; # args to pass through to extractor only
|
||||
my $filtercmd = undef; # path to filter-model-given-input.pl
|
||||
my $filterfile = undef;
|
||||
my $qsubwrapper = undef;
|
||||
@ -178,6 +179,7 @@ GetOptions(
|
||||
"verbose" => \$verbose,
|
||||
"mertdir=s" => \$mertdir,
|
||||
"mertargs=s" => \$mertargs,
|
||||
"extractorargs=s" => \$extractorargs,
|
||||
"mertmertargs=s" => \$mertmertargs,
|
||||
"rootdir=s" => \$SCRIPTS_ROOTDIR,
|
||||
"filtercmd=s" => \$filtercmd, # allow to override the default location
|
||||
@ -241,8 +243,9 @@ Options:
|
||||
model. useful for lattice decoding
|
||||
--rootdir=STRING ... where do helpers reside (if not given explicitly)
|
||||
--mertdir=STRING ... path to new mert implementation
|
||||
--mertargs=STRING ... extra args for mert, eg. to specify scorer
|
||||
--mertmertargs=STRING ... extra args for mert only,
|
||||
--mertargs=STRING ... extra args for both extractor and mert
|
||||
--extractorargs=STRING ... extra args for extractor only
|
||||
--mertmertargs=STRING ... extra args for mert only
|
||||
--scorenbestcmd=STRING ... path to score-nbest.py
|
||||
--old-sge ... passed to parallelizers, assume Grid Engine < 6.0
|
||||
--inputtype=[0|1|2] ... Handle different input types: (0 for text,
|
||||
@ -364,6 +367,7 @@ $scconfig = "--scconfig $scconfig" if ($scconfig);
|
||||
|
||||
my $mert_extract_args=$mertargs;
|
||||
$mert_extract_args .=" $scconfig";
|
||||
$mert_extract_args .=" $extractorargs";
|
||||
|
||||
$mertmertargs = "" if !defined $mertmertargs;
|
||||
|
||||
|
@ -47,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
|
||||
#endif
|
||||
|
||||
inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
|
||||
uint64_t value64;
|
||||
memcpy(&value64, base_off, sizeof(value64));
|
||||
return value64;
|
||||
#else
|
||||
return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Pack integers up to 57 bits using their least significant digits.
|
||||
@ -75,7 +82,14 @@ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t va
|
||||
|
||||
/* Same caveats as above, but for a 25 bit limit. */
|
||||
inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
|
||||
uint32_t value32;
|
||||
memcpy(&value32, base_off, sizeof(value32));
|
||||
return (value32 >> BitPackShift(bit_off & 7, length)) & mask;
|
||||
#else
|
||||
return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
|
||||
|
@ -99,6 +99,13 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
void FSyncOrThrow(int fd) {
|
||||
// Apparently windows doesn't have fsync?
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed.");
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace {
|
||||
void InternalSeek(int fd, off_t off, int whence) {
|
||||
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
|
||||
|
@ -78,6 +78,8 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
|
||||
|
||||
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
|
||||
|
||||
void FSyncOrThrow(int fd);
|
||||
|
||||
// Seeking
|
||||
void SeekOrThrow(int fd, uint64_t off);
|
||||
void AdvanceOrThrow(int fd, int64_t off);
|
||||
|
@ -24,12 +24,12 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
|
||||
*this << "Could not parse \"" << value << "\" into a number";
|
||||
}
|
||||
|
||||
GZException::GZException(gzFile file) {
|
||||
#ifdef HAVE_ZLIB
|
||||
GZException::GZException(gzFile file) {
|
||||
int num;
|
||||
*this << gzerror( file, &num) << " from zlib";
|
||||
#endif // HAVE_ZLIB
|
||||
}
|
||||
#endif // HAVE_ZLIB
|
||||
|
||||
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
|
||||
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
@ -27,7 +27,9 @@ class ParseNumberException : public Exception {
|
||||
|
||||
class GZException : public Exception {
|
||||
public:
|
||||
#ifdef HAVE_ZLIB
|
||||
explicit GZException(gzFile file);
|
||||
#endif
|
||||
GZException() throw() {}
|
||||
~GZException() throw() {}
|
||||
};
|
||||
|
@ -1,126 +0,0 @@
|
||||
#ifndef UTIL_KEY_VALUE_PACKING__
|
||||
#define UTIL_KEY_VALUE_PACKING__
|
||||
|
||||
/* Why such a general interface? I'm planning on doing bit-level packing. */
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace util {
|
||||
|
||||
template <class Key, class Value> struct Entry {
|
||||
Key key;
|
||||
Value value;
|
||||
|
||||
const Key &GetKey() const { return key; }
|
||||
const Value &GetValue() const { return value; }
|
||||
|
||||
Value &MutableValue() { return value; }
|
||||
|
||||
void Set(const Key &key_in, const Value &value_in) {
|
||||
SetKey(key_in);
|
||||
SetValue(value_in);
|
||||
}
|
||||
void SetKey(const Key &key_in) { key = key_in; }
|
||||
void SetValue(const Value &value_in) { value = value_in; }
|
||||
|
||||
bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); }
|
||||
};
|
||||
|
||||
// And now for a brief interlude to specialize std::swap.
|
||||
} // namespace util
|
||||
namespace std {
|
||||
template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) {
|
||||
swap(first.key, second.key);
|
||||
swap(first.value, second.value);
|
||||
}
|
||||
}// namespace std
|
||||
namespace util {
|
||||
|
||||
template <class KeyT, class ValueT> class AlignedPacking {
|
||||
public:
|
||||
typedef KeyT Key;
|
||||
typedef ValueT Value;
|
||||
|
||||
public:
|
||||
static const std::size_t kBytes = sizeof(Entry<Key, Value>);
|
||||
static const std::size_t kBits = kBytes * 8;
|
||||
|
||||
typedef Entry<Key, Value> * MutableIterator;
|
||||
typedef const Entry<Key, Value> * ConstIterator;
|
||||
typedef const Entry<Key, Value> & ConstReference;
|
||||
|
||||
static MutableIterator FromVoid(void *start) {
|
||||
return reinterpret_cast<MutableIterator>(start);
|
||||
}
|
||||
|
||||
static Entry<Key, Value> Make(const Key &key, const Value &value) {
|
||||
Entry<Key, Value> ret;
|
||||
ret.Set(key, value);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
template <class KeyT, class ValueT> class ByteAlignedPacking {
|
||||
public:
|
||||
typedef KeyT Key;
|
||||
typedef ValueT Value;
|
||||
|
||||
private:
|
||||
#pragma pack(push)
|
||||
#pragma pack(1)
|
||||
struct RawEntry {
|
||||
Key key;
|
||||
Value value;
|
||||
|
||||
const Key &GetKey() const { return key; }
|
||||
const Value &GetValue() const { return value; }
|
||||
|
||||
Value &MutableValue() { return value; }
|
||||
|
||||
void Set(const Key &key_in, const Value &value_in) {
|
||||
SetKey(key_in);
|
||||
SetValue(value_in);
|
||||
}
|
||||
void SetKey(const Key &key_in) { key = key_in; }
|
||||
void SetValue(const Value &value_in) { value = value_in; }
|
||||
|
||||
bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); }
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
friend void std::swap<>(RawEntry&, RawEntry&);
|
||||
|
||||
public:
|
||||
typedef RawEntry *MutableIterator;
|
||||
typedef const RawEntry *ConstIterator;
|
||||
typedef RawEntry &ConstReference;
|
||||
|
||||
static const std::size_t kBytes = sizeof(RawEntry);
|
||||
static const std::size_t kBits = kBytes * 8;
|
||||
|
||||
static MutableIterator FromVoid(void *start) {
|
||||
return MutableIterator(reinterpret_cast<RawEntry*>(start));
|
||||
}
|
||||
|
||||
static RawEntry Make(const Key &key, const Value &value) {
|
||||
RawEntry ret;
|
||||
ret.Set(key, value);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
namespace std {
|
||||
template <class Key, class Value> void swap(
|
||||
typename util::ByteAlignedPacking<Key, Value>::RawEntry &first,
|
||||
typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) {
|
||||
swap(first.key, second.key);
|
||||
swap(first.value, second.value);
|
||||
}
|
||||
}// namespace std
|
||||
|
||||
#endif // UTIL_KEY_VALUE_PACKING__
|
@ -1,75 +0,0 @@
|
||||
#include "util/key_value_packing.hh"
|
||||
|
||||
#include <boost/random/mersenne_twister.hpp>
|
||||
#include <boost/random/uniform_int.hpp>
|
||||
#include <boost/random/variate_generator.hpp>
|
||||
#include <boost/scoped_array.hpp>
|
||||
#define BOOST_TEST_MODULE KeyValueStoreTest
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
#include <limits>
|
||||
#include <stdlib.h>
|
||||
|
||||
namespace util {
|
||||
namespace {
|
||||
|
||||
BOOST_AUTO_TEST_CASE(basic_in_out) {
|
||||
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
|
||||
void *backing = malloc(Packing::kBytes * 2);
|
||||
Packing::MutableIterator i(Packing::FromVoid(backing));
|
||||
i->SetKey(10);
|
||||
BOOST_CHECK_EQUAL(10, i->GetKey());
|
||||
i->SetValue(3);
|
||||
BOOST_CHECK_EQUAL(3, i->GetValue());
|
||||
++i;
|
||||
i->SetKey(5);
|
||||
BOOST_CHECK_EQUAL(5, i->GetKey());
|
||||
i->SetValue(42);
|
||||
BOOST_CHECK_EQUAL(42, i->GetValue());
|
||||
|
||||
Packing::ConstIterator c(i);
|
||||
BOOST_CHECK_EQUAL(5, c->GetKey());
|
||||
--c;
|
||||
BOOST_CHECK_EQUAL(10, c->GetKey());
|
||||
BOOST_CHECK_EQUAL(42, i->GetValue());
|
||||
|
||||
BOOST_CHECK_EQUAL(5, i->GetKey());
|
||||
free(backing);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(simple_sort) {
|
||||
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
|
||||
char foo[Packing::kBytes * 4];
|
||||
Packing::MutableIterator begin(Packing::FromVoid(foo));
|
||||
Packing::MutableIterator i = begin;
|
||||
i->SetKey(0); ++i;
|
||||
i->SetKey(2); ++i;
|
||||
i->SetKey(3); ++i;
|
||||
i->SetKey(1); ++i;
|
||||
std::sort(begin, i);
|
||||
BOOST_CHECK_EQUAL(0, begin[0].GetKey());
|
||||
BOOST_CHECK_EQUAL(1, begin[1].GetKey());
|
||||
BOOST_CHECK_EQUAL(2, begin[2].GetKey());
|
||||
BOOST_CHECK_EQUAL(3, begin[3].GetKey());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(big_sort) {
|
||||
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
|
||||
boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]);
|
||||
Packing::MutableIterator begin(Packing::FromVoid(memory.get()));
|
||||
|
||||
boost::mt19937 rng;
|
||||
boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max());
|
||||
boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range);
|
||||
|
||||
for (size_t i = 0; i < 1000; ++i) {
|
||||
(begin + i)->SetKey(gen());
|
||||
}
|
||||
std::sort(begin, begin + 1000);
|
||||
for (size_t i = 0; i < 999; ++i) {
|
||||
BOOST_CHECK(begin[i] < begin[i+1]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace util
|
21
util/mmap.cc
21
util/mmap.cc
@ -101,9 +101,10 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
|
||||
int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
|
||||
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL);
|
||||
uint64_t total_size = size + offset;
|
||||
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL);
|
||||
UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
|
||||
LPVOID ret = MapViewOfFile(hMapping, protectM, 0, offset, size);
|
||||
LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size);
|
||||
CloseHandle(hMapping);
|
||||
UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
|
||||
#else
|
||||
@ -147,16 +148,20 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
|
||||
}
|
||||
}
|
||||
|
||||
void *MapAnonymous(std::size_t size) {
|
||||
return MapOrThrow(size, true,
|
||||
// Allocates zeroed memory in to.
|
||||
void MapAnonymous(std::size_t size, util::scoped_memory &to) {
|
||||
to.reset();
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
0 // MapOrThrow ignores the flags anyway.
|
||||
#elif defined(MAP_ANONYMOUS)
|
||||
MAP_ANONYMOUS | MAP_PRIVATE // Linux
|
||||
to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||
#else
|
||||
to.reset(MapOrThrow(size, true,
|
||||
# if defined(MAP_ANONYMOUS)
|
||||
MAP_ANONYMOUS | MAP_PRIVATE // Linux
|
||||
# else
|
||||
MAP_ANON | MAP_PRIVATE // BSD
|
||||
# endif
|
||||
, false, -1, 0), size, scoped_memory::MMAP_ALLOCATED);
|
||||
#endif
|
||||
, false, -1, 0);
|
||||
}
|
||||
|
||||
void *MapZeroedWrite(int fd, std::size_t size) {
|
||||
|
@ -100,7 +100,7 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
|
||||
|
||||
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
|
||||
|
||||
void *MapAnonymous(std::size_t size);
|
||||
void MapAnonymous(std::size_t size, scoped_memory &to);
|
||||
|
||||
// Open file name with mmap of size bytes, all of which are initially zero.
|
||||
void *MapZeroedWrite(int fd, std::size_t size);
|
||||
|
@ -7,9 +7,11 @@
|
||||
* placed in namespace util
|
||||
* add MurmurHashNative
|
||||
* default option = 0 for seed
|
||||
* ARM port from NICT
|
||||
*/
|
||||
|
||||
#include "util/murmur_hash.hh"
|
||||
#include <string.h>
|
||||
|
||||
namespace util {
|
||||
|
||||
@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
|
||||
|
||||
uint64_t h = seed ^ (len * m);
|
||||
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
const size_t ksize = sizeof(uint64_t);
|
||||
const unsigned char * data = (const unsigned char *)key;
|
||||
const unsigned char * end = data + (std::size_t)(len/8) * ksize;
|
||||
#else
|
||||
const uint64_t * data = (const uint64_t *)key;
|
||||
const uint64_t * end = data + (len/8);
|
||||
#endif
|
||||
|
||||
while(data != end)
|
||||
{
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
uint64_t k;
|
||||
memcpy(&k, data, ksize);
|
||||
data += ksize;
|
||||
#else
|
||||
uint64_t k = *data++;
|
||||
#endif
|
||||
|
||||
k *= m;
|
||||
k ^= k >> r;
|
||||
@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
|
||||
unsigned int h1 = seed ^ len;
|
||||
unsigned int h2 = 0;
|
||||
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
size_t ksize = sizeof(unsigned int);
|
||||
const unsigned char * data = (const unsigned char *)key;
|
||||
#else
|
||||
const unsigned int * data = (const unsigned int *)key;
|
||||
#endif
|
||||
|
||||
unsigned int k1, k2;
|
||||
while(len >= 8)
|
||||
{
|
||||
unsigned int k1 = *data++;
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
memcpy(&k1, data, ksize);
|
||||
data += ksize;
|
||||
memcpy(&k2, data, ksize);
|
||||
data += ksize;
|
||||
#else
|
||||
k1 = *data++;
|
||||
k2 = *data++;
|
||||
#endif
|
||||
|
||||
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
||||
h1 *= m; h1 ^= k1;
|
||||
len -= 4;
|
||||
|
||||
unsigned int k2 = *data++;
|
||||
k2 *= m; k2 ^= k2 >> r; k2 *= m;
|
||||
h2 *= m; h2 ^= k2;
|
||||
len -= 4;
|
||||
@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
|
||||
|
||||
if(len >= 4)
|
||||
{
|
||||
unsigned int k1 = *data++;
|
||||
#if defined(__arm) || defined(__arm__)
|
||||
memcpy(&k1, data, ksize);
|
||||
data += ksize;
|
||||
#else
|
||||
k1 = *data++;
|
||||
#endif
|
||||
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
||||
h1 *= m; h1 ^= k1;
|
||||
len -= 4;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <boost/random/variate_generator.hpp>
|
||||
#include <boost/scoped_array.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#define BOOST_TEST_MODULE SortedUniformTest
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user