Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Christian Buck 2012-03-03 00:46:45 +00:00
commit 90b096b382
55 changed files with 1090 additions and 629 deletions

View File

@ -196,7 +196,7 @@ class Moses():
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False):
def traverse_incrementally(self,table,models,load_lines,store_flag,inverted=False,lowmem=False):
"""hack-ish way to find common phrase pairs in multiple models in one traversal without storing it all in memory
relies on alphabetical sorting of phrase table.
"""
@ -209,7 +209,9 @@ class Moses():
self.phrase_pairs = defaultdict(lambda: defaultdict(lambda: [[[0]*len(self.models) for i in range(self.number_of_features)],[]]))
self.reordering_pairs = defaultdict(lambda: defaultdict(lambda: [[0]*len(self.models) for i in range(self.number_of_features)]))
self.phrase_source = defaultdict(lambda: [0]*len(self.models))
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
if lowmem:
self.phrase_target = defaultdict(lambda: [0]*len(self.models))
for model,priority,i in models:
@ -451,10 +453,10 @@ class Moses():
return line
def create_inverse(self,fobj):
def create_inverse(self,fobj,tempdir=None):
"""swap source and target phrase in the phrase table, and then sort (by target phrase)"""
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False)
inverse = NamedTemporaryFile(prefix='inv_unsorted',delete=False,dir=tempdir)
swap = re.compile(b'(.+?) \|\|\| (.+?) \|\|\|')
# just swap source and target phrase, and leave order of scores etc. intact.
@ -463,7 +465,7 @@ class Moses():
inverse.write(swap.sub(b'\\2 ||| \\1 |||',line,1))
inverse.close()
inverse_sorted = sort_file(inverse.name)
inverse_sorted = sort_file(inverse.name,tempdir=tempdir)
os.remove(inverse.name)
return inverse_sorted
@ -1254,14 +1256,16 @@ def handle_file(filename,action,fileobj=None,mode='r'):
fileobj.close()
def sort_file(filename):
def sort_file(filename,tempdir=None):
"""Sort a file and return temporary file"""
cmd = ['sort', filename]
env = {}
env['LC_ALL'] = 'C'
if tempdir:
cmd.extend(['-T',tempdir])
outfile = NamedTemporaryFile(delete=False)
outfile = NamedTemporaryFile(delete=False,dir=tempdir)
sys.stderr.write('LC_ALL=C ' + ' '.join(cmd) + ' > ' + outfile.name + '\n')
p = Popen(cmd,env=env,stdout=outfile.file)
p.wait()
@ -1344,6 +1348,8 @@ class Combine_TMs():
lowmem: low memory mode: instead of loading target phrase counts / probability (when required), process the original table and its inversion (source and target swapped) incrementally, then merge the two halves.
tempdir: temporary directory (for low memory mode).
there are a number of further configuration options that you can define, which modify the algorithm for linear interpolation. They have no effect in mode 'counts'
recompute_lexweights: don't directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights.
@ -1507,25 +1513,25 @@ class Combine_TMs():
self.loaded['pt-target'] = 1
def _inverse_wrapper(self,weights):
def _inverse_wrapper(self,weights,tempdir=None):
"""if we want to invert the phrase table to better calcualte p(s|t) and lex(s|t), manage creation, sorting and merging of inverted phrase tables"""
sys.stderr.write('Processing first table half\n')
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False)
pt_half1 = NamedTemporaryFile(prefix='half1',delete=False,dir=tempdir)
self._write_phrasetable(models,pt_half1,weights)
pt_half1.seek(0)
sys.stderr.write('Inverting tables\n')
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table')),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
models = [(self.model_interface.create_inverse(self.model_interface.open_table(model,'phrase-table'),tempdir=tempdir),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
sys.stderr.write('Processing second table half\n')
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False)
pt_half2_inverted = NamedTemporaryFile(prefix='half2',delete=False,dir=tempdir)
self._write_phrasetable(models,pt_half2_inverted,weights,inverted=True)
pt_half2_inverted.close()
for model,priority,i in models:
model.close()
os.remove(model.name)
pt_half2 = sort_file(pt_half2_inverted.name)
pt_half2 = sort_file(pt_half2_inverted.name,tempdir=tempdir)
os.remove(pt_half2_inverted.name)
sys.stderr.write('Merging tables: first half: {0} ; second half: {1} ; final table: {2}\n'.format(pt_half1.name,pt_half2.name,self.output_file))
@ -1549,7 +1555,7 @@ class Combine_TMs():
i = 0
sys.stderr.write('Incrementally loading and processing phrase tables...')
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted):
for block in self.model_interface.traverse_incrementally('phrase-table',models,self.load_lines,store_flag,inverted=inverted,lowmem=self.flags['lowmem']):
for src in sorted(self.model_interface.phrase_pairs, key = lambda x: x + b' |'):
for target in sorted(self.model_interface.phrase_pairs[src], key = lambda x: x + b' |'):
@ -1586,7 +1592,7 @@ class Combine_TMs():
self._ensure_loaded(data)
if self.flags['lowmem'] and (self.mode == 'counts' or self.flags['normalized'] and self.flags['normalize_s_given_t'] == 't'):
self._inverse_wrapper(weights)
self._inverse_wrapper(weights,tempdir=self.flags['tempdir'])
else:
models = [(self.model_interface.open_table(model,'phrase-table'),priority,i) for (model,priority,i) in priority_sort_models(self.model_interface.models)]
output_object = handle_file(self.output_file,'open',mode='w')
@ -1631,12 +1637,12 @@ class Combine_TMs():
sys.stderr.write('Error: only linear interpolation is supported for reordering model combination')
output_object = handle_file(self.output_file,'open',mode='w')
models = [(self.open_table(model,table),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
models = [(self.model_interface.open_table(model,'reordering-table'),priority,i) for (model,priority,i) in priority_sort_models(self.models)]
i = 0
sys.stderr.write('Incrementally loading and processing phrase tables...')
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs'):
for block in self.model_interface.traverse_incrementally('reordering-table',models,self.model_interface.load_reordering_probabilities,'pairs',lowmem=self.flags['lowmem']):
for src in sorted(self.model_interface.reordering_pairs):
for target in sorted(self.model_interface.reordering_pairs[src]):
@ -1829,6 +1835,11 @@ def parse_command_line():
parser.add_argument('--recompute_lexweights', action="store_true",
help=('don\'t directly interpolate lexical weights, but interpolate word translation probabilities instead and recompute the lexical weights. Only relevant in mode "interpolate".'))
parser.add_argument('--tempdir', type=str,
default=None,
help=('Temporary directory in --lowmem mode.'))
return parser.parse_args()
if __name__ == "__main__":
@ -1842,7 +1853,7 @@ if __name__ == "__main__":
else:
args = parse_command_line()
#initialize
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights)
combiner = Combine_TMs([(m,'primary') for m in args.model],weights=args.weights,mode=args.mode,output_file=args.output,reference_file=args.reference,output_lexical=args.output_lexical,lowmem=args.lowmem,normalized=args.normalized,recompute_lexweights=args.recompute_lexweights,tempdir=args.tempdir)
# execute right method
f_string = "combiner."+args.action+'()'
exec(f_string)

View File

@ -87,7 +87,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
} else {
backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
util::MapAnonymous(memory_size, backing.vocab);
return reinterpret_cast<uint8_t*>(backing.vocab.get());
}
}
@ -103,32 +103,44 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
throw e;
}
if (config.write_method == Config::WRITE_AFTER) {
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
// mmap it now.
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
std::size_t page_size = util::SizePage();
std::size_t alignment_cruft = adjusted_vocab % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
} else {
backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
}
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) {
if (config.write_mmap) {
util::SyncOrThrow(backing.search.get(), backing.search.size());
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params = Parameters();
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
WriteHeader(backing.vocab.get(), params);
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
if (!config.write_mmap) return;
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
switch (config.write_method) {
case Config::WRITE_MMAP:
util::SyncOrThrow(backing.search.get(), backing.search.size());
break;
case Config::WRITE_AFTER:
util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
util::FSyncOrThrow(backing.file.get());
break;
}
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params = Parameters();
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
WriteHeader(backing.vocab.get(), params);
}
namespace detail {
@ -172,7 +184,7 @@ void ReadHeader(int fd, Parameters &out) {
UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
out.counts.resize(static_cast<std::size_t>(out.fixed.order));
util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
}
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params) {

View File

@ -58,7 +58,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
// Write header to binary file. This is done last to prevent incomplete files
// from loading.
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing);
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
namespace detail {

View File

@ -18,11 +18,14 @@ namespace ngram {
namespace {
void Usage(const char *name) {
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
" Default is -100. The ARPA file will always take precedence.\n"
"-s allows models to be built even if they do not have <s> and </s>.\n"
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n\n"
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
"-w mmap|after determines how writing is done.\n"
" mmap maps the binary file and writes to it. Default for trie.\n"
" after allocates anonymous memory, builds, and writes. Default for probing.\n\n"
"type is either probing or trie. Default is probing.\n\n"
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
@ -58,7 +61,7 @@ uint8_t ParseBitCount(const char *from) {
unsigned long val = ParseUInt(from);
if (val > 25) {
util::ParseNumberException e(from);
e << " bit counts are limited to 256.";
e << " bit counts are limited to 25.";
}
return val;
}
@ -115,10 +118,10 @@ int main(int argc, char *argv[]) {
using namespace lm::ngram;
try {
bool quantize = false, set_backoff_bits = false, bhiksha = false;
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false;
lm::ngram::Config config;
int opt;
while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:a:")) != -1) {
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) {
switch(opt) {
case 'q':
config.prob_bits = ParseBitCount(optarg);
@ -132,6 +135,7 @@ int main(int argc, char *argv[]) {
case 'a':
config.pointer_bhiksha_bits = ParseBitCount(optarg);
bhiksha = true;
break;
case 'u':
config.unknown_missing_logprob = ParseFloat(optarg);
break;
@ -144,6 +148,16 @@ int main(int argc, char *argv[]) {
case 'm':
config.building_memory = ParseUInt(optarg) * 1048576;
break;
case 'w':
set_write_method = true;
if (!strcmp(optarg, "mmap")) {
config.write_method = Config::WRITE_MMAP;
} else if (!strcmp(optarg, "after")) {
config.write_method = Config::WRITE_AFTER;
} else {
Usage(argv[0]);
}
break;
case 's':
config.sentence_marker_missing = lm::SILENT;
break;
@ -160,45 +174,45 @@ int main(int argc, char *argv[]) {
}
if (optind + 1 == argc) {
ShowSizes(argv[optind], config);
return 0;
}
const char *model_type, *from_file;
if (optind + 2 == argc) {
model_type = "probing";
from_file = argv[optind];
} else if (optind + 2 == argc) {
config.write_mmap = argv[optind + 1];
} else if (optind + 3 == argc) {
model_type = argv[optind];
from_file = argv[optind + 1];
config.write_mmap = argv[optind + 2];
} else {
Usage(argv[0]);
}
if (!strcmp(model_type, "probing")) {
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
ProbingModel(from_file, config);
} else if (!strcmp(model_type, "trie")) {
if (quantize) {
if (bhiksha) {
QuantArrayTrieModel(from_file, config);
ProbingModel(argv[optind], config);
} else if (optind + 3 == argc) {
const char *model_type = argv[optind];
const char *from_file = argv[optind + 1];
config.write_mmap = argv[optind + 2];
if (!strcmp(model_type, "probing")) {
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
ProbingModel(from_file, config);
} else if (!strcmp(model_type, "trie")) {
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
if (quantize) {
if (bhiksha) {
QuantArrayTrieModel(from_file, config);
} else {
QuantTrieModel(from_file, config);
}
} else {
QuantTrieModel(from_file, config);
if (bhiksha) {
ArrayTrieModel(from_file, config);
} else {
TrieModel(from_file, config);
}
}
} else {
if (bhiksha) {
ArrayTrieModel(from_file, config);
} else {
TrieModel(from_file, config);
}
Usage(argv[0]);
}
} else {
Usage(argv[0]);
}
std::cerr << "Built " << config.write_mmap << " successfully." << std::endl;
} catch (const std::exception &e) {
}
catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
std::cerr << "ERROR" << std::endl;
return 1;
}
std::cerr << "SUCCESS" << std::endl;
return 0;
}

View File

@ -17,6 +17,7 @@ Config::Config() :
temporary_directory_prefix(NULL),
arpa_complain(ALL),
write_mmap(NULL),
write_method(WRITE_AFTER),
include_vocab(true),
prob_bits(8),
backoff_bits(8),

View File

@ -70,9 +70,17 @@ struct Config {
// to NULL to disable.
const char *write_mmap;
typedef enum {
WRITE_MMAP, // Map the file directly.
WRITE_AFTER // Write after we're done.
} WriteMethod;
WriteMethod write_method;
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
bool include_vocab;
// Quantization options. Only effective for QuantTrieModel. One value is
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
// to quantize (and one of the remaining backoffs will be 0).

View File

@ -46,7 +46,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters &params, const Config &config, int fd) {
SetupMemory(start, params.counts, config);
vocab_.LoadedBinary(fd, config.enumerate_vocab);
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
search_.LoadedBinary();
}
@ -82,7 +82,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
search_.unigram.Unknown().backoff = 0.0;
search_.unigram.Unknown().prob = config.unknown_missing_logprob;
}
FinishFile(config, kModelType, kVersion, counts, backing_);
FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
} catch (util::Exception &e) {
e << " Byte: " << f.Offset();
throw;
@ -119,7 +119,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
}
float backoff;
// i is the order of the backoff we're looking for.
const Middle *mid_iter = search_.MiddleBegin() + start - 2;
typename Search::MiddleIter mid_iter = search_.MiddleBegin() + start - 2;
for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++mid_iter) {
if (!search_.LookupMiddleNoProb(*mid_iter, *i, backoff, node)) break;
ret.prob += backoff;
@ -139,7 +139,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
search_.LookupUnigram(*context_rbegin, out_state.backoff[0], node, ignored);
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
float *backoff_out = out_state.backoff + 1;
const typename Search::Middle *mid = search_.MiddleBegin();
typename Search::MiddleIter mid(search_.MiddleBegin());
for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++mid) {
if (!search_.LookupMiddleNoProb(*mid, *i, *backoff_out, node)) {
std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
@ -166,7 +166,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
// If this function is called, then it does depend on left words.
ret.independent_left = false;
ret.extend_left = extend_pointer;
const typename Search::Middle *mid_iter = search_.MiddleBegin() + extend_length - 1;
typename Search::MiddleIter mid_iter(search_.MiddleBegin() + extend_length - 1);
const WordIndex *i = add_rbegin;
for (; ; ++i, ++backoff_out, ++mid_iter) {
if (i == add_rend) {
@ -235,7 +235,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
// Ok start by looking up the bigram.
const WordIndex *hist_iter = context_rbegin;
const typename Search::Middle *mid_iter = search_.MiddleBegin();
typename Search::MiddleIter mid_iter(search_.MiddleBegin());
for (; ; ++mid_iter, ++hist_iter, ++backoff_out) {
if (hist_iter == context_rend) {
// Ran out of history. Typically no backoff, but this could be a blank.

View File

@ -20,11 +20,11 @@ namespace ngram {
namespace {
void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) {
std::sort(values, values_end);
const float *start = values, *finish;
void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
std::sort(values.begin(), values.end());
std::vector<float>::const_iterator start = values.begin(), finish;
for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
finish = values + (((values_end - values) * static_cast<uint64_t>(i + 1)) / bins);
finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
if (finish == start) {
// zero length bucket.
*centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
@ -66,12 +66,12 @@ void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vec
float *centers = start_ + TableStart(order) + ProbTableLength();
*(centers++) = kNoExtensionBackoff;
*(centers++) = kExtensionBackoff;
MakeBins(&*backoff.begin(), &*backoff.end(), centers, (1ULL << backoff_bits_) - 2);
MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
}
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
float *centers = start_ + TableStart(order);
MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_));
MakeBins(prob, centers, (1ULL << prob_bits_));
}
void SeparatelyQuantize::FinishedLoading(const Config &config) {

View File

@ -84,9 +84,11 @@ template <class Middle> void FixSRI(int lower, float negative_lower_prob, unsign
}
template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
assert(n >= 2);
ReadNGramHeader(f, n);
// vocab ids of words in reverse order
// Both vocab_ids and keys are non-empty because n >= 2.
// vocab ids of words in reverse order.
std::vector<WordIndex> vocab_ids(n);
std::vector<uint64_t> keys(n-1);
typename Store::Entry::Value value;
@ -147,7 +149,7 @@ template <class MiddleT, class LongestT> uint8_t *TemplateHashedSearch<MiddleT,
template <class MiddleT, class LongestT> template <class Voc> void TemplateHashedSearch<MiddleT, LongestT>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing) {
// TODO: fix sorted.
SetupMemory(GrowForSearch(config, 0, Size(counts, config), backing), counts, config);
SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config);
PositiveProbWarn warn(config.positive_log_probability);

View File

@ -91,8 +91,10 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing);
const Middle *MiddleBegin() const { return &*middle_.begin(); }
const Middle *MiddleEnd() const { return &*middle_.end(); }
typedef typename std::vector<Middle>::const_iterator MiddleIter;
MiddleIter MiddleBegin() const { return middle_.begin(); }
MiddleIter MiddleEnd() const { return middle_.end(); }
Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const {
util::FloatEnc val;

View File

@ -197,7 +197,7 @@ class SRISucks {
void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
it_[i] = &*values_[i].begin();
it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
}
messages_[0].Apply(it_, unigram_file);
BackoffMessages *messages = messages_ + 1;
@ -229,8 +229,8 @@ class SRISucks {
class FindBlanks {
public:
FindBlanks(uint64_t *counts, unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
: counts_(counts), longest_counts_(counts + order - 1), unigrams_(unigrams), sri_(messages) {}
FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
: counts_(order), unigrams_(unigrams), sri_(messages) {}
float UnigramProb(WordIndex index) const {
return unigrams_[index].prob;
@ -250,7 +250,7 @@ class FindBlanks {
}
void Longest(const void * /*data*/) {
++*longest_counts_;
++counts_.back();
}
// Unigrams wrote one past.
@ -258,8 +258,12 @@ class FindBlanks {
--counts_[0];
}
const std::vector<uint64_t> &Counts() const {
return counts_;
}
private:
uint64_t *const counts_, *const longest_counts_;
std::vector<uint64_t> counts_;
const ProbBackoff *unigrams_;
@ -473,14 +477,15 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
}
SRISucks sri;
std::vector<uint64_t> fixed_counts(counts.size());
std::vector<uint64_t> fixed_counts;
util::scoped_FILE unigram_file;
util::scoped_fd unigram_fd(files.StealUnigram());
{
util::scoped_memory unigrams;
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
fixed_counts = finder.Counts();
}
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {

View File

@ -62,6 +62,8 @@ template <class Quant, class Bhiksha> class TrieSearch {
void LoadedBinary();
typedef const Middle *MiddleIter;
const Middle *MiddleBegin() const { return middle_begin_; }
const Middle *MiddleEnd() const { return middle_end_; }

View File

@ -83,7 +83,12 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
#if defined(_WIN32) || defined(_WIN64)
std::stable_sort
#else
std::sort
#endif
(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
util::scoped_FILE out(maker.MakeFile());
@ -157,7 +162,10 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) {
UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
WriteOrThrow(file_, start, amount);
long forward = entry_size_ - internal - amount;
if (forward) UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
#if !defined(_WIN32) && !defined(_WIN64)
if (forward)
#endif
UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
}
void RecordReader::Rewind() {
@ -244,8 +252,13 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
}
// Sort full records by full n-gram.
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
// parallel_sort uses too much RAM
std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
// parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
#if defined(_WIN32) || defined(_WIN64)
std::stable_sort
#else
std::sort
#endif
(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
files.push_back(DiskFlush(begin, out_end, maker));
contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));

View File

@ -125,8 +125,10 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
if (enumerate_) {
util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
util::JointSort(begin_, end_, values);
if (!strings_to_enumerate_.empty()) {
util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
util::JointSort(begin_, end_, values);
}
for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
// <unk> strikes again: +1 here.
enumerate_->Add(i + 1, strings_to_enumerate_[i]);
@ -142,11 +144,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
bound_ = end_ - begin_ + 1;
}
void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
SetSpecial(Index("<s>"), Index("</s>"), 0);
bound_ = end_ - begin_ + 1;
ReadWords(fd, to, bound_);
if (have_words) ReadWords(fd, to, bound_);
}
namespace {
@ -201,12 +203,12 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
SetSpecial(Index("<s>"), Index("</s>"), 0);
}
void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
lookup_.LoadedBinary();
bound_ = header_->bound;
SetSpecial(Index("<s>"), Index("</s>"), 0);
ReadWords(fd, to, bound_);
if (have_words) ReadWords(fd, to, bound_);
}
void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
@ -229,7 +231,7 @@ void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialW
if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as <unk>.";
break;
case THROW_UP:
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. If you built your APRA with IRSTLM and forgot to run add-start-end.sh, complain to <bertoldi at fbk.eu> stating that you think build-lm.sh should do this by default, then go back and retrain your model from the start. To bypass this check and treat " << str << " as an OOV, pass -s. The resulting model will not work with e.g. Moses.");
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. Run build_binary -s to disable this check.");
}
}

View File

@ -82,7 +82,7 @@ class SortedVocabulary : public base::Vocabulary {
bool SawUnk() const { return saw_unk_; }
void LoadedBinary(int fd, EnumerateVocab *to);
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
private:
uint64_t *begin_, *end_;
@ -143,9 +143,11 @@ class ProbingVocabulary : public base::Vocabulary {
void FinishedLoading(ProbBackoff *reorder_vocab);
std::size_t UnkCountChangePadding() const { return 0; }
bool SawUnk() const { return saw_unk_; }
void LoadedBinary(int fd, EnumerateVocab *to);
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
private:
typedef util::ProbingHashTable<ProbingVocabuaryEntry, util::IdentityHash> Lookup;

View File

@ -4,21 +4,90 @@
#include <cmath>
#include <climits>
#include <fstream>
#include <iterator>
#include <iostream>
#include <stdexcept>
#include "Util.h"
namespace {
// configure regularisation
const char KEY_REFLEN[] = "reflen";
const char REFLEN_AVERAGE[] = "average";
const char REFLEN_SHORTEST[] = "shortest";
const char REFLEN_CLOSEST[] = "closest";
} // namespace
// A simple STL-map based n-gram counts.
// Basically, we provide typical accessors and mutaors, but
// we intentionally does not allow erasing elements.
class BleuScorer::NgramCounts {
public:
// Used to construct the ngram map
struct NgramComparator {
bool operator()(const vector<int>& a, const vector<int>& b) const {
size_t i;
const size_t as = a.size();
const size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
return true;
}
if (a[i] > b[i]) {
return false;
}
}
// entries are equal, shortest wins
return as < bs;
}
};
typedef vector<int> Key;
typedef int Value;
typedef map<Key, Value, NgramComparator>::iterator iterator;
typedef map<Key, Value, NgramComparator>::const_iterator const_iterator;
NgramCounts() : kDefaultCount(1) { }
virtual ~NgramCounts() { }
// If the specified "ngram" is found, we add counts.
// If not, we insert the default count in the container.
void add(const Key& ngram) {
const_iterator it = find(ngram);
if (it != end()) {
m_counts[ngram] = it->second + 1;
} else {
m_counts[ngram] = kDefaultCount;
}
}
void clear() { m_counts.clear(); }
bool empty() const { return m_counts.empty(); }
size_t size() const { return m_counts.size(); }
size_t max_size() const { return m_counts.max_size(); }
iterator find(const Key& ngram) { return m_counts.find(ngram); }
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
iterator begin() { return m_counts.begin(); }
const_iterator begin() const { return m_counts.begin(); }
iterator end() { return m_counts.end(); }
const_iterator end() const { return m_counts.end(); }
private:
const int kDefaultCount;
map<Key, Value, NgramComparator> m_counts;
};
BleuScorer::BleuScorer(const string& config)
: StatisticsBasedScorer("BLEU",config),
: StatisticsBasedScorer("BLEU", config),
kLENGTH(4),
m_ref_length_type(CLOSEST) {
//configure regularisation
static string KEY_REFLEN = "reflen";
static string REFLEN_AVERAGE = "average";
static string REFLEN_SHORTEST = "shortest";
static string REFLEN_CLOSEST = "closest";
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
m_ref_length_type = AVERAGE;
} else if (reflen == REFLEN_SHORTEST) {
@ -28,18 +97,15 @@ BleuScorer::BleuScorer(const string& config)
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
}
// cerr << "Using reference length strategy: " << reflen << endl;
}
BleuScorer::~BleuScorer() {}
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
size_t BleuScorer::countNgrams(const string& line, NgramCounts& counts,
unsigned int n)
{
vector<int> encoded_tokens;
//cerr << line << endl;
TokenizeAndEncode(line, encoded_tokens);
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
//cerr << endl;
for (size_t k = 1; k <= n; ++k) {
//ngram order longer than sentence - no point
if (k > encoded_tokens.size()) {
@ -50,18 +116,9 @@ size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned in
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
ngram.push_back(encoded_tokens[j]);
}
int count = 1;
counts_iterator oldcount = counts.find(ngram);
if (oldcount != counts.end()) {
count = (oldcount->second) + 1;
}
//cerr << count << endl;
counts[ngram] = count;
//cerr << endl;
counts.add(ngram);
}
}
//cerr << "counted ngrams" << endl;
//dump_counts(counts);
return encoded_tokens.size();
}
@ -82,9 +139,9 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
//cerr << line << endl;
line = this->applyFactors(line);
if (i == 0) {
counts_t *counts = new counts_t; //these get leaked
NgramCounts *counts = new NgramCounts; //these get leaked
m_ref_counts.push_back(counts);
vector<size_t> lengths;
m_ref_lengths.push_back(lengths);
@ -92,11 +149,12 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
if (m_ref_counts.size() <= sid) {
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
}
counts_t counts;
size_t length = countNgrams(line,counts,kLENGTH);
NgramCounts counts;
size_t length = countNgrams(line, counts, kLENGTH);
//for any counts larger than those already there, merge them in
for (counts_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
counts_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
NgramCounts::const_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
int oldcount = 0;
if (oldcount_it != m_ref_counts[sid]->end()) {
oldcount = oldcount_it->second;
@ -113,83 +171,56 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
}
++sid;
}
TRACE_ERR(endl);
}
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
// cerr << text << endl;
// cerr << sid << endl;
//dump_counts(*m_ref_counts[sid]);
if (sid >= m_ref_counts.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
counts_t testcounts;
//stats for this line
vector<float> stats(kLENGTH*2);;
size_t length = countNgrams(text,testcounts,kLENGTH);
//dump_counts(testcounts);
if (m_ref_length_type == SHORTEST) {
//cerr << reflengths.size() << " " << sid << endl;
int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end());
stats.push_back(shortest);
} else if (m_ref_length_type == AVERAGE) {
int total = 0;
for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
total += m_ref_lengths[sid][i];
}
const float mean = static_cast<float>(total) / m_ref_lengths[sid].size();
stats.push_back(mean);
} else if (m_ref_length_type == CLOSEST) {
int min_diff = INT_MAX;
int min_idx = 0;
for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
const int reflength = m_ref_lengths[sid][i];
const int diff = reflength - static_cast<int>(length);
const int absolute_diff = abs(diff) - abs(min_diff);
NgramCounts testcounts;
// stats for this line
vector<ScoreStatsType> stats(kLENGTH * 2);
string sentence = this->applyFactors(text);
const size_t length = countNgrams(sentence, testcounts, kLENGTH);
if (absolute_diff < 0) { //look for the closest reference
min_diff = diff;
min_idx = i;
} else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest
if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) {
min_idx = i;
}
}
}
stats.push_back(m_ref_lengths[sid][min_idx]);
} else {
throw runtime_error("Unsupported reflength strategy");
// Calculate effective reference length.
switch (m_ref_length_type) {
case SHORTEST:
CalcShortest(sid, stats);
break;
case AVERAGE:
CalcAverage(sid, stats);
break;
case CLOSEST:
CalcClosest(sid, length, stats);
break;
default:
throw runtime_error("Unsupported reflength strategy");
}
//cerr << "computed length" << endl;
//precision on each ngram type
for (counts_iterator testcounts_it = testcounts.begin();
for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
testcounts_it != testcounts.end(); ++testcounts_it) {
counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
NgramCounts::const_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
int correct = 0;
int guess = testcounts_it->second;
const int guess = testcounts_it->second;
if (refcounts_it != m_ref_counts[sid]->end()) {
correct = min(refcounts_it->second,guess);
}
size_t len = testcounts_it->first.size();
const size_t len = testcounts_it->first.size();
stats[len*2-2] += correct;
stats[len*2-1] += guess;
}
stringstream sout;
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
//TRACE_ERR(sout.str() << endl);
string stats_str = sout.str();
entry.set(stats_str);
entry.set(stats);
}
float BleuScorer::calculateScore(const vector<int>& comps) const
{
//cerr << "BLEU: ";
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
float logbleu = 0.0;
for (int i = 0; i < kLENGTH; ++i) {
if (comps[2*i] == 0) {
@ -203,15 +234,64 @@ float BleuScorer::calculateScore(const vector<int>& comps) const
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << " " << exp(logbleu) << endl;
return exp(logbleu);
}
void BleuScorer::dump_counts(counts_t& counts) const {
for (counts_const_iterator i = counts.begin(); i != counts.end(); ++i) {
cerr << "(";
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
cerr << ") " << i->second << ", ";
void BleuScorer::dump_counts(ostream* os,
const NgramCounts& counts) const {
for (NgramCounts::const_iterator it = counts.begin();
it != counts.end(); ++it) {
*os << "(";
const NgramCounts::Key& keys = it->first;
for (size_t i = 0; i < keys.size(); ++i) {
if (i != 0) {
*os << " ";
}
*os << keys[i];
}
*os << ") : " << it->second << ", ";
}
cerr << endl;
*os << endl;
}
void BleuScorer::CalcAverage(size_t sentence_id,
vector<ScoreStatsType>& stats) const {
int total = 0;
for (size_t i = 0;
i < m_ref_lengths[sentence_id].size(); ++i) {
total += m_ref_lengths[sentence_id][i];
}
const float mean = static_cast<float>(total) /
m_ref_lengths[sentence_id].size();
stats.push_back(static_cast<ScoreStatsType>(mean));
}
void BleuScorer::CalcClosest(size_t sentence_id,
size_t length,
vector<ScoreStatsType>& stats) const {
int min_diff = INT_MAX;
int min_idx = 0;
for (size_t i = 0; i < m_ref_lengths[sentence_id].size(); ++i) {
const int reflength = m_ref_lengths[sentence_id][i];
const int length_diff = abs(reflength - static_cast<int>(length));
// Look for the closest reference
if (length_diff < abs(min_diff)) {
min_diff = reflength - length;
min_idx = i;
// if two references has the same closest length, take the shortest
} else if (length_diff == abs(min_diff)) {
if (reflength < static_cast<int>(m_ref_lengths[sentence_id][min_idx])) {
min_idx = i;
}
}
}
stats.push_back(m_ref_lengths[sentence_id][min_idx]);
}
void BleuScorer::CalcShortest(size_t sentence_id,
vector<ScoreStatsType>& stats) const {
const int shortest = *min_element(m_ref_lengths[sentence_id].begin(),
m_ref_lengths[sentence_id].end());
stats.push_back(shortest);
}

View File

@ -1,7 +1,7 @@
#ifndef MERT_BLEU_SCORER_H_
#define MERT_BLEU_SCORER_H_
#include <iostream>
#include <ostream>
#include <string>
#include <vector>
@ -24,55 +24,42 @@ public:
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual float calculateScore(const vector<int>& comps) const;
virtual size_t NumberOfScores() const {
return 2 * kLENGTH + 1;
}
virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; }
private:
enum ReferenceLengthType {
AVERAGE,
SHORTEST,
CLOSEST,
CLOSEST
};
//Used to construct the ngram map
struct CompareNgrams {
bool operator()(const vector<int>& a, const vector<int>& b) const {
size_t i;
const size_t as = a.size();
const size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
//cerr << "true" << endl;
return true;
}
if (a[i] > b[i]) {
//cerr << "false" << endl;
return false;
}
}
//entries are equal, shortest wins
return as < bs;;
}
};
typedef map<vector<int>,int,CompareNgrams> counts_t;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
/**
* A NgramCounts is a key-value store.
* Clients don't have to worry about the actual implementation
* since this type is used in internal only.
*/
class NgramCounts;
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n);
void dump_counts(counts_t& counts) const;
void dump_counts(std::ostream* os, const NgramCounts& counts) const;
// For calculating effective reference length.
void CalcAverage(size_t sentence_id,
vector<ScoreStatsType>& stats) const;
void CalcClosest(size_t sentence_id, size_t length,
vector<ScoreStatsType>& stats) const;
void CalcShortest(size_t sentence_id,
vector<ScoreStatsType>& stats) const;
const int kLENGTH;
ReferenceLengthType m_ref_length_type;
// data extracted from reference files
ScopedVector<counts_t> m_ref_counts;
ScopedVector<NgramCounts> m_ref_counts;
vector<vector<size_t> > m_ref_lengths;
// no copying allowed

View File

@ -1,6 +1,6 @@
#include "CderScorer.h"
#include <iterator>
#include <algorithm>
#include <fstream>
#include <stdexcept>
@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
m_ref_sentences.push_back(vector<sent_t>());
string line;
while (getline(refin,line)) {
line = this->applyFactors(line);
sent_t encoded;
TokenizeAndEncode(line, encoded);
m_ref_sentences[rid].push_back(encoded);
@ -40,13 +41,11 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
vector<int> stats;
prepareStatsVector(sid, text, stats);
string sentence = this->applyFactors(text);
stringstream sout;
copy(stats.begin(), stats.end(), ostream_iterator<float>(sout," "));
string stats_str = sout.str();
entry.set(stats_str);
vector<int> stats;
prepareStatsVector(sid, sentence, stats);
entry.set(stats);
}
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
@ -55,9 +54,11 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
TokenizeAndEncode(text, cand);
float max = -2;
vector<int> tmp;
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
sent_t& ref = m_ref_sentences[rid][sid];
vector<int> tmp = computeCD(cand, ref);
const sent_t& ref = m_ref_sentences[rid][sid];
tmp.clear();
computeCD(cand, ref, tmp);
if (calculateScore(tmp) > max) {
stats = tmp;
}
@ -66,16 +67,14 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
float CderScorer::calculateScore(const vector<int>& comps) const
{
if (comps.size() != 2)
{
if (comps.size() != 2) {
throw runtime_error("Size of stat vector for CDER is not 2");
}
return 1 - (comps[0] / static_cast<float>(comps[1]));
return 1.0f - (comps[0] / static_cast<float>(comps[1]));
}
vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
{
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
vector<int>& stats) const {
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
@ -113,10 +112,9 @@ vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
row = nextRow;
}
vector<int> stats(2);
stats.resize(2);
stats[0] = *(row->rbegin()); // CD distance is the cost of path from (0,0) to (I,L)
stats[1] = ref.size();
delete row;
return stats;
}

View File

@ -1,8 +1,6 @@
#ifndef MERT_CDER_SCORER_H_
#define MERT_CDER_SCORER_H_
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include "Types.h"
@ -10,9 +8,8 @@
using namespace std;
class CderScorer: public StatisticsBasedScorer
{
public:
class CderScorer: public StatisticsBasedScorer {
public:
explicit CderScorer(const string& config);
~CderScorer();
@ -22,17 +19,16 @@ public:
virtual void prepareStatsVector(size_t sid, const string& text, vector<int>& stats);
virtual size_t NumberOfScores() const {
return 2;
}
virtual size_t NumberOfScores() const { return 2; }
virtual float calculateScore(const vector<int>& comps) const;
private:
private:
typedef vector<int> sent_t;
vector<vector<sent_t> > m_ref_sentences;
vector<int> computeCD(const sent_t& cand, const sent_t& ref) const;
void computeCD(const sent_t& cand, const sent_t& ref,
vector<int>& stats) const;
// no copying allowed
CderScorer(const CderScorer&);

182
mert/InterpolatedScorer.cpp Normal file
View File

@ -0,0 +1,182 @@
#include "ScorerFactory.h"
#include "InterpolatedScorer.h"
#include "Util.h"
using namespace std;
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
{
// name would be: HAMMING,BLEU or similar
string scorers = name;
while (scorers.length() > 0) {
string scorertype = "";
getNextPound(scorers,scorertype,",");
Scorer *theScorer=ScorerFactory::getScorer(scorertype,config);
_scorers.push_back(theScorer);
}
if (_scorers.size() == 0) {
throw runtime_error("There are no scorers");
}
cerr << "Number of scorers: " << _scorers.size() << endl;
//TODO debug this
string wtype = getConfig("weights","");
//Default weights set to uniform ie. if two weights 0.5 each
//weights should add to 1
if (wtype.length() == 0) {
float weight = 1.0/_scorers.size() ;
//cout << " Default weights:" << weight << endl;
for (size_t i = 0; i < _scorers.size(); i ++) {
_scorerWeights.push_back(weight);
}
} else {
float tot=0;
//cout << "Defined weights:" << endl;
while (wtype.length() > 0) {
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
}
//cout << endl;
if (tot != float(1)) {
for (vector<float>::iterator it = _scorerWeights.begin(); it != _scorerWeights.end(); ++it)
{
*it /= tot;
}
}
if (_scorers.size() != _scorerWeights.size()) {
throw runtime_error("The number of weights does not equal the number of scorers!");
}
}
cerr << "The weights for the interpolated scorers are: " << endl;
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
cerr << *it << " " ;
}
cerr <<endl;
}
void InterpolatedScorer::setScoreData(ScoreData* data)
{
size_t last = 0;
m_score_data = data;
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
int numScoresScorer = (*itsc)->NumberOfScores();
ScoreData* newData =new ScoreData(**itsc);
for (size_t i = 0; i < data->size(); i++) {
ScoreArray scoreArray = data->get(i);
ScoreArray newScoreArray;
std::string istr;
std::stringstream out;
out << i;
istr = out.str();
size_t numNBest = scoreArray.size();
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
for (size_t j = 0; j < numNBest ; j++) {
ScoreStats scoreStats = data->get(i, j);
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
ScoreStats newScoreStats;
for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
ScoreStatsType score = scoreStats.get(k);
newScoreStats.add(score);
}
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
newScoreArray.add(newScoreStats);
}
newScoreArray.setIndex(istr);
newData->add(newScoreArray);
}
//newData->dump();
// NOTE: This class takes the ownership of the heap allocated
// ScoreData objects to avoid the memory leak issues.
m_scorers_score_data.push_back(newData);
(*itsc)->setScoreData(newData);
last += numScoresScorer;
}
}
/** The interpolated scorer calls a vector of scorers and combines them with
weights **/
void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) const
{
//cout << "*******InterpolatedScorer::score" << endl;
size_t scorerNum = 0;
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
//int numScores = (*itsc)->NumberOfScores();
statscores_t tscores;
(*itsc)->score(candidates,diffs,tscores);
size_t inc = 0;
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
//cout << "Scores " << (*itstatsc) << endl;
float weight = _scorerWeights[scorerNum];
if (weight == 0) {
stringstream msg;
msg << "No weights for scorer" << scorerNum ;
throw runtime_error(msg.str());
}
if (scorerNum == 0) {
scores.push_back(weight * (*itstatsc));
} else {
scores[inc] += weight * (*itstatsc);
}
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
inc++;
}
scorerNum++;
}
}
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
(*itsc)->setReferenceFiles(referenceFiles);
}
}
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
stringstream buff;
int i=0;
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
ScoreStats tempEntry;
(*itsc)->prepareStats(sid, text, tempEntry);
if (i > 0) buff << " ";
buff << tempEntry;
i++;
}
//cout << " Scores for interpolated: " << buff << endl;
string str = buff.str();
entry.set(str);
}
void InterpolatedScorer::setFactors(const string& factors)
{
if (factors.empty()) return;
vector<string> fsplit;
split(factors, ',', fsplit);
if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
for (size_t i = 0; i < _scorers.size(); ++i)
{
_scorers[i]->setFactors(fsplit[i]);
}
}

60
mert/InterpolatedScorer.h Normal file
View File

@ -0,0 +1,60 @@
#ifndef __INTERPOLATED_SCORER_H__
#define __INTERPOLATED_SCORER_H__
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <limits>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include "Types.h"
#include "ScoreData.h"
#include "Scorer.h"
#include "ScopedVector.h"
/**
* Class that includes other scorers eg.
* Interpolated HAMMING and BLEU scorer **/
class InterpolatedScorer : public Scorer
{
public:
// name would be: "HAMMING,BLEU" or similar
InterpolatedScorer(const string& name, const string& config);
virtual ~InterpolatedScorer() {}
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) const;
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual size_t NumberOfScores() const {
size_t sz=0;
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc != _scorers.end(); itsc++) {
sz += (*itsc)->NumberOfScores();
}
return sz;
};
virtual void setScoreData(ScoreData* data);
/**
* Set the factors, which should be used for this metric
*/
virtual void setFactors(const string& factors);
protected:
ScopedVector<Scorer> _scorers;
// Take the ownership of the heap-allocated the objects
// by Scorer objects.
ScopedVector<ScoreData> m_scorers_score_data;
vector<float> _scorerWeights;
};
#endif //__INTERPOLATED_SCORER_H

View File

@ -12,6 +12,7 @@ FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
FeatureDataIterator.cpp
Data.cpp
BleuScorer.cpp
InterpolatedScorer.cpp
Point.cpp
PerScorer.cpp
Scorer.cpp
@ -44,6 +45,7 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro ;
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test timer_test : TimerTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
install legacy : programs : <location>. ;

View File

@ -24,6 +24,11 @@ public:
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual size_t NumberOfScores() const
{
return 0;
}
void whoami() const {
cerr << "I AM MergeScorer" << endl;
}

View File

@ -72,7 +72,6 @@ statscore_t Optimizer::GetStatScore(const Point& param) const
{
vector<unsigned> bests;
Get1bests(param, bests);
//copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," "));
statscore_t score = GetStatScore(bests);
return score;
}

View File

@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
int sid = 0;
while (getline(in,line)) {
line = this->applyFactors(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_ref_tokens.push_back(multiset<int>());
@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
string sentence = this->applyFactors(text);
// Calculate correct, output_length and ref_length for
// the line and store it in entry
vector<int> testtokens;
TokenizeAndEncode(text, testtokens);
TokenizeAndEncode(sentence, testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;

View File

@ -38,6 +38,7 @@ Point::Point(const vector<parameter_t>& init,
}
} else {
CHECK(init.size()==pdim);
CHECK(optindices.size() == Point::dim);
for (unsigned int i=0; i<Point::dim; i++) {
operator[](i)=init[optindices[i]];
m_min[i] = min[optindices[i]];

View File

@ -60,6 +60,15 @@ public:
static void setdim(size_t d) {
dim = d;
}
static void set_optindices(const vector<unsigned int>& indices) {
optindices = indices;
}
static const vector<unsigned int>& get_optindices() {
return optindices;
}
static bool OptimizeAll() {
return fixedweights.empty();
}

View File

@ -24,12 +24,6 @@ ScoreStats::ScoreStats(const size_t size)
memset(array_, 0, GetArraySizeWithBytes());
}
ScoreStats::ScoreStats(std::string &theString)
: available_(0), entries_(0), array_(NULL)
{
set(theString);
}
ScoreStats::~ScoreStats()
{
if (array_) {
@ -73,14 +67,14 @@ void ScoreStats::add(ScoreStatsType v)
array_[entries_++]=v;
}
void ScoreStats::set(std::string &theString)
void ScoreStats::set(const std::string& str)
{
std::string substring, stringBuf;
reset();
while (!theString.empty()) {
getNextPound(theString, substring);
add(ConvertStringToScoreStatsType(substring));
vector<string> out;
Tokenize(str.c_str(), ' ', &out);
for (vector<string>::const_iterator it = out.begin();
it != out.end(); ++it) {
add(ConvertStringToScoreStatsType(*it));
}
}
@ -144,7 +138,7 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
if (s1.get(k) != s2.get(k))
return false;
}
return true;
}
//END_ADDED

View File

@ -31,7 +31,7 @@ private:
public:
ScoreStats();
explicit ScoreStats(const size_t size);
explicit ScoreStats(std::string &theString);
~ScoreStats();
// We intentionally allow copying.
@ -66,7 +66,15 @@ public:
return array_;
}
void set(std::string &theString);
void set(const std::string& str);
// Much more efficient than the above.
void set(const std::vector<ScoreStatsType>& stats) {
reset();
for (size_t i = 0; i < stats.size(); ++i) {
add(stats[i]);
}
}
inline size_t bytes() const {
return GetArraySizeWithBytes();

View File

@ -1,5 +1,6 @@
#include "Scorer.h"
#include <limits>
#include "Util.h"
namespace {
@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
}
}
/**
* Set the factors, which should be used for this metric
*/
void Scorer::setFactors(const string& factors)
{
if (factors.empty()) return;
vector<string> factors_vec;
split(factors, '|', factors_vec);
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
{
int factor = atoi(it->c_str());
m_factors.push_back(factor);
}
}
/**
* Take the factored sentence and return the desired factors
*/
string Scorer::applyFactors(const string& sentence)
{
if (m_factors.size() == 0) return sentence;
vector<string> tokens;
split(sentence, ' ', tokens);
stringstream sstream;
for (size_t i = 0; i < tokens.size(); ++i)
{
if (tokens[i] == "") continue;
vector<string> factors;
split(tokens[i], '|', factors);
int fsize = factors.size();
if (i>0) sstream << " ";
for (size_t j = 0; j < m_factors.size(); ++j)
{
int findex = m_factors[j];
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
if (j>0) sstream << "|";
sstream << factors[findex];
}
}
return sstream.str();
}
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
: Scorer(name,config) {
//configure regularisation

View File

@ -28,10 +28,7 @@ class Scorer
/**
* Return the number of statistics needed for the computation of the score.
*/
virtual size_t NumberOfScores() const {
cerr << "Scorer: 0" << endl;
return 0;
}
virtual size_t NumberOfScores() const = 0;
/**
* Set the reference files. This must be called before prepareStats().
@ -57,7 +54,9 @@ class Scorer
* applying each in turn, and calculating a new score each time.
*/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) const {
statscores_t& scores) const = 0;
/*
{
//dummy impl
if (!m_score_data) {
throw runtime_error("score data not loaded");
@ -67,6 +66,7 @@ class Scorer
scores.push_back(0);
}
}
*/
/**
* Calculate the score of the sentences corresponding to the list of candidate
@ -93,10 +93,20 @@ class Scorer
/**
* Set the score data, prior to scoring.
*/
void setScoreData(ScoreData* data) {
virtual void setScoreData(ScoreData* data) {
m_score_data = data;
}
/**
* Set the factors, which should be used for this metric
*/
virtual void setFactors(const string& factors);
/**
* Take the factored sentence and return the desired factors
*/
virtual string applyFactors(const string& sentece);
private:
class Encoder {
public:
@ -114,6 +124,7 @@ class Scorer
string m_name;
Encoder* m_encoder;
map<string, string> m_config;
vector<int> m_factors;
protected:
ScoreData* m_score_data;

View File

@ -7,6 +7,7 @@
#include "TerScorer.h"
#include "CderScorer.h"
#include "MergeScorer.h"
#include "InterpolatedScorer.h"
using namespace std;
@ -32,6 +33,11 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
} else if (type == "MERGE") {
return (MergeScorer*) new MergeScorer(config);
} else {
throw runtime_error("Unknown scorer type: " + type);
if (type.find(',') != string::npos) {
return new InterpolatedScorer(type, config);
}
else {
throw runtime_error("Unknown scorer type: " + type);
}
}
}

View File

@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
string line;
int sid = 0;
while ( getline ( in, line ) ) {
line = this->applyFactors(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_references.push_back ( tokens );
@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
{
string sentence = this->applyFactors(text);
terAlignment result;
result.numEdits = 0.0 ;
@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
}
averageLength=averageLength/( double ) m_multi_references.size();
TokenizeAndEncode(text, testtokens);
TokenizeAndEncode(sentence, testtokens);
terCalc * evaluation=new terCalc();
evaluation->setDebugMode ( false );
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );

View File

@ -1,73 +1,106 @@
#include "Timer.h"
#include "Util.h"
#include <cstdio>
double Timer::elapsed_time()
{
time_t now;
time(&now);
return difftime(now, start_time);
#if !defined(_WIN32) && !defined(_WIN64)
#include <sys/resource.h>
#include <sys/time.h>
#endif
namespace {
#if !defined(_WIN32) && !defined(_WIN64)
uint64_t GetMicroSeconds(const struct timeval& tv) {
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
double Timer::get_elapsed_time()
{
return elapsed_time();
uint64_t GetTimeOfDayMicroSeconds() {
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
#endif
} // namespace
Timer::CPUTime Timer::GetCPUTimeMicroSeconds() const {
#if !defined(_WIN32) && !defined(_WIN64)
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage)) {
TRACE_ERR("Error occurred: getrusage().\n");
exit(1);
}
CPUTime t;
t.user_time = GetMicroSeconds(usage.ru_utime);
t.sys_time = GetMicroSeconds(usage.ru_stime);
return t;
#else // Windows
// Not implemented yet.
// TODO: implement the Windows version using native APIs.
CPUTime t;
return t;
#endif
}
double Timer::get_elapsed_cpu_time() const {
return static_cast<double>(get_elapsed_cpu_time_microseconds()) * 1e-6;
}
uint64_t Timer::get_elapsed_cpu_time_microseconds() const {
const CPUTime e = GetCPUTimeMicroSeconds();
return (e.user_time - m_start_time.user_time) +
(e.sys_time - m_start_time.sys_time);
}
double Timer::get_elapsed_wall_time() const {
return static_cast<double>(get_elapsed_wall_time_microseconds()) * 1e-6;
}
uint64_t Timer::get_elapsed_wall_time_microseconds() const {
return GetTimeOfDayMicroSeconds() - m_wall;
}
void Timer::start(const char* msg)
{
// Print an optional message, something like "Starting timer t";
if (msg) TRACE_ERR( msg << std::endl);
// Return immediately if the timer is already running
if (running) return;
// Change timer status to running
running = true;
// Set the start time;
time(&start_time);
if (m_is_running) return;
m_is_running = true;
m_wall = GetTimeOfDayMicroSeconds();
m_start_time = GetCPUTimeMicroSeconds();
}
/***
* Turn the timer off and start it again from 0. Print an optional message.
*/
/*
inline void Timer::restart(const char* msg)
void Timer::restart(const char* msg)
{
// Print an optional message, something like "Restarting timer t";
if (msg) TRACE_ERR( msg << std::endl;
// Set the timer status to running
running = true;
// Set the accumulated time to 0 and the start time to now
acc_time = 0;
start_clock = clock();
start_time = time(0);
if (msg) {
TRACE_ERR(msg << std::endl);
}
m_wall = GetTimeOfDayMicroSeconds();
m_start_time = GetCPUTimeMicroSeconds();
}
*/
/***
* Stop the timer and print an optional message.
*/
/*
inline void Timer::stop(const char* msg)
{
// Print an optional message, something like "Stopping timer t";
check(msg);
// Recalculate and store the total accumulated time up until now
if (running) acc_time += elapsed_time();
running = false;
}
*/
void Timer::check(const char* msg)
{
// Print an optional message, something like "Checking timer t";
if (msg) TRACE_ERR( msg << " : ");
// TRACE_ERR( "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
TRACE_ERR( "[" << (running ? elapsed_time() : 0) << "] seconds\n");
if (m_is_running) {
TRACE_ERR("[Wall " << get_elapsed_wall_time()
<< " CPU " << get_elapsed_cpu_time() << "] seconds.\n");
} else {
TRACE_ERR("WARNING: the timer is not running.\n");
}
}
std::string Timer::ToString() const {
std::string res;
char tmp[64];
const double wall = get_elapsed_wall_time();
const CPUTime e = GetCPUTimeMicroSeconds();
const double utime = (e.user_time - m_start_time.user_time) * 1e-6;
const double stime = (e.sys_time - m_start_time.sys_time) * 1e-6;
std::snprintf(tmp, sizeof(tmp), "wall %f user %f sec. sys %f sec. total %f sec.",
wall, utime, stime, utime + stime);
res.append(tmp);
return res;
}

View File

@ -1,46 +1,50 @@
#ifndef MERT_TIMER_H_
#define MERT_TIMER_H_
#include <ctime>
#include <iostream>
#include <iomanip>
#include <ostream>
#include <string>
#include <stdint.h>
class Timer
{
/**
* Allow timers to be printed to ostreams using the syntax 'os << t'
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
* print out the total amount of time 't' has been "running".
*/
friend std::ostream& operator<<(std::ostream& os, Timer& t);
private:
// Time values are stored in microseconds.
struct CPUTime {
uint64_t user_time; // user CPU time
uint64_t sys_time; // system CPU time
private:
bool running;
time_t start_time;
CPUTime() : user_time(0), sys_time(0) { }
};
/**
* Return the total time that the timer has been in the "running"
* state since it was first "started" or last "restarted". For
* "short" time periods (less than an hour), the actual cpu time
* used is reported instead of the elapsed time.
* TODO in seconds?
*/
double elapsed_time();
CPUTime GetCPUTimeMicroSeconds() const;
public:
bool m_is_running;
uint64_t m_wall; // wall-clock time in microseconds
CPUTime m_start_time;
public:
/**
* 'running' is initially false. A timer needs to be explicitly started
* using 'start' or 'restart'.
* 'm_is_running' is initially false. A timer needs to be explicitly started
* using 'start'.
*/
Timer() : running(false), start_time(0) { }
Timer()
: m_is_running(false),
m_wall(0),
m_start_time() {}
~Timer() {}
/**
* Start a timer. If it is already running, let it continue running.
* Print an optional message.
*/
void start(const char* msg = 0);
// void restart(const char* msg = 0);
// void stop(const char* msg = 0);
/**
* Restart the timer iff the timer is already running.
* if the timer is not running, just start the timer.
*/
void restart(const char* msg = 0);
/**
* Print out an optional message followed by the current timer timing.
@ -48,19 +52,49 @@ public:
void check(const char* msg = 0);
/**
* Return the total time that the timer has been in the "running"
* state since it was first "started" or last "restarted". For
* "short" time periods (less than an hour), the actual cpu time
* used is reported instead of the elapsed time.
* This function is the public version of elapsed_time()
*/
double get_elapsed_time();
bool is_running() const { return m_is_running; }
/**
* Return the total time in seconds that the timer has been in the
* "running" state since it was first "started" or last "restarted".
* For "short" time periods (less than an hour), the actual cpu time
* used is reported instead of the elapsed time.
*/
double get_elapsed_cpu_time() const;
/**
* Return the total time in microseconds.
*/
uint64_t get_elapsed_cpu_time_microseconds() const;
/**
* Get elapsed wall-clock time in seconds.
*/
double get_elapsed_wall_time() const;
/**
* Get elapsed wall-clock time in microseconds.
*/
uint64_t get_elapsed_wall_time_microseconds() const;
/**
* Return a string that has the user CPU time, system time, and total time.
*/
std::string ToString() const;
};
inline std::ostream& operator<<(std::ostream& os, Timer& t)
{
//os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0);
os << (t.running ? t.elapsed_time() : 0);
/**
* Allow timers to be printed to ostreams using the syntax 'os << t'
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
* print out the total amount of time 't' has been "running".
*/
inline std::ostream& operator<<(std::ostream& os, const Timer& t) {
if (t.is_running()) {
os << t.ToString();
} else {
os << "timer is not running.";
}
return os;
}

32
mert/TimerTest.cpp Normal file
View File

@ -0,0 +1,32 @@
#include "Timer.h"
#define BOOST_TEST_MODULE TimerTest
#include <boost/test/unit_test.hpp>
#include <string>
#include <iostream>
#include <unistd.h>
BOOST_AUTO_TEST_CASE(timer_basic_test) {
Timer timer;
const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
timer.start();
BOOST_REQUIRE(timer.is_running());
BOOST_REQUIRE(usleep(sleep_time_microsec) == 0);
// BOOST_CHECK(timer.get_elapsed_cpu_time() > 0.0);
// BOOST_CHECK(timer.get_elapsed_cpu_time_microseconds() > 0);
BOOST_CHECK(timer.get_elapsed_wall_time() > 0.0);
BOOST_CHECK(timer.get_elapsed_wall_time_microseconds() > 0);
timer.restart();
BOOST_REQUIRE(timer.is_running());
BOOST_REQUIRE(usleep(sleep_time_microsec) == 0);
// BOOST_CHECK(timer.get_elapsed_cpu_time() > 0.0);
// BOOST_CHECK(timer.get_elapsed_cpu_time_microseconds() > 0);
BOOST_CHECK(timer.get_elapsed_wall_time() > 0.0);
BOOST_CHECK(timer.get_elapsed_wall_time_microseconds() > 0);
const std::string s = timer.ToString();
BOOST_CHECK(!s.empty());
}

View File

@ -84,5 +84,5 @@ void PrintUserTime(const std::string &message)
double GetUserTime()
{
return g_timer.get_elapsed_time();
return g_timer.get_elapsed_cpu_time();
}

View File

@ -131,6 +131,7 @@ void usage()
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--reference|-R] comma separated list of reference files" << endl;
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
@ -138,10 +139,19 @@ void usage()
cerr << "[--help|-h] print this message and exit" << endl;
cerr << endl;
cerr << "Evaluator is able to compute more metrics at once. To do this," << endl;
cerr << "separate scorers with semicolon (note that comma is used to separate" << endl;
cerr << "scorers in the interpolated scorer)." << endl;
cerr << "specify more --sctype arguments. You can also specify more --scconfig strings." << endl;
cerr << endl;
cerr << "If you specify only one metric and one candidate file, only the final score" << endl;
cerr << "The example below prints BLEU score, PER score and interpolated" << endl;
cerr << "score of CDER and PER with the given weights." << endl;
cerr << endl;
cerr << "./evaluator \\" << endl;
cerr << "\t--sctype BLEU --scconfig reflen:closest \\" << endl;
cerr << "\t--sctype PER \\" << endl;
cerr << "\t--sctype CDER,PER --scconfig weights:0.25+0.75 \\" << endl;
cerr << "\t--candidate CANDIDATE \\" << endl;
cerr << "\t--reference REFERENCE" << endl;
cerr << endl;
cerr << "If you specify only one scorer and one candidate file, only the final score" << endl;
cerr << "will be printed to stdout. Otherwise each line will contain metric name" << endl;
cerr << "and/or filename and the final score. Since most of the metrics prints some" << endl;
cerr << "debuging info, consider redirecting stderr to /dev/null." << endl;
@ -155,24 +165,24 @@ static struct option long_options[] = {
{"candidate", required_argument, 0, 'C'},
{"bootstrap", required_argument, 0, 'b'},
{"rseed", required_argument, 0, 'r'},
{"factors", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
// Options used in evaluator.
struct ProgramOption {
string scorer_type;
string scorer_config;
vector<string> scorer_types;
vector<string> scorer_configs;
string reference;
string candidate;
vector<string> scorer_factors;
int bootstrap;
int seed;
bool has_seed;
ProgramOption()
: scorer_type("BLEU"),
scorer_config(""),
reference(""),
: reference(""),
candidate(""),
bootstrap(0),
seed(0),
@ -182,13 +192,17 @@ struct ProgramOption {
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
int last_scorer_index = -1;
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) {
switch(c) {
case 's':
opt->scorer_type = string(optarg);
opt->scorer_types.push_back(string(optarg));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
last_scorer_index++;
break;
case 'c':
opt->scorer_config = string(optarg);
opt->scorer_configs[last_scorer_index] = string(optarg);
break;
case 'R':
opt->reference = string(optarg);
@ -203,10 +217,21 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
opt->seed = strtol(optarg, NULL, 10);
opt->has_seed = true;
break;
case 'f':
opt->scorer_factors[last_scorer_index] = string(optarg);
break;
default:
usage();
}
}
// Add default scorer if no scorer provided
if (opt->scorer_types.size() == 0)
{
opt->scorer_types.push_back(string("BLEU"));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
}
}
void InitSeed(const ProgramOption *opt) {
@ -236,7 +261,6 @@ int main(int argc, char** argv)
try {
vector<string> refFiles;
vector<string> candFiles;
vector<string> scorerTypes;
if (option.reference.length() == 0) throw runtime_error("You have to specify at least one reference file.");
split(option.reference, ',', refFiles);
@ -244,17 +268,15 @@ int main(int argc, char** argv)
if (option.candidate.length() == 0) throw runtime_error("You have to specify at least one candidate file.");
split(option.candidate, ',', candFiles);
if (option.scorer_type.length() == 0) throw runtime_error("You have to specify at least one scorer.");
split(option.scorer_type, ';', scorerTypes);
if (candFiles.size() > 1) g_has_more_files = true;
if (scorerTypes.size() > 1) g_has_more_scorers = true;
if (option.scorer_types.size() > 1) g_has_more_scorers = true;
for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt)
{
for (vector<string>::const_iterator scorerIt = scorerTypes.begin(); scorerIt != scorerTypes.end(); ++scorerIt)
for (size_t i = 0; i < option.scorer_types.size(); i++)
{
g_scorer = ScorerFactory::getScorer(*scorerIt, option.scorer_config);
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
g_scorer->setFactors(option.scorer_factors[i]);
g_scorer->setReferenceFiles(refFiles);
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
delete g_scorer;

View File

@ -26,6 +26,7 @@ void usage()
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--reference|-r] comma separated list of reference files" << endl;
cerr << "[--binary|-b] use binary output format (default to text )" << endl;
cerr << "[--nbest|-n] the nbest file" << endl;
@ -41,6 +42,7 @@ void usage()
static struct option long_options[] = {
{"sctype", required_argument, 0, 's'},
{"scconfig", required_argument,0, 'c'},
{"factors", required_argument,0, 'f'},
{"reference", required_argument, 0, 'r'},
{"binary", no_argument, 0, 'b'},
{"nbest", required_argument, 0, 'n'},
@ -57,6 +59,7 @@ static struct option long_options[] = {
struct ProgramOption {
string scorerType;
string scorerConfig;
string scorerFactors;
string referenceFile;
string nbestFile;
string scoreDataFile;
@ -69,6 +72,7 @@ struct ProgramOption {
ProgramOption()
: scorerType("BLEU"),
scorerConfig(""),
scorerFactors(""),
referenceFile(""),
nbestFile(""),
scoreDataFile("statscore.data"),
@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch (c) {
case 's':
opt->scorerType = string(optarg);
@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
case 'c':
opt->scorerConfig = string(optarg);
break;
case 'f':
opt->scorerFactors = string(optarg);
break;
case 'r':
opt->referenceFile = string(optarg);
break;
@ -180,6 +187,8 @@ int main(int argc, char** argv)
Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
scorer->setFactors(option.scorerFactors);
// load references
if (referenceFiles.size() > 0)
scorer->setReferenceFiles(referenceFiles);
@ -206,16 +215,9 @@ int main(int argc, char** argv)
data.remove_duplicates();
//END_ADDED
if (option.binmode)
cerr << "Binary write mode is selected" << endl;
else
cerr << "Binary write mode is NOT selected" << endl;
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
PrintUserTime("Stopping...");
// timer.stop("Stopping...");
delete scorer;
return EXIT_SUCCESS;

View File

@ -343,6 +343,8 @@ int main(int argc, char **argv)
data.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
}
TheScorer->setScoreData(data.getScoreData().get());
//ADDED_BY_TS
data.remove_duplicates();
//END_ADDED
@ -362,13 +364,6 @@ int main(int argc, char **argv)
vector<string> features;
Tokenize(option.to_optimize_str.c_str(), ',', &features);
if (option.pdim != static_cast<int>(features.size())) {
cerr << "Error: pdim and the specified number of features are not equal: "
<< "pdim = " << option.pdim
<< ", the number of features = " << features.size() << endl;
exit(1);
}
for (vector<string>::const_iterator it = features.begin();
it != features.end(); ++it) {
const int feature_index = data.getFeatureIndex(*it);
@ -405,6 +400,7 @@ int main(int argc, char **argv)
Point::setpdim(option.pdim);
Point::setdim(to_optimize.size());
Point::set_optindices(to_optimize);
//starting points consist of specified points and random restarts
vector<Point> startingPoints;

View File

@ -2575,7 +2575,7 @@ sub create_step {
$subdir = "lm" if $subdir eq "interpolated-lm";
open(STEP,">$file");
print STEP "#!/bin/bash\n\n";
print STEP "PATH=".$ENV{"PATH"}."\n";
print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
print STEP "cd $dir\n";
print STEP "echo 'starting at '`date`' on '`hostname`\n";
print STEP "mkdir -p $dir/$subdir\n\n";

View File

@ -181,7 +181,7 @@ if ($opt_hierarchical)
my %PHRASE_USED;
if (!$opt_hierarchical) {
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
open(INPUT,$input) or die "Can't read $input";
open(INPUT,mk_open_string($input)) or die "Can't read $input";
while(my $line = <INPUT>) {
chomp($line);
my @WORD = split(/ +/,$line);
@ -207,6 +207,22 @@ if (!$opt_hierarchical) {
close(INPUT);
}
sub mk_open_string {
my $file = shift;
my $openstring;
if ($file !~ /\.gz$/ && -e "$file.gz") {
$openstring = "$ZCAT $file.gz |";
} elsif ($file =~ /\.gz$/) {
$openstring = "$ZCAT $file |";
} elsif ($opt_hierarchical) {
$openstring = "cat $file |";
} else {
$openstring = "< $file";
}
return $openstring;
}
# filter files
for(my $i=0;$i<=$#TABLE;$i++) {
my ($used,$total) = (0,0);
@ -215,16 +231,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
my $new_file = $TABLE_NEW_NAME[$i];
print STDERR "filtering $file -> $new_file...\n";
my $openstring;
if ($file !~ /\.gz$/ && -e "$file.gz") {
$openstring = "$ZCAT $file.gz |";
} elsif ($file =~ /\.gz$/) {
$openstring = "$ZCAT $file |";
} elsif ($opt_hierarchical) {
$openstring = "cat $file |";
} else {
$openstring = "< $file";
}
my $openstring = mk_open_string($file);
my $new_openstring;
if ($new_file =~ /\.gz$/) {
@ -303,7 +310,7 @@ close(INFO);
print "To run the decoder, please call:
moses -f $dir/moses.ini < $input\n";
moses -f $dir/moses.ini -i $input\n";
sub safesystem {
print STDERR "Executing: @_\n";

View File

@ -376,7 +376,7 @@ void Model::zipFile()
{
fclose(file);
file = fopen(filename.c_str(), "rb");
FILE* gzfile = (FILE*) gzopen((filename+".gz").c_str(),"wb");
gzFile gzfile = gzopen((filename+".gz").c_str(),"wb");
char inbuffer[128];
int num_read;
while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {

View File

@ -137,6 +137,7 @@ my $___INPUTTYPE = 0;
my $mertdir = undef; # path to new mert directory
my $mertargs = undef; # args to pass through to mert & extractor
my $mertmertargs = undef; # args to pass through to mert only
my $extractorargs = undef; # args to pass through to extractor only
my $filtercmd = undef; # path to filter-model-given-input.pl
my $filterfile = undef;
my $qsubwrapper = undef;
@ -178,6 +179,7 @@ GetOptions(
"verbose" => \$verbose,
"mertdir=s" => \$mertdir,
"mertargs=s" => \$mertargs,
"extractorargs=s" => \$extractorargs,
"mertmertargs=s" => \$mertmertargs,
"rootdir=s" => \$SCRIPTS_ROOTDIR,
"filtercmd=s" => \$filtercmd, # allow to override the default location
@ -241,8 +243,9 @@ Options:
model. useful for lattice decoding
--rootdir=STRING ... where do helpers reside (if not given explicitly)
--mertdir=STRING ... path to new mert implementation
--mertargs=STRING ... extra args for mert, eg. to specify scorer
--mertmertargs=STRING ... extra args for mert only,
--mertargs=STRING ... extra args for both extractor and mert
--extractorargs=STRING ... extra args for extractor only
--mertmertargs=STRING ... extra args for mert only
--scorenbestcmd=STRING ... path to score-nbest.py
--old-sge ... passed to parallelizers, assume Grid Engine < 6.0
--inputtype=[0|1|2] ... Handle different input types: (0 for text,
@ -364,6 +367,7 @@ $scconfig = "--scconfig $scconfig" if ($scconfig);
my $mert_extract_args=$mertargs;
$mert_extract_args .=" $scconfig";
$mert_extract_args .=" $extractorargs";
$mertmertargs = "" if !defined $mertmertargs;

View File

@ -47,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
#endif
inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
#if defined(__arm) || defined(__arm__)
const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
uint64_t value64;
memcpy(&value64, base_off, sizeof(value64));
return value64;
#else
return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
#endif
}
/* Pack integers up to 57 bits using their least significant digits.
@ -75,7 +82,14 @@ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t va
/* Same caveats as above, but for a 25 bit limit. */
inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
#if defined(__arm) || defined(__arm__)
const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
uint32_t value32;
memcpy(&value32, base_off, sizeof(value32));
return (value32 >> BitPackShift(bit_off & 7, length)) & mask;
#else
return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
#endif
}
inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {

View File

@ -99,6 +99,13 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
}
}
void FSyncOrThrow(int fd) {
// Apparently windows doesn't have fsync?
#if !defined(_WIN32) && !defined(_WIN64)
UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed.");
#endif
}
namespace {
void InternalSeek(int fd, off_t off, int whence) {
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");

View File

@ -78,6 +78,8 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
void FSyncOrThrow(int fd);
// Seeking
void SeekOrThrow(int fd, uint64_t off);
void AdvanceOrThrow(int fd, int64_t off);

View File

@ -24,12 +24,12 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
GZException::GZException(gzFile file) {
#ifdef HAVE_ZLIB
GZException::GZException(gzFile file) {
int num;
*this << gzerror( file, &num) << " from zlib";
#endif // HAVE_ZLIB
}
#endif // HAVE_ZLIB
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

View File

@ -27,7 +27,9 @@ class ParseNumberException : public Exception {
class GZException : public Exception {
public:
#ifdef HAVE_ZLIB
explicit GZException(gzFile file);
#endif
GZException() throw() {}
~GZException() throw() {}
};

View File

@ -1,126 +0,0 @@
#ifndef UTIL_KEY_VALUE_PACKING__
#define UTIL_KEY_VALUE_PACKING__
/* Why such a general interface? I'm planning on doing bit-level packing. */
#include <algorithm>
#include <cstddef>
#include <cstring>
#include <stdint.h>
namespace util {
template <class Key, class Value> struct Entry {
Key key;
Value value;
const Key &GetKey() const { return key; }
const Value &GetValue() const { return value; }
Value &MutableValue() { return value; }
void Set(const Key &key_in, const Value &value_in) {
SetKey(key_in);
SetValue(value_in);
}
void SetKey(const Key &key_in) { key = key_in; }
void SetValue(const Value &value_in) { value = value_in; }
bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); }
};
// And now for a brief interlude to specialize std::swap.
} // namespace util
namespace std {
template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) {
swap(first.key, second.key);
swap(first.value, second.value);
}
}// namespace std
namespace util {
template <class KeyT, class ValueT> class AlignedPacking {
public:
typedef KeyT Key;
typedef ValueT Value;
public:
static const std::size_t kBytes = sizeof(Entry<Key, Value>);
static const std::size_t kBits = kBytes * 8;
typedef Entry<Key, Value> * MutableIterator;
typedef const Entry<Key, Value> * ConstIterator;
typedef const Entry<Key, Value> & ConstReference;
static MutableIterator FromVoid(void *start) {
return reinterpret_cast<MutableIterator>(start);
}
static Entry<Key, Value> Make(const Key &key, const Value &value) {
Entry<Key, Value> ret;
ret.Set(key, value);
return ret;
}
};
template <class KeyT, class ValueT> class ByteAlignedPacking {
public:
typedef KeyT Key;
typedef ValueT Value;
private:
#pragma pack(push)
#pragma pack(1)
struct RawEntry {
Key key;
Value value;
const Key &GetKey() const { return key; }
const Value &GetValue() const { return value; }
Value &MutableValue() { return value; }
void Set(const Key &key_in, const Value &value_in) {
SetKey(key_in);
SetValue(value_in);
}
void SetKey(const Key &key_in) { key = key_in; }
void SetValue(const Value &value_in) { value = value_in; }
bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); }
};
#pragma pack(pop)
friend void std::swap<>(RawEntry&, RawEntry&);
public:
typedef RawEntry *MutableIterator;
typedef const RawEntry *ConstIterator;
typedef RawEntry &ConstReference;
static const std::size_t kBytes = sizeof(RawEntry);
static const std::size_t kBits = kBytes * 8;
static MutableIterator FromVoid(void *start) {
return MutableIterator(reinterpret_cast<RawEntry*>(start));
}
static RawEntry Make(const Key &key, const Value &value) {
RawEntry ret;
ret.Set(key, value);
return ret;
}
};
} // namespace util
namespace std {
template <class Key, class Value> void swap(
typename util::ByteAlignedPacking<Key, Value>::RawEntry &first,
typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) {
swap(first.key, second.key);
swap(first.value, second.value);
}
}// namespace std
#endif // UTIL_KEY_VALUE_PACKING__

View File

@ -1,75 +0,0 @@
#include "util/key_value_packing.hh"
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/scoped_array.hpp>
#define BOOST_TEST_MODULE KeyValueStoreTest
#include <boost/test/unit_test.hpp>
#include <limits>
#include <stdlib.h>
namespace util {
namespace {
BOOST_AUTO_TEST_CASE(basic_in_out) {
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
void *backing = malloc(Packing::kBytes * 2);
Packing::MutableIterator i(Packing::FromVoid(backing));
i->SetKey(10);
BOOST_CHECK_EQUAL(10, i->GetKey());
i->SetValue(3);
BOOST_CHECK_EQUAL(3, i->GetValue());
++i;
i->SetKey(5);
BOOST_CHECK_EQUAL(5, i->GetKey());
i->SetValue(42);
BOOST_CHECK_EQUAL(42, i->GetValue());
Packing::ConstIterator c(i);
BOOST_CHECK_EQUAL(5, c->GetKey());
--c;
BOOST_CHECK_EQUAL(10, c->GetKey());
BOOST_CHECK_EQUAL(42, i->GetValue());
BOOST_CHECK_EQUAL(5, i->GetKey());
free(backing);
}
BOOST_AUTO_TEST_CASE(simple_sort) {
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
char foo[Packing::kBytes * 4];
Packing::MutableIterator begin(Packing::FromVoid(foo));
Packing::MutableIterator i = begin;
i->SetKey(0); ++i;
i->SetKey(2); ++i;
i->SetKey(3); ++i;
i->SetKey(1); ++i;
std::sort(begin, i);
BOOST_CHECK_EQUAL(0, begin[0].GetKey());
BOOST_CHECK_EQUAL(1, begin[1].GetKey());
BOOST_CHECK_EQUAL(2, begin[2].GetKey());
BOOST_CHECK_EQUAL(3, begin[3].GetKey());
}
BOOST_AUTO_TEST_CASE(big_sort) {
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]);
Packing::MutableIterator begin(Packing::FromVoid(memory.get()));
boost::mt19937 rng;
boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max());
boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range);
for (size_t i = 0; i < 1000; ++i) {
(begin + i)->SetKey(gen());
}
std::sort(begin, begin + 1000);
for (size_t i = 0; i < 999; ++i) {
BOOST_CHECK(begin[i] < begin[i+1]);
}
}
} // namespace
} // namespace util

View File

@ -101,9 +101,10 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
#if defined(_WIN32) || defined(_WIN64)
int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL);
uint64_t total_size = size + offset;
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL);
UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
LPVOID ret = MapViewOfFile(hMapping, protectM, 0, offset, size);
LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size);
CloseHandle(hMapping);
UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
#else
@ -147,16 +148,20 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
}
}
void *MapAnonymous(std::size_t size) {
return MapOrThrow(size, true,
// Allocates zeroed memory in to.
void MapAnonymous(std::size_t size, util::scoped_memory &to) {
to.reset();
#if defined(_WIN32) || defined(_WIN64)
0 // MapOrThrow ignores the flags anyway.
#elif defined(MAP_ANONYMOUS)
MAP_ANONYMOUS | MAP_PRIVATE // Linux
to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED);
#else
to.reset(MapOrThrow(size, true,
# if defined(MAP_ANONYMOUS)
MAP_ANONYMOUS | MAP_PRIVATE // Linux
# else
MAP_ANON | MAP_PRIVATE // BSD
# endif
, false, -1, 0), size, scoped_memory::MMAP_ALLOCATED);
#endif
, false, -1, 0);
}
void *MapZeroedWrite(int fd, std::size_t size) {

View File

@ -100,7 +100,7 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
void *MapAnonymous(std::size_t size);
void MapAnonymous(std::size_t size, scoped_memory &to);
// Open file name with mmap of size bytes, all of which are initially zero.
void *MapZeroedWrite(int fd, std::size_t size);

View File

@ -7,9 +7,11 @@
* placed in namespace util
* add MurmurHashNative
* default option = 0 for seed
* ARM port from NICT
*/
#include "util/murmur_hash.hh"
#include <string.h>
namespace util {
@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
uint64_t h = seed ^ (len * m);
#if defined(__arm) || defined(__arm__)
const size_t ksize = sizeof(uint64_t);
const unsigned char * data = (const unsigned char *)key;
const unsigned char * end = data + (std::size_t)(len/8) * ksize;
#else
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
#endif
while(data != end)
{
#if defined(__arm) || defined(__arm__)
uint64_t k;
memcpy(&k, data, ksize);
data += ksize;
#else
uint64_t k = *data++;
#endif
k *= m;
k ^= k >> r;
@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
unsigned int h1 = seed ^ len;
unsigned int h2 = 0;
#if defined(__arm) || defined(__arm__)
size_t ksize = sizeof(unsigned int);
const unsigned char * data = (const unsigned char *)key;
#else
const unsigned int * data = (const unsigned int *)key;
#endif
unsigned int k1, k2;
while(len >= 8)
{
unsigned int k1 = *data++;
#if defined(__arm) || defined(__arm__)
memcpy(&k1, data, ksize);
data += ksize;
memcpy(&k2, data, ksize);
data += ksize;
#else
k1 = *data++;
k2 = *data++;
#endif
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
unsigned int k2 = *data++;
k2 *= m; k2 ^= k2 >> r; k2 *= m;
h2 *= m; h2 ^= k2;
len -= 4;
@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
if(len >= 4)
{
unsigned int k1 = *data++;
#if defined(__arm) || defined(__arm__)
memcpy(&k1, data, ksize);
data += ksize;
#else
k1 = *data++;
#endif
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;

View File

@ -5,6 +5,7 @@
#include <boost/random/variate_generator.hpp>
#include <boost/scoped_array.hpp>
#include <boost/unordered_map.hpp>
#define BOOST_TEST_MODULE SortedUniformTest
#include <boost/test/unit_test.hpp>