Merge branch 'master' into miramerge

Conflicts:
	moses/src/LanguageModel.cpp
	moses/src/TargetPhrase.h
	moses/src/TrellisPath.h
	moses/src/Util.h
	scripts/training/train-model.perl
This commit is contained in:
Barry Haddow 2011-10-12 17:14:23 +01:00
commit c83166087e
154 changed files with 47827 additions and 2116 deletions

View File

@ -194,6 +194,7 @@
GCC_ENABLE_FIX_AND_CONTINUE = YES;
GCC_MODEL_TUNING = G5;
GCC_OPTIMIZATION_LEVEL = 0;
HEADER_SEARCH_PATHS = ../kenlm;
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = OnDiskPt;
};
@ -205,6 +206,7 @@
ALWAYS_SEARCH_USER_PATHS = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_MODEL_TUNING = G5;
HEADER_SEARCH_PATHS = ../kenlm;
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = OnDiskPt;
};

View File

@ -81,6 +81,9 @@
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* flag for zlib */
#undef HAVE_ZLIB
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#undef LT_OBJDIR

View File

@ -258,7 +258,7 @@ then
AC_MSG_NOTICE([])
AC_MSG_NOTICE([!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!])
AC_MSG_NOTICE([!!! You are linking the IRSTLM library; be sure the release is >= 5.50.01 !!!])
AC_MSG_NOTICE([!!! You are linking the IRSTLM library; be sure the release is >= 5.70.02 !!!])
AC_MSG_NOTICE([!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!])
AC_MSG_NOTICE([])
@ -387,6 +387,11 @@ then
LDFLAGS="$LDFLAGS -L${with_zlib}/lib"
fi
# zlib is always required (see ./moses/src/gzfilebuf.h)
# TODO: This shouldn't be presented to the user as a config option if it isn't actually an option
AC_CHECK_HEADER(zlib.h,
[AC_DEFINE([HAVE_ZLIB], [], [flag for zlib])],
[AC_MSG_ERROR([Cannot find zlib.h. Please install it. For Debian, try 'sudo aptitude install zlib1g-dev'])])
LIBS="$LIBS -lz"

View File

@ -13,9 +13,10 @@ libkenlm_la_SOURCES = \
lm/quantize.cc \
lm/read_arpa.cc \
lm/trie.cc \
lm/trie_sort.cc \
lm/virtual_interface.cc \
lm/vocab.cc \
util/scoped.cc \
util/file.cc \
util/murmur_hash.cc \
util/mmap.cc \
util/file_piece.cc \

View File

@ -7,7 +7,7 @@
set -e
for i in util/{bit_packing,ersatz_progress,exception,file_piece,murmur_hash,scoped,mmap} lm/{bhiksha,binary_format,config,lm_exception,model,quantize,read_arpa,search_hashed,search_trie,trie,virtual_interface,vocab}; do
for i in util/{bit_packing,ersatz_progress,exception,file_piece,murmur_hash,file,mmap} lm/{bhiksha,binary_format,config,lm_exception,model,quantize,read_arpa,search_hashed,search_trie,trie,trie_sort,virtual_interface,vocab}; do
g++ -I. -O3 -DNDEBUG $CXXFLAGS -c $i.cc -o $i.o
done
g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/build_binary.cc {lm,util}/*.o -lz -o build_binary

View File

@ -7,128 +7,197 @@
objects = {
/* Begin PBXBuildFile section */
1E2B85C412555DB1000770D6 /* lm_exception.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E2B85C112555DB1000770D6 /* lm_exception.cc */; };
1E2B85C512555DB1000770D6 /* lm_exception.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E2B85C212555DB1000770D6 /* lm_exception.hh */; };
1E37EBC712496AB400C1C73A /* virtual_interface.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E37EBC512496AB400C1C73A /* virtual_interface.cc */; };
1E37EBC812496AB400C1C73A /* virtual_interface.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E37EBC612496AB400C1C73A /* virtual_interface.hh */; };
1E46B59E13BA5BE10084F898 /* blank.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E46B59D13BA5BE10084F898 /* blank.hh */; };
1E46B5A213BA5C050084F898 /* quantize.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E46B5A013BA5C050084F898 /* quantize.cc */; };
1E46B5A313BA5C050084F898 /* quantize.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E46B5A113BA5C050084F898 /* quantize.hh */; };
1E8A94FE1288BD570022C4EB /* build_binary.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8A94F41288BD570022C4EB /* build_binary.cc */; };
1E8A94FF1288BD570022C4EB /* config.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8A94F51288BD570022C4EB /* config.cc */; };
1E8A95001288BD570022C4EB /* config.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8A94F61288BD570022C4EB /* config.hh */; };
1E8A95011288BD570022C4EB /* model_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8A94F71288BD570022C4EB /* model_test.cc */; };
1E8A95021288BD570022C4EB /* model.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8A94F81288BD570022C4EB /* model.cc */; };
1E8A95031288BD570022C4EB /* model.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8A94F91288BD570022C4EB /* model.hh */; };
1E8A95041288BD570022C4EB /* search_hashed.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8A94FA1288BD570022C4EB /* search_hashed.cc */; };
1E8A95051288BD570022C4EB /* search_hashed.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8A94FB1288BD570022C4EB /* search_hashed.hh */; };
1E8A95061288BD570022C4EB /* search_trie.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8A94FC1288BD570022C4EB /* search_trie.cc */; };
1E8A95071288BD570022C4EB /* search_trie.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8A94FD1288BD570022C4EB /* search_trie.hh */; };
1E8BF78A1278A434009F10C1 /* binary_format.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8BF7871278A434009F10C1 /* binary_format.cc */; };
1E8BF78B1278A434009F10C1 /* binary_format.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8BF7881278A434009F10C1 /* binary_format.hh */; };
1E8BF78C1278A434009F10C1 /* enumerate_vocab.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8BF7891278A434009F10C1 /* enumerate_vocab.hh */; };
1E8BF79D1278A443009F10C1 /* trie.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8BF7951278A443009F10C1 /* trie.cc */; };
1E8BF79E1278A443009F10C1 /* trie.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E8BF7961278A443009F10C1 /* trie.hh */; };
1E8BF7D51278A600009F10C1 /* bit_packing.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E8BF7D41278A600009F10C1 /* bit_packing.cc */; };
1E91441413D065490005055B /* bhiksha.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E91441213D065490005055B /* bhiksha.cc */; };
1E91441513D065490005055B /* bhiksha.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E91441313D065490005055B /* bhiksha.hh */; };
1EBB16D7126C158600AE6102 /* ersatz_progress.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16BF126C158600AE6102 /* ersatz_progress.cc */; };
1EBB16D8126C158600AE6102 /* ersatz_progress.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16C0126C158600AE6102 /* ersatz_progress.hh */; };
1EBB16D9126C158600AE6102 /* exception.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16C1126C158600AE6102 /* exception.cc */; };
1EBB16DA126C158600AE6102 /* exception.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16C2126C158600AE6102 /* exception.hh */; };
1EBB16DC126C158600AE6102 /* file_piece.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16C4126C158600AE6102 /* file_piece.cc */; };
1EBB16DD126C158600AE6102 /* file_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16C5126C158600AE6102 /* file_piece.hh */; };
1EBB16DE126C158600AE6102 /* joint_sort_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16C6126C158600AE6102 /* joint_sort_test.cc */; };
1EBB16DF126C158600AE6102 /* joint_sort.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16C7126C158600AE6102 /* joint_sort.hh */; };
1EBB16E0126C158600AE6102 /* key_value_packing_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16C8126C158600AE6102 /* key_value_packing_test.cc */; };
1EBB16E1126C158600AE6102 /* key_value_packing.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16C9126C158600AE6102 /* key_value_packing.hh */; };
1EBB16E2126C158600AE6102 /* mmap.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16CA126C158600AE6102 /* mmap.cc */; };
1EBB16E3126C158600AE6102 /* mmap.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16CB126C158600AE6102 /* mmap.hh */; };
1EBB16E4126C158600AE6102 /* murmur_hash.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16CC126C158600AE6102 /* murmur_hash.cc */; };
1EBB16E5126C158600AE6102 /* murmur_hash.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16CD126C158600AE6102 /* murmur_hash.hh */; };
1EBB16E6126C158600AE6102 /* probing_hash_table_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16CE126C158600AE6102 /* probing_hash_table_test.cc */; };
1EBB16E7126C158600AE6102 /* probing_hash_table.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16CF126C158600AE6102 /* probing_hash_table.hh */; };
1EBB16E8126C158600AE6102 /* proxy_iterator.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D0126C158600AE6102 /* proxy_iterator.hh */; };
1EBB16E9126C158600AE6102 /* scoped.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16D1126C158600AE6102 /* scoped.cc */; };
1EBB16EA126C158600AE6102 /* scoped.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D2126C158600AE6102 /* scoped.hh */; };
1EBB16EB126C158600AE6102 /* sorted_uniform_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16D3126C158600AE6102 /* sorted_uniform_test.cc */; };
1EBB16EC126C158600AE6102 /* sorted_uniform.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D4126C158600AE6102 /* sorted_uniform.hh */; };
1EBB16EE126C158600AE6102 /* string_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D6126C158600AE6102 /* string_piece.hh */; };
1EBB1717126C15C500AE6102 /* facade.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB1708126C15C500AE6102 /* facade.hh */; };
1EBB171A126C15C500AE6102 /* ngram_query.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB170B126C15C500AE6102 /* ngram_query.cc */; };
1EBB171C126C15C500AE6102 /* read_arpa.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB170D126C15C500AE6102 /* read_arpa.cc */; };
1EBB171D126C15C500AE6102 /* read_arpa.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB170E126C15C500AE6102 /* read_arpa.hh */; };
1EBB171E126C15C500AE6102 /* sri_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB170F126C15C500AE6102 /* sri_test.cc */; };
1EBB171F126C15C500AE6102 /* sri.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB1710126C15C500AE6102 /* sri.cc */; };
1EBB1720126C15C500AE6102 /* sri.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB1711126C15C500AE6102 /* sri.hh */; };
1EBB1721126C15C500AE6102 /* vocab.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB1713126C15C500AE6102 /* vocab.cc */; };
1EBB1722126C15C500AE6102 /* vocab.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB1714126C15C500AE6102 /* vocab.hh */; };
1EBB1723126C15C500AE6102 /* weights.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB1715126C15C500AE6102 /* weights.hh */; };
1EBB1724126C15C500AE6102 /* word_index.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB1716126C15C500AE6102 /* word_index.hh */; };
1ED9988712783457006BBB6C /* file_piece_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1ED9988612783457006BBB6C /* file_piece_test.cc */; };
1E69E6C7142EED56004E4D93 /* bit_packing.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E69B142EED56004E4D93 /* bit_packing.cc */; };
1E69E6C8142EED56004E4D93 /* bit_packing.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E69C142EED56004E4D93 /* bit_packing.hh */; };
1E69E6C9142EED56004E4D93 /* bit_packing.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E69D142EED56004E4D93 /* bit_packing.o */; };
1E69E6CA142EED56004E4D93 /* bit_packing_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E69F142EED56004E4D93 /* bit_packing_test.cc */; };
1E69E6CB142EED56004E4D93 /* ersatz_progress.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6A0142EED56004E4D93 /* ersatz_progress.cc */; };
1E69E6CC142EED56004E4D93 /* ersatz_progress.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6A1142EED56004E4D93 /* ersatz_progress.hh */; };
1E69E6CD142EED56004E4D93 /* ersatz_progress.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6A2142EED56004E4D93 /* ersatz_progress.o */; };
1E69E6CE142EED56004E4D93 /* exception.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6A3142EED56004E4D93 /* exception.cc */; };
1E69E6CF142EED56004E4D93 /* exception.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6A4142EED56004E4D93 /* exception.hh */; };
1E69E6D0142EED56004E4D93 /* exception.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6A5142EED56004E4D93 /* exception.o */; };
1E69E6D1142EED56004E4D93 /* file.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6A6142EED56004E4D93 /* file.cc */; };
1E69E6D2142EED56004E4D93 /* file.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6A7142EED56004E4D93 /* file.hh */; };
1E69E6D3142EED56004E4D93 /* file_piece.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6A8142EED56004E4D93 /* file_piece.cc */; };
1E69E6D4142EED56004E4D93 /* file_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6AA142EED56004E4D93 /* file_piece.hh */; };
1E69E6D5142EED56004E4D93 /* file_piece.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6AB142EED56004E4D93 /* file_piece.o */; };
1E69E6D6142EED56004E4D93 /* file_piece_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6AD142EED56004E4D93 /* file_piece_test.cc */; };
1E69E6D7142EED56004E4D93 /* have.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6AE142EED56004E4D93 /* have.hh */; };
1E69E6D8142EED56004E4D93 /* joint_sort.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6AF142EED56004E4D93 /* joint_sort.hh */; };
1E69E6D9142EED56004E4D93 /* joint_sort_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6B1142EED56004E4D93 /* joint_sort_test.cc */; };
1E69E6DA142EED56004E4D93 /* key_value_packing.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6B2142EED56004E4D93 /* key_value_packing.hh */; };
1E69E6DB142EED56004E4D93 /* key_value_packing_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6B4142EED56004E4D93 /* key_value_packing_test.cc */; };
1E69E6DC142EED56004E4D93 /* mmap.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6B5142EED56004E4D93 /* mmap.cc */; };
1E69E6DD142EED56004E4D93 /* mmap.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6B6142EED56004E4D93 /* mmap.hh */; };
1E69E6DE142EED56004E4D93 /* mmap.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6B7142EED56004E4D93 /* mmap.o */; };
1E69E6DF142EED56004E4D93 /* murmur_hash.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6B8142EED56004E4D93 /* murmur_hash.cc */; };
1E69E6E0142EED56004E4D93 /* murmur_hash.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6B9142EED56004E4D93 /* murmur_hash.hh */; };
1E69E6E1142EED56004E4D93 /* murmur_hash.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6BA142EED56004E4D93 /* murmur_hash.o */; };
1E69E6E2142EED56004E4D93 /* probing_hash_table.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6BB142EED56004E4D93 /* probing_hash_table.hh */; };
1E69E6E3142EED57004E4D93 /* probing_hash_table_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6BD142EED56004E4D93 /* probing_hash_table_test.cc */; };
1E69E6E4142EED57004E4D93 /* proxy_iterator.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6BE142EED56004E4D93 /* proxy_iterator.hh */; };
1E69E6E5142EED57004E4D93 /* scoped.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6BF142EED56004E4D93 /* scoped.hh */; };
1E69E6E6142EED57004E4D93 /* scoped.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6C0142EED56004E4D93 /* scoped.o */; };
1E69E6E7142EED57004E4D93 /* sized_iterator.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6C1142EED56004E4D93 /* sized_iterator.hh */; };
1E69E6E8142EED57004E4D93 /* sorted_uniform.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6C2142EED56004E4D93 /* sorted_uniform.hh */; };
1E69E6E9142EED57004E4D93 /* sorted_uniform_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6C4142EED56004E4D93 /* sorted_uniform_test.cc */; };
1E69E6EA142EED57004E4D93 /* string_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6C5142EED56004E4D93 /* string_piece.hh */; };
1E69E6EB142EED57004E4D93 /* tokenize_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6C6142EED56004E4D93 /* tokenize_piece.hh */; };
1E69E725142EEDA8004E4D93 /* bhiksha.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6EE142EEDA8004E4D93 /* bhiksha.cc */; };
1E69E726142EEDA8004E4D93 /* bhiksha.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6EF142EEDA8004E4D93 /* bhiksha.hh */; };
1E69E727142EEDA8004E4D93 /* binary_format.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6F0142EEDA8004E4D93 /* binary_format.cc */; };
1E69E728142EEDA8004E4D93 /* binary_format.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6F1142EEDA8004E4D93 /* binary_format.hh */; };
1E69E729142EEDA8004E4D93 /* binary_format.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6F2142EEDA8004E4D93 /* binary_format.o */; };
1E69E72A142EEDA8004E4D93 /* blank.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6F3142EEDA8004E4D93 /* blank.hh */; };
1E69E72B142EEDA8004E4D93 /* build_binary.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6F4142EEDA8004E4D93 /* build_binary.cc */; };
1E69E72C142EEDA8004E4D93 /* config.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6F5142EEDA8004E4D93 /* config.cc */; };
1E69E72D142EEDA8004E4D93 /* config.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6F6142EEDA8004E4D93 /* config.hh */; };
1E69E72E142EEDA8004E4D93 /* config.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6F7142EEDA8004E4D93 /* config.o */; };
1E69E72F142EEDA8004E4D93 /* enumerate_vocab.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6F8142EEDA8004E4D93 /* enumerate_vocab.hh */; };
1E69E730142EEDA8004E4D93 /* facade.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6F9142EEDA8004E4D93 /* facade.hh */; };
1E69E731142EEDA8004E4D93 /* left.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6FA142EEDA8004E4D93 /* left.hh */; };
1E69E732142EEDA8004E4D93 /* left_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6FB142EEDA8004E4D93 /* left_test.cc */; };
1E69E733142EEDA8004E4D93 /* lm_exception.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E6FC142EEDA8004E4D93 /* lm_exception.cc */; };
1E69E734142EEDA8004E4D93 /* lm_exception.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6FD142EEDA8004E4D93 /* lm_exception.hh */; };
1E69E735142EEDA8004E4D93 /* lm_exception.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E6FE142EEDA8004E4D93 /* lm_exception.o */; };
1E69E736142EEDA8004E4D93 /* max_order.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E6FF142EEDA8004E4D93 /* max_order.hh */; };
1E69E737142EEDA8004E4D93 /* model.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E700142EEDA8004E4D93 /* model.cc */; };
1E69E738142EEDA8004E4D93 /* model.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E701142EEDA8004E4D93 /* model.hh */; };
1E69E739142EEDA8004E4D93 /* model.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E702142EEDA8004E4D93 /* model.o */; };
1E69E73A142EEDA8004E4D93 /* model_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E704142EEDA8004E4D93 /* model_test.cc */; };
1E69E73B142EEDA8004E4D93 /* model_type.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E705142EEDA8004E4D93 /* model_type.hh */; };
1E69E73C142EEDA8004E4D93 /* ngram_query.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E706142EEDA8004E4D93 /* ngram_query.cc */; };
1E69E73D142EEDA8004E4D93 /* quantize.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E707142EEDA8004E4D93 /* quantize.cc */; };
1E69E73E142EEDA8004E4D93 /* quantize.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E708142EEDA8004E4D93 /* quantize.hh */; };
1E69E73F142EEDA8004E4D93 /* read_arpa.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E709142EEDA8004E4D93 /* read_arpa.cc */; };
1E69E740142EEDA8004E4D93 /* read_arpa.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E70A142EEDA8004E4D93 /* read_arpa.hh */; };
1E69E741142EEDA8004E4D93 /* read_arpa.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E70B142EEDA8004E4D93 /* read_arpa.o */; };
1E69E742142EEDA8004E4D93 /* return.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E70C142EEDA8004E4D93 /* return.hh */; };
1E69E743142EEDA8004E4D93 /* search_hashed.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E70D142EEDA8004E4D93 /* search_hashed.cc */; };
1E69E744142EEDA8004E4D93 /* search_hashed.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E70E142EEDA8004E4D93 /* search_hashed.hh */; };
1E69E745142EEDA8004E4D93 /* search_hashed.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E70F142EEDA8004E4D93 /* search_hashed.o */; };
1E69E746142EEDA8004E4D93 /* search_trie.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E710142EEDA8004E4D93 /* search_trie.cc */; };
1E69E747142EEDA8004E4D93 /* search_trie.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E711142EEDA8004E4D93 /* search_trie.hh */; };
1E69E748142EEDA8004E4D93 /* search_trie.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E712142EEDA8004E4D93 /* search_trie.o */; };
1E69E749142EEDA8004E4D93 /* sri.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E713142EEDA8004E4D93 /* sri.cc */; };
1E69E74A142EEDA8004E4D93 /* sri.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E714142EEDA8004E4D93 /* sri.hh */; };
1E69E74B142EEDA8004E4D93 /* sri_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E715142EEDA8004E4D93 /* sri_test.cc */; };
1E69E74C142EEDA8004E4D93 /* trie.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E718142EEDA8004E4D93 /* trie.cc */; };
1E69E74D142EEDA8004E4D93 /* trie.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E719142EEDA8004E4D93 /* trie.hh */; };
1E69E74E142EEDA8004E4D93 /* trie.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E71A142EEDA8004E4D93 /* trie.o */; };
1E69E74F142EEDA8004E4D93 /* trie_sort.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E71B142EEDA8004E4D93 /* trie_sort.cc */; };
1E69E750142EEDA8004E4D93 /* trie_sort.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E71C142EEDA8004E4D93 /* trie_sort.hh */; };
1E69E751142EEDA8004E4D93 /* virtual_interface.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E71D142EEDA8004E4D93 /* virtual_interface.cc */; };
1E69E752142EEDA8004E4D93 /* virtual_interface.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E71E142EEDA8004E4D93 /* virtual_interface.hh */; };
1E69E753142EEDA8004E4D93 /* virtual_interface.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E71F142EEDA8004E4D93 /* virtual_interface.o */; };
1E69E754142EEDA8004E4D93 /* vocab.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1E69E720142EEDA8004E4D93 /* vocab.cc */; };
1E69E755142EEDA8004E4D93 /* vocab.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E721142EEDA8004E4D93 /* vocab.hh */; };
1E69E756142EEDA8004E4D93 /* vocab.o in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E69E722142EEDA8004E4D93 /* vocab.o */; };
1E69E757142EEDA8004E4D93 /* weights.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E723142EEDA8004E4D93 /* weights.hh */; };
1E69E758142EEDA8004E4D93 /* word_index.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1E69E724142EEDA8004E4D93 /* word_index.hh */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
1E2B85C112555DB1000770D6 /* lm_exception.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = lm_exception.cc; path = lm/lm_exception.cc; sourceTree = "<group>"; };
1E2B85C212555DB1000770D6 /* lm_exception.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = lm_exception.hh; path = lm/lm_exception.hh; sourceTree = "<group>"; };
1E37EBC512496AB400C1C73A /* virtual_interface.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = virtual_interface.cc; path = lm/virtual_interface.cc; sourceTree = "<group>"; };
1E37EBC612496AB400C1C73A /* virtual_interface.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = virtual_interface.hh; path = lm/virtual_interface.hh; sourceTree = "<group>"; };
1E46B59D13BA5BE10084F898 /* blank.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = blank.hh; path = lm/blank.hh; sourceTree = "<group>"; };
1E46B5A013BA5C050084F898 /* quantize.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = quantize.cc; path = lm/quantize.cc; sourceTree = "<group>"; };
1E46B5A113BA5C050084F898 /* quantize.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = quantize.hh; path = lm/quantize.hh; sourceTree = "<group>"; };
1E8A94F41288BD570022C4EB /* build_binary.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = build_binary.cc; path = lm/build_binary.cc; sourceTree = "<group>"; };
1E8A94F51288BD570022C4EB /* config.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = config.cc; path = lm/config.cc; sourceTree = "<group>"; };
1E8A94F61288BD570022C4EB /* config.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = config.hh; path = lm/config.hh; sourceTree = "<group>"; };
1E8A94F71288BD570022C4EB /* model_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = model_test.cc; path = lm/model_test.cc; sourceTree = "<group>"; };
1E8A94F81288BD570022C4EB /* model.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = model.cc; path = lm/model.cc; sourceTree = "<group>"; };
1E8A94F91288BD570022C4EB /* model.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = model.hh; path = lm/model.hh; sourceTree = "<group>"; };
1E8A94FA1288BD570022C4EB /* search_hashed.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = search_hashed.cc; path = lm/search_hashed.cc; sourceTree = "<group>"; };
1E8A94FB1288BD570022C4EB /* search_hashed.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = search_hashed.hh; path = lm/search_hashed.hh; sourceTree = "<group>"; };
1E8A94FC1288BD570022C4EB /* search_trie.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = search_trie.cc; path = lm/search_trie.cc; sourceTree = "<group>"; };
1E8A94FD1288BD570022C4EB /* search_trie.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = search_trie.hh; path = lm/search_trie.hh; sourceTree = "<group>"; };
1E8BF7871278A434009F10C1 /* binary_format.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = binary_format.cc; path = lm/binary_format.cc; sourceTree = "<group>"; };
1E8BF7881278A434009F10C1 /* binary_format.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = binary_format.hh; path = lm/binary_format.hh; sourceTree = "<group>"; };
1E8BF7891278A434009F10C1 /* enumerate_vocab.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = enumerate_vocab.hh; path = lm/enumerate_vocab.hh; sourceTree = "<group>"; };
1E8BF7951278A443009F10C1 /* trie.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = trie.cc; path = lm/trie.cc; sourceTree = "<group>"; };
1E8BF7961278A443009F10C1 /* trie.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = trie.hh; path = lm/trie.hh; sourceTree = "<group>"; };
1E8BF7D41278A600009F10C1 /* bit_packing.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bit_packing.cc; path = util/bit_packing.cc; sourceTree = "<group>"; };
1E91441213D065490005055B /* bhiksha.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bhiksha.cc; path = lm/bhiksha.cc; sourceTree = "<group>"; };
1E91441313D065490005055B /* bhiksha.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = bhiksha.hh; path = lm/bhiksha.hh; sourceTree = "<group>"; };
1EBB16BF126C158600AE6102 /* ersatz_progress.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ersatz_progress.cc; path = util/ersatz_progress.cc; sourceTree = "<group>"; };
1EBB16C0126C158600AE6102 /* ersatz_progress.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = ersatz_progress.hh; path = util/ersatz_progress.hh; sourceTree = "<group>"; };
1EBB16C1126C158600AE6102 /* exception.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = exception.cc; path = util/exception.cc; sourceTree = "<group>"; };
1EBB16C2126C158600AE6102 /* exception.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = exception.hh; path = util/exception.hh; sourceTree = "<group>"; };
1EBB16C4126C158600AE6102 /* file_piece.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = file_piece.cc; path = util/file_piece.cc; sourceTree = "<group>"; };
1EBB16C5126C158600AE6102 /* file_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = file_piece.hh; path = util/file_piece.hh; sourceTree = "<group>"; };
1EBB16C6126C158600AE6102 /* joint_sort_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = joint_sort_test.cc; path = util/joint_sort_test.cc; sourceTree = "<group>"; };
1EBB16C7126C158600AE6102 /* joint_sort.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = joint_sort.hh; path = util/joint_sort.hh; sourceTree = "<group>"; };
1EBB16C8126C158600AE6102 /* key_value_packing_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = key_value_packing_test.cc; path = util/key_value_packing_test.cc; sourceTree = "<group>"; };
1EBB16C9126C158600AE6102 /* key_value_packing.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = key_value_packing.hh; path = util/key_value_packing.hh; sourceTree = "<group>"; };
1EBB16CA126C158600AE6102 /* mmap.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = mmap.cc; path = util/mmap.cc; sourceTree = "<group>"; };
1EBB16CB126C158600AE6102 /* mmap.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = mmap.hh; path = util/mmap.hh; sourceTree = "<group>"; };
1EBB16CC126C158600AE6102 /* murmur_hash.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = murmur_hash.cc; path = util/murmur_hash.cc; sourceTree = "<group>"; };
1EBB16CD126C158600AE6102 /* murmur_hash.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = murmur_hash.hh; path = util/murmur_hash.hh; sourceTree = "<group>"; };
1EBB16CE126C158600AE6102 /* probing_hash_table_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = probing_hash_table_test.cc; path = util/probing_hash_table_test.cc; sourceTree = "<group>"; };
1EBB16CF126C158600AE6102 /* probing_hash_table.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = probing_hash_table.hh; path = util/probing_hash_table.hh; sourceTree = "<group>"; };
1EBB16D0126C158600AE6102 /* proxy_iterator.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = proxy_iterator.hh; path = util/proxy_iterator.hh; sourceTree = "<group>"; };
1EBB16D1126C158600AE6102 /* scoped.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = scoped.cc; path = util/scoped.cc; sourceTree = "<group>"; };
1EBB16D2126C158600AE6102 /* scoped.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = scoped.hh; path = util/scoped.hh; sourceTree = "<group>"; };
1EBB16D3126C158600AE6102 /* sorted_uniform_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = sorted_uniform_test.cc; path = util/sorted_uniform_test.cc; sourceTree = "<group>"; };
1EBB16D4126C158600AE6102 /* sorted_uniform.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = sorted_uniform.hh; path = util/sorted_uniform.hh; sourceTree = "<group>"; };
1EBB16D6126C158600AE6102 /* string_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = string_piece.hh; path = util/string_piece.hh; sourceTree = "<group>"; };
1EBB1708126C15C500AE6102 /* facade.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = facade.hh; path = lm/facade.hh; sourceTree = "<group>"; };
1EBB170B126C15C500AE6102 /* ngram_query.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ngram_query.cc; path = lm/ngram_query.cc; sourceTree = "<group>"; };
1EBB170D126C15C500AE6102 /* read_arpa.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = read_arpa.cc; path = lm/read_arpa.cc; sourceTree = "<group>"; };
1EBB170E126C15C500AE6102 /* read_arpa.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = read_arpa.hh; path = lm/read_arpa.hh; sourceTree = "<group>"; };
1EBB170F126C15C500AE6102 /* sri_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = sri_test.cc; path = lm/sri_test.cc; sourceTree = "<group>"; };
1EBB1710126C15C500AE6102 /* sri.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = sri.cc; path = lm/sri.cc; sourceTree = "<group>"; };
1EBB1711126C15C500AE6102 /* sri.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = sri.hh; path = lm/sri.hh; sourceTree = "<group>"; };
1EBB1712126C15C500AE6102 /* test.arpa */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = test.arpa; path = lm/test.arpa; sourceTree = "<group>"; };
1EBB1713126C15C500AE6102 /* vocab.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = vocab.cc; path = lm/vocab.cc; sourceTree = "<group>"; };
1EBB1714126C15C500AE6102 /* vocab.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = vocab.hh; path = lm/vocab.hh; sourceTree = "<group>"; };
1EBB1715126C15C500AE6102 /* weights.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = weights.hh; path = lm/weights.hh; sourceTree = "<group>"; };
1EBB1716126C15C500AE6102 /* word_index.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = word_index.hh; path = lm/word_index.hh; sourceTree = "<group>"; };
1ED9988612783457006BBB6C /* file_piece_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = file_piece_test.cc; path = util/file_piece_test.cc; sourceTree = "<group>"; };
1E69E69B142EED56004E4D93 /* bit_packing.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_packing.cc; sourceTree = "<group>"; };
1E69E69C142EED56004E4D93 /* bit_packing.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = bit_packing.hh; sourceTree = "<group>"; };
1E69E69D142EED56004E4D93 /* bit_packing.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = bit_packing.o; sourceTree = "<group>"; };
1E69E69E142EED56004E4D93 /* bit_packing_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = bit_packing_test; sourceTree = "<group>"; };
1E69E69F142EED56004E4D93 /* bit_packing_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_packing_test.cc; sourceTree = "<group>"; };
1E69E6A0142EED56004E4D93 /* ersatz_progress.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ersatz_progress.cc; sourceTree = "<group>"; };
1E69E6A1142EED56004E4D93 /* ersatz_progress.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ersatz_progress.hh; sourceTree = "<group>"; };
1E69E6A2142EED56004E4D93 /* ersatz_progress.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = ersatz_progress.o; sourceTree = "<group>"; };
1E69E6A3142EED56004E4D93 /* exception.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = exception.cc; sourceTree = "<group>"; };
1E69E6A4142EED56004E4D93 /* exception.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = exception.hh; sourceTree = "<group>"; };
1E69E6A5142EED56004E4D93 /* exception.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = exception.o; sourceTree = "<group>"; };
1E69E6A6142EED56004E4D93 /* file.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = file.cc; sourceTree = "<group>"; };
1E69E6A7142EED56004E4D93 /* file.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = file.hh; sourceTree = "<group>"; };
1E69E6A8142EED56004E4D93 /* file_piece.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = file_piece.cc; sourceTree = "<group>"; };
1E69E6A9142EED56004E4D93 /* file_piece.cc.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = file_piece.cc.gz; sourceTree = "<group>"; };
1E69E6AA142EED56004E4D93 /* file_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = file_piece.hh; sourceTree = "<group>"; };
1E69E6AB142EED56004E4D93 /* file_piece.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = file_piece.o; sourceTree = "<group>"; };
1E69E6AC142EED56004E4D93 /* file_piece_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = file_piece_test; sourceTree = "<group>"; };
1E69E6AD142EED56004E4D93 /* file_piece_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = file_piece_test.cc; sourceTree = "<group>"; };
1E69E6AE142EED56004E4D93 /* have.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = have.hh; sourceTree = "<group>"; };
1E69E6AF142EED56004E4D93 /* joint_sort.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = joint_sort.hh; sourceTree = "<group>"; };
1E69E6B0142EED56004E4D93 /* joint_sort_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = joint_sort_test; sourceTree = "<group>"; };
1E69E6B1142EED56004E4D93 /* joint_sort_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = joint_sort_test.cc; sourceTree = "<group>"; };
1E69E6B2142EED56004E4D93 /* key_value_packing.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = key_value_packing.hh; sourceTree = "<group>"; };
1E69E6B3142EED56004E4D93 /* key_value_packing_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = key_value_packing_test; sourceTree = "<group>"; };
1E69E6B4142EED56004E4D93 /* key_value_packing_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = key_value_packing_test.cc; sourceTree = "<group>"; };
1E69E6B5142EED56004E4D93 /* mmap.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mmap.cc; sourceTree = "<group>"; };
1E69E6B6142EED56004E4D93 /* mmap.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = mmap.hh; sourceTree = "<group>"; };
1E69E6B7142EED56004E4D93 /* mmap.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = mmap.o; sourceTree = "<group>"; };
1E69E6B8142EED56004E4D93 /* murmur_hash.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = murmur_hash.cc; sourceTree = "<group>"; };
1E69E6B9142EED56004E4D93 /* murmur_hash.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = murmur_hash.hh; sourceTree = "<group>"; };
1E69E6BA142EED56004E4D93 /* murmur_hash.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = murmur_hash.o; sourceTree = "<group>"; };
1E69E6BB142EED56004E4D93 /* probing_hash_table.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = probing_hash_table.hh; sourceTree = "<group>"; };
1E69E6BC142EED56004E4D93 /* probing_hash_table_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = probing_hash_table_test; sourceTree = "<group>"; };
1E69E6BD142EED56004E4D93 /* probing_hash_table_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = probing_hash_table_test.cc; sourceTree = "<group>"; };
1E69E6BE142EED56004E4D93 /* proxy_iterator.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = proxy_iterator.hh; sourceTree = "<group>"; };
1E69E6BF142EED56004E4D93 /* scoped.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = scoped.hh; sourceTree = "<group>"; };
1E69E6C0142EED56004E4D93 /* scoped.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = scoped.o; sourceTree = "<group>"; };
1E69E6C1142EED56004E4D93 /* sized_iterator.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = sized_iterator.hh; sourceTree = "<group>"; };
1E69E6C2142EED56004E4D93 /* sorted_uniform.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = sorted_uniform.hh; sourceTree = "<group>"; };
1E69E6C3142EED56004E4D93 /* sorted_uniform_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = sorted_uniform_test; sourceTree = "<group>"; };
1E69E6C4142EED56004E4D93 /* sorted_uniform_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sorted_uniform_test.cc; sourceTree = "<group>"; };
1E69E6C5142EED56004E4D93 /* string_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = string_piece.hh; sourceTree = "<group>"; };
1E69E6C6142EED56004E4D93 /* tokenize_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tokenize_piece.hh; sourceTree = "<group>"; };
1E69E6EE142EEDA8004E4D93 /* bhiksha.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bhiksha.cc; sourceTree = "<group>"; };
1E69E6EF142EEDA8004E4D93 /* bhiksha.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = bhiksha.hh; sourceTree = "<group>"; };
1E69E6F0142EEDA8004E4D93 /* binary_format.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binary_format.cc; sourceTree = "<group>"; };
1E69E6F1142EEDA8004E4D93 /* binary_format.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = binary_format.hh; sourceTree = "<group>"; };
1E69E6F2142EEDA8004E4D93 /* binary_format.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = binary_format.o; sourceTree = "<group>"; };
1E69E6F3142EEDA8004E4D93 /* blank.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blank.hh; sourceTree = "<group>"; };
1E69E6F4142EEDA8004E4D93 /* build_binary.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = build_binary.cc; sourceTree = "<group>"; };
1E69E6F5142EEDA8004E4D93 /* config.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = config.cc; sourceTree = "<group>"; };
1E69E6F6142EEDA8004E4D93 /* config.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = config.hh; sourceTree = "<group>"; };
1E69E6F7142EEDA8004E4D93 /* config.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = config.o; sourceTree = "<group>"; };
1E69E6F8142EEDA8004E4D93 /* enumerate_vocab.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = enumerate_vocab.hh; sourceTree = "<group>"; };
1E69E6F9142EEDA8004E4D93 /* facade.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = facade.hh; sourceTree = "<group>"; };
1E69E6FA142EEDA8004E4D93 /* left.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = left.hh; sourceTree = "<group>"; };
1E69E6FB142EEDA8004E4D93 /* left_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = left_test.cc; sourceTree = "<group>"; };
1E69E6FC142EEDA8004E4D93 /* lm_exception.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lm_exception.cc; sourceTree = "<group>"; };
1E69E6FD142EEDA8004E4D93 /* lm_exception.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = lm_exception.hh; sourceTree = "<group>"; };
1E69E6FE142EEDA8004E4D93 /* lm_exception.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = lm_exception.o; sourceTree = "<group>"; };
1E69E6FF142EEDA8004E4D93 /* max_order.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = max_order.hh; sourceTree = "<group>"; };
1E69E700142EEDA8004E4D93 /* model.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = model.cc; sourceTree = "<group>"; };
1E69E701142EEDA8004E4D93 /* model.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = model.hh; sourceTree = "<group>"; };
1E69E702142EEDA8004E4D93 /* model.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = model.o; sourceTree = "<group>"; };
1E69E703142EEDA8004E4D93 /* model_test */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.executable"; path = model_test; sourceTree = "<group>"; };
1E69E704142EEDA8004E4D93 /* model_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = model_test.cc; sourceTree = "<group>"; };
1E69E705142EEDA8004E4D93 /* model_type.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = model_type.hh; sourceTree = "<group>"; };
1E69E706142EEDA8004E4D93 /* ngram_query.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ngram_query.cc; sourceTree = "<group>"; };
1E69E707142EEDA8004E4D93 /* quantize.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = quantize.cc; sourceTree = "<group>"; };
1E69E708142EEDA8004E4D93 /* quantize.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = quantize.hh; sourceTree = "<group>"; };
1E69E709142EEDA8004E4D93 /* read_arpa.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = read_arpa.cc; sourceTree = "<group>"; };
1E69E70A142EEDA8004E4D93 /* read_arpa.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = read_arpa.hh; sourceTree = "<group>"; };
1E69E70B142EEDA8004E4D93 /* read_arpa.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = read_arpa.o; sourceTree = "<group>"; };
1E69E70C142EEDA8004E4D93 /* return.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = return.hh; sourceTree = "<group>"; };
1E69E70D142EEDA8004E4D93 /* search_hashed.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = search_hashed.cc; sourceTree = "<group>"; };
1E69E70E142EEDA8004E4D93 /* search_hashed.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = search_hashed.hh; sourceTree = "<group>"; };
1E69E70F142EEDA8004E4D93 /* search_hashed.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = search_hashed.o; sourceTree = "<group>"; };
1E69E710142EEDA8004E4D93 /* search_trie.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = search_trie.cc; sourceTree = "<group>"; };
1E69E711142EEDA8004E4D93 /* search_trie.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = search_trie.hh; sourceTree = "<group>"; };
1E69E712142EEDA8004E4D93 /* search_trie.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = search_trie.o; sourceTree = "<group>"; };
1E69E713142EEDA8004E4D93 /* sri.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sri.cc; sourceTree = "<group>"; };
1E69E714142EEDA8004E4D93 /* sri.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = sri.hh; sourceTree = "<group>"; };
1E69E715142EEDA8004E4D93 /* sri_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sri_test.cc; sourceTree = "<group>"; };
1E69E716142EEDA8004E4D93 /* test.arpa */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = test.arpa; sourceTree = "<group>"; };
1E69E717142EEDA8004E4D93 /* test_nounk.arpa */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = test_nounk.arpa; sourceTree = "<group>"; };
1E69E718142EEDA8004E4D93 /* trie.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trie.cc; sourceTree = "<group>"; };
1E69E719142EEDA8004E4D93 /* trie.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = trie.hh; sourceTree = "<group>"; };
1E69E71A142EEDA8004E4D93 /* trie.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = trie.o; sourceTree = "<group>"; };
1E69E71B142EEDA8004E4D93 /* trie_sort.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trie_sort.cc; sourceTree = "<group>"; };
1E69E71C142EEDA8004E4D93 /* trie_sort.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = trie_sort.hh; sourceTree = "<group>"; };
1E69E71D142EEDA8004E4D93 /* virtual_interface.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = virtual_interface.cc; sourceTree = "<group>"; };
1E69E71E142EEDA8004E4D93 /* virtual_interface.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = virtual_interface.hh; sourceTree = "<group>"; };
1E69E71F142EEDA8004E4D93 /* virtual_interface.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = virtual_interface.o; sourceTree = "<group>"; };
1E69E720142EEDA8004E4D93 /* vocab.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vocab.cc; sourceTree = "<group>"; };
1E69E721142EEDA8004E4D93 /* vocab.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = vocab.hh; sourceTree = "<group>"; };
1E69E722142EEDA8004E4D93 /* vocab.o */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.objfile"; path = vocab.o; sourceTree = "<group>"; };
1E69E723142EEDA8004E4D93 /* weights.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = weights.hh; sourceTree = "<group>"; };
1E69E724142EEDA8004E4D93 /* word_index.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = word_index.hh; sourceTree = "<group>"; };
D2AAC046055464E500DB518D /* libkenlm.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkenlm.a; sourceTree = BUILT_PRODUCTS_DIR; };
/* End PBXFileReference section */
@ -137,6 +206,23 @@
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
1E69E6C9142EED56004E4D93 /* bit_packing.o in Frameworks */,
1E69E6CD142EED56004E4D93 /* ersatz_progress.o in Frameworks */,
1E69E6D0142EED56004E4D93 /* exception.o in Frameworks */,
1E69E6D5142EED56004E4D93 /* file_piece.o in Frameworks */,
1E69E6DE142EED56004E4D93 /* mmap.o in Frameworks */,
1E69E6E1142EED56004E4D93 /* murmur_hash.o in Frameworks */,
1E69E6E6142EED57004E4D93 /* scoped.o in Frameworks */,
1E69E729142EEDA8004E4D93 /* binary_format.o in Frameworks */,
1E69E72E142EEDA8004E4D93 /* config.o in Frameworks */,
1E69E735142EEDA8004E4D93 /* lm_exception.o in Frameworks */,
1E69E739142EEDA8004E4D93 /* model.o in Frameworks */,
1E69E741142EEDA8004E4D93 /* read_arpa.o in Frameworks */,
1E69E745142EEDA8004E4D93 /* search_hashed.o in Frameworks */,
1E69E748142EEDA8004E4D93 /* search_trie.o in Frameworks */,
1E69E74E142EEDA8004E4D93 /* trie.o in Frameworks */,
1E69E753142EEDA8004E4D93 /* virtual_interface.o in Frameworks */,
1E69E756142EEDA8004E4D93 /* vocab.o in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -156,66 +242,8 @@
08FB7795FE84155DC02AAC07 /* Source */ = {
isa = PBXGroup;
children = (
1E91441213D065490005055B /* bhiksha.cc */,
1E91441313D065490005055B /* bhiksha.hh */,
1E46B5A013BA5C050084F898 /* quantize.cc */,
1E46B5A113BA5C050084F898 /* quantize.hh */,
1E46B59D13BA5BE10084F898 /* blank.hh */,
1E8A94F41288BD570022C4EB /* build_binary.cc */,
1E8A94F51288BD570022C4EB /* config.cc */,
1E8A94F61288BD570022C4EB /* config.hh */,
1E8A94F71288BD570022C4EB /* model_test.cc */,
1E8A94F81288BD570022C4EB /* model.cc */,
1E8A94F91288BD570022C4EB /* model.hh */,
1E8A94FA1288BD570022C4EB /* search_hashed.cc */,
1E8A94FB1288BD570022C4EB /* search_hashed.hh */,
1E8A94FC1288BD570022C4EB /* search_trie.cc */,
1E8A94FD1288BD570022C4EB /* search_trie.hh */,
1E8BF7D41278A600009F10C1 /* bit_packing.cc */,
1E8BF7951278A443009F10C1 /* trie.cc */,
1E8BF7961278A443009F10C1 /* trie.hh */,
1E8BF7871278A434009F10C1 /* binary_format.cc */,
1E8BF7881278A434009F10C1 /* binary_format.hh */,
1E8BF7891278A434009F10C1 /* enumerate_vocab.hh */,
1ED9988612783457006BBB6C /* file_piece_test.cc */,
1EBB1708126C15C500AE6102 /* facade.hh */,
1EBB170B126C15C500AE6102 /* ngram_query.cc */,
1EBB170D126C15C500AE6102 /* read_arpa.cc */,
1EBB170E126C15C500AE6102 /* read_arpa.hh */,
1EBB170F126C15C500AE6102 /* sri_test.cc */,
1EBB1710126C15C500AE6102 /* sri.cc */,
1EBB1711126C15C500AE6102 /* sri.hh */,
1EBB1712126C15C500AE6102 /* test.arpa */,
1EBB1713126C15C500AE6102 /* vocab.cc */,
1EBB1714126C15C500AE6102 /* vocab.hh */,
1EBB1715126C15C500AE6102 /* weights.hh */,
1EBB1716126C15C500AE6102 /* word_index.hh */,
1EBB16BF126C158600AE6102 /* ersatz_progress.cc */,
1EBB16C0126C158600AE6102 /* ersatz_progress.hh */,
1EBB16C1126C158600AE6102 /* exception.cc */,
1EBB16C2126C158600AE6102 /* exception.hh */,
1EBB16C4126C158600AE6102 /* file_piece.cc */,
1EBB16C5126C158600AE6102 /* file_piece.hh */,
1EBB16C6126C158600AE6102 /* joint_sort_test.cc */,
1EBB16C7126C158600AE6102 /* joint_sort.hh */,
1EBB16C8126C158600AE6102 /* key_value_packing_test.cc */,
1EBB16C9126C158600AE6102 /* key_value_packing.hh */,
1EBB16CA126C158600AE6102 /* mmap.cc */,
1EBB16CB126C158600AE6102 /* mmap.hh */,
1EBB16CC126C158600AE6102 /* murmur_hash.cc */,
1EBB16CD126C158600AE6102 /* murmur_hash.hh */,
1EBB16CE126C158600AE6102 /* probing_hash_table_test.cc */,
1EBB16CF126C158600AE6102 /* probing_hash_table.hh */,
1EBB16D0126C158600AE6102 /* proxy_iterator.hh */,
1EBB16D1126C158600AE6102 /* scoped.cc */,
1EBB16D2126C158600AE6102 /* scoped.hh */,
1EBB16D3126C158600AE6102 /* sorted_uniform_test.cc */,
1EBB16D4126C158600AE6102 /* sorted_uniform.hh */,
1EBB16D6126C158600AE6102 /* string_piece.hh */,
1E2B85C112555DB1000770D6 /* lm_exception.cc */,
1E2B85C212555DB1000770D6 /* lm_exception.hh */,
1E37EBC512496AB400C1C73A /* virtual_interface.cc */,
1E37EBC612496AB400C1C73A /* virtual_interface.hh */,
1E69E6ED142EEDA8004E4D93 /* lm */,
1E69E69A142EED56004E4D93 /* util */,
);
name = Source;
sourceTree = "<group>";
@ -228,6 +256,119 @@
name = Products;
sourceTree = "<group>";
};
1E69E69A142EED56004E4D93 /* util */ = {
isa = PBXGroup;
children = (
1E69E69B142EED56004E4D93 /* bit_packing.cc */,
1E69E69C142EED56004E4D93 /* bit_packing.hh */,
1E69E69D142EED56004E4D93 /* bit_packing.o */,
1E69E69E142EED56004E4D93 /* bit_packing_test */,
1E69E69F142EED56004E4D93 /* bit_packing_test.cc */,
1E69E6A0142EED56004E4D93 /* ersatz_progress.cc */,
1E69E6A1142EED56004E4D93 /* ersatz_progress.hh */,
1E69E6A2142EED56004E4D93 /* ersatz_progress.o */,
1E69E6A3142EED56004E4D93 /* exception.cc */,
1E69E6A4142EED56004E4D93 /* exception.hh */,
1E69E6A5142EED56004E4D93 /* exception.o */,
1E69E6A6142EED56004E4D93 /* file.cc */,
1E69E6A7142EED56004E4D93 /* file.hh */,
1E69E6A8142EED56004E4D93 /* file_piece.cc */,
1E69E6A9142EED56004E4D93 /* file_piece.cc.gz */,
1E69E6AA142EED56004E4D93 /* file_piece.hh */,
1E69E6AB142EED56004E4D93 /* file_piece.o */,
1E69E6AC142EED56004E4D93 /* file_piece_test */,
1E69E6AD142EED56004E4D93 /* file_piece_test.cc */,
1E69E6AE142EED56004E4D93 /* have.hh */,
1E69E6AF142EED56004E4D93 /* joint_sort.hh */,
1E69E6B0142EED56004E4D93 /* joint_sort_test */,
1E69E6B1142EED56004E4D93 /* joint_sort_test.cc */,
1E69E6B2142EED56004E4D93 /* key_value_packing.hh */,
1E69E6B3142EED56004E4D93 /* key_value_packing_test */,
1E69E6B4142EED56004E4D93 /* key_value_packing_test.cc */,
1E69E6B5142EED56004E4D93 /* mmap.cc */,
1E69E6B6142EED56004E4D93 /* mmap.hh */,
1E69E6B7142EED56004E4D93 /* mmap.o */,
1E69E6B8142EED56004E4D93 /* murmur_hash.cc */,
1E69E6B9142EED56004E4D93 /* murmur_hash.hh */,
1E69E6BA142EED56004E4D93 /* murmur_hash.o */,
1E69E6BB142EED56004E4D93 /* probing_hash_table.hh */,
1E69E6BC142EED56004E4D93 /* probing_hash_table_test */,
1E69E6BD142EED56004E4D93 /* probing_hash_table_test.cc */,
1E69E6BE142EED56004E4D93 /* proxy_iterator.hh */,
1E69E6BF142EED56004E4D93 /* scoped.hh */,
1E69E6C0142EED56004E4D93 /* scoped.o */,
1E69E6C1142EED56004E4D93 /* sized_iterator.hh */,
1E69E6C2142EED56004E4D93 /* sorted_uniform.hh */,
1E69E6C3142EED56004E4D93 /* sorted_uniform_test */,
1E69E6C4142EED56004E4D93 /* sorted_uniform_test.cc */,
1E69E6C5142EED56004E4D93 /* string_piece.hh */,
1E69E6C6142EED56004E4D93 /* tokenize_piece.hh */,
);
path = util;
sourceTree = "<group>";
};
1E69E6ED142EEDA8004E4D93 /* lm */ = {
isa = PBXGroup;
children = (
1E69E6EE142EEDA8004E4D93 /* bhiksha.cc */,
1E69E6EF142EEDA8004E4D93 /* bhiksha.hh */,
1E69E6F0142EEDA8004E4D93 /* binary_format.cc */,
1E69E6F1142EEDA8004E4D93 /* binary_format.hh */,
1E69E6F2142EEDA8004E4D93 /* binary_format.o */,
1E69E6F3142EEDA8004E4D93 /* blank.hh */,
1E69E6F4142EEDA8004E4D93 /* build_binary.cc */,
1E69E6F5142EEDA8004E4D93 /* config.cc */,
1E69E6F6142EEDA8004E4D93 /* config.hh */,
1E69E6F7142EEDA8004E4D93 /* config.o */,
1E69E6F8142EEDA8004E4D93 /* enumerate_vocab.hh */,
1E69E6F9142EEDA8004E4D93 /* facade.hh */,
1E69E6FA142EEDA8004E4D93 /* left.hh */,
1E69E6FB142EEDA8004E4D93 /* left_test.cc */,
1E69E6FC142EEDA8004E4D93 /* lm_exception.cc */,
1E69E6FD142EEDA8004E4D93 /* lm_exception.hh */,
1E69E6FE142EEDA8004E4D93 /* lm_exception.o */,
1E69E6FF142EEDA8004E4D93 /* max_order.hh */,
1E69E700142EEDA8004E4D93 /* model.cc */,
1E69E701142EEDA8004E4D93 /* model.hh */,
1E69E702142EEDA8004E4D93 /* model.o */,
1E69E703142EEDA8004E4D93 /* model_test */,
1E69E704142EEDA8004E4D93 /* model_test.cc */,
1E69E705142EEDA8004E4D93 /* model_type.hh */,
1E69E706142EEDA8004E4D93 /* ngram_query.cc */,
1E69E707142EEDA8004E4D93 /* quantize.cc */,
1E69E708142EEDA8004E4D93 /* quantize.hh */,
1E69E709142EEDA8004E4D93 /* read_arpa.cc */,
1E69E70A142EEDA8004E4D93 /* read_arpa.hh */,
1E69E70B142EEDA8004E4D93 /* read_arpa.o */,
1E69E70C142EEDA8004E4D93 /* return.hh */,
1E69E70D142EEDA8004E4D93 /* search_hashed.cc */,
1E69E70E142EEDA8004E4D93 /* search_hashed.hh */,
1E69E70F142EEDA8004E4D93 /* search_hashed.o */,
1E69E710142EEDA8004E4D93 /* search_trie.cc */,
1E69E711142EEDA8004E4D93 /* search_trie.hh */,
1E69E712142EEDA8004E4D93 /* search_trie.o */,
1E69E713142EEDA8004E4D93 /* sri.cc */,
1E69E714142EEDA8004E4D93 /* sri.hh */,
1E69E715142EEDA8004E4D93 /* sri_test.cc */,
1E69E716142EEDA8004E4D93 /* test.arpa */,
1E69E717142EEDA8004E4D93 /* test_nounk.arpa */,
1E69E718142EEDA8004E4D93 /* trie.cc */,
1E69E719142EEDA8004E4D93 /* trie.hh */,
1E69E71A142EEDA8004E4D93 /* trie.o */,
1E69E71B142EEDA8004E4D93 /* trie_sort.cc */,
1E69E71C142EEDA8004E4D93 /* trie_sort.hh */,
1E69E71D142EEDA8004E4D93 /* virtual_interface.cc */,
1E69E71E142EEDA8004E4D93 /* virtual_interface.hh */,
1E69E71F142EEDA8004E4D93 /* virtual_interface.o */,
1E69E720142EEDA8004E4D93 /* vocab.cc */,
1E69E721142EEDA8004E4D93 /* vocab.hh */,
1E69E722142EEDA8004E4D93 /* vocab.o */,
1E69E723142EEDA8004E4D93 /* weights.hh */,
1E69E724142EEDA8004E4D93 /* word_index.hh */,
);
path = lm;
sourceTree = "<group>";
};
C6A0FF2B0290797F04C91782 /* Documentation */ = {
isa = PBXGroup;
children = (
@ -242,36 +383,46 @@
isa = PBXHeadersBuildPhase;
buildActionMask = 2147483647;
files = (
1E37EBC812496AB400C1C73A /* virtual_interface.hh in Headers */,
1E2B85C512555DB1000770D6 /* lm_exception.hh in Headers */,
1EBB16D8126C158600AE6102 /* ersatz_progress.hh in Headers */,
1EBB16DA126C158600AE6102 /* exception.hh in Headers */,
1EBB16DD126C158600AE6102 /* file_piece.hh in Headers */,
1EBB16DF126C158600AE6102 /* joint_sort.hh in Headers */,
1EBB16E1126C158600AE6102 /* key_value_packing.hh in Headers */,
1EBB16E3126C158600AE6102 /* mmap.hh in Headers */,
1EBB16E5126C158600AE6102 /* murmur_hash.hh in Headers */,
1EBB16E7126C158600AE6102 /* probing_hash_table.hh in Headers */,
1EBB16E8126C158600AE6102 /* proxy_iterator.hh in Headers */,
1EBB16EA126C158600AE6102 /* scoped.hh in Headers */,
1EBB16EC126C158600AE6102 /* sorted_uniform.hh in Headers */,
1EBB16EE126C158600AE6102 /* string_piece.hh in Headers */,
1EBB1717126C15C500AE6102 /* facade.hh in Headers */,
1EBB171D126C15C500AE6102 /* read_arpa.hh in Headers */,
1EBB1720126C15C500AE6102 /* sri.hh in Headers */,
1EBB1722126C15C500AE6102 /* vocab.hh in Headers */,
1EBB1723126C15C500AE6102 /* weights.hh in Headers */,
1EBB1724126C15C500AE6102 /* word_index.hh in Headers */,
1E8BF78B1278A434009F10C1 /* binary_format.hh in Headers */,
1E8BF78C1278A434009F10C1 /* enumerate_vocab.hh in Headers */,
1E8BF79E1278A443009F10C1 /* trie.hh in Headers */,
1E8A95001288BD570022C4EB /* config.hh in Headers */,
1E8A95031288BD570022C4EB /* model.hh in Headers */,
1E8A95051288BD570022C4EB /* search_hashed.hh in Headers */,
1E8A95071288BD570022C4EB /* search_trie.hh in Headers */,
1E46B59E13BA5BE10084F898 /* blank.hh in Headers */,
1E46B5A313BA5C050084F898 /* quantize.hh in Headers */,
1E91441513D065490005055B /* bhiksha.hh in Headers */,
1E69E6C8142EED56004E4D93 /* bit_packing.hh in Headers */,
1E69E6CC142EED56004E4D93 /* ersatz_progress.hh in Headers */,
1E69E6CF142EED56004E4D93 /* exception.hh in Headers */,
1E69E6D2142EED56004E4D93 /* file.hh in Headers */,
1E69E6D4142EED56004E4D93 /* file_piece.hh in Headers */,
1E69E6D7142EED56004E4D93 /* have.hh in Headers */,
1E69E6D8142EED56004E4D93 /* joint_sort.hh in Headers */,
1E69E6DA142EED56004E4D93 /* key_value_packing.hh in Headers */,
1E69E6DD142EED56004E4D93 /* mmap.hh in Headers */,
1E69E6E0142EED56004E4D93 /* murmur_hash.hh in Headers */,
1E69E6E2142EED56004E4D93 /* probing_hash_table.hh in Headers */,
1E69E6E4142EED57004E4D93 /* proxy_iterator.hh in Headers */,
1E69E6E5142EED57004E4D93 /* scoped.hh in Headers */,
1E69E6E7142EED57004E4D93 /* sized_iterator.hh in Headers */,
1E69E6E8142EED57004E4D93 /* sorted_uniform.hh in Headers */,
1E69E6EA142EED57004E4D93 /* string_piece.hh in Headers */,
1E69E6EB142EED57004E4D93 /* tokenize_piece.hh in Headers */,
1E69E726142EEDA8004E4D93 /* bhiksha.hh in Headers */,
1E69E728142EEDA8004E4D93 /* binary_format.hh in Headers */,
1E69E72A142EEDA8004E4D93 /* blank.hh in Headers */,
1E69E72D142EEDA8004E4D93 /* config.hh in Headers */,
1E69E72F142EEDA8004E4D93 /* enumerate_vocab.hh in Headers */,
1E69E730142EEDA8004E4D93 /* facade.hh in Headers */,
1E69E731142EEDA8004E4D93 /* left.hh in Headers */,
1E69E734142EEDA8004E4D93 /* lm_exception.hh in Headers */,
1E69E736142EEDA8004E4D93 /* max_order.hh in Headers */,
1E69E738142EEDA8004E4D93 /* model.hh in Headers */,
1E69E73B142EEDA8004E4D93 /* model_type.hh in Headers */,
1E69E73E142EEDA8004E4D93 /* quantize.hh in Headers */,
1E69E740142EEDA8004E4D93 /* read_arpa.hh in Headers */,
1E69E742142EEDA8004E4D93 /* return.hh in Headers */,
1E69E744142EEDA8004E4D93 /* search_hashed.hh in Headers */,
1E69E747142EEDA8004E4D93 /* search_trie.hh in Headers */,
1E69E74A142EEDA8004E4D93 /* sri.hh in Headers */,
1E69E74D142EEDA8004E4D93 /* trie.hh in Headers */,
1E69E750142EEDA8004E4D93 /* trie_sort.hh in Headers */,
1E69E752142EEDA8004E4D93 /* virtual_interface.hh in Headers */,
1E69E755142EEDA8004E4D93 /* vocab.hh in Headers */,
1E69E757142EEDA8004E4D93 /* weights.hh in Headers */,
1E69E758142EEDA8004E4D93 /* word_index.hh in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -324,35 +475,38 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1E37EBC712496AB400C1C73A /* virtual_interface.cc in Sources */,
1E2B85C412555DB1000770D6 /* lm_exception.cc in Sources */,
1EBB16D7126C158600AE6102 /* ersatz_progress.cc in Sources */,
1EBB16D9126C158600AE6102 /* exception.cc in Sources */,
1EBB16DC126C158600AE6102 /* file_piece.cc in Sources */,
1EBB16DE126C158600AE6102 /* joint_sort_test.cc in Sources */,
1EBB16E0126C158600AE6102 /* key_value_packing_test.cc in Sources */,
1EBB16E2126C158600AE6102 /* mmap.cc in Sources */,
1EBB16E4126C158600AE6102 /* murmur_hash.cc in Sources */,
1EBB16E6126C158600AE6102 /* probing_hash_table_test.cc in Sources */,
1EBB16E9126C158600AE6102 /* scoped.cc in Sources */,
1EBB16EB126C158600AE6102 /* sorted_uniform_test.cc in Sources */,
1EBB171A126C15C500AE6102 /* ngram_query.cc in Sources */,
1EBB171C126C15C500AE6102 /* read_arpa.cc in Sources */,
1EBB171E126C15C500AE6102 /* sri_test.cc in Sources */,
1EBB171F126C15C500AE6102 /* sri.cc in Sources */,
1EBB1721126C15C500AE6102 /* vocab.cc in Sources */,
1ED9988712783457006BBB6C /* file_piece_test.cc in Sources */,
1E8BF78A1278A434009F10C1 /* binary_format.cc in Sources */,
1E8BF79D1278A443009F10C1 /* trie.cc in Sources */,
1E8BF7D51278A600009F10C1 /* bit_packing.cc in Sources */,
1E8A94FE1288BD570022C4EB /* build_binary.cc in Sources */,
1E8A94FF1288BD570022C4EB /* config.cc in Sources */,
1E8A95011288BD570022C4EB /* model_test.cc in Sources */,
1E8A95021288BD570022C4EB /* model.cc in Sources */,
1E8A95041288BD570022C4EB /* search_hashed.cc in Sources */,
1E8A95061288BD570022C4EB /* search_trie.cc in Sources */,
1E46B5A213BA5C050084F898 /* quantize.cc in Sources */,
1E91441413D065490005055B /* bhiksha.cc in Sources */,
1E69E6C7142EED56004E4D93 /* bit_packing.cc in Sources */,
1E69E6CA142EED56004E4D93 /* bit_packing_test.cc in Sources */,
1E69E6CB142EED56004E4D93 /* ersatz_progress.cc in Sources */,
1E69E6CE142EED56004E4D93 /* exception.cc in Sources */,
1E69E6D1142EED56004E4D93 /* file.cc in Sources */,
1E69E6D3142EED56004E4D93 /* file_piece.cc in Sources */,
1E69E6D6142EED56004E4D93 /* file_piece_test.cc in Sources */,
1E69E6D9142EED56004E4D93 /* joint_sort_test.cc in Sources */,
1E69E6DB142EED56004E4D93 /* key_value_packing_test.cc in Sources */,
1E69E6DC142EED56004E4D93 /* mmap.cc in Sources */,
1E69E6DF142EED56004E4D93 /* murmur_hash.cc in Sources */,
1E69E6E3142EED57004E4D93 /* probing_hash_table_test.cc in Sources */,
1E69E6E9142EED57004E4D93 /* sorted_uniform_test.cc in Sources */,
1E69E725142EEDA8004E4D93 /* bhiksha.cc in Sources */,
1E69E727142EEDA8004E4D93 /* binary_format.cc in Sources */,
1E69E72B142EEDA8004E4D93 /* build_binary.cc in Sources */,
1E69E72C142EEDA8004E4D93 /* config.cc in Sources */,
1E69E732142EEDA8004E4D93 /* left_test.cc in Sources */,
1E69E733142EEDA8004E4D93 /* lm_exception.cc in Sources */,
1E69E737142EEDA8004E4D93 /* model.cc in Sources */,
1E69E73A142EEDA8004E4D93 /* model_test.cc in Sources */,
1E69E73C142EEDA8004E4D93 /* ngram_query.cc in Sources */,
1E69E73D142EEDA8004E4D93 /* quantize.cc in Sources */,
1E69E73F142EEDA8004E4D93 /* read_arpa.cc in Sources */,
1E69E743142EEDA8004E4D93 /* search_hashed.cc in Sources */,
1E69E746142EEDA8004E4D93 /* search_trie.cc in Sources */,
1E69E749142EEDA8004E4D93 /* sri.cc in Sources */,
1E69E74B142EEDA8004E4D93 /* sri_test.cc in Sources */,
1E69E74C142EEDA8004E4D93 /* trie.cc in Sources */,
1E69E74F142EEDA8004E4D93 /* trie_sort.cc in Sources */,
1E69E751142EEDA8004E4D93 /* virtual_interface.cc in Sources */,
1E69E754142EEDA8004E4D93 /* vocab.cc in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -372,6 +526,12 @@
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
HEADER_SEARCH_PATHS = (
/opt/local/include/,
/Users/hieuhoang/workspace/sourceforge/trunk/kenlm,
/usr/local/include,
../srilm/include,
);
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = kenlm;
};
@ -387,6 +547,12 @@
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
HEADER_SEARCH_PATHS = (
/opt/local/include/,
/Users/hieuhoang/workspace/sourceforge/trunk/kenlm,
/usr/local/include,
../srilm/include,
);
INSTALL_PATH = /usr/local/lib;
PRODUCT_NAME = kenlm;
};

View File

@ -11,8 +11,9 @@
*/
#include <inttypes.h>
#include <assert.h>
#include "lm/binary_format.hh"
#include "lm/model_type.hh"
#include "lm/trie.hh"
#include "util/bit_packing.hh"
#include "util/sorted_uniform.hh"
@ -78,6 +79,7 @@ class ArrayBhiksha {
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
out.end = ((end_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
//assert(out.end >= out.begin);
}
void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) {

View File

@ -19,10 +19,10 @@ namespace lm {
namespace ngram {
namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 4\n\0";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
const long int kMagicVersion = 4;
const long int kMagicVersion = 5;
// Test values.
struct Sanity {
@ -42,12 +42,6 @@ struct Sanity {
const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
std::size_t Align8(std::size_t in) {
std::size_t off = in % 8;
if (!off) return in;
return in + 8 - off;
}
std::size_t TotalHeaderSize(unsigned char order) {
return Align8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
}
@ -119,7 +113,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
}
}
void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing) {
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) {
if (config.write_mmap) {
if (msync(backing.search.get(), backing.search.size(), MS_SYNC) || msync(backing.vocab.get(), backing.vocab.size(), MS_SYNC))
UTIL_THROW(util::ErrnoException, "msync failed for " << config.write_mmap);
@ -130,6 +124,7 @@ void FinishFile(const Config &config, ModelType model_type, const std::vector<ui
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
params.fixed.search_version = search_version;
WriteHeader(backing.vocab.get(), params);
}
}
@ -174,12 +169,13 @@ void ReadHeader(int fd, Parameters &out) {
ReadLoop(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
}
void MatchCheck(ModelType model_type, const Parameters &params) {
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params) {
if (params.fixed.model_type != model_type) {
if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *)))
UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code.");
UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]);
}
UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
}
void SeekPastHeader(int fd, const Parameters &params) {

View File

@ -2,6 +2,7 @@
#define LM_BINARY_FORMAT__
#include "lm/config.hh"
#include "lm/model_type.hh"
#include "lm/read_arpa.hh"
#include "util/file_piece.hh"
@ -16,13 +17,6 @@
namespace lm {
namespace ngram {
/* Not the best numbering system, but it grew this way for historical reasons
* and I want to preserve existing binary files. */
typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2, QUANT_TRIE_SORTED=3, ARRAY_TRIE_SORTED=4, QUANT_ARRAY_TRIE_SORTED=5} ModelType;
const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE_SORTED - TRIE_SORTED);
const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE_SORTED - TRIE_SORTED);
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors.
@ -36,8 +30,14 @@ struct FixedWidthParameters {
ModelType model_type;
// Does the end of the file have the actual strings in the vocabulary?
bool has_vocabulary;
unsigned int search_version;
};
inline std::size_t Align8(std::size_t in) {
std::size_t off = in % 8;
return off ? (in + 8 - off) : in;
}
// Parameters stored in the header of a binary file.
struct Parameters {
FixedWidthParameters fixed;
@ -64,7 +64,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
// Write header to binary file. This is done last to prevent incomplete files
// from loading.
void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing);
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing);
namespace detail {
@ -72,7 +72,7 @@ bool IsBinaryFormat(int fd);
void ReadHeader(int fd, Parameters &params);
void MatchCheck(ModelType model_type, const Parameters &params);
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
void SeekPastHeader(int fd, const Parameters &params);
@ -90,7 +90,7 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
if (detail::IsBinaryFormat(backing.file.get())) {
Parameters params;
detail::ReadHeader(backing.file.get(), params);
detail::MatchCheck(To::kModelType, params);
detail::MatchCheck(To::kModelType, To::kVersion, params);
// Replace the run-time configured probing_multiplier with the one in the file.
Config new_config(config);
new_config.probing_multiplier = params.fixed.probing_multiplier;

View File

@ -38,20 +38,6 @@ inline bool HasExtension(const float &backoff) {
return compare.i != interpret.i;
}
/* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or
* "baz quux" (because they were pruned). 1.2% of n-grams generated by SRI
* with default settings on the benchmark data set are like this. Since search
* proceeds by finding "quux", "baz quux", "bar baz quux", and finally
* "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are
* inserted. The blanks have probability kBlankProb and backoff kBlankBackoff.
* A blank is recognized by kBlankProb in the probability field; kBlankBackoff
* must be 0 so that inference asseses zero backoff from these blanks.
*/
const float kBlankProb = -std::numeric_limits<float>::infinity();
const float kBlankBackoff = kNoExtensionBackoff;
const uint32_t kBlankProbQuant = 0;
const uint32_t kBlankBackoffQuant = 0;
} // namespace ngram
} // namespace lm
#endif // LM_BLANK__

247
kenlm/lm/left.hh Normal file
View File

@ -0,0 +1,247 @@
/* Efficient left and right language model state for sentence fragments.
* Intended usage:
* Store ChartState with every chart entry.
* To do a rule application:
* 1. Make a ChartState object for your new entry.
* 2. Construct RuleScore.
* 3. Going from left to right, call Terminal or NonTerminal.
* For terminals, just pass the vocab id.
* For non-terminals, pass that non-terminal's ChartState.
* If your decoder expects scores inclusive of subtree scores (i.e. you
* label entries with the highest-scoring path), pass the non-terminal's
* score as prob.
* If your decoder expects relative scores and will walk the chart later,
* pass prob = 0.0.
* In other words, the only effect of prob is that it gets added to the
* returned log probability.
* 4. Call Finish. It returns the log probability.
*
* There's a couple more details:
* Do not pass <s> to Terminal as it is formally not a word in the sentence,
* only context. Instead, call BeginSentence. If called, it should be the
* first call after RuleScore is constructed (since <s> is always the
* leftmost).
*
* If the leftmost RHS is a non-terminal, it's faster to call BeginNonTerminal.
*
* Hashing and sorting comparison operators are provided. All state objects
* are POD. If you intend to use memcmp on raw state objects, you must call
* ZeroRemaining first, as the value of array entries beyond length is
* otherwise undefined.
*
* Usage is of course not limited to chart decoding. Anything that generates
* sentence fragments missing left context could benefit. For example, a
* phrase-based decoder could pre-score phrases, storing ChartState with each
* phrase, even if hypotheses are generated left-to-right.
*/
#ifndef LM_LEFT__
#define LM_LEFT__
#include "lm/max_order.hh"
#include "lm/model.hh"
#include "lm/return.hh"
#include "util/murmur_hash.hh"
#include <algorithm>
namespace lm {
namespace ngram {
struct Left {
bool operator==(const Left &other) const {
return
(length == other.length) &&
pointers[length - 1] == other.pointers[length - 1];
}
int Compare(const Left &other) const {
if (length != other.length) return length < other.length ? -1 : 1;
if (pointers[length - 1] > other.pointers[length - 1]) return 1;
if (pointers[length - 1] < other.pointers[length - 1]) return -1;
return 0;
}
bool operator<(const Left &other) const {
if (length != other.length) return length < other.length;
return pointers[length - 1] < other.pointers[length - 1];
}
void ZeroRemaining() {
for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i)
*i = 0;
}
unsigned char length;
uint64_t pointers[kMaxOrder - 1];
};
inline size_t hash_value(const Left &left) {
return util::MurmurHashNative(&left.length, 1, left.pointers[left.length - 1]);
}
struct ChartState {
bool operator==(const ChartState &other) {
return (left == other.left) && (right == other.right) && (full == other.full);
}
int Compare(const ChartState &other) const {
int lres = left.Compare(other.left);
if (lres) return lres;
int rres = right.Compare(other.right);
if (rres) return rres;
return (int)full - (int)other.full;
}
bool operator<(const ChartState &other) const {
return Compare(other) == -1;
}
void ZeroRemaining() {
left.ZeroRemaining();
right.ZeroRemaining();
}
Left left;
bool full;
State right;
};
inline size_t hash_value(const ChartState &state) {
size_t hashes[2];
hashes[0] = hash_value(state.left);
hashes[1] = hash_value(state.right);
return util::MurmurHashNative(hashes, sizeof(size_t), state.full);
}
template <class M> class RuleScore {
public:
explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(out), left_done_(false), left_write_(out.left.pointers), prob_(0.0) {
out.left.length = 0;
out.right.length = 0;
}
void BeginSentence() {
out_.right = model_.BeginSentenceState();
// out_.left is empty.
left_done_ = true;
}
void Terminal(WordIndex word) {
State copy(out_.right);
ProcessRet(model_.FullScore(copy, word, out_.right));
if (out_.right.length != copy.length + 1) left_done_ = true;
}
// Faster version of NonTerminal for the case where the rule begins with a non-terminal.
void BeginNonTerminal(const ChartState &in, float prob) {
prob_ = prob;
out_ = in;
left_write_ = out_.left.pointers + out_.left.length;
left_done_ = in.full;
}
void NonTerminal(const ChartState &in, float prob) {
prob_ += prob;
if (!in.left.length) {
if (in.full) {
for (const float *i = out_.right.backoff; i < out_.right.backoff + out_.right.length; ++i) prob_ += *i;
left_done_ = true;
out_.right = in.right;
}
return;
}
if (!out_.right.length) {
out_.right = in.right;
if (left_done_) return;
if (left_write_ != out_.left.pointers) {
left_done_ = true;
} else {
out_.left = in.left;
left_write_ = out_.left.pointers + in.left.length;
left_done_ = in.full;
}
return;
}
float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1];
float *back = backoffs, *back2 = backoffs2;
unsigned char next_use;
FullScoreReturn ret;
ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + out_.right.length, out_.right.backoff, in.left.pointers[0], 1, back, next_use));
if (!next_use) {
left_done_ = true;
out_.right = in.right;
return;
}
unsigned char extend_length = 2;
for (const uint64_t *i = in.left.pointers + 1; i < in.left.pointers + in.left.length; ++i, ++extend_length) {
ProcessRet(ret = model_.ExtendLeft(out_.right.words, out_.right.words + next_use, back, *i, extend_length, back2, next_use));
if (!next_use) {
left_done_ = true;
out_.right = in.right;
return;
}
std::swap(back, back2);
}
if (in.full) {
for (const float *i = back; i != back + next_use; ++i) prob_ += *i;
left_done_ = true;
out_.right = in.right;
return;
}
// Right state was minimized, so it's already independent of the new words to the left.
if (in.right.length < in.left.length) {
out_.right = in.right;
return;
}
// Shift exisiting words down.
for (WordIndex *i = out_.right.words + next_use - 1; i >= out_.right.words; --i) {
*(i + in.right.length) = *i;
}
// Add words from in.right.
std::copy(in.right.words, in.right.words + in.right.length, out_.right.words);
// Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
std::copy(in.right.backoff, in.right.backoff + in.right.length, out_.right.backoff);
std::copy(back, back + next_use, out_.right.backoff + in.right.length);
out_.right.length = in.right.length + next_use;
}
float Finish() {
out_.left.length = left_write_ - out_.left.pointers;
out_.full = left_done_;
return prob_;
}
private:
void ProcessRet(const FullScoreReturn &ret) {
prob_ += ret.prob;
if (left_done_) return;
if (ret.independent_left) {
left_done_ = true;
return;
}
*(left_write_++) = ret.extend_left;
}
const M &model_;
ChartState &out_;
bool left_done_;
uint64_t *left_write_;
float prob_;
};
} // namespace ngram
} // namespace lm
#endif // LM_LEFT__

360
kenlm/lm/left_test.cc Normal file
View File

@ -0,0 +1,360 @@
#include "lm/left.hh"
#include "lm/model.hh"
#include "util/tokenize_piece.hh"
#include <vector>
#define BOOST_TEST_MODULE LeftTest
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
namespace lm {
namespace ngram {
namespace {
#define Term(word) score.Terminal(m.GetVocabulary().Index(word));
#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
template <class M> void Short(const M &m) {
ChartState base;
{
RuleScore<M> score(m, base);
Term("more");
Term("loin");
BOOST_CHECK_CLOSE(-1.206319 - 0.3561665, score.Finish(), 0.001);
}
BOOST_CHECK(base.full);
BOOST_CHECK_EQUAL(2, base.left.length);
BOOST_CHECK_EQUAL(1, base.right.length);
VCheck("loin", base.right.words[0]);
ChartState more_left;
{
RuleScore<M> score(m, more_left);
Term("little");
score.NonTerminal(base, -1.206319 - 0.3561665);
// p(little more loin | null context)
BOOST_CHECK_CLOSE(-1.56538, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(3, more_left.left.length);
BOOST_CHECK_EQUAL(1, more_left.right.length);
VCheck("loin", more_left.right.words[0]);
BOOST_CHECK(more_left.full);
ChartState shorter;
{
RuleScore<M> score(m, shorter);
Term("to");
score.NonTerminal(base, -1.206319 - 0.3561665);
BOOST_CHECK_CLOSE(-0.30103 - 1.687872 - 1.206319 - 0.3561665, score.Finish(), 0.01);
}
BOOST_CHECK_EQUAL(1, shorter.left.length);
BOOST_CHECK_EQUAL(1, shorter.right.length);
VCheck("loin", shorter.right.words[0]);
BOOST_CHECK(shorter.full);
}
template <class M> void Charge(const M &m) {
ChartState base;
{
RuleScore<M> score(m, base);
Term("on");
Term("more");
BOOST_CHECK_CLOSE(-1.509559 -0.4771212 -1.206319, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(1, base.left.length);
BOOST_CHECK_EQUAL(1, base.right.length);
VCheck("more", base.right.words[0]);
BOOST_CHECK(base.full);
ChartState extend;
{
RuleScore<M> score(m, extend);
Term("looking");
score.NonTerminal(base, -1.509559 -0.4771212 -1.206319);
BOOST_CHECK_CLOSE(-3.91039, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(2, extend.left.length);
BOOST_CHECK_EQUAL(1, extend.right.length);
VCheck("more", extend.right.words[0]);
BOOST_CHECK(extend.full);
ChartState tobos;
{
RuleScore<M> score(m, tobos);
score.BeginSentence();
score.NonTerminal(extend, -3.91039);
BOOST_CHECK_CLOSE(-3.471169, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(0, tobos.left.length);
BOOST_CHECK_EQUAL(1, tobos.right.length);
}
template <class M> float LeftToRight(const M &m, const std::vector<WordIndex> &words) {
float ret = 0.0;
State right = m.NullContextState();
for (std::vector<WordIndex>::const_iterator i = words.begin(); i != words.end(); ++i) {
State copy(right);
ret += m.Score(copy, *i, right);
}
return ret;
}
template <class M> float RightToLeft(const M &m, const std::vector<WordIndex> &words) {
float ret = 0.0;
ChartState state;
state.left.length = 0;
state.right.length = 0;
state.full = false;
for (std::vector<WordIndex>::const_reverse_iterator i = words.rbegin(); i != words.rend(); ++i) {
ChartState copy(state);
RuleScore<M> score(m, state);
score.Terminal(*i);
score.NonTerminal(copy, ret);
ret = score.Finish();
}
return ret;
}
template <class M> float TreeMiddle(const M &m, const std::vector<WordIndex> &words) {
std::vector<std::pair<ChartState, float> > states(words.size());
for (unsigned int i = 0; i < words.size(); ++i) {
RuleScore<M> score(m, states[i].first);
score.Terminal(words[i]);
states[i].second = score.Finish();
}
while (states.size() > 1) {
std::vector<std::pair<ChartState, float> > upper((states.size() + 1) / 2);
for (unsigned int i = 0; i < states.size() / 2; ++i) {
RuleScore<M> score(m, upper[i].first);
score.NonTerminal(states[i*2].first, states[i*2].second);
score.NonTerminal(states[i*2+1].first, states[i*2+1].second);
upper[i].second = score.Finish();
}
if (states.size() % 2) {
upper.back() = states.back();
}
std::swap(states, upper);
}
return states.empty() ? 0 : states.back().second;
}
template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vector<WordIndex> &out) {
out.clear();
for (util::PieceIterator<' '> i(str); i; ++i) {
out.push_back(m.GetVocabulary().Index(*i));
}
}
#define TEXT_TEST(str) \
{ \
std::vector<WordIndex> words; \
LookupVocab(m, str, words); \
float expect = LeftToRight(m, words); \
BOOST_CHECK_CLOSE(expect, RightToLeft(m, words), 0.001); \
BOOST_CHECK_CLOSE(expect, TreeMiddle(m, words), 0.001); \
}
// Build sentences, or parts thereof, from right to left.
template <class M> void GrowBig(const M &m) {
TEXT_TEST("in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>");
TEXT_TEST("on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>");
TEXT_TEST("on a little more loin also would consider higher to look good");
TEXT_TEST("more loin also would consider higher to look good");
TEXT_TEST("more loin also would consider higher to look");
TEXT_TEST("also would consider higher to look");
TEXT_TEST("also would consider higher");
TEXT_TEST("would consider higher to look");
TEXT_TEST("consider higher to look");
TEXT_TEST("consider higher to");
TEXT_TEST("consider higher");
}
template <class M> void AlsoWouldConsiderHigher(const M &m) {
ChartState also;
{
RuleScore<M> score(m, also);
score.Terminal(m.GetVocabulary().Index("also"));
BOOST_CHECK_CLOSE(-1.687872, score.Finish(), 0.001);
}
ChartState would;
{
RuleScore<M> score(m, would);
score.Terminal(m.GetVocabulary().Index("would"));
BOOST_CHECK_CLOSE(-1.687872, score.Finish(), 0.001);
}
ChartState combine_also_would;
{
RuleScore<M> score(m, combine_also_would);
score.NonTerminal(also, -1.687872);
score.NonTerminal(would, -1.687872);
BOOST_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(2, combine_also_would.right.length);
ChartState also_would;
{
RuleScore<M> score(m, also_would);
score.Terminal(m.GetVocabulary().Index("also"));
score.Terminal(m.GetVocabulary().Index("would"));
BOOST_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(2, also_would.right.length);
ChartState consider;
{
RuleScore<M> score(m, consider);
score.Terminal(m.GetVocabulary().Index("consider"));
BOOST_CHECK_CLOSE(-1.687872, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(1, consider.left.length);
BOOST_CHECK_EQUAL(1, consider.right.length);
BOOST_CHECK(!consider.full);
ChartState higher;
float higher_score;
{
RuleScore<M> score(m, higher);
score.Terminal(m.GetVocabulary().Index("higher"));
higher_score = score.Finish();
}
BOOST_CHECK_CLOSE(-1.509559, higher_score, 0.001);
BOOST_CHECK_EQUAL(1, higher.left.length);
BOOST_CHECK_EQUAL(1, higher.right.length);
BOOST_CHECK(!higher.full);
VCheck("higher", higher.right.words[0]);
BOOST_CHECK_CLOSE(-0.30103, higher.right.backoff[0], 0.001);
ChartState consider_higher;
{
RuleScore<M> score(m, consider_higher);
score.NonTerminal(consider, -1.687872);
score.NonTerminal(higher, higher_score);
BOOST_CHECK_CLOSE(-1.509559 - 1.687872 - 0.30103, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(2, consider_higher.left.length);
BOOST_CHECK(!consider_higher.full);
ChartState full;
{
RuleScore<M> score(m, full);
score.NonTerminal(combine_also_would, -1.687872 - 2.0);
score.NonTerminal(consider_higher, -1.509559 - 1.687872 - 0.30103);
BOOST_CHECK_CLOSE(-10.6879, score.Finish(), 0.001);
}
BOOST_CHECK_EQUAL(4, full.right.length);
}
template <class M> void GrowSmall(const M &m) {
TEXT_TEST("in biarritz watching considering looking . </s>");
TEXT_TEST("in biarritz watching considering looking .");
TEXT_TEST("in biarritz");
}
#define CHECK_SCORE(str, val) \
{ \
float got = val; \
std::vector<WordIndex> indices; \
LookupVocab(m, str, indices); \
BOOST_CHECK_CLOSE(LeftToRight(m, indices), got, 0.001); \
}
template <class M> void FullGrow(const M &m) {
std::vector<WordIndex> words;
LookupVocab(m, "in biarritz watching considering looking . </s>", words);
ChartState lexical[7];
float lexical_scores[7];
for (unsigned int i = 0; i < 7; ++i) {
RuleScore<M> score(m, lexical[i]);
score.Terminal(words[i]);
lexical_scores[i] = score.Finish();
}
CHECK_SCORE("in", lexical_scores[0]);
CHECK_SCORE("biarritz", lexical_scores[1]);
CHECK_SCORE("watching", lexical_scores[2]);
CHECK_SCORE("</s>", lexical_scores[6]);
ChartState l1[4];
float l1_scores[4];
{
RuleScore<M> score(m, l1[0]);
score.NonTerminal(lexical[0], lexical_scores[0]);
score.NonTerminal(lexical[1], lexical_scores[1]);
CHECK_SCORE("in biarritz", l1_scores[0] = score.Finish());
}
{
RuleScore<M> score(m, l1[1]);
score.NonTerminal(lexical[2], lexical_scores[2]);
score.NonTerminal(lexical[3], lexical_scores[3]);
CHECK_SCORE("watching considering", l1_scores[1] = score.Finish());
}
{
RuleScore<M> score(m, l1[2]);
score.NonTerminal(lexical[4], lexical_scores[4]);
score.NonTerminal(lexical[5], lexical_scores[5]);
CHECK_SCORE("looking .", l1_scores[2] = score.Finish());
}
BOOST_CHECK_EQUAL(l1[2].left.length, 1);
l1[3] = lexical[6];
l1_scores[3] = lexical_scores[6];
ChartState l2[2];
float l2_scores[2];
{
RuleScore<M> score(m, l2[0]);
score.NonTerminal(l1[0], l1_scores[0]);
score.NonTerminal(l1[1], l1_scores[1]);
CHECK_SCORE("in biarritz watching considering", l2_scores[0] = score.Finish());
}
{
RuleScore<M> score(m, l2[1]);
score.NonTerminal(l1[2], l1_scores[2]);
score.NonTerminal(l1[3], l1_scores[3]);
CHECK_SCORE("looking . </s>", l2_scores[1] = score.Finish());
}
BOOST_CHECK_EQUAL(l2[1].left.length, 1);
BOOST_CHECK(l2[1].full);
ChartState top;
{
RuleScore<M> score(m, top);
score.NonTerminal(l2[0], l2_scores[0]);
score.NonTerminal(l2[1], l2_scores[1]);
CHECK_SCORE("in biarritz watching considering looking . </s>", score.Finish());
}
}
template <class M> void Everything() {
Config config;
config.messages = NULL;
M m("test.arpa", config);
Short(m);
Charge(m);
GrowBig(m);
AlsoWouldConsiderHigher(m);
GrowSmall(m);
FullGrow(m);
}
BOOST_AUTO_TEST_CASE(ProbingAll) {
Everything<Model>();
}
BOOST_AUTO_TEST_CASE(TrieAll) {
Everything<TrieModel>();
}
BOOST_AUTO_TEST_CASE(QuantTrieAll) {
Everything<QuantTrieModel>();
}
BOOST_AUTO_TEST_CASE(ArrayQuantTrieAll) {
Everything<QuantArrayTrieModel>();
}
BOOST_AUTO_TEST_CASE(ArrayTrieAll) {
Everything<ArrayTrieModel>();
}
} // namespace
} // namespace ngram
} // namespace lm

View File

@ -14,11 +14,6 @@
namespace lm {
namespace ngram {
size_t hash_value(const State &state) {
return util::MurmurHashNative(state.history_, sizeof(WordIndex) * state.valid_length_);
}
namespace detail {
template <class Search, class VocabularyT> const ModelType GenericModel<Search, VocabularyT>::kModelType = Search::kModelType;
@ -41,11 +36,11 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
// g++ prints warnings unless these are fully initialized.
State begin_sentence = State();
begin_sentence.valid_length_ = 1;
begin_sentence.history_[0] = vocab_.BeginSentence();
begin_sentence.backoff_[0] = search_.unigram.Lookup(begin_sentence.history_[0]).backoff;
begin_sentence.length = 1;
begin_sentence.words[0] = vocab_.BeginSentence();
begin_sentence.backoff[0] = search_.unigram.Lookup(begin_sentence.words[0]).backoff;
State null_context = State();
null_context.valid_length_ = 0;
null_context.length = 0;
P::Init(begin_sentence, null_context, vocab_, search_.MiddleEnd() - search_.MiddleBegin() + 2);
}
@ -87,7 +82,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
search_.unigram.Unknown().backoff = 0.0;
search_.unigram.Unknown().prob = config.unknown_missing_logprob;
}
FinishFile(config, kModelType, counts, backing_);
FinishFile(config, kModelType, kVersion, counts, backing_);
} catch (util::Exception &e) {
e << " Byte: " << f.Offset();
throw;
@ -95,9 +90,9 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
}
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
FullScoreReturn ret = ScoreExceptBackoff(in_state.history_, in_state.history_ + in_state.valid_length_, new_word, out_state);
if (ret.ngram_length - 1 < in_state.valid_length_) {
ret.prob = std::accumulate(in_state.backoff_ + ret.ngram_length - 1, in_state.backoff_ + in_state.valid_length_, ret.prob);
FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
if (ret.ngram_length - 1 < in_state.length) {
ret.prob = std::accumulate(in_state.backoff + ret.ngram_length - 1, in_state.backoff + in_state.length, ret.prob);
}
return ret;
}
@ -131,32 +126,80 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
// Generate a state from context.
context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
if (context_rend == context_rbegin) {
out_state.valid_length_ = 0;
out_state.length = 0;
return;
}
float ignored_prob;
FullScoreReturn ignored;
typename Search::Node node;
search_.LookupUnigram(*context_rbegin, ignored_prob, out_state.backoff_[0], node);
out_state.valid_length_ = HasExtension(out_state.backoff_[0]) ? 1 : 0;
float *backoff_out = out_state.backoff_ + 1;
search_.LookupUnigram(*context_rbegin, out_state.backoff[0], node, ignored);
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
float *backoff_out = out_state.backoff + 1;
const typename Search::Middle *mid = search_.MiddleBegin();
for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++mid) {
if (!search_.LookupMiddleNoProb(*mid, *i, *backoff_out, node)) {
std::copy(context_rbegin, context_rbegin + out_state.valid_length_, out_state.history_);
std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
return;
}
if (HasExtension(*backoff_out)) out_state.valid_length_ = i - context_rbegin + 1;
if (HasExtension(*backoff_out)) out_state.length = i - context_rbegin + 1;
}
std::copy(context_rbegin, context_rbegin + out_state.valid_length_, out_state.history_);
std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
}
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ExtendLeft(
const WordIndex *add_rbegin, const WordIndex *add_rend,
const float *backoff_in,
uint64_t extend_pointer,
unsigned char extend_length,
float *backoff_out,
unsigned char &next_use) const {
FullScoreReturn ret;
float subtract_me;
typename Search::Node node(search_.Unpack(extend_pointer, extend_length, subtract_me));
ret.prob = subtract_me;
ret.ngram_length = extend_length;
next_use = 0;
// If this function is called, then it does depend on left words.
ret.independent_left = false;
ret.extend_left = extend_pointer;
const typename Search::Middle *mid_iter = search_.MiddleBegin() + extend_length - 1;
const WordIndex *i = add_rbegin;
for (; ; ++i, ++backoff_out, ++mid_iter) {
if (i == add_rend) {
// Ran out of words.
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
ret.prob -= subtract_me;
return ret;
}
if (mid_iter == search_.MiddleEnd()) break;
if (ret.independent_left || !search_.LookupMiddle(*mid_iter, *i, *backoff_out, node, ret)) {
// Didn't match a word.
ret.independent_left = true;
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
ret.prob -= subtract_me;
return ret;
}
ret.ngram_length = mid_iter - search_.MiddleBegin() + 2;
if (HasExtension(*backoff_out)) next_use = i - add_rbegin + 1;
}
if (ret.independent_left || !search_.LookupLongest(*i, ret.prob, node)) {
// The last backoff weight, for Order() - 1.
ret.prob += backoff_in[i - add_rbegin];
} else {
ret.ngram_length = P::Order();
}
ret.independent_left = true;
ret.prob -= subtract_me;
return ret;
}
namespace {
// Do a paraonoid copy of history, assuming new_word has already been copied
// (hence the -1). out_state.valid_length_ could be zero so I avoided using
// (hence the -1). out_state.length could be zero so I avoided using
// std::copy.
void CopyRemainingHistory(const WordIndex *from, State &out_state) {
WordIndex *out = out_state.history_ + 1;
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.valid_length_) - 1;
WordIndex *out = out_state.words + 1;
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
for (const WordIndex *in = from; in < in_end; ++in, ++out) *out = *in;
}
} // namespace
@ -175,17 +218,17 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
// ret.ngram_length contains the last known non-blank ngram length.
ret.ngram_length = 1;
float *backoff_out(out_state.backoff);
typename Search::Node node;
float *backoff_out(out_state.backoff_);
search_.LookupUnigram(new_word, ret.prob, *backoff_out, node);
// This is the length of the context that should be used for continuation.
out_state.valid_length_ = HasExtension(*backoff_out) ? 1 : 0;
search_.LookupUnigram(new_word, *backoff_out, node, ret);
// This is the length of the context that should be used for continuation to the right.
out_state.length = HasExtension(*backoff_out) ? 1 : 0;
// We'll write the word anyway since it will probably be used and does no harm being there.
out_state.history_[0] = new_word;
out_state.words[0] = new_word;
if (context_rbegin == context_rend) return ret;
++backoff_out;
// Ok now we now that the bigram contains known words. Start by looking it up.
// Ok start by looking up the bigram.
const WordIndex *hist_iter = context_rbegin;
const typename Search::Middle *mid_iter = search_.MiddleBegin();
for (; ; ++mid_iter, ++hist_iter, ++backoff_out) {
@ -198,36 +241,28 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
if (mid_iter == search_.MiddleEnd()) break;
float revert = ret.prob;
if (!search_.LookupMiddle(*mid_iter, *hist_iter, ret.prob, *backoff_out, node)) {
if (ret.independent_left || !search_.LookupMiddle(*mid_iter, *hist_iter, *backoff_out, node, ret)) {
// Didn't find an ngram using hist_iter.
CopyRemainingHistory(context_rbegin, out_state);
// ret.prob was already set.
// ret.prob was already set.
ret.independent_left = true;
return ret;
}
if (ret.prob == kBlankProb) {
// It's a blank. Go back to the old probability.
ret.prob = revert;
} else {
ret.ngram_length = hist_iter - context_rbegin + 2;
if (HasExtension(*backoff_out)) {
out_state.valid_length_ = ret.ngram_length;
}
ret.ngram_length = hist_iter - context_rbegin + 2;
if (HasExtension(*backoff_out)) {
out_state.length = ret.ngram_length;
}
}
// It passed every lookup in search_.middle. All that's left is to check search_.longest.
if (!search_.LookupLongest(*hist_iter, ret.prob, node)) {
// Failed to find a longest n-gram. Fall back to the most recent non-blank.
CopyRemainingHistory(context_rbegin, out_state);
// ret.prob was already set.
return ret;
if (!ret.independent_left && search_.LookupLongest(*hist_iter, ret.prob, node)) {
// It's an P::Order()-gram.
// There is no blank in longest_.
ret.ngram_length = P::Order();
}
// It's an P::Order()-gram.
// This handles (N-1)-grams and N-grams.
CopyRemainingHistory(context_rbegin, out_state);
// There is no blank in longest_.
ret.ngram_length = P::Order();
ret.independent_left = true;
return ret;
}

View File

@ -12,6 +12,8 @@
#include "lm/vocab.hh"
#include "lm/weights.hh"
#include "util/murmur_hash.hh"
#include <algorithm>
#include <vector>
@ -27,42 +29,41 @@ namespace ngram {
class State {
public:
bool operator==(const State &other) const {
if (valid_length_ != other.valid_length_) return false;
const WordIndex *end = history_ + valid_length_;
for (const WordIndex *first = history_, *second = other.history_;
first != end; ++first, ++second) {
if (*first != *second) return false;
}
// If the histories are equal, so are the backoffs.
return true;
if (length != other.length) return false;
return !memcmp(words, other.words, length * sizeof(WordIndex));
}
// Three way comparison function.
int Compare(const State &other) const {
if (valid_length_ == other.valid_length_) {
return memcmp(history_, other.history_, valid_length_ * sizeof(WordIndex));
}
return (valid_length_ < other.valid_length_) ? -1 : 1;
if (length != other.length) return length < other.length ? -1 : 1;
return memcmp(words, other.words, length * sizeof(WordIndex));
}
bool operator<(const State &other) const {
if (length != other.length) return length < other.length;
return memcmp(words, other.words, length * sizeof(WordIndex)) < 0;
}
// Call this before using raw memcmp.
void ZeroRemaining() {
for (unsigned char i = valid_length_; i < kMaxOrder - 1; ++i) {
history_[i] = 0;
backoff_[i] = 0.0;
for (unsigned char i = length; i < kMaxOrder - 1; ++i) {
words[i] = 0;
backoff[i] = 0.0;
}
}
unsigned char ValidLength() const { return valid_length_; }
unsigned char Length() const { return length; }
// You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
// This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
WordIndex history_[kMaxOrder - 1];
float backoff_[kMaxOrder - 1];
unsigned char valid_length_;
WordIndex words[kMaxOrder - 1];
float backoff[kMaxOrder - 1];
unsigned char length;
};
size_t hash_value(const State &state);
inline size_t hash_value(const State &state) {
return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length);
}
namespace detail {
@ -75,6 +76,8 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
// This is the model type returned by RecognizeBinary.
static const ModelType kModelType;
static const unsigned int kVersion = Search::kVersion;
/* Get the size of memory that will be mapped given ngram counts. This
* does not include small non-mapped control structures, such as this class
* itself.
@ -114,6 +117,25 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
*/
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
/* More efficient version of FullScore where a partial n-gram has already
* been scored.
* NOTE: THE RETURNED .prob IS RELATIVE, NOT ABSOLUTE. So for example, if
* the n-gram does not end up extending further left, then 0 is returned.
*/
FullScoreReturn ExtendLeft(
// Additional context in reverse order. This will update add_rend to
const WordIndex *add_rbegin, const WordIndex *add_rend,
// Backoff weights to use.
const float *backoff_in,
// extend_left returned by a previous query.
uint64_t extend_pointer,
// Length of n-gram that the pointer corresponds to.
unsigned char extend_length,
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
float *backoff_out,
// Amount of additional content that should be considered by the next call.
unsigned char &next_use) const;
private:
friend void LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);

View File

@ -10,8 +10,8 @@ namespace lm {
namespace ngram {
std::ostream &operator<<(std::ostream &o, const State &state) {
o << "State length " << static_cast<unsigned int>(state.valid_length_) << ':';
for (const WordIndex *i = state.history_; i < state.history_ + state.valid_length_; ++i) {
o << "State length " << static_cast<unsigned int>(state.length) << ':';
for (const WordIndex *i = state.words; i < state.words + state.length; ++i) {
o << ' ' << *i;
}
return o;
@ -19,25 +19,26 @@ std::ostream &operator<<(std::ostream &o, const State &state) {
namespace {
#define StartTest(word, ngram, score) \
#define StartTest(word, ngram, score, indep_left) \
ret = model.FullScore( \
state, \
model.GetVocabulary().Index(word), \
out);\
BOOST_CHECK_CLOSE(score, ret.prob, 0.001); \
BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \
BOOST_CHECK_GE(std::min<unsigned char>(ngram, 5 - 1), out.valid_length_); \
BOOST_CHECK_GE(std::min<unsigned char>(ngram, 5 - 1), out.length); \
BOOST_CHECK_EQUAL(indep_left, ret.independent_left); \
{\
WordIndex context[state.valid_length_ + 1]; \
WordIndex context[state.length + 1]; \
context[0] = model.GetVocabulary().Index(word); \
std::copy(state.history_, state.history_ + state.valid_length_, context + 1); \
std::copy(state.words, state.words + state.length, context + 1); \
State get_state; \
model.GetState(context, context + state.valid_length_ + 1, get_state); \
model.GetState(context, context + state.length + 1, get_state); \
BOOST_CHECK_EQUAL(out, get_state); \
}
#define AppendTest(word, ngram, score) \
StartTest(word, ngram, score) \
#define AppendTest(word, ngram, score, indep_left) \
StartTest(word, ngram, score, indep_left) \
state = out;
template <class M> void Starters(const M &model) {
@ -45,12 +46,12 @@ template <class M> void Starters(const M &model) {
Model::State state(model.BeginSentenceState());
Model::State out;
StartTest("looking", 2, -0.4846522);
StartTest("looking", 2, -0.4846522, true);
// , probability plus <s> backoff
StartTest(",", 1, -1.383514 + -0.4149733);
StartTest(",", 1, -1.383514 + -0.4149733, true);
// <unk> probability plus <s> backoff
StartTest("this_is_not_found", 1, -1.995635 + -0.4149733);
StartTest("this_is_not_found", 1, -1.995635 + -0.4149733, true);
}
template <class M> void Continuation(const M &model) {
@ -58,46 +59,64 @@ template <class M> void Continuation(const M &model) {
Model::State state(model.BeginSentenceState());
Model::State out;
AppendTest("looking", 2, -0.484652);
AppendTest("on", 3, -0.348837);
AppendTest("a", 4, -0.0155266);
AppendTest("little", 5, -0.00306122);
AppendTest("looking", 2, -0.484652, true);
AppendTest("on", 3, -0.348837, true);
AppendTest("a", 4, -0.0155266, true);
AppendTest("little", 5, -0.00306122, true);
State preserve = state;
AppendTest("the", 1, -4.04005);
AppendTest("biarritz", 1, -1.9889);
AppendTest("not_found", 1, -2.29666);
AppendTest("more", 1, -1.20632 - 20.0);
AppendTest(".", 2, -0.51363);
AppendTest("</s>", 3, -0.0191651);
BOOST_CHECK_EQUAL(0, state.valid_length_);
AppendTest("the", 1, -4.04005, true);
AppendTest("biarritz", 1, -1.9889, true);
AppendTest("not_found", 1, -2.29666, true);
AppendTest("more", 1, -1.20632 - 20.0, true);
AppendTest(".", 2, -0.51363, true);
AppendTest("</s>", 3, -0.0191651, true);
BOOST_CHECK_EQUAL(0, state.length);
state = preserve;
AppendTest("more", 5, -0.00181395);
BOOST_CHECK_EQUAL(4, state.valid_length_);
AppendTest("loin", 5, -0.0432557);
BOOST_CHECK_EQUAL(1, state.valid_length_);
AppendTest("more", 5, -0.00181395, true);
BOOST_CHECK_EQUAL(4, state.length);
AppendTest("loin", 5, -0.0432557, true);
BOOST_CHECK_EQUAL(1, state.length);
}
template <class M> void Blanks(const M &model) {
FullScoreReturn ret;
State state(model.NullContextState());
State out;
AppendTest("also", 1, -1.687872);
AppendTest("would", 2, -2);
AppendTest("consider", 3, -3);
AppendTest("also", 1, -1.687872, false);
AppendTest("would", 2, -2, true);
AppendTest("consider", 3, -3, true);
State preserve = state;
AppendTest("higher", 4, -4);
AppendTest("looking", 5, -5);
BOOST_CHECK_EQUAL(1, state.valid_length_);
AppendTest("higher", 4, -4, true);
AppendTest("looking", 5, -5, true);
BOOST_CHECK_EQUAL(1, state.length);
state = preserve;
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103);
// also would consider not_found
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
state = model.NullContextState();
// higher looking is a blank.
AppendTest("higher", 1, -1.509559);
AppendTest("looking", 1, -1.285941 - 0.30103);
AppendTest("not_found", 1, -1.995635 - 0.4771212);
AppendTest("higher", 1, -1.509559, false);
AppendTest("looking", 2, -1.285941 - 0.30103, false);
State higher_looking = state;
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("not_found", 1, -1.995635 - 0.4771212, true);
state = higher_looking;
// higher looking consider
AppendTest("consider", 1, -1.687872 - 0.4771212, true);
state = model.NullContextState();
AppendTest("would", 1, -1.687872, false);
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("consider", 2, -1.687872 -0.30103, false);
BOOST_CHECK_EQUAL(2, state.length);
AppendTest("higher", 3, -1.509559 - 0.30103, false);
BOOST_CHECK_EQUAL(3, state.length);
AppendTest("looking", 4, -1.285941 - 0.30103, false);
}
template <class M> void Unknowns(const M &model) {
@ -105,14 +124,14 @@ template <class M> void Unknowns(const M &model) {
State state(model.NullContextState());
State out;
AppendTest("not_found", 1, -1.995635);
AppendTest("not_found", 1, -1.995635, false);
State preserve = state;
AppendTest("not_found2", 2, -15.0);
AppendTest("not_found3", 2, -15.0 - 2.0);
AppendTest("not_found2", 2, -15.0, true);
AppendTest("not_found3", 2, -15.0 - 2.0, true);
state = preserve;
AppendTest("however", 2, -4);
AppendTest("not_found3", 3, -6);
AppendTest("however", 2, -4, true);
AppendTest("not_found3", 3, -6, true);
}
template <class M> void MinimalState(const M &model) {
@ -120,22 +139,66 @@ template <class M> void MinimalState(const M &model) {
State state(model.NullContextState());
State out;
AppendTest("baz", 1, -6.535897);
BOOST_CHECK_EQUAL(0, state.valid_length_);
AppendTest("baz", 1, -6.535897, true);
BOOST_CHECK_EQUAL(0, state.length);
state = model.NullContextState();
AppendTest("foo", 1, -3.141592);
BOOST_CHECK_EQUAL(1, state.valid_length_);
AppendTest("bar", 2, -6.0);
AppendTest("foo", 1, -3.141592, true);
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 2, -6.0, true);
// Has to include the backoff weight.
BOOST_CHECK_EQUAL(1, state.valid_length_);
AppendTest("bar", 1, -2.718281 + 3.0);
BOOST_CHECK_EQUAL(1, state.valid_length_);
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 1, -2.718281 + 3.0, true);
BOOST_CHECK_EQUAL(1, state.length);
state = model.NullContextState();
AppendTest("to", 1, -1.687872);
AppendTest("look", 2, -0.2922095);
BOOST_CHECK_EQUAL(2, state.valid_length_);
AppendTest("good", 3, -7);
AppendTest("to", 1, -1.687872, false);
AppendTest("look", 2, -0.2922095, true);
BOOST_CHECK_EQUAL(2, state.length);
AppendTest("good", 3, -7, true);
}
template <class M> void ExtendLeftTest(const M &model) {
State right;
FullScoreReturn little(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("little"), right));
const float kLittleProb = -1.285941;
BOOST_CHECK_CLOSE(kLittleProb, little.prob, 0.001);
unsigned char next_use;
float backoff_out[4];
FullScoreReturn extend_none(model.ExtendLeft(NULL, NULL, NULL, little.extend_left, 1, NULL, next_use));
BOOST_CHECK_EQUAL(0, next_use);
BOOST_CHECK_EQUAL(little.extend_left, extend_none.extend_left);
BOOST_CHECK_CLOSE(0.0, extend_none.prob, 0.001);
BOOST_CHECK_EQUAL(1, extend_none.ngram_length);
const WordIndex a = model.GetVocabulary().Index("a");
float backoff_in = 3.14;
// a little
FullScoreReturn extend_a(model.ExtendLeft(&a, &a + 1, &backoff_in, little.extend_left, 1, backoff_out, next_use));
BOOST_CHECK_EQUAL(1, next_use);
BOOST_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001);
BOOST_CHECK_CLOSE(-0.09132547 - kLittleProb, extend_a.prob, 0.001);
BOOST_CHECK_EQUAL(2, extend_a.ngram_length);
BOOST_CHECK(!extend_a.independent_left);
const WordIndex on = model.GetVocabulary().Index("on");
FullScoreReturn extend_on(model.ExtendLeft(&on, &on + 1, &backoff_in, extend_a.extend_left, 2, backoff_out, next_use));
BOOST_CHECK_EQUAL(1, next_use);
BOOST_CHECK_CLOSE(-0.4771212, backoff_out[0], 0.001);
BOOST_CHECK_CLOSE(-0.0283603 - -0.09132547, extend_on.prob, 0.001);
BOOST_CHECK_EQUAL(3, extend_on.ngram_length);
BOOST_CHECK(!extend_on.independent_left);
const WordIndex both[2] = {a, on};
float backoff_in_arr[4];
FullScoreReturn extend_both(model.ExtendLeft(both, both + 2, backoff_in_arr, little.extend_left, 1, backoff_out, next_use));
BOOST_CHECK_EQUAL(2, next_use);
BOOST_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001);
BOOST_CHECK_CLOSE(-0.4771212, backoff_out[1], 0.001);
BOOST_CHECK_CLOSE(-0.0283603 - kLittleProb, extend_both.prob, 0.001);
BOOST_CHECK_EQUAL(3, extend_both.ngram_length);
BOOST_CHECK(!extend_both.independent_left);
BOOST_CHECK_EQUAL(extend_on.extend_left, extend_both.extend_left);
}
#define StatelessTest(word, provide, ngram, score) \
@ -166,17 +229,17 @@ template <class M> void Stateless(const M &model) {
// looking
StatelessTest(1, 2, 2, -0.484652);
// on
AppendTest("on", 3, -0.348837);
AppendTest("on", 3, -0.348837, true);
StatelessTest(2, 3, 3, -0.348837);
StatelessTest(2, 2, 3, -0.348837);
StatelessTest(2, 1, 2, -0.4638903);
// a
StatelessTest(3, 4, 4, -0.0155266);
// little
AppendTest("little", 5, -0.00306122);
AppendTest("little", 5, -0.00306122, true);
StatelessTest(4, 5, 5, -0.00306122);
// the
AppendTest("the", 1, -4.04005);
AppendTest("the", 1, -4.04005, true);
StatelessTest(5, 5, 1, -4.04005);
// No context of the.
StatelessTest(5, 0, 1, -1.687872);
@ -189,8 +252,8 @@ template <class M> void Stateless(const M &model) {
WordIndex unk[1];
unk[0] = 0;
model.GetState(unk, unk + 1, state);
BOOST_CHECK_EQUAL(1, state.valid_length_);
BOOST_CHECK_EQUAL(static_cast<WordIndex>(0), state.history_[0]);
BOOST_CHECK_EQUAL(1, state.length);
BOOST_CHECK_EQUAL(static_cast<WordIndex>(0), state.words[0]);
}
template <class M> void NoUnkCheck(const M &model) {
@ -207,6 +270,7 @@ template <class M> void Everything(const M &m) {
Blanks(m);
Unknowns(m);
MinimalState(m);
ExtendLeftTest(m);
Stateless(m);
}
@ -245,6 +309,7 @@ template <class ModelT> void LoadingTest() {
config.enumerate_vocab = &enumerate;
ModelT m("test.arpa", config);
enumerate.Check(m.GetVocabulary());
BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound());
Everything(m);
}
{
@ -252,6 +317,7 @@ template <class ModelT> void LoadingTest() {
config.enumerate_vocab = &enumerate;
ModelT m("test_nounk.arpa", config);
enumerate.Check(m.GetVocabulary());
BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound());
NoUnkCheck(m);
}
}
@ -315,7 +381,7 @@ template <class ModelT> void BinaryTest() {
enumerate.Check(binary.GetVocabulary());
NoUnkCheck(binary);
}
unlink("test.binary");
unlink("test_nounk.binary");
}
BOOST_AUTO_TEST_CASE(write_and_read_probing) {

16
kenlm/lm/model_type.hh Normal file
View File

@ -0,0 +1,16 @@
#ifndef LM_MODEL_TYPE__
#define LM_MODEL_TYPE__
namespace lm {
namespace ngram {
/* Not the best numbering system, but it grew this way for historical reasons
* and I want to preserve existing binary files. */
typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2, QUANT_TRIE_SORTED=3, ARRAY_TRIE_SORTED=4, QUANT_ARRAY_TRIE_SORTED=5} ModelType;
const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE_SORTED - TRIE_SORTED);
const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE_SORTED - TRIE_SORTED);
} // namespace ngram
} // namespace lm
#endif // LM_MODEL_TYPE__

View File

@ -1,5 +1,6 @@
#include "lm/quantize.hh"
#include "lm/binary_format.hh"
#include "lm/lm_exception.hh"
#include <algorithm>
@ -70,8 +71,7 @@ void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vec
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
float *centers = start_ + TableStart(order);
*(centers++) = kBlankProb;
MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_) - 1);
MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_));
}
void SeparatelyQuantize::FinishedLoading(const Config &config) {

View File

@ -1,9 +1,9 @@
#ifndef LM_QUANTIZE_H__
#define LM_QUANTIZE_H__
#include "lm/binary_format.hh" // for ModelType
#include "lm/blank.hh"
#include "lm/config.hh"
#include "lm/model_type.hh"
#include "util/bit_packing.hh"
#include <algorithm>
@ -36,6 +36,9 @@ class DontQuantize {
prob = util::ReadNonPositiveFloat31(base, bit_offset);
backoff = util::ReadFloat32(base, bit_offset + 31);
}
void ReadProb(const void *base, uint64_t bit_offset, float &prob) const {
prob = util::ReadNonPositiveFloat31(base, bit_offset);
}
void ReadBackoff(const void *base, uint64_t bit_offset, float &backoff) const {
backoff = util::ReadFloat32(base, bit_offset + 31);
}
@ -77,7 +80,7 @@ class SeparatelyQuantize {
Bins(uint8_t bits, const float *const begin) : begin_(begin), end_(begin_ + (1ULL << bits)), bits_(bits), mask_((1ULL << bits) - 1) {}
uint64_t EncodeProb(float value) const {
return(value == kBlankProb ? kBlankProbQuant : Encode(value, 1));
return Encode(value, 0);
}
uint64_t EncodeBackoff(float value) const {
@ -132,6 +135,10 @@ class SeparatelyQuantize {
(prob_.EncodeProb(prob) << backoff_.Bits()) | backoff_.EncodeBackoff(backoff));
}
void ReadProb(const void *base, uint64_t bit_offset, float &prob) const {
prob = prob_.Decode(util::ReadInt25(base, bit_offset + backoff_.Bits(), prob_.Bits(), prob_.Mask()));
}
void Read(const void *base, uint64_t bit_offset, float &prob, float &backoff) const {
uint64_t both = util::ReadInt57(base, bit_offset, total_bits_, total_mask_);
prob = prob_.Decode(both >> backoff_.Bits());
@ -179,7 +186,7 @@ class SeparatelyQuantize {
void SetupMemory(void *start, const Config &config);
static const bool kTrain = true;
// Assumes kBlankProb is removed from prob and 0.0 is removed from backoff.
// Assumes 0.0 is removed from backoff.
void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff);
// Train just probabilities (for longest order).
void TrainProb(uint8_t order, std::vector<float> &prob);

39
kenlm/lm/return.hh Normal file
View File

@ -0,0 +1,39 @@
#ifndef LM_RETURN__
#define LM_RETURN__
#include <inttypes.h>
namespace lm {
/* Structure returned by scoring routines. */
struct FullScoreReturn {
// log10 probability
float prob;
/* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams:
* -1 foo
* -3.14 bar
* -2.718 baz -5
* -6 foo bar
*
* If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the
* right.
* If you score ``foo'' then ngram_length is 1 and recombination state is
* ``foo''.
*
* Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination.
*/
unsigned char ngram_length;
/* Left extension information. If independent_left is set, then prob is
* independent of words to the left (up to additional backoff). Otherwise,
* extend_left indicates how to efficiently extend further to the left.
*/
bool independent_left;
uint64_t extend_left; // Defined only if independent_left
};
} // namespace lm
#endif // LM_RETURN__

View File

@ -1,10 +1,12 @@
#include "lm/search_hashed.hh"
#include "lm/binary_format.hh"
#include "lm/blank.hh"
#include "lm/lm_exception.hh"
#include "lm/read_arpa.hh"
#include "lm/vocab.hh"
#include "util/bit_packing.hh"
#include "util/file_piece.hh"
#include <string>
@ -48,30 +50,77 @@ class ActivateUnigram {
ProbBackoff *modify_;
};
template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
ReadNGramHeader(f, n);
template <class Middle> void FixSRI(int lower, float negative_lower_prob, unsigned int n, const uint64_t *keys, const WordIndex *vocab_ids, ProbBackoff *unigrams, std::vector<Middle> &middle) {
ProbBackoff blank;
blank.prob = kBlankProb;
blank.backoff = kBlankBackoff;
blank.backoff = kNoExtensionBackoff;
// Fix SRI's stupidity.
// Note that negative_lower_prob is the negative of the probability (so it's currently >= 0). We still want the sign bit off to indicate left extension, so I just do -= on the backoffs.
blank.prob = negative_lower_prob;
// An entry was found at lower (order lower + 2).
// We need to insert blanks starting at lower + 1 (order lower + 3).
unsigned int fix = static_cast<unsigned int>(lower + 1);
uint64_t backoff_hash = detail::CombineWordHash(static_cast<uint64_t>(vocab_ids[1]), vocab_ids[2]);
if (fix == 0) {
// Insert a missing bigram.
blank.prob -= unigrams[vocab_ids[1]].backoff;
SetExtension(unigrams[vocab_ids[1]].backoff);
// Bigram including a unigram's backoff
middle[0].Insert(Middle::Packing::Make(keys[0], blank));
fix = 1;
} else {
for (unsigned int i = 3; i < fix + 2; ++i) backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[i]);
}
// fix >= 1. Insert trigrams and above.
for (; fix <= n - 3; ++fix) {
typename Middle::MutableIterator gotit;
if (middle[fix - 1].UnsafeMutableFind(backoff_hash, gotit)) {
float &backoff = gotit->MutableValue().backoff;
SetExtension(backoff);
blank.prob -= backoff;
}
middle[fix].Insert(Middle::Packing::Make(keys[fix], blank));
backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[fix + 2]);
}
}
template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
ReadNGramHeader(f, n);
// vocab ids of words in reverse order
WordIndex vocab_ids[n];
uint64_t keys[n - 1];
typename Store::Packing::Value value;
typename Middle::ConstIterator found;
typename Middle::MutableIterator found;
for (size_t i = 0; i < count; ++i) {
ReadNGram(f, n, vocab, vocab_ids, value, warn);
keys[0] = detail::CombineWordHash(static_cast<uint64_t>(*vocab_ids), vocab_ids[1]);
for (unsigned int h = 1; h < n - 1; ++h) {
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
}
// Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
util::SetSign(value.prob);
store.Insert(Store::Packing::Make(keys[n-2], value));
// Go back and insert blanks.
for (int lower = n - 3; lower >= 0; --lower) {
if (middle[lower].Find(keys[lower], found)) break;
middle[lower].Insert(Middle::Packing::Make(keys[lower], blank));
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
int lower;
util::FloatEnc fix_prob;
for (lower = n - 3; ; --lower) {
if (lower == -1) {
fix_prob.f = unigrams[vocab_ids[0]].prob;
fix_prob.i &= ~util::kSignBit;
unigrams[vocab_ids[0]].prob = fix_prob.f;
break;
}
if (middle[lower].UnsafeMutableFind(keys[lower], found)) {
// Turn off sign bit to indicate that it extends left.
fix_prob.f = found->MutableValue().prob;
fix_prob.i &= ~util::kSignBit;
found->MutableValue().prob = fix_prob.f;
// We don't need to recurse further down because this entry already set the bits for lower entries.
break;
}
}
if (lower != static_cast<int>(n) - 3) FixSRI(lower, fix_prob.f, n, keys, vocab_ids, unigrams, middle);
activate(vocab_ids, n);
}
@ -107,15 +156,15 @@ template <class MiddleT, class LongestT> template <class Voc> void TemplateHashe
try {
if (counts.size() > 2) {
ReadNGrams(f, 2, counts[1], vocab, middle_, ActivateUnigram(unigram.Raw()), middle_[0], warn);
ReadNGrams(f, 2, counts[1], vocab, unigram.Raw(), middle_, ActivateUnigram(unigram.Raw()), middle_[0], warn);
}
for (unsigned int n = 3; n < counts.size(); ++n) {
ReadNGrams(f, n, counts[n-1], vocab, middle_, ActivateLowerMiddle<Middle>(middle_[n-3]), middle_[n-2], warn);
ReadNGrams(f, n, counts[n-1], vocab, unigram.Raw(), middle_, ActivateLowerMiddle<Middle>(middle_[n-3]), middle_[n-2], warn);
}
if (counts.size() > 2) {
ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle_, ActivateLowerMiddle<Middle>(middle_.back()), longest, warn);
ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, unigram.Raw(), middle_, ActivateLowerMiddle<Middle>(middle_.back()), longest, warn);
} else {
ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle_, ActivateUnigram(unigram.Raw()), longest, warn);
ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, unigram.Raw(), middle_, ActivateUnigram(unigram.Raw()), longest, warn);
}
} catch (util::ProbingSizeException &e) {
UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model. KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them. Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n");
@ -133,7 +182,7 @@ template <class MiddleT, class LongestT> void TemplateHashedSearch<MiddleT, Long
template class TemplateHashedSearch<ProbingHashedSearch::Middle, ProbingHashedSearch::Longest>;
template void TemplateHashedSearch<ProbingHashedSearch::Middle, ProbingHashedSearch::Longest>::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &, ProbingVocabulary &vocab, Backing &backing);
template void TemplateHashedSearch<ProbingHashedSearch::Middle, ProbingHashedSearch::Longest>::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &, ProbingVocabulary &vocab, Backing &backing);
} // namespace detail
} // namespace ngram

View File

@ -1,15 +1,18 @@
#ifndef LM_SEARCH_HASHED__
#define LM_SEARCH_HASHED__
#include "lm/binary_format.hh"
#include "lm/model_type.hh"
#include "lm/config.hh"
#include "lm/read_arpa.hh"
#include "lm/return.hh"
#include "lm/weights.hh"
#include "util/bit_packing.hh"
#include "util/key_value_packing.hh"
#include "util/probing_hash_table.hh"
#include <algorithm>
#include <iostream>
#include <vector>
namespace util { class FilePiece; }
@ -52,9 +55,14 @@ struct HashedSearch {
Unigram unigram;
void LookupUnigram(WordIndex word, float &prob, float &backoff, Node &next) const {
void LookupUnigram(WordIndex word, float &backoff, Node &next, FullScoreReturn &ret) const {
const ProbBackoff &entry = unigram.Lookup(word);
prob = entry.prob;
util::FloatEnc val;
val.f = entry.prob;
ret.independent_left = (val.i & util::kSignBit);
ret.extend_left = static_cast<uint64_t>(word);
val.i |= util::kSignBit;
ret.prob = val.f;
backoff = entry.backoff;
next = static_cast<Node>(word);
}
@ -67,6 +75,8 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
typedef LongestT Longest;
Longest longest;
static const unsigned int kVersion = 0;
// TODO: move probing_multiplier here with next binary file format update.
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
@ -85,11 +95,33 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
const Middle *MiddleBegin() const { return &*middle_.begin(); }
const Middle *MiddleEnd() const { return &*middle_.end(); }
bool LookupMiddle(const Middle &middle, WordIndex word, float &prob, float &backoff, Node &node) const {
Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const {
util::FloatEnc val;
if (extend_length == 1) {
val.f = unigram.Lookup(static_cast<uint64_t>(extend_pointer)).prob;
} else {
typename Middle::ConstIterator found;
if (!middle_[extend_length - 2].Find(extend_pointer, found)) {
std::cerr << "Extend pointer " << extend_pointer << " should have been found for length " << (unsigned) extend_length << std::endl;
abort();
}
val.f = found->GetValue().prob;
}
val.i |= util::kSignBit;
prob = val.f;
return extend_pointer;
}
bool LookupMiddle(const Middle &middle, WordIndex word, float &backoff, Node &node, FullScoreReturn &ret) const {
node = CombineWordHash(node, word);
typename Middle::ConstIterator found;
if (!middle.Find(node, found)) return false;
prob = found->GetValue().prob;
util::FloatEnc enc;
enc.f = found->GetValue().prob;
ret.independent_left = (enc.i & util::kSignBit);
ret.extend_left = node;
enc.i |= util::kSignBit;
ret.prob = enc.f;
backoff = found->GetValue().backoff;
return true;
}
@ -105,6 +137,7 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
}
bool LookupLongest(WordIndex word, float &prob, Node &node) const {
// Sign bit is always on because longest n-grams do not extend left.
node = CombineWordHash(node, word);
typename Longest::ConstIterator found;
if (!longest.Find(node, found)) return false;

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,16 @@
#ifndef LM_SEARCH_TRIE__
#define LM_SEARCH_TRIE__
#include "lm/binary_format.hh"
#include "lm/config.hh"
#include "lm/model_type.hh"
#include "lm/return.hh"
#include "lm/trie.hh"
#include "lm/weights.hh"
#include "util/file_piece.hh"
#include <vector>
#include <assert.h>
namespace lm {
@ -30,6 +36,8 @@ template <class Quant, class Bhiksha> class TrieSearch {
static const ModelType kModelType = static_cast<ModelType>(TRIE_SORTED + Quant::kModelTypeAdd + Bhiksha::kModelTypeAdd);
static const unsigned int kVersion = 0;
static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config) {
Quant::UpdateConfigFromBinary(fd, counts, config);
AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0]));
@ -57,12 +65,16 @@ template <class Quant, class Bhiksha> class TrieSearch {
void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, Backing &backing);
void LookupUnigram(WordIndex word, float &prob, float &backoff, Node &node) const {
unigram.Find(word, prob, backoff, node);
void LookupUnigram(WordIndex word, float &backoff, Node &node, FullScoreReturn &ret) const {
unigram.Find(word, ret.prob, backoff, node);
ret.independent_left = (node.begin == node.end);
ret.extend_left = static_cast<uint64_t>(word);
}
bool LookupMiddle(const Middle &mid, WordIndex word, float &prob, float &backoff, Node &node) const {
return mid.Find(word, prob, backoff, node);
bool LookupMiddle(const Middle &mid, WordIndex word, float &backoff, Node &node, FullScoreReturn &ret) const {
if (!mid.Find(word, ret.prob, backoff, node, ret.extend_left)) return false;
ret.independent_left = (node.begin == node.end);
return true;
}
bool LookupMiddleNoProb(const Middle &mid, WordIndex word, float &backoff, Node &node) const {
@ -76,14 +88,25 @@ template <class Quant, class Bhiksha> class TrieSearch {
bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
// TODO: don't decode backoff.
assert(begin != end);
float ignored_prob, ignored_backoff;
LookupUnigram(*begin, ignored_prob, ignored_backoff, node);
FullScoreReturn ignored;
float ignored_backoff;
LookupUnigram(*begin, ignored_backoff, node, ignored);
for (const WordIndex *i = begin + 1; i < end; ++i) {
if (!LookupMiddleNoProb(middle_begin_[i - begin - 1], *i, ignored_backoff, node)) return false;
}
return true;
}
Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const {
if (extend_length == 1) {
float ignored;
Node ret;
unigram.Find(static_cast<WordIndex>(extend_pointer), prob, ignored, ret);
return ret;
}
return middle_begin_[extend_length - 2].ReadEntry(extend_pointer, prob);
}
private:
friend void BuildTrie<Quant, Bhiksha>(const std::string &file_prefix, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing);

View File

@ -86,18 +86,19 @@ template <class Quant, class Bhiksha> void BitPackedMiddle<Quant, Bhiksha>::Inse
++insert_index_;
}
template <class Quant, class Bhiksha> bool BitPackedMiddle<Quant, Bhiksha>::Find(WordIndex word, float &prob, float &backoff, NodeRange &range) const {
template <class Quant, class Bhiksha> bool BitPackedMiddle<Quant, Bhiksha>::Find(WordIndex word, float &prob, float &backoff, NodeRange &range, uint64_t &pointer) const {
uint64_t at_pointer;
if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) {
return false;
}
uint64_t index = at_pointer;
pointer = at_pointer;
at_pointer *= total_bits_;
at_pointer += word_bits_;
quant_.Read(base_, at_pointer, prob, backoff);
at_pointer += quant_.TotalBits();
bhiksha_.ReadNext(base_, at_pointer, index, total_bits_, range);
bhiksha_.ReadNext(base_, at_pointer, pointer, total_bits_, range);
return true;
}

View File

@ -94,10 +94,19 @@ template <class Quant, class Bhiksha> class BitPackedMiddle : public BitPacked {
void LoadedBinary() { bhiksha_.LoadedBinary(); }
bool Find(WordIndex word, float &prob, float &backoff, NodeRange &range) const;
bool Find(WordIndex word, float &prob, float &backoff, NodeRange &range, uint64_t &pointer) const;
bool FindNoProb(WordIndex word, float &backoff, NodeRange &range) const;
NodeRange ReadEntry(uint64_t pointer, float &prob) {
uint64_t addr = pointer * total_bits_;
addr += word_bits_;
quant_.ReadProb(base_, addr, prob);
NodeRange ret;
bhiksha_.ReadNext(base_, addr + quant_.TotalBits(), pointer, total_bits_, ret);
return ret;
}
private:
Quant quant_;
Bhiksha bhiksha_;

261
kenlm/lm/trie_sort.cc Normal file
View File

@ -0,0 +1,261 @@
#include "lm/trie_sort.hh"
#include "lm/config.hh"
#include "lm/lm_exception.hh"
#include "lm/read_arpa.hh"
#include "lm/vocab.hh"
#include "lm/weights.hh"
#include "lm/word_index.hh"
#include "util/file_piece.hh"
#include "util/mmap.hh"
#include "util/proxy_iterator.hh"
#include "util/sized_iterator.hh"
#include <algorithm>
#include <cstring>
#include <cstdio>
#include <deque>
#include <limits>
#include <vector>
namespace lm {
namespace ngram {
namespace trie {
const char *kContextSuffix = "_contexts";
FILE *OpenOrThrow(const char *name, const char *mode) {
FILE *ret = fopen(name, mode);
if (!ret) UTIL_THROW(util::ErrnoException, "Could not open " << name << " for " << mode);
return ret;
}
void WriteOrThrow(FILE *to, const void *data, size_t size) {
assert(size);
if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size);
}
namespace {
typedef util::SizedIterator NGramIter;
// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
class PartialViewProxy {
public:
PartialViewProxy() : attention_size_(0), inner_() {}
PartialViewProxy(void *ptr, std::size_t block_size, std::size_t attention_size) : attention_size_(attention_size), inner_(ptr, block_size) {}
operator std::string() const {
return std::string(reinterpret_cast<const char*>(inner_.Data()), attention_size_);
}
PartialViewProxy &operator=(const PartialViewProxy &from) {
memcpy(inner_.Data(), from.inner_.Data(), attention_size_);
return *this;
}
PartialViewProxy &operator=(const std::string &from) {
memcpy(inner_.Data(), from.data(), attention_size_);
return *this;
}
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
private:
friend class util::ProxyIterator<PartialViewProxy>;
typedef std::string value_type;
const std::size_t attention_size_;
typedef util::SizedInnerIterator InnerIterator;
InnerIterator &Inner() { return inner_; }
const InnerIterator &Inner() const { return inner_; }
InnerIterator inner_;
};
typedef util::ProxyIterator<PartialViewProxy> PartialIter;
std::string DiskFlush(const void *mem_begin, const void *mem_end, const std::string &file_prefix, std::size_t batch, unsigned char order) {
std::stringstream assembled;
assembled << file_prefix << static_cast<unsigned int>(order) << '_' << batch;
std::string ret(assembled.str());
util::scoped_fd out(util::CreateOrThrow(ret.c_str()));
util::WriteOrThrow(out.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
return ret;
}
void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_file_name, std::size_t entry_size, unsigned char order) {
const size_t context_size = sizeof(WordIndex) * (order - 1);
// Sort just the contexts using the same memory.
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
std::string name(ngram_file_name + kContextSuffix);
util::scoped_FILE out(OpenOrThrow(name.c_str(), "w"));
// Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
if (context_begin == context_end) return;
PartialIter i(context_begin);
WriteOrThrow(out.get(), i->Data(), context_size);
const void *previous = i->Data();
++i;
for (; i != context_end; ++i) {
if (memcmp(previous, i->Data(), context_size)) {
WriteOrThrow(out.get(), i->Data(), context_size);
previous = i->Data();
}
}
}
struct ThrowCombine {
void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const {
UTIL_THROW(FormatLoadException, "Duplicate n-gram detected.");
}
};
// Useful for context files that just contain records with no value.
struct FirstCombine {
void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const {
WriteOrThrow(out, first, entry_size);
}
};
template <class Combine> void MergeSortedFiles(const std::string &first_name, const std::string &second_name, const std::string &out, std::size_t weights_size, unsigned char order, const Combine &combine = ThrowCombine()) {
std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
RecordReader first, second;
first.Init(first_name.c_str(), entry_size);
util::RemoveOrThrow(first_name.c_str());
second.Init(second_name.c_str(), entry_size);
util::RemoveOrThrow(second_name.c_str());
util::scoped_FILE out_file(OpenOrThrow(out.c_str(), "w"));
EntryCompare less(order);
while (first && second) {
if (less(first.Data(), second.Data())) {
WriteOrThrow(out_file.get(), first.Data(), entry_size);
++first;
} else if (less(second.Data(), first.Data())) {
WriteOrThrow(out_file.get(), second.Data(), entry_size);
++second;
} else {
combine(entry_size, first.Data(), second.Data(), out_file.get());
++first; ++second;
}
}
for (RecordReader &remains = (first ? second : first); remains; ++remains) {
WriteOrThrow(out_file.get(), remains.Data(), entry_size);
}
}
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn) {
ReadNGramHeader(f, order);
const size_t count = counts[order - 1];
// Size of weights. Does it include backoff?
const size_t words_size = sizeof(WordIndex) * order;
const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
const size_t entry_size = words_size + weights_size;
const size_t batch_size = std::min(count, mem.size() / entry_size);
uint8_t *const begin = reinterpret_cast<uint8_t*>(mem.get());
std::deque<std::string> files;
for (std::size_t batch = 0, done = 0; done < count; ++batch) {
uint8_t *out = begin;
uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
if (order == counts.size()) {
for (; out != out_end; out += entry_size) {
ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
}
} else {
for (; out != out_end; out += entry_size) {
ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
}
}
// Sort full records by full n-gram.
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
// parallel_sort uses too much RAM
std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
files.push_back(DiskFlush(begin, out_end, file_prefix, batch, order));
WriteContextFile(begin, out_end, files.back(), entry_size, order);
done += (out_end - begin) / entry_size;
}
// All individual files created. Merge them.
std::size_t merge_count = 0;
while (files.size() > 1) {
std::stringstream assembled;
assembled << file_prefix << static_cast<unsigned int>(order) << "_merge_" << (merge_count++);
files.push_back(assembled.str());
MergeSortedFiles(files[0], files[1], files.back(), weights_size, order, ThrowCombine());
MergeSortedFiles(files[0] + kContextSuffix, files[1] + kContextSuffix, files.back() + kContextSuffix, 0, order, FirstCombine());
files.pop_front();
files.pop_front();
}
if (!files.empty()) {
std::stringstream assembled;
assembled << file_prefix << static_cast<unsigned int>(order) << "_merged";
std::string merged_name(assembled.str());
if (std::rename(files[0].c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << files[0].c_str() << " to " << merged_name.c_str());
std::string context_name = files[0] + kContextSuffix;
merged_name += kContextSuffix;
if (std::rename(context_name.c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << context_name << " to " << merged_name.c_str());
}
}
} // namespace
void RecordReader::Init(const std::string &name, std::size_t entry_size) {
file_.reset(OpenOrThrow(name.c_str(), "r+"));
data_.reset(malloc(entry_size));
UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer");
remains_ = true;
entry_size_ = entry_size;
++*this;
}
void RecordReader::Overwrite(const void *start, std::size_t amount) {
long internal = (uint8_t*)start - (uint8_t*)data_.get();
UTIL_THROW_IF(fseek(file_.get(), internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
WriteOrThrow(file_.get(), start, amount);
long forward = entry_size_ - internal - amount;
if (forward) UTIL_THROW_IF(fseek(file_.get(), forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
}
void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
PositiveProbWarn warn(config.positive_log_probability);
{
std::string unigram_name = file_prefix + "unigrams";
util::scoped_fd unigram_file;
// In case <unk> appears.
size_t file_out = (counts[0] + 1) * sizeof(ProbBackoff);
util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), file_out, unigram_file), file_out);
Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
CheckSpecials(config, vocab);
if (!vocab.SawUnk()) ++counts[0];
}
// Only use as much buffer as we need.
size_t buffer_use = 0;
for (unsigned int order = 2; order < counts.size(); ++order) {
buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
}
buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
buffer = std::min<size_t>(buffer, buffer_use);
util::scoped_memory mem;
mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED);
if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);
for (unsigned char order = 2; order <= counts.size(); ++order) {
ConvertToSorted(f, vocab, counts, mem, file_prefix, order, warn);
}
ReadEnd(f);
}
} // namespace trie
} // namespace ngram
} // namespace lm

94
kenlm/lm/trie_sort.hh Normal file
View File

@ -0,0 +1,94 @@
#ifndef LM_TRIE_SORT__
#define LM_TRIE_SORT__
#include "lm/word_index.hh"
#include "util/file.hh"
#include "util/scoped.hh"
#include <cstddef>
#include <functional>
#include <string>
#include <vector>
#include <inttypes.h>
namespace util { class FilePiece; }
// Step of trie builder: create sorted files.
namespace lm {
namespace ngram {
class SortedVocabulary;
class Config;
namespace trie {
extern const char *kContextSuffix;
FILE *OpenOrThrow(const char *name, const char *mode);
void WriteOrThrow(FILE *to, const void *data, size_t size);
class EntryCompare : public std::binary_function<const void*, const void*, bool> {
public:
explicit EntryCompare(unsigned char order) : order_(order) {}
bool operator()(const void *first_void, const void *second_void) const {
const WordIndex *first = static_cast<const WordIndex*>(first_void);
const WordIndex *second = static_cast<const WordIndex*>(second_void);
const WordIndex *end = first + order_;
for (; first != end; ++first, ++second) {
if (*first < *second) return true;
if (*first > *second) return false;
}
return false;
}
private:
unsigned char order_;
};
class RecordReader {
public:
RecordReader() : remains_(true) {}
void Init(const std::string &name, std::size_t entry_size);
void *Data() { return data_.get(); }
const void *Data() const { return data_.get(); }
RecordReader &operator++() {
std::size_t ret = fread(data_.get(), entry_size_, 1, file_.get());
if (!ret) {
UTIL_THROW_IF(!feof(file_.get()), util::ErrnoException, "Error reading temporary file");
remains_ = false;
}
return *this;
}
operator bool() const { return remains_; }
void Rewind() {
rewind(file_.get());
remains_ = true;
++*this;
}
std::size_t EntrySize() const { return entry_size_; }
void Overwrite(const void *start, std::size_t amount);
private:
util::scoped_malloc data_;
bool remains_;
std::size_t entry_size_;
util::scoped_FILE file_;
};
void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab);
} // namespace trie
} // namespace ngram
} // namespace lm
#endif // LM_TRIE_SORT__

View File

@ -1,37 +1,13 @@
#ifndef LM_VIRTUAL_INTERFACE__
#define LM_VIRTUAL_INTERFACE__
#include "lm/return.hh"
#include "lm/word_index.hh"
#include "util/string_piece.hh"
#include <string>
namespace lm {
/* Structure returned by scoring routines. */
struct FullScoreReturn {
// log10 probability
float prob;
/* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams:
* -1 foo
* -3.14 bar
* -2.718 baz -5
* -6 foo bar
*
* If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the
* right.
* If you score ``foo'' then ngram_length is 1 and recombination state is
* ``foo''.
*
* Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination.
*/
unsigned char ngram_length;
};
namespace base {
template <class T, class U, class V> class ModelFacade;

View File

@ -1,5 +1,6 @@
#include "lm/vocab.hh"
#include "lm/binary_format.hh"
#include "lm/enumerate_vocab.hh"
#include "lm/lm_exception.hh"
#include "lm/config.hh"
@ -56,16 +57,6 @@ WordIndex ReadWords(int fd, EnumerateVocab *enumerate) {
}
}
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
ssize_t ret = write(fd, data, size);
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
data += ret;
size -= ret;
}
}
} // namespace
WriteWordsWrapper::WriteWordsWrapper(EnumerateVocab *inner) : inner_(inner) {}
@ -80,7 +71,7 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
void WriteWordsWrapper::Write(int fd) {
if ((off_t)-1 == lseek(fd, 0, SEEK_END))
UTIL_THROW(util::ErrnoException, "Failed to seek in binary to vocab words");
WriteOrThrow(fd, buffer_.data(), buffer_.size());
util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
}
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}
@ -146,15 +137,28 @@ void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
SetSpecial(Index("<s>"), Index("</s>"), 0);
}
namespace {
const unsigned int kProbingVocabularyVersion = 0;
} // namespace
namespace detail {
struct ProbingVocabularyHeader {
// Lowest unused vocab id. This is also the number of words, including <unk>.
unsigned int version;
WordIndex bound;
};
} // namespace detail
ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {}
std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) {
return Lookup::Size(entries, config.probing_multiplier);
return Align8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier);
}
void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) {
lookup_ = Lookup(start, allocated);
available_ = 1;
header_ = static_cast<detail::ProbingVocabularyHeader*>(start);
lookup_ = Lookup(static_cast<uint8_t*>(start) + Align8(sizeof(detail::ProbingVocabularyHeader)), allocated);
bound_ = 1;
saw_unk_ = false;
}
@ -172,20 +176,24 @@ WordIndex ProbingVocabulary::Insert(const StringPiece &str) {
saw_unk_ = true;
return 0;
} else {
if (enumerate_) enumerate_->Add(available_, str);
lookup_.Insert(Lookup::Packing::Make(hashed, available_));
return available_++;
if (enumerate_) enumerate_->Add(bound_, str);
lookup_.Insert(Lookup::Packing::Make(hashed, bound_));
return bound_++;
}
}
void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
lookup_.FinishedInserting();
header_->bound = bound_;
header_->version = kProbingVocabularyVersion;
SetSpecial(Index("<s>"), Index("</s>"), 0);
}
void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
lookup_.LoadedBinary();
available_ = ReadWords(fd, to);
ReadWords(fd, to);
bound_ = header_->bound;
SetSpecial(Index("<s>"), Index("</s>"), 0);
}

View File

@ -25,6 +25,7 @@ uint64_t HashForVocab(const char *str, std::size_t len);
inline uint64_t HashForVocab(const StringPiece &str) {
return HashForVocab(str.data(), str.length());
}
class ProbingVocabularyHeader;
} // namespace detail
class WriteWordsWrapper : public EnumerateVocab {
@ -113,10 +114,7 @@ class ProbingVocabulary : public base::Vocabulary {
static size_t Size(std::size_t entries, const Config &config);
// Vocab words are [0, Bound()).
// WARNING WARNING: returns UINT_MAX when loading binary and not enumerating vocabulary.
// Fixing this bug requires a binary file format change and will be fixed with the next binary file format update.
// Specifically, the binary file format does not currently indicate whether <unk> is in count or not.
WordIndex Bound() const { return available_; }
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
@ -141,11 +139,13 @@ class ProbingVocabulary : public base::Vocabulary {
Lookup lookup_;
WordIndex available_;
WordIndex bound_;
bool saw_unk_;
EnumerateVocab *enumerate_;
detail::ProbingVocabularyHeader *header_;
};
void MissingUnknown(const Config &config) throw(SpecialWordMissingException);

View File

@ -2,7 +2,7 @@
#Run tests. Requires Boost.
set -e
./compile.sh
for i in util/{bit_packing,file_piece,joint_sort,key_value_packing,probing_hash_table,sorted_uniform}_test lm/model_test; do
for i in util/{bit_packing,file_piece,joint_sort,key_value_packing,probing_hash_table,sorted_uniform}_test lm/{model,left}_test; do
g++ -I. -O3 $CXXFLAGS $i.cc {lm,util}/*.o -lboost_test_exec_monitor -lz -o $i
pushd $(dirname $i) >/dev/null && ./$(basename $i) || echo "$i failed"; popd >/dev/null
done

View File

@ -86,6 +86,20 @@ inline void WriteFloat32(void *base, uint64_t bit_off, float value) {
const uint32_t kSignBit = 0x80000000;
inline void SetSign(float &to) {
FloatEnc enc;
enc.f = to;
enc.i |= kSignBit;
to = enc.f;
}
inline void UnsetSign(float &to) {
FloatEnc enc;
enc.f = to;
enc.i &= ~kSignBit;
to = enc.f;
}
inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) {
FloatEnc encoded;
encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31);

View File

@ -79,4 +79,9 @@ ErrnoException::ErrnoException() throw() : errno_(errno) {
ErrnoException::~ErrnoException() throw() {}
EndOfFileException::EndOfFileException() throw() {
*this << "End of file";
}
EndOfFileException::~EndOfFileException() throw() {}
} // namespace util

View File

@ -105,6 +105,12 @@ class ErrnoException : public Exception {
int errno_;
};
class EndOfFileException : public Exception {
public:
EndOfFileException() throw();
~EndOfFileException() throw();
};
} // namespace util
#endif // UTIL_EXCEPTION__

74
kenlm/util/file.cc Normal file
View File

@ -0,0 +1,74 @@
#include "util/file.hh"
#include "util/exception.hh"
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <inttypes.h>
namespace util {
scoped_fd::~scoped_fd() {
if (fd_ != -1 && close(fd_)) {
std::cerr << "Could not close file " << fd_ << std::endl;
std::abort();
}
}
scoped_FILE::~scoped_FILE() {
if (file_ && std::fclose(file_)) {
std::cerr << "Could not close file " << std::endl;
std::abort();
}
}
int OpenReadOrThrow(const char *name) {
int ret;
UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
return ret;
}
int CreateOrThrow(const char *name) {
int ret;
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name);
return ret;
}
off_t SizeFile(int fd) {
struct stat sb;
if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
return sb.st_size;
}
void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
ssize_t ret = read(fd, to, amount);
if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
}
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
ssize_t ret = write(fd, data, size);
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
data += ret;
size -= ret;
}
}
void RemoveOrThrow(const char *name) {
UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name);
}
} // namespace util

74
kenlm/util/file.hh Normal file
View File

@ -0,0 +1,74 @@
#ifndef UTIL_FILE__
#define UTIL_FILE__
#include <cstdio>
#include <unistd.h>
namespace util {
class scoped_fd {
public:
scoped_fd() : fd_(-1) {}
explicit scoped_fd(int fd) : fd_(fd) {}
~scoped_fd();
void reset(int to) {
scoped_fd other(fd_);
fd_ = to;
}
int get() const { return fd_; }
int operator*() const { return fd_; }
int release() {
int ret = fd_;
fd_ = -1;
return ret;
}
operator bool() { return fd_ != -1; }
private:
int fd_;
scoped_fd(const scoped_fd &);
scoped_fd &operator=(const scoped_fd &);
};
class scoped_FILE {
public:
explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
~scoped_FILE();
std::FILE *get() { return file_; }
const std::FILE *get() const { return file_; }
void reset(std::FILE *to = NULL) {
scoped_FILE other(file_);
file_ = to;
}
private:
std::FILE *file_;
};
int OpenReadOrThrow(const char *name);
int CreateOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
const off_t kBadSize = -1;
off_t SizeFile(int fd);
void ReadOrThrow(int fd, void *to, std::size_t size);
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
void RemoveOrThrow(const char *name);
} // namespace util
#endif // UTIL_FILE__

View File

@ -1,6 +1,7 @@
#include "util/file_piece.hh"
#include "util/exception.hh"
#include "util/file.hh"
#include <iostream>
#include <string>
@ -21,11 +22,6 @@
namespace util {
EndOfFileException::EndOfFileException() throw() {
*this << "End of file";
}
EndOfFileException::~EndOfFileException() throw() {}
ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
@ -40,18 +36,6 @@ GZException::GZException(void *file) {
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int OpenReadOrThrow(const char *name) {
int ret;
UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
return ret;
}
off_t SizeFile(int fd) {
struct stat sb;
if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
return sb.st_size;
}
FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {

View File

@ -3,9 +3,9 @@
#include "util/ersatz_progress.hh"
#include "util/exception.hh"
#include "util/file.hh"
#include "util/have.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"
#include "util/string_piece.hh"
#include <string>
@ -14,12 +14,6 @@
namespace util {
class EndOfFileException : public Exception {
public:
EndOfFileException() throw();
~EndOfFileException() throw();
};
class ParseNumberException : public Exception {
public:
explicit ParseNumberException(StringPiece value) throw();
@ -33,14 +27,8 @@ class GZException : public Exception {
~GZException() throw() {}
};
int OpenReadOrThrow(const char *name);
extern const bool kSpaces[256];
// Return value for SizeFile when it can't size properly.
const off_t kBadSize = -1;
off_t SizeFile(int fd);
// Memory backing the returned StringPiece may vanish on the next call.
class FilePiece {
public:

View File

@ -1,6 +1,6 @@
#include "util/exception.hh"
#include "util/file.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"
#include <iostream>
@ -66,20 +66,6 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
return ret;
}
namespace {
void ReadAll(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
ssize_t ret = read(fd, to, amount);
if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
}
} // namespace
const int kFileFlags =
#ifdef MAP_FILE
MAP_FILE | MAP_SHARED
@ -106,7 +92,7 @@ void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_m
out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed.");
ReadAll(fd, out.get(), size);
ReadOrThrow(fd, out.get(), size);
break;
}
}

View File

@ -2,8 +2,6 @@
#define UTIL_MMAP__
// Utilities for mmaped files.
#include "util/scoped.hh"
#include <cstddef>
#include <inttypes.h>
@ -11,6 +9,8 @@
namespace util {
class scoped_fd;
// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
class scoped_mmap {
public:

View File

@ -1,24 +0,0 @@
#include "util/scoped.hh"
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
namespace util {
scoped_fd::~scoped_fd() {
if (fd_ != -1 && close(fd_)) {
std::cerr << "Could not close file " << fd_ << std::endl;
abort();
}
}
scoped_FILE::~scoped_FILE() {
if (file_ && fclose(file_)) {
std::cerr << "Could not close file " << std::endl;
abort();
}
}
} // namespace util

View File

@ -1,10 +1,11 @@
#ifndef UTIL_SCOPED__
#define UTIL_SCOPED__
/* Other scoped objects in the style of scoped_ptr. */
#include "util/exception.hh"
/* Other scoped objects in the style of scoped_ptr. */
#include <cstddef>
#include <cstdio>
#include <cstdlib>
namespace util {
@ -34,52 +35,33 @@ template <class T, class R, R (*Free)(T*)> class scoped_thing {
scoped_thing &operator=(const scoped_thing &);
};
class scoped_fd {
class scoped_malloc {
public:
scoped_fd() : fd_(-1) {}
scoped_malloc() : p_(NULL) {}
explicit scoped_fd(int fd) : fd_(fd) {}
scoped_malloc(void *p) : p_(p) {}
~scoped_fd();
~scoped_malloc() { std::free(p_); }
void reset(int to) {
scoped_fd other(fd_);
fd_ = to;
void reset(void *p = NULL) {
scoped_malloc other(p_);
p_ = p;
}
int get() const { return fd_; }
int operator*() const { return fd_; }
int release() {
int ret = fd_;
fd_ = -1;
return ret;
void call_realloc(std::size_t to) {
void *ret;
UTIL_THROW_IF(!(ret = std::realloc(p_, to)), util::ErrnoException, "realloc to " << to << " bytes failed.");
p_ = ret;
}
void *get() { return p_; }
const void *get() const { return p_; }
private:
int fd_;
void *p_;
scoped_fd(const scoped_fd &);
scoped_fd &operator=(const scoped_fd &);
};
class scoped_FILE {
public:
explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
~scoped_FILE();
std::FILE *get() { return file_; }
const std::FILE *get() const { return file_; }
void reset(std::FILE *to = NULL) {
scoped_FILE other(file_);
file_ = to;
}
private:
std::FILE *file_;
scoped_malloc(const scoped_malloc &);
scoped_malloc &operator=(const scoped_malloc &);
};
// Hat tip to boost.

View File

@ -0,0 +1,107 @@
#ifndef UTIL_SIZED_ITERATOR__
#define UTIL_SIZED_ITERATOR__
#include "util/proxy_iterator.hh"
#include <functional>
#include <string>
#include <inttypes.h>
#include <string.h>
namespace util {
class SizedInnerIterator {
public:
SizedInnerIterator() {}
SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast<uint8_t*>(ptr)), size_(size) {}
bool operator==(const SizedInnerIterator &other) const {
return ptr_ == other.ptr_;
}
bool operator<(const SizedInnerIterator &other) const {
return ptr_ < other.ptr_;
}
SizedInnerIterator &operator+=(std::ptrdiff_t amount) {
ptr_ += amount * size_;
return *this;
}
std::ptrdiff_t operator-(const SizedInnerIterator &other) const {
return (ptr_ - other.ptr_) / size_;
}
const void *Data() const { return ptr_; }
void *Data() { return ptr_; }
std::size_t EntrySize() const { return size_; }
private:
uint8_t *ptr_;
std::size_t size_;
};
class SizedProxy {
public:
SizedProxy() {}
SizedProxy(void *ptr, std::size_t size) : inner_(ptr, size) {}
operator std::string() const {
return std::string(reinterpret_cast<const char*>(inner_.Data()), inner_.EntrySize());
}
SizedProxy &operator=(const SizedProxy &from) {
memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize());
return *this;
}
SizedProxy &operator=(const std::string &from) {
memcpy(inner_.Data(), from.data(), inner_.EntrySize());
return *this;
}
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
private:
friend class util::ProxyIterator<SizedProxy>;
typedef std::string value_type;
typedef SizedInnerIterator InnerIterator;
InnerIterator &Inner() { return inner_; }
const InnerIterator &Inner() const { return inner_; }
InnerIterator inner_;
};
typedef ProxyIterator<SizedProxy> SizedIterator;
inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); }
// Useful wrapper for a comparison function i.e. sort.
template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> {
public:
explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {}
bool operator()(const Proxy &first, const Proxy &second) const {
return delegate_(first.Data(), second.Data());
}
bool operator()(const Proxy &first, const std::string &second) const {
return delegate_(first.Data(), second.data());
}
bool operator()(const std::string &first, const Proxy &second) const {
return delegate_(first.data(), second.Data());
}
bool operator()(const std::string &first, const std::string &second) const {
return delegate_(first.data(), second.data());
}
const Delegate &GetDelegate() const { return delegate_; }
private:
const Delegate delegate_;
};
} // namespace util
#endif // UTIL_SIZED_ITERATOR__

View File

@ -0,0 +1,69 @@
#ifndef UTIL_TOKENIZE_PIECE__
#define UTIL_TOKENIZE_PIECE__
#include "util/string_piece.hh"
#include <boost/iterator/iterator_facade.hpp>
/* Usage:
*
* for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) {
* std::cout << *i << "\n";
* }
*
*/
namespace util {
// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece.
template <char d> class PieceIterator : public boost::iterator_facade<PieceIterator<d>, const StringPiece, boost::forward_traversal_tag> {
public:
// Default construct is end, which is also accessed by kEndPieceIterator;
PieceIterator() {}
explicit PieceIterator(const StringPiece &str)
: after_(str) {
increment();
}
bool operator!() const {
return after_.data() == 0;
}
operator bool() const {
return after_.data() != 0;
}
static PieceIterator<d> end() {
return PieceIterator<d>();
}
private:
friend class boost::iterator_core_access;
void increment() {
const char *start = after_.data();
for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {}
if (start == after_.data() + after_.size()) {
// End condition.
after_.clear();
return;
}
const char *finish = start;
for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {}
current_ = StringPiece(start, finish - start);
after_ = StringPiece(finish, after_.data() + after_.size() - finish);
}
bool equal(const PieceIterator &other) const {
return after_.data() == other.after_.data();
}
const StringPiece &dereference() const { return current_; }
StringPiece current_;
StringPiece after_;
};
} // namespace util
#endif // UTIL_TOKENIZE_PIECE__

View File

@ -302,7 +302,10 @@
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
HEADER_SEARCH_PATHS = ../moses/src;
HEADER_SEARCH_PATHS = (
../moses/src,
../kenlm,
);
INSTALL_PATH = /usr/local/bin;
LIBRARY_SEARCH_PATHS = (
../irstlm/lib,
@ -335,7 +338,10 @@
_LARGE_FILES,
"_FILE_OFFSET_BITS=64",
);
HEADER_SEARCH_PATHS = ../moses/src;
HEADER_SEARCH_PATHS = (
../moses/src,
../kenlm,
);
INSTALL_PATH = /usr/local/bin;
LIBRARY_SEARCH_PATHS = (
../irstlm/lib,

View File

@ -270,8 +270,7 @@ void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId,
VERBOSE(3,"0" << std::endl);
if (StaticData::Instance().GetOutputHypoScore()) {
out << hypo->GetTotalScore() << " "
<< ChartHypothesis::GetHypoCount() << " ";
out << hypo->GetTotalScore() << " ";
}
if (!m_surpressSingleBestOutput) {
@ -318,8 +317,7 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
// The output from -output-hypo-score is always written to std::cout.
if (StaticData::Instance().GetOutputHypoScore()) {
if (bestHypo != NULL) {
out << bestHypo->GetTotalScore() << " "
<< ChartHypothesis::GetHypoCount() << " ";
out << bestHypo->GetTotalScore() << " ";
} else {
out << "0 ";
}
@ -344,7 +342,7 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
// print the surface factor of the translation
out << translationId << " ||| ";
OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
out << " ||| ";
out << " |||";
// print the scores in a hardwired order
// before each model type, the corresponding command-line-like name must be emitted
@ -354,10 +352,10 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
const LMList& lml = system->GetLanguageModels();
if (lml.size() > 0) {
if (labeledOutput)
out << "lm: ";
out << "lm:";
LMList::const_iterator lmi = lml.begin();
for (; lmi != lml.end(); ++lmi) {
out << path.GetScoreBreakdown().GetScoreForProducer(*lmi) << " ";
out << " " << path.GetScoreBreakdown().GetScoreForProducer(*lmi);
}
}
@ -386,8 +384,8 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
// word penalty
if (labeledOutput)
out << " w: ";
out << path.GetScoreBreakdown().GetScoreForProducer(system->GetWordPenaltyProducer()) << " ";
out << " w:";
out << " " << path.GetScoreBreakdown().GetScoreForProducer(system->GetWordPenaltyProducer());
// generation
const vector<GenerationDictionary*>& gds = system->GetGenerationDictionaries();
@ -411,7 +409,7 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
// total
out << "||| " << path.GetTotalScore();
out << " |||" << path.GetTotalScore();
/*
if (includeAlignment) {

View File

@ -221,24 +221,6 @@ int main(int argc, char* argv[])
return EXIT_FAILURE;
}
// create threadpool, if necessary
int threadcount = (parameter.GetParam("threads").size() > 0) ?
Scan<size_t>(parameter.GetParam("threads")[0]) : 1;
#ifdef WITH_THREADS
if (threadcount < 1) {
cerr << "Error: Need to specify a positive number of threads" << endl;
exit(1);
}
ThreadPool pool(threadcount);
#else
if (threadcount > 1) {
cerr << "Error: Thread count of " << threadcount
<< " but moses not built with thread support" << endl;
exit(1);
}
#endif
const StaticData &staticData = StaticData::Instance();
if (!StaticData::LoadDataStatic(&parameter))
return EXIT_FAILURE;
@ -264,6 +246,10 @@ int main(int argc, char* argv[])
if (ioWrapper == NULL)
return EXIT_FAILURE;
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
// read each sentence & decode
InputType *source=0;
while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {

View File

@ -410,6 +410,7 @@
isa = XCBuildConfiguration;
buildSettings = {
ARCHS = "$(ONLY_ACTIVE_ARCH_PRE_XCODE_3_1)";
HEADER_SEARCH_PATHS = ../kenlm;
ONLY_ACTIVE_ARCH_PRE_XCODE_3_1 = "$(NATIVE_ARCH_ACTUAL)";
SDKROOT = "$(DEVELOPER_SDK_DIR)/MacOSX10.6.sdk";
VALID_ARCHS = "i386 ppc ppc64 ppc7400 ppc970 x86_64";
@ -420,6 +421,7 @@
isa = XCBuildConfiguration;
buildSettings = {
ARCHS = "$(ONLY_ACTIVE_ARCH_PRE_XCODE_3_1)";
HEADER_SEARCH_PATHS = ../kenlm;
ONLY_ACTIVE_ARCH_PRE_XCODE_3_1 = "$(NATIVE_ARCH_ACTUAL)";
SDKROOT = "$(DEVELOPER_SDK_DIR)/MacOSX10.6.sdk";
VALID_ARCHS = "i386 ppc ppc64 ppc7400 ppc970 x86_64";
@ -430,6 +432,7 @@
isa = XCBuildConfiguration;
buildSettings = {
ARCHS = "$(ONLY_ACTIVE_ARCH_PRE_XCODE_3_1)";
HEADER_SEARCH_PATHS = ../kenlm;
ONLY_ACTIVE_ARCH_PRE_XCODE_3_1 = "$(NATIVE_ARCH_ACTUAL)";
SDKROOT = "$(DEVELOPER_SDK_DIR)/MacOSX10.6.sdk";
VALID_ARCHS = "i386 ppc ppc64 ppc7400 ppc970 x86_64";

View File

@ -312,8 +312,8 @@ void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*
for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
if (i>0) out << " ";
out << *factor;
if (i>0) out << " " << *factor;
else out << *factor;
}
out << endl;
}
@ -529,22 +529,21 @@ void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>&
{
for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
out << translationId;
out << " ||| ";
out << " |||";
const vector<Word> mbrHypo = si->GetWords();
for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
if (i>0) out << " ";
out << *factor;
if (i>0) out << " " << *factor;
else out << *factor;
}
out << " ||| ";
out << "map: " << si->GetMapScore();
out << " |||";
out << " map: " << si->GetMapScore();
out << " w: " << mbrHypo.size();
const vector<float>& ngramScores = si->GetNgramScores();
for (size_t i = 0; i < ngramScores.size(); ++i) {
out << " " << ngramScores[i];
}
out << " ||| ";
out << si->GetScore();
out << " ||| " << si->GetScore();
out << endl;
}

View File

@ -237,6 +237,7 @@ void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const
const ArcList *arcList = succHyp->GetArcList();
if (arcList != NULL) {
ArcList::const_iterator iterArcList;
//QUESTION: What happens if there's more than one loserPrevHypo?
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
const Hypothesis *loserHypo = *iterArcList;
const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();

View File

@ -19,17 +19,7 @@
using namespace Moses;
template<class T>
T log_sum (T log_a, T log_b)
{
T v;
if (log_a < log_b) {
v = log_b+log ( 1 + exp ( log_a-log_b ));
} else {
v = log_a+log ( 1 + exp ( log_b-log_a ));
}
return ( v );
}
class Edge;

View File

@ -72,11 +72,13 @@ public:
TranslationTask(size_t lineNumber,
InputType* source, OutputCollector* outputCollector, OutputCollector* nbestCollector,
OutputCollector* latticeSamplesCollector,
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
OutputCollector* detailedTranslationCollector,
OutputCollector* alignmentInfoCollector ) :
m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector),
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector) {}
@ -240,6 +242,15 @@ public:
m_nbestCollector->Write(m_lineNumber, out.str());
}
//lattice samples
if (m_latticeSamplesCollector) {
TrellisPathList latticeSamples;
ostringstream out;
manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), manager.GetTranslationSystem(), m_lineNumber);
m_latticeSamplesCollector->Write(m_lineNumber, out.str());
}
// detailed translation reporting
if (m_detailedTranslationCollector) {
ostringstream out;
@ -264,6 +275,7 @@ private:
size_t m_lineNumber;
OutputCollector* m_outputCollector;
OutputCollector* m_nbestCollector;
OutputCollector* m_latticeSamplesCollector;
OutputCollector* m_wordGraphCollector;
OutputCollector* m_searchGraphCollector;
OutputCollector* m_detailedTranslationCollector;
@ -342,25 +354,6 @@ int main(int argc, char** argv)
}
// create threadpool, if using multi-threaded decoding
// note: multi-threading is done on sentence-level,
// each thread translates one sentence
int threadcount = (params->GetParam("threads").size() > 0) ?
Scan<size_t>(params->GetParam("threads")[0]) : 1;
#ifdef WITH_THREADS
if (threadcount < 1) {
cerr << "Error: Need to specify a positive number of threads" << endl;
exit(1);
}
ThreadPool pool(threadcount);
#else
if (threadcount > 1) {
cerr << "Error: Thread count of " << threadcount << " but moses not built with thread support" << endl;
exit(1);
}
#endif
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params)) {
@ -376,6 +369,10 @@ int main(int argc, char** argv)
// shorthand for accessing information in StaticData
const StaticData& staticData = StaticData::Instance();
//initialise random numbers
srand(time(NULL));
// set up read/writing class
IOWrapper* ioWrapper = GetIODevice(staticData);
if (!ioWrapper) {
@ -396,21 +393,43 @@ int main(int argc, char** argv)
// because multithreading may return sentences in shuffled order
auto_ptr<OutputCollector> outputCollector; // for translations
auto_ptr<OutputCollector> nbestCollector; // for n-best lists
auto_ptr<OutputCollector> latticeSamplesCollector; //for lattice samples
auto_ptr<ofstream> nbestOut;
auto_ptr<ofstream> latticeSamplesOut;
size_t nbestSize = staticData.GetNBestSize();
string nbestFile = staticData.GetNBestFilePath();
bool output1best = true;
if (nbestSize) {
if (nbestFile == "-" || nbestFile == "/dev/stdout") {
// nbest to stdout, no 1-best
nbestCollector.reset(new OutputCollector());
output1best = false;
} else {
// nbest to file, 1-best to stdout
nbestOut.reset(new ofstream(nbestFile.c_str()));
assert(nbestOut->good());
if (!nbestOut->good()) {
TRACE_ERR("ERROR: Failed to open " << nbestFile << " for nbest lists" << endl);
exit(1);
}
nbestCollector.reset(new OutputCollector(nbestOut.get()));
outputCollector.reset(new OutputCollector());
}
} else {
}
size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
if (latticeSamplesSize) {
if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
latticeSamplesCollector.reset(new OutputCollector());
output1best = false;
} else {
latticeSamplesOut.reset(new ofstream(latticeSamplesFile.c_str()));
if (!latticeSamplesOut->good()) {
TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
exit(1);
}
latticeSamplesCollector.reset(new OutputCollector(latticeSamplesOut.get()));
}
}
if (output1best) {
outputCollector.reset(new OutputCollector());
}
@ -439,6 +458,10 @@ int main(int argc, char** argv)
alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
}
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = 0;
@ -449,13 +472,15 @@ int main(int argc, char** argv)
// set up task of translating one sentence
TranslationTask* task =
new TranslationTask(lineCount,source, outputCollector.get(),
nbestCollector.get(), wordGraphCollector.get(),
nbestCollector.get(),
latticeSamplesCollector.get(),
wordGraphCollector.get(),
searchGraphCollector.get(),
detailedTranslationCollector.get(),
alignmentInfoCollector.get() );
// execute task
#ifdef WITH_THREADS
pool.Submit(task);
pool.Submit(task);
#else
task->Run();
#endif

View File

@ -183,6 +183,7 @@
<ClCompile Include="src\InputFileStream.cpp" />
<ClCompile Include="src\InputType.cpp" />
<ClCompile Include="src\LanguageModel.cpp" />
<ClCompile Include="src\LanguageModelChartState.cpp" />
<ClCompile Include="src\LanguageModelFactory.cpp" />
<ClCompile Include="src\LanguageModelImplementation.cpp" />
<ClCompile Include="src\LanguageModelInternal.cpp">
@ -238,6 +239,9 @@
<ClCompile Include="src\RuleCube.cpp" />
<ClCompile Include="src\RuleCubeItem.cpp" />
<ClCompile Include="src\RuleCubeQueue.cpp" />
<ClCompile Include="src\RuleTableLoaderCompact.cpp" />
<ClCompile Include="src\RuleTableLoaderFactory.cpp" />
<ClCompile Include="src\RuleTableLoaderStandard.cpp" />
<ClCompile Include="src\ScoreComponentCollection.cpp" />
<ClCompile Include="src\ScoreIndexManager.cpp" />
<ClCompile Include="src\ScoreProducer.cpp" />
@ -321,6 +325,7 @@
<ClInclude Include="src\InputFileStream.h" />
<ClInclude Include="src\InputType.h" />
<ClInclude Include="src\LanguageModel.h" />
<ClInclude Include="src\LanguageModelChartState.h" />
<ClInclude Include="src\LanguageModelDelegate.h" />
<ClInclude Include="src\LanguageModelFactory.h" />
<ClInclude Include="src\LanguageModelImplementation.h" />
@ -367,6 +372,10 @@
<ClInclude Include="src\RuleCube.h" />
<ClInclude Include="src\RuleCubeItem.h" />
<ClInclude Include="src\RuleCubeQueue.h" />
<ClInclude Include="src\RuleTableLoader.h" />
<ClInclude Include="src\RuleTableLoaderCompact.h" />
<ClInclude Include="src\RuleTableLoaderFactory.h" />
<ClInclude Include="src\RuleTableLoaderStandard.h" />
<ClInclude Include="src\ScoreComponentCollection.h" />
<ClInclude Include="src\ScoreIndexManager.h" />
<ClInclude Include="src\ScoreProducer.h" />

View File

@ -49,6 +49,8 @@
1E528B9E13A12B2D00E9A67E /* SyntacticLanguageModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E528B9C13A12B2D00E9A67E /* SyntacticLanguageModel.cpp */; };
1E5AAF1512B25C9E0071864D /* LanguageModelImplementation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E5AAF1312B25C9E0071864D /* LanguageModelImplementation.cpp */; };
1E5AAF1612B25C9E0071864D /* LanguageModelImplementation.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E5AAF1412B25C9E0071864D /* LanguageModelImplementation.h */; };
1E9CB84514166D4A00EDB7FC /* LanguageModelChartState.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E9CB84314166D4A00EDB7FC /* LanguageModelChartState.cpp */; };
1E9CB84614166D4A00EDB7FC /* LanguageModelChartState.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E9CB84414166D4A00EDB7FC /* LanguageModelChartState.h */; };
1EA6AB4A13BCC838004465AF /* ChartRuleLookupManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EA6AB4813BCC838004465AF /* ChartRuleLookupManager.cpp */; };
1EA6AB4B13BCC838004465AF /* ChartRuleLookupManager.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EA6AB4913BCC838004465AF /* ChartRuleLookupManager.h */; };
1EBB262913A12DB500B51840 /* hash.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB262213A12DB500B51840 /* hash.h */; };
@ -60,6 +62,12 @@
1EBB262F13A12DB500B51840 /* RandLMFilter.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB262813A12DB500B51840 /* RandLMFilter.h */; };
1ED00036124BC2690029177F /* ChartTranslationOption.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED00034124BC2690029177F /* ChartTranslationOption.cpp */; };
1ED00037124BC2690029177F /* ChartTranslationOption.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ED00035124BC2690029177F /* ChartTranslationOption.h */; };
1ED0DE291432A0D200C20FBE /* RuleTableLoaderCompact.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED0DE1D1432A0D100C20FBE /* RuleTableLoaderCompact.cpp */; };
1ED0DE2A1432A0D200C20FBE /* RuleTableLoaderCompact.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ED0DE1E1432A0D100C20FBE /* RuleTableLoaderCompact.h */; };
1ED0DE2C1432A0D200C20FBE /* RuleTableLoaderFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED0DE211432A0D100C20FBE /* RuleTableLoaderFactory.cpp */; };
1ED0DE2D1432A0D200C20FBE /* RuleTableLoaderFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ED0DE221432A0D100C20FBE /* RuleTableLoaderFactory.h */; };
1ED0DE2F1432A0D200C20FBE /* RuleTableLoaderStandard.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED0DE251432A0D100C20FBE /* RuleTableLoaderStandard.cpp */; };
1ED0DE301432A0D200C20FBE /* RuleTableLoaderStandard.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ED0DE261432A0D100C20FBE /* RuleTableLoaderStandard.h */; };
1ED0FE2A124BB9380029177F /* AlignmentInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED0FD4C124BB9380029177F /* AlignmentInfo.cpp */; };
1ED0FE2B124BB9380029177F /* AlignmentInfo.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ED0FD4D124BB9380029177F /* AlignmentInfo.h */; };
1ED0FE2C124BB9380029177F /* BilingualDynSuffixArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED0FD4E124BB9380029177F /* BilingualDynSuffixArray.cpp */; };
@ -317,6 +325,8 @@
1E528B9C13A12B2D00E9A67E /* SyntacticLanguageModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = SyntacticLanguageModel.cpp; path = src/SyntacticLanguageModel.cpp; sourceTree = "<group>"; };
1E5AAF1312B25C9E0071864D /* LanguageModelImplementation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = LanguageModelImplementation.cpp; path = src/LanguageModelImplementation.cpp; sourceTree = "<group>"; };
1E5AAF1412B25C9E0071864D /* LanguageModelImplementation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = LanguageModelImplementation.h; path = src/LanguageModelImplementation.h; sourceTree = "<group>"; };
1E9CB84314166D4A00EDB7FC /* LanguageModelChartState.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = LanguageModelChartState.cpp; path = src/LanguageModelChartState.cpp; sourceTree = "<group>"; };
1E9CB84414166D4A00EDB7FC /* LanguageModelChartState.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = LanguageModelChartState.h; path = src/LanguageModelChartState.h; sourceTree = "<group>"; };
1EA6AB4813BCC838004465AF /* ChartRuleLookupManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartRuleLookupManager.cpp; path = src/ChartRuleLookupManager.cpp; sourceTree = "<group>"; };
1EA6AB4913BCC838004465AF /* ChartRuleLookupManager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartRuleLookupManager.h; path = src/ChartRuleLookupManager.h; sourceTree = "<group>"; };
1EBB262213A12DB500B51840 /* hash.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = hash.h; path = src/DynSAInclude/hash.h; sourceTree = "<group>"; };
@ -328,6 +338,12 @@
1EBB262813A12DB500B51840 /* RandLMFilter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = RandLMFilter.h; path = src/DynSAInclude/RandLMFilter.h; sourceTree = "<group>"; };
1ED00034124BC2690029177F /* ChartTranslationOption.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartTranslationOption.cpp; path = src/ChartTranslationOption.cpp; sourceTree = "<group>"; };
1ED00035124BC2690029177F /* ChartTranslationOption.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartTranslationOption.h; path = src/ChartTranslationOption.h; sourceTree = "<group>"; };
1ED0DE1D1432A0D100C20FBE /* RuleTableLoaderCompact.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = RuleTableLoaderCompact.cpp; path = src/RuleTableLoaderCompact.cpp; sourceTree = "<group>"; };
1ED0DE1E1432A0D100C20FBE /* RuleTableLoaderCompact.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = RuleTableLoaderCompact.h; path = src/RuleTableLoaderCompact.h; sourceTree = "<group>"; };
1ED0DE211432A0D100C20FBE /* RuleTableLoaderFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = RuleTableLoaderFactory.cpp; path = src/RuleTableLoaderFactory.cpp; sourceTree = "<group>"; };
1ED0DE221432A0D100C20FBE /* RuleTableLoaderFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = RuleTableLoaderFactory.h; path = src/RuleTableLoaderFactory.h; sourceTree = "<group>"; };
1ED0DE251432A0D100C20FBE /* RuleTableLoaderStandard.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = RuleTableLoaderStandard.cpp; path = src/RuleTableLoaderStandard.cpp; sourceTree = "<group>"; };
1ED0DE261432A0D100C20FBE /* RuleTableLoaderStandard.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = RuleTableLoaderStandard.h; path = src/RuleTableLoaderStandard.h; sourceTree = "<group>"; };
1ED0FD4C124BB9380029177F /* AlignmentInfo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = AlignmentInfo.cpp; path = src/AlignmentInfo.cpp; sourceTree = "<group>"; };
1ED0FD4D124BB9380029177F /* AlignmentInfo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = AlignmentInfo.h; path = src/AlignmentInfo.h; sourceTree = "<group>"; };
1ED0FD4E124BB9380029177F /* BilingualDynSuffixArray.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = BilingualDynSuffixArray.cpp; path = src/BilingualDynSuffixArray.cpp; sourceTree = "<group>"; };
@ -670,6 +686,8 @@
1ED0FDBC124BB9380029177F /* LVoc.h */,
1ED0FD98124BB9380029177F /* LanguageModel.cpp */,
1ED0FD99124BB9380029177F /* LanguageModel.h */,
1E9CB84314166D4A00EDB7FC /* LanguageModelChartState.cpp */,
1E9CB84414166D4A00EDB7FC /* LanguageModelChartState.h */,
1ED0FD9B124BB9380029177F /* LanguageModelFactory.cpp */,
1ED0FD9C124BB9380029177F /* LanguageModelFactory.h */,
1ED0FD9F124BB9380029177F /* LanguageModelIRST.cpp */,
@ -755,6 +773,12 @@
1E46B5A513BA5C7F0084F898 /* RuleCubeItem.h */,
1E2E1635132A892800ED4085 /* RuleCubeQueue.cpp */,
1E2E1636132A892800ED4085 /* RuleCubeQueue.h */,
1ED0DE1D1432A0D100C20FBE /* RuleTableLoaderCompact.cpp */,
1ED0DE1E1432A0D100C20FBE /* RuleTableLoaderCompact.h */,
1ED0DE211432A0D100C20FBE /* RuleTableLoaderFactory.cpp */,
1ED0DE221432A0D100C20FBE /* RuleTableLoaderFactory.h */,
1ED0DE251432A0D100C20FBE /* RuleTableLoaderStandard.cpp */,
1ED0DE261432A0D100C20FBE /* RuleTableLoaderStandard.h */,
1ED0FDEB124BB9380029177F /* ScoreComponentCollection.cpp */,
1ED0FDEC124BB9380029177F /* ScoreComponentCollection.h */,
1ED0FDED124BB9380029177F /* ScoreIndexManager.cpp */,
@ -1012,6 +1036,10 @@
1E46B5A713BA5C7F0084F898 /* RuleCubeItem.h in Headers */,
1EA6AB4B13BCC838004465AF /* ChartRuleLookupManager.h in Headers */,
1ED22EE613DD96B0000DE8C9 /* DotChartInMemory.h in Headers */,
1E9CB84614166D4A00EDB7FC /* LanguageModelChartState.h in Headers */,
1ED0DE2A1432A0D200C20FBE /* RuleTableLoaderCompact.h in Headers */,
1ED0DE2D1432A0D200C20FBE /* RuleTableLoaderFactory.h in Headers */,
1ED0DE301432A0D200C20FBE /* RuleTableLoaderStandard.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -1188,6 +1216,10 @@
1E46B5A613BA5C7F0084F898 /* RuleCubeItem.cpp in Sources */,
1EA6AB4A13BCC838004465AF /* ChartRuleLookupManager.cpp in Sources */,
1ED22EE513DD96B0000DE8C9 /* DotChartInMemory.cpp in Sources */,
1E9CB84514166D4A00EDB7FC /* LanguageModelChartState.cpp in Sources */,
1ED0DE291432A0D200C20FBE /* RuleTableLoaderCompact.cpp in Sources */,
1ED0DE2C1432A0D200C20FBE /* RuleTableLoaderFactory.cpp in Sources */,
1ED0DE2F1432A0D200C20FBE /* RuleTableLoaderStandard.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -1211,7 +1243,8 @@
LM_KEN,
LM_IRST,
LM_RAND,
HAVE_CONFIG_H,
"_FILE_OFFSET_BITS=64",
_LARGE_FILES,
);
HEADER_SEARCH_PATHS = (
../irstlm/include,
@ -1242,7 +1275,8 @@
LM_KEN,
LM_IRST,
LM_RAND,
HAVE_CONFIG_H,
"_FILE_OFFSET_BITS=64",
_LARGE_FILES,
);
HEADER_SEARCH_PATHS = (
../irstlm/include,

View File

@ -183,7 +183,7 @@ size_t ChartCell::GetSize() const
return ret;
}
void ChartCell::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<int,bool> &reachable) const
void ChartCell::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<const ChartHypothesis *,bool> &reachable) const
{
std::map<Word, ChartHypothesisCollection>::const_iterator iterOutside;
for (iterOutside = m_hypoColl.begin(); iterOutside != m_hypoColl.end(); ++iterOutside) {

View File

@ -90,7 +90,7 @@ public:
return m_coverage < compare.m_coverage;
}
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<int,bool> &reachable) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<const ChartHypothesis *,bool> &reachable) const;
};

View File

@ -35,7 +35,6 @@
namespace Moses
{
unsigned int ChartHypothesis::s_HypothesesCreated = 0;
#ifdef USE_HYPO_POOL
ObjectPool<ChartHypothesis> ChartHypothesis::s_objectPool("ChartHypothesis", 300000);
@ -45,8 +44,7 @@ ObjectPool<ChartHypothesis> ChartHypothesis::s_objectPool("ChartHypothesis", 300
ChartHypothesis::ChartHypothesis(const ChartTranslationOption &transOpt,
const RuleCubeItem &item,
ChartManager &manager)
:m_id(++s_HypothesesCreated)
,m_targetPhrase(*(item.GetTranslationDimension().GetTargetPhrase()))
:m_targetPhrase(*(item.GetTranslationDimension().GetTargetPhrase()))
,m_transOpt(transOpt)
,m_contextPrefix(Output, manager.GetTranslationSystem()->GetLanguageModels().GetMaxNGramOrder())
,m_contextSuffix(Output, manager.GetTranslationSystem()->GetLanguageModels().GetMaxNGramOrder())
@ -362,13 +360,13 @@ TO_STRING_BODY(ChartHypothesis)
std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
{
out << hypo.GetId();
out << &hypo;
// recombination
if (hypo.GetWinningHypothesis() != NULL &&
hypo.GetWinningHypothesis()->GetId() != hypo.GetId())
hypo.GetWinningHypothesis() != &hypo)
{
out << "->" << hypo.GetWinningHypothesis()->GetId();
out << "->" << hypo.GetWinningHypothesis();
}
out << " " << hypo.GetCurrTargetPhrase()
@ -378,7 +376,7 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
HypoList::const_iterator iter;
for (iter = hypo.GetPrevHypos().begin(); iter != hypo.GetPrevHypos().end(); ++iter) {
const ChartHypothesis &prevHypo = **iter;
out << " " << prevHypo.GetId();
out << " " << &prevHypo;
}
out << " [total=" << hypo.GetTotalScore() << "]";

View File

@ -21,6 +21,10 @@
#pragma once
#if HAVE_CONFIG_H
#include "config.h"
#endif
#include <vector>
#include "Util.h"
#include "WordsRange.h"
@ -43,14 +47,10 @@ class ChartHypothesis
friend std::ostream& operator<<(std::ostream&, const ChartHypothesis&);
protected:
#ifdef USE_HYPO_POOL
static ObjectPool<ChartHypothesis> s_objectPool;
#endif
static unsigned int s_HypothesesCreated;
int m_id; /**< numeric ID of this hypothesis, used for logging */
const TargetPhrase &m_targetPhrase;
const ChartTranslationOption &m_transOpt;
@ -77,13 +77,6 @@ protected:
ChartHypothesis(const ChartHypothesis &copy); // not implemented
public:
static void ResetHypoCount() {
s_HypothesesCreated = 0;
}
static unsigned int GetHypoCount() {
return s_HypothesesCreated;
}
#ifdef USE_HYPO_POOL
void *operator new(size_t /* num_bytes */) {
void *ptr = s_objectPool.getPtr();
@ -104,9 +97,8 @@ public:
~ChartHypothesis();
int GetId()const {
return m_id;
}
const ChartHypothesis *GetId() const { return this; }
const ChartTranslationOption &GetTranslationOption()const {
return m_transOpt;
}

View File

@ -254,7 +254,7 @@ void ChartHypothesisCollection::CleanupArcList()
}
}
void ChartHypothesisCollection::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<int,bool> &reachable) const
void ChartHypothesisCollection::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<const ChartHypothesis *, bool> &reachable) const
{
HCType::const_iterator iter;
for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {

View File

@ -115,7 +115,7 @@ public:
float GetBestScore() const { return m_bestScore; }
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<int,bool> &reachable) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<const ChartHypothesis *,bool> &reachable) const;
};

View File

@ -106,7 +106,6 @@ void ChartManager::ProcessSentence()
}
IFVERBOSE(1) {
cerr << "Num of hypo = " << ChartHypothesis::GetHypoCount() << " --- cells:" << endl;
for (size_t startPos = 0; startPos < size; ++startPos) {
cerr.width(3);
@ -201,7 +200,7 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
size_t size = m_source.GetSize();
// which hypotheses are reachable?
std::map<int,bool> reachable;
std::map<const ChartHypothesis *,bool> reachable;
WordsRange fullRange(0, size-1);
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
@ -224,7 +223,7 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
}
}
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<int,bool> &reachable ) const
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<const ChartHypothesis *,bool> &reachable ) const
{
// do not recurse, if already visited
if (reachable.find(hypo->GetId()) != reachable.end())

View File

@ -57,7 +57,7 @@ public:
void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<int,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<const ChartHypothesis *,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
const InputType& GetSource() const {
return m_source;

View File

@ -29,19 +29,6 @@ using namespace std;
namespace Moses
{
Factor::Factor(FactorDirection /* direction */, FactorType /* factorType */, const std::string *factorString, size_t id)
://m_direction(direction)
//,m_factorType(factorType)
m_ptrString(factorString)
,m_id(id)
{}
Factor::Factor(FactorDirection /* direction */, FactorType /* factorType */, const std::string *factorString)
//:m_direction(direction)
//,m_factorType(factorType)
:m_ptrString(factorString)
,m_id(NOT_FOUND)
{}
TO_STRING_BODY(Factor)

View File

@ -22,27 +22,20 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifndef moses_Factor_h
#define moses_Factor_h
#include <sstream>
#include <iostream>
#include <list>
#include <vector>
#include <map>
#include <ostream>
#include <string>
#include "TypeDef.h"
#include "Util.h"
#include "hash.h"
namespace Moses
{
class FactorFriend;
class FactorCollection;
/** Represents a factor (word, POS, etc) on the E or F side
*
* A Factor object is a tuple of direction (Input or Output,
* corresponding to French or English), a type (surface form,
* POS, stem, etc), and the value of the factor.
/** Represents a factor (word, POS, etc).
*
* A Factor has a contiguous identifier and string value.
*/
class Factor
{
@ -50,77 +43,46 @@ class Factor
// only these classes are allowed to instantiate this class
friend class FactorCollection;
friend class FactorFriend;
protected:
//FactorDirection m_direction;
//FactorType m_factorType;
const std::string *m_ptrString;
const size_t m_id;
// FactorCollection writes here.
std::string m_string;
size_t m_id;
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
Factor(FactorDirection direction, FactorType factorType, const std::string *factorString, size_t id);
//! no id set. do not used to create new factors, only used for seeing if factor exists
Factor(FactorDirection direction, FactorType factorType, const std::string *factorString);
Factor() {}
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
// Not implemented. Shouldn't be called.
Factor &operator=(const Factor &factor);
public:
//! returns whether this factor is part of the source ('Input') or target ('Output') language
//inline FactorDirection GetFactorDirection() const
//{
// return m_direction;
//}
//! index, FactorType. For example, 0=surface, 1=POS. The actual mapping is user defined
//inline FactorType GetFactorType() const
//{
// return m_factorType;
//}
//! original string representation of the factor
inline const std::string &GetString() const {
return *m_ptrString;
return m_string;
}
//! contiguous ID
inline size_t GetId() const {
return m_id;
}
/*
//! Alternative comparison between factors. Not yet used
inline unsigned int GetHash() const
{
unsigned int h=quick_hash((const char*)&m_direction, sizeof(FactorDirection), 0xc7e7f2fd);
h=quick_hash((const char*)&m_factorType, sizeof(FactorType), h);
h=quick_hash((const char*)&m_ptrString, sizeof(const std::string *), h);
return h;
}
*/
/** transitive comparison between 2 factors.
* -1 = less than
* +1 = more than
* 0 = same
* Used by operator< & operator==, as well as other classes
*/
inline int Compare(const Factor &compare) const {
if (m_ptrString < compare.m_ptrString)
if (this < &compare)
return -1;
if (m_ptrString > compare.m_ptrString)
if (this > &compare)
return 1;
/*
if (m_direction < compare.m_direction)
return -1;
if (m_direction > compare.m_direction)
return 1;
if (m_factorType < compare.m_factorType)
return -1;
if (m_factorType > compare.m_factorType)
return 1;
*/
return 0;
}
//! transitive comparison used for adding objects into FactorCollection
inline bool operator<(const Factor &compare) const {
return Compare(compare) < 0;
return this < &compare;
}
// quick equality comparison. Not used

View File

@ -19,12 +19,12 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <fstream>
#ifdef HAVE_BOOST
#include <boost/version.hpp>
#endif
#include <ostream>
#include <string>
#include <vector>
#include "FactorCollection.h"
#include "LanguageModel.h"
#include "Util.h"
using namespace std;
@ -33,79 +33,54 @@ namespace Moses
{
FactorCollection FactorCollection::s_instance;
void FactorCollection::LoadVocab(FactorDirection direction, FactorType factorType, const string &filePath)
const Factor *FactorCollection::AddFactor(const string &factorString)
{
ifstream inFile(filePath.c_str());
string line;
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
#ifdef WITH_THREADS
boost::upgrade_lock<boost::shared_mutex> lock(m_accessLock);
boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(lock);
#endif
while( !getline(inFile, line, '\n').eof()) {
vector<string> token = Tokenize( line );
if (token.size() < 2) {
continue;
}
// looks like good line
AddFactor(direction, factorType, token[1]);
#if BOOST_VERSION < 104200
FactorFriend to_ins;
to_ins.in.m_string.assign(factorString.data(), factorString.size());
#endif // BOOST_VERSION
{
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#if BOOST_VERSION >= 104200
// If this line doesn't compile, upgrade your Boost.
Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
#else // BOOST_VERSION
Set::const_iterator i = m_set.find(to_ins);
#endif // BOOST_VERSION
if (i != m_set.end()) return &i->in;
}
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#if BOOST_VERSION >= 102400
FactorFriend to_ins;
to_ins.in.m_string.assign(factorString.data(), factorString.size());
#endif // BOOST_VERSION
#else // WITH_THREADS
FactorFriend to_ins;
to_ins.in.m_string.assign(factorString.data(), factorString.size());
#endif // WITH_THREADS
to_ins.in.m_id = m_factorId;
std::pair<Set::iterator, bool> ret(m_set.insert(to_ins));
if (ret.second) {
m_factorId++;
}
return &ret.first->in;
}
bool FactorCollection::Exists(FactorDirection direction, FactorType factorType, const string &factorString)
{
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> lock(m_accessLock);
#endif
// find string id
const string *ptrString=&(*m_factorStringCollection.insert(factorString).first);
FactorSet::const_iterator iterFactor;
Factor search(direction, factorType, ptrString); // id not used for searching
iterFactor = m_collection.find(search);
return iterFactor != m_collection.end();
}
const Factor *FactorCollection::AddFactor(FactorDirection direction
, FactorType factorType
, const string &factorString)
{
#ifdef WITH_THREADS
boost::upgrade_lock<boost::shared_mutex> lock(m_accessLock);
boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(lock);
#endif
// find string id
const string *ptrString=&(*m_factorStringCollection.insert(factorString).first);
pair<FactorSet::iterator, bool> ret = m_collection.insert( Factor(direction, factorType, ptrString, m_factorId) );
if (ret.second)
++m_factorId; // new factor, make sure next new factor has diffrernt id
const Factor *factor = &(*ret.first);
return factor;
}
FactorCollection::~FactorCollection()
{
//FactorSet::iterator iter;
//for (iter = m_collection.begin() ; iter != m_collection.end() ; iter++)
//{
// delete (*iter);
//}
}
FactorCollection::~FactorCollection() {}
TO_STRING_BODY(FactorCollection);
// friend
ostream& operator<<(ostream& out, const FactorCollection& factorCollection)
{
FactorSet::const_iterator iterFactor;
for (iterFactor = factorCollection.m_collection.begin() ; iterFactor != factorCollection.m_collection.end() ; ++iterFactor) {
const Factor &factor = *iterFactor;
out << factor;
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> lock(factorCollection.m_accessLock);
#endif
for (FactorCollection::Set::const_iterator i = factorCollection.m_set.begin(); i != factorCollection.m_set.end(); ++i) {
out << i->in;
}
return out;
}

View File

@ -22,22 +22,39 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifndef moses_FactorCollection_h
#define moses_FactorCollection_h
#include <set>
#include <string>
#if HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef WITH_THREADS
#include <boost/thread/shared_mutex.hpp>
#endif
#ifdef HAVE_BOOST
#include "util/murmur_hash.hh"
#include <boost/unordered_set.hpp>
#else
#include <set>
#endif
#include <functional>
#include <string>
#include "Factor.h"
namespace Moses
{
class LanguageModel;
typedef std::set<Factor> FactorSet;
typedef std::set<std::string> StringSet;
/* We don't want Factor to be copyable by anybody. But we also want to store
* it in an STL container. The solution is that Factor's copy constructor is
* private and friended to FactorFriend. The STL containers can delegate
* copying, so friending the container isn't sufficient. STL containers see
* FactorFriend's public copy constructor and everybody else sees Factor's
* private copy constructor.
*/
struct FactorFriend {
Factor in;
};
/** collection of factors
*
@ -51,16 +68,44 @@ class FactorCollection
{
friend std::ostream& operator<<(std::ostream&, const FactorCollection&);
protected:
#ifdef HAVE_BOOST
struct HashFactor : public std::unary_function<const FactorFriend &, std::size_t> {
std::size_t operator()(const std::string &str) const {
return util::MurmurHashNative(str.data(), str.size());
}
std::size_t operator()(const FactorFriend &factor) const {
return (*this)(factor.in.GetString());
}
};
struct EqualsFactor : public std::binary_function<const FactorFriend &, const FactorFriend &, bool> {
bool operator()(const FactorFriend &left, const FactorFriend &right) const {
return left.in.GetString() == right.in.GetString();
}
bool operator()(const FactorFriend &left, const std::string &right) const {
return left.in.GetString() == right;
}
bool operator()(const std::string &left, const FactorFriend &right) const {
return left == right.in.GetString();
}
};
typedef boost::unordered_set<FactorFriend, HashFactor, EqualsFactor> Set;
#else
struct LessFactor : public std::binary_function<const FactorFriend &, const FactorFriend &, bool> {
bool operator()(const FactorFriend &left, const FactorFriend &right) const {
return left.in.GetString() < right.in.GetString();
}
};
typedef std::set<FactorFriend, LessFactor> Set;
#endif
Set m_set;
static FactorCollection s_instance;
#ifdef WITH_THREADS
//reader-writer lock
boost::shared_mutex m_accessLock;
mutable boost::shared_mutex m_accessLock;
#endif
size_t m_factorId; /**< unique, contiguous ids, starting from 0, for each factor */
FactorSet m_collection; /**< collection of all factors */
StringSet m_factorStringCollection; /**< collection of unique string used by factors */
size_t m_factorId; /**< unique, contiguous ids, starting from 0, for each factor */
//! constructor. only the 1 static variable can be created
FactorCollection()
@ -72,17 +117,17 @@ public:
return s_instance;
}
//! Destructor
~FactorCollection();
//! Test to see whether a factor exists
bool Exists(FactorDirection direction, FactorType factorType, const std::string &factorString);
/** returns a factor with the same direction, factorType and factorString.
* If a factor already exist in the collection, return the existing factor, if not create a new 1
*/
const Factor *AddFactor(FactorDirection direction, FactorType factorType, const std::string &factorString);
//! Load list of factors. Deprecated
void LoadVocab(FactorDirection direction, FactorType factorType, const std::string &filePath);
const Factor *AddFactor(const std::string &factorString);
// TODO: remove calls to this function, replacing them with the simpler AddFactor(factorString)
const Factor *AddFactor(FactorDirection /*direction*/, FactorType /*factorType*/, const std::string &factorString) {
return AddFactor(factorString);
}
TO_STRING();

View File

@ -88,9 +88,9 @@ void LanguageModel::CalcScore(const Phrase &phrase
, float &ngramScore
, size_t &oovCount) const
{
fullScore = 0;
ngramScore = 0;
fullScore = 0;
ngramScore = 0;
oovCount = 0;
size_t phraseSize = phrase.GetSize();
@ -159,19 +159,19 @@ FFState* LanguageModel::Evaluate(
{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
// translation option.
// translation option.
// In the case of unigram language models, there is no overlap, so we don't
// need to do anything.
if(GetNGramOrder() <= 1)
return NULL;
clock_t t=0;
clock_t t = 0;
IFVERBOSE(2) {
t = clock(); // track time
t = clock(); // track time
}
// Empty phrase added? nothing to be done
// Empty phrase added? nothing to be done
if (hypo.GetCurrTargetLength() == 0)
return ps ? m_implementation->NewState(ps) : NULL;
@ -202,7 +202,7 @@ FFState* LanguageModel::Evaluate(
// add last factor
contextFactor.back() = &hypo.GetWord(currPos);
lmScore += m_implementation->GetValueGivenState(contextFactor, *res).score;
lmScore += m_implementation->GetValueGivenState(contextFactor, *res).score;
}
// end of sentence
@ -217,10 +217,10 @@ FFState* LanguageModel::Evaluate(
else
contextFactor[i] = &hypo.GetWord((size_t)currPos);
}
lmScore += m_implementation->GetValueForgotState(contextFactor, *res).score;
}
else
{
lmScore += m_implementation->GetValueForgotState(contextFactor, *res).score;
}
else
{
if (endPos < currEndPos) {
//need to get the LM state (otherwise the last LM state is fine)
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
@ -258,147 +258,4 @@ float LanguageModel::GetOOVWeight() const {
return 0;
}
}
FFState* LanguageModel::EvaluateChart(
const ChartHypothesis& hypo,
int featureID,
ScoreComponentCollection* out) const
{
// data structure for factored context phrase (history and predicted word)
vector<const Word*> contextFactor;
contextFactor.reserve(GetNGramOrder());
// initialize language model context state
FFState *lmState = m_implementation->NewState( m_implementation->GetNullContextState() );
// initial language model scores
float prefixScore = 0.0; // not yet final for initial words (lack context)
float finalizedScore = 0.0; // finalized, has sufficient context
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
// loop over rule
for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
phrasePos++)
{
// consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// regular word
if (!word.IsNonTerminal())
{
ShiftOrPush(contextFactor, word);
// beginning of sentence symbol <s>? -> just update state
if (word == m_implementation->GetSentenceStartArray())
{
assert(phrasePos == 0);
delete lmState;
lmState = m_implementation->NewState( m_implementation->GetBeginSentenceState() );
}
// score a regular word added by the rule
else
{
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(m_implementation->GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
}
}
// non-terminal, add phrase from underlying hypothesis
else
{
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
size_t subPhraseLength = prevHypo->GetNumTargetTerminals();
// special case: rule starts with non-terminal -> copy everything
if (phrasePos == 0) {
// get prefixScore and finalizedScore
const LanguageModelChartState* prevState =
dynamic_cast<const LanguageModelChartState*>(prevHypo->GetFFState( featureID ));
prefixScore = prevState->GetPrefixScore();
finalizedScore = prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] - prefixScore;
// get language model state
delete lmState;
lmState = m_implementation->NewState( prevState->GetRightContext() );
// push suffix
int suffixPos = prevHypo->GetSuffix().GetSize() - (GetNGramOrder()-1);
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
for(;(size_t)suffixPos < prevHypo->GetSuffix().GetSize(); suffixPos++)
{
const Word &word = prevHypo->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
wordPos++;
}
}
// internal non-terminal
else
{
// score its prefix
for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window
&& prefixPos < subPhraseLength; // up to length
prefixPos++)
{
const Word &word = prevHypo->GetPrefix().GetWord(prefixPos);
ShiftOrPush(contextFactor, word);
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(m_implementation->GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
}
// check if we are dealing with a large sub-phrase
if (subPhraseLength > GetNGramOrder() - 1)
{
// add its finalized language model score
const LanguageModelChartState* prevState =
dynamic_cast<const LanguageModelChartState*>(prevHypo->GetFFState( featureID ));
finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score
- prevState->GetPrefixScore(); // - prefix score
// copy language model state
delete lmState;
lmState = m_implementation->NewState( prevState->GetRightContext() );
// push its suffix
size_t remainingWords = subPhraseLength - (GetNGramOrder()-1);
if (remainingWords > GetNGramOrder()-1) {
// only what is needed for the history window
remainingWords = GetNGramOrder()-1;
}
for(size_t suffixPos = prevHypo->GetSuffix().GetSize() - remainingWords;
suffixPos < prevHypo->GetSuffix().GetSize();
suffixPos++) {
const Word &word = prevHypo->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
}
wordPos += subPhraseLength;
}
}
}
}
// assign combined score to score breakdown
out->Assign(this, prefixScore + finalizedScore);
// create and return feature function state
LanguageModelChartState *res = new LanguageModelChartState( prefixScore, lmState, hypo );
return res;
}
void LanguageModel::updateChartScore( float *prefixScore, float *finalizedScore, float score, size_t wordPos ) const {
if (wordPos < GetNGramOrder()) {
*prefixScore += score;
}
else {
*finalizedScore += score;
}
}
}

View File

@ -132,12 +132,12 @@ public:
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart(
FFState* EvaluateChart(
const ChartHypothesis& cur_hypo,
int featureID,
ScoreComponentCollection* accumulator) const;
void updateChartScore( float *prefixScore, float *finalScore, float score, size_t wordPos ) const;
ScoreComponentCollection* accumulator) const {
return m_implementation->EvaluateChart(cur_hypo, featureID, accumulator, this);
}
#ifdef WITH_THREADS
// if multi-threaded return boost ptr

View File

@ -0,0 +1,49 @@
//
// LanguageModelChartState.cpp
// moses
//
// Created by Hieu Hoang on 06/09/2011.
// Copyright 2011 __MyCompanyName__. All rights reserved.
//
#include "LanguageModelChartState.h"
#include "ChartHypothesis.h"
#include "ChartManager.h"
namespace Moses
{
int LanguageModelChartState::Compare(const FFState& o) const
{
const LanguageModelChartState &other =
dynamic_cast<const LanguageModelChartState &>( o );
// prefix
if (m_hypo->GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
{
int ret = m_hypo->GetPrefix().Compare(other.GetHypothesis()->GetPrefix());
if (ret != 0)
return ret;
}
// suffix
size_t inputSize = m_hypo->GetManager().GetSource().GetSize();
if (m_hypo->GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
{
int ret = other.GetRightContext()->Compare( *m_lmRightContext );
if (ret != 0)
return ret;
}
// size_t inputSize = m_hypo->GetManager().GetSource().GetSize();
// if (m_hypo->GetCurrSourceRange().GetEndPos() < inputSize - 1)
// {
// int ret2 = m_hypo->GetSuffix().Compare(other.GetHypothesis()->GetSuffix());
// if (ret != 0)
// return ret;
// }
return 0;
}
} // namespace

View File

@ -22,68 +22,39 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#pragma once
#include "FFState.h"
#include "ChartManager.h"
#include "ChartHypothesis.h"
namespace Moses
{
class ChartHypothesis;
class Phrase;
//! Abstract class for lexical reordering model states
class LanguageModelChartState : public FFState
{
private:
float m_prefixScore;
FFState* m_lmRightContext;
const ChartHypothesis *m_hypo;
float m_prefixScore;
FFState* m_lmRightContext;
const ChartHypothesis *m_hypo;
public:
LanguageModelChartState(float prefixScore,
FFState *lmRightContext,
const ChartHypothesis &hypo)
:m_prefixScore(prefixScore)
,m_lmRightContext(lmRightContext)
,m_hypo(&hypo)
{}
~LanguageModelChartState() {
LanguageModelChartState(float prefixScore,
FFState *lmRightContext,
const ChartHypothesis &hypo)
:m_prefixScore(prefixScore)
,m_lmRightContext(lmRightContext)
,m_hypo(&hypo)
{}
~LanguageModelChartState() {
delete m_lmRightContext;
}
float GetPrefixScore() const { return m_prefixScore; }
FFState* GetRightContext() const { return m_lmRightContext; }
const ChartHypothesis* GetHypothesis() const { return m_hypo; }
float GetPrefixScore() const { return m_prefixScore; }
FFState* GetRightContext() const { return m_lmRightContext; }
const ChartHypothesis* GetHypothesis() const { return m_hypo; }
int Compare(const FFState& o) const
{
const LanguageModelChartState &other =
dynamic_cast<const LanguageModelChartState &>( o );
// prefix
if (m_hypo->GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
{
int ret = m_hypo->GetPrefix().Compare(other.GetHypothesis()->GetPrefix());
if (ret != 0)
return ret;
}
// suffix
size_t inputSize = m_hypo->GetManager().GetSource().GetSize();
if (m_hypo->GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
{
int ret = other.GetRightContext()->Compare( *m_lmRightContext );
if (ret != 0)
return ret;
}
// size_t inputSize = m_hypo->GetManager().GetSource().GetSize();
// if (m_hypo->GetCurrSourceRange().GetEndPos() < inputSize - 1)
// {
// int ret2 = m_hypo->GetSuffix().Compare(other.GetHypothesis()->GetSuffix());
// if (ret != 0)
// return ret;
// }
return 0;
}
int Compare(const FFState& o) const;
};
}

View File

@ -25,9 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <fstream>
#include "dictionary.h"
#include "n_gram.h"
#include "lmtable.h"
#include "lmmacro.h"
#include "lmContainer.h"
#include "LanguageModelIRST.h"
#include "TypeDef.h"
@ -71,58 +69,26 @@ bool LanguageModelIRST::Load(const std::string &filePath,
m_nGramOrder = nGramOrder;
m_filePath = filePath;
//checking the language model type
int lmtype = getLanguageModelType(m_filePath);
std::cerr << "IRSTLM Language Model Type of " << filePath << " is " << lmtype << std::endl;
if (lmtype == _IRSTLM_LMMACRO) {
// case lmmacro: LM is of type lmmacro, create an object of lmmacro
m_lmtb = new lmmacro();
d=((lmmacro *)m_lmtb)->getDict();
((lmmacro*) m_lmtb)->load(m_filePath);
} else if (lmtype == _IRSTLM_LMTABLE) {
// case (standard) lmmacro: LM is of type lmtable: create an object of lmtable
std::cerr << "Loading LM file (no MAP)\n";
m_lmtb = (lmtable *)new lmtable();
d=((lmtable *)m_lmtb)->getDict();
// Load the (possibly binary) model
// Open the input file (possibly gzipped)
InputFileStream inp(m_filePath);
#ifdef WIN32
m_lmtb->load(inp); //don't use memory map
#else
if (m_filePath.compare(m_filePath.size()-3,3,".mm")==0)
m_lmtb->load(inp,m_filePath.c_str(),NULL,1);
else
m_lmtb->load(inp,m_filePath.c_str(),NULL,0);
#endif
} else {
std::cerr << "This language model type is unknown!" << std::endl;
exit(1);
}
if (lmtype == _IRSTLM_LMMACRO) {
m_lmtb->getDict()->incflag(1);
}
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
m_lmtb->setMaxLoadedLevel(1000);
m_lmtb->load(m_filePath);
d=m_lmtb->getDict();
d->incflag(1);
m_lmtb_size=m_lmtb->maxlevel();
// LM can be ok, just outputs warnings
// Mauro: in the original, the following two instructions are wrongly switched:
m_unknownId = d->oovcode(); // at the level of micro tags
m_empty = -1; // code for an empty position
CreateFactors(factorCollection);
VERBOSE(1, "IRST: m_unknownId=" << m_unknownId << std::endl);
//install caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags)
int ml = ((lmtable *)m_lmtb)->maxlevel();
m_lmtb->init_caches(ml>2?ml-1:2);
m_lmtb->init_caches(m_lmtb_size>2?m_lmtb_size-1:2);
if (m_lmtb_dub > 0) m_lmtb->setlogOOVpenalty(m_lmtb_dub);
@ -135,6 +101,7 @@ void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection)
// code copied & paste from SRI LM class. should do template function
std::map<size_t, int> lmIdMap;
size_t maxFactorId = 0; // to create lookup vector later on
m_empty = -1; // code for an empty position
dict_entry *entry;
dictionary_iter iter(d); // at the level of micro tags
@ -160,15 +127,12 @@ void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection)
// add to lookup vector in object
m_lmIdLookup.resize(maxFactorId+1);
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_empty);
map<size_t, int>::iterator iterMap;
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) {
m_lmIdLookup[iterMap->first] = iterMap->second;
}
}
int LanguageModelIRST::GetLmID( const std::string &str ) const
@ -177,14 +141,26 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
}
int LanguageModelIRST::GetLmID( const Factor *factor ) const
{
{
size_t factorId = factor->GetId();
if (factorId >= m_lmIdLookup.size()) {
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
if (d->incflag()==1) {
std::string s = factor->GetString();
int code = d->encode(s.c_str());
//////////
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
///e delle parole target in Moses, puo' accadere che una parola target
///di cui non sia stato ancora calcolato il suo codice target abbia
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
///E' necessario dunque identificare questi casi di indeterminatezza
///del codice target. Attualamente, questo controllo e' stato implementato
///impostando a m_empty tutti i termini che non hanno ancora
//ricevuto un codice target effettivo
///////////
///OLD PROBLEM - SOLVED
////////////
/// IL PPROBLEMA ERA QUI
/// m_lmIdLookup.push_back(code);
@ -192,7 +168,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
/// Cosi' funziona ....
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
/// quindi controllo
/// quindi
/// e scopro che rimane vuota una entry ogni due
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
/// non da problemi di correttezza, ma solo di "spreco" di memoria
@ -202,12 +178,14 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
////////////////
//resize and fill with m_unknownId
m_lmIdLookup.resize(factorId+1, m_unknownId);
if (factorId >= m_lmIdLookup.size()){
//resize and fill with m_empty
//increment the array more than needed to avoid too many resizing operation.
m_lmIdLookup.resize(factorId+10, m_empty);
}
//insert new code
m_lmIdLookup[factorId] = code;
return code;
} else {

View File

@ -29,8 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Util.h"
#include "LanguageModelSingleFactor.h"
class lmtable; // irst lm table
class lmmacro; // irst lm for macro tags
class lmContainer; // irst lm container for any lm type
class ngram;
class dictionary;
@ -45,9 +44,10 @@ class LanguageModelIRST : public LanguageModelPointerState
{
protected:
mutable std::vector<int> m_lmIdLookup;
lmtable* m_lmtb;
lmContainer* m_lmtb;
int m_unknownId;
int m_unknownId; //code of OOV
int m_empty; //code of an empty position
int m_lmtb_sentenceStart; //lmtb symbols to initialize ngram with
int m_lmtb_sentenceEnd; //lmt symbol to initialize ngram with
int m_lmtb_size; //max ngram stored in the table

View File

@ -33,11 +33,27 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FactorCollection.h"
#include "Phrase.h"
#include "StaticData.h"
#include "LanguageModelChartState.h"
#include "ChartHypothesis.h"
using namespace std;
namespace Moses
{
void LanguageModelImplementation::ShiftOrPush(std::vector<const Word*> &contextFactor, const Word &word) const
{
if (contextFactor.size() < GetNGramOrder()) {
contextFactor.push_back(&word);
} else {
// shift
for (size_t currNGramOrder = 0 ; currNGramOrder < GetNGramOrder() - 1 ; currNGramOrder++) {
contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1];
}
contextFactor[GetNGramOrder() - 1] = &word;
}
}
LMResult LanguageModelImplementation::GetValueGivenState(
const std::vector<const Word*> &contextFactor,
FFState &state) const
@ -52,4 +68,142 @@ void LanguageModelImplementation::GetState(
GetValueForgotState(contextFactor, state);
}
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const {
// data structure for factored context phrase (history and predicted word)
vector<const Word*> contextFactor;
contextFactor.reserve(GetNGramOrder());
// initialize language model context state
FFState *lmState = NewState( GetNullContextState() );
// initial language model scores
float prefixScore = 0.0; // not yet final for initial words (lack context)
float finalizedScore = 0.0; // finalized, has sufficient context
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
// loop over rule
for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
phrasePos++)
{
// consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// regular word
if (!word.IsNonTerminal())
{
ShiftOrPush(contextFactor, word);
// beginning of sentence symbol <s>? -> just update state
if (word == GetSentenceStartArray())
{
assert(phrasePos == 0);
delete lmState;
lmState = NewState( GetBeginSentenceState() );
}
// score a regular word added by the rule
else
{
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
}
}
// non-terminal, add phrase from underlying hypothesis
else
{
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
size_t subPhraseLength = prevHypo->GetNumTargetTerminals();
// special case: rule starts with non-terminal -> copy everything
if (phrasePos == 0) {
// get prefixScore and finalizedScore
const LanguageModelChartState* prevState =
dynamic_cast<const LanguageModelChartState*>(prevHypo->GetFFState( featureID ));
prefixScore = prevState->GetPrefixScore();
finalizedScore = prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] - prefixScore;
// get language model state
delete lmState;
lmState = NewState( prevState->GetRightContext() );
// push suffix
int suffixPos = prevHypo->GetSuffix().GetSize() - (GetNGramOrder()-1);
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
for(;(size_t)suffixPos < prevHypo->GetSuffix().GetSize(); suffixPos++)
{
const Word &word = prevHypo->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
wordPos++;
}
}
// internal non-terminal
else
{
// score its prefix
for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window
&& prefixPos < subPhraseLength; // up to length
prefixPos++)
{
const Word &word = prevHypo->GetPrefix().GetWord(prefixPos);
ShiftOrPush(contextFactor, word);
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
}
// check if we are dealing with a large sub-phrase
if (subPhraseLength > GetNGramOrder() - 1)
{
// add its finalized language model score
const LanguageModelChartState* prevState =
dynamic_cast<const LanguageModelChartState*>(prevHypo->GetFFState( featureID ));
finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score
- prevState->GetPrefixScore(); // - prefix score
// copy language model state
delete lmState;
lmState = NewState( prevState->GetRightContext() );
// push its suffix
size_t remainingWords = subPhraseLength - (GetNGramOrder()-1);
if (remainingWords > GetNGramOrder()-1) {
// only what is needed for the history window
remainingWords = GetNGramOrder()-1;
}
for(size_t suffixPos = prevHypo->GetSuffix().GetSize() - remainingWords;
suffixPos < prevHypo->GetSuffix().GetSize();
suffixPos++) {
const Word &word = prevHypo->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
}
wordPos += subPhraseLength;
}
}
}
}
// assign combined score to score breakdown
out->Assign(scorer, prefixScore + finalizedScore);
// create and return feature function state
LanguageModelChartState *res = new LanguageModelChartState( prefixScore, lmState, hypo );
return res;
}
void LanguageModelImplementation::updateChartScore( float *prefixScore, float *finalizedScore, float score, size_t wordPos ) const {
if (wordPos < GetNGramOrder()) {
*prefixScore += score;
}
else {
*finalizedScore += score;
}
}
}

View File

@ -33,6 +33,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
class LanguageModel;
class FactorCollection;
class Factor;
class Phrase;
@ -59,6 +61,7 @@ private:
#else
// default constructor is ok
#endif
void ShiftOrPush(std::vector<const Word*> &contextFactor, const Word &word) const;
protected:
std::string m_filePath; //! for debugging purposes
@ -96,6 +99,10 @@ public:
virtual const FFState *GetBeginSentenceState() const = 0;
virtual FFState *NewState(const FFState *from = NULL) const = 0;
virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator, const LanguageModel *feature) const;
void updateChartScore( float *prefixScore, float *finalScore, float score, size_t wordPos ) const;
//! max n-gram order of LM
size_t GetNGramOrder() const {
return m_nGramOrder;

View File

@ -35,6 +35,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Phrase.h"
#include "InputFileStream.h"
#include "StaticData.h"
#include "ChartHypothesis.h"
using namespace std;
@ -46,15 +47,39 @@ LanguageModelKenBase::~LanguageModelKenBase() {}
namespace
{
class LanguageModelChartStateKenLM : public FFState
{
private:
lm::ngram::ChartState m_state;
const ChartHypothesis *m_hypo;
public:
explicit LanguageModelChartStateKenLM(const ChartHypothesis &hypo)
:m_hypo(&hypo)
{}
const ChartHypothesis* GetHypothesis() const { return m_hypo; }
const lm::ngram::ChartState &GetChartState() const { return m_state; }
lm::ngram::ChartState &GetChartState() { return m_state; }
int Compare(const FFState& o) const
{
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
int ret = m_state.Compare(other.m_state);
return ret;
}
};
class MappingBuilder : public lm::ngram::EnumerateVocab
{
public:
MappingBuilder(FactorType factorType, FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorType(factorType), m_factorCollection(factorCollection), m_mapping(mapping) {}
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorCollection(factorCollection), m_mapping(mapping) {}
void Add(lm::WordIndex index, const StringPiece &str) {
m_word.assign(str.data(), str.size());
std::size_t factorId = m_factorCollection.AddFactor(Output, m_factorType, m_word)->GetId();
std::size_t factorId = m_factorCollection.AddFactor(str.data())->GetId();
if (m_mapping.size() <= factorId) {
// 0 is <unk> :-)
m_mapping.resize(factorId + 1);
@ -63,8 +88,6 @@ public:
}
private:
std::string m_word;
FactorType m_factorType;
FactorCollection &m_factorCollection;
std::vector<lm::WordIndex> &m_mapping;
};
@ -73,14 +96,15 @@ struct KenLMState : public FFState {
lm::ngram::State state;
int Compare(const FFState &o) const {
const KenLMState &other = static_cast<const KenLMState &>(o);
if (state.valid_length_ < other.state.valid_length_) return -1;
if (state.valid_length_ > other.state.valid_length_) return 1;
return std::memcmp(state.history_, other.state.history_, sizeof(lm::WordIndex) * state.valid_length_);
if (state.length < other.state.length) return -1;
if (state.length > other.state.length) return 1;
return std::memcmp(state.words, other.state.words, sizeof(lm::WordIndex) * state.length);
}
};
/** Implementation of single factor LM using Ken's code.
*/
/*
* An implementation of single factor LM using Ken's code.
*/
template <class Model> class LanguageModelKen : public LanguageModelKenBase
{
private:
@ -121,8 +145,58 @@ public:
void CleanUpAfterSentenceProcessing() {}
void InitializeBeforeSentenceProcessing() {}
FFState *EvaluateChart(const ChartHypothesis& cur_hypo,
int featureID,
ScoreComponentCollection *accumulator,
const LanguageModel *feature) const;
};
template <class Model>
FFState *LanguageModelKen<Model>::EvaluateChart(
const ChartHypothesis& hypo,
int featureID,
ScoreComponentCollection *accumulator,
const LanguageModel *feature) const
{
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM(hypo);
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
size_t phrasePos = 0;
// Special cases for first word.
if (size) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
if (word == GetSentenceStartArray()) {
// Begin of sentence
ruleScore.BeginSentence();
phrasePos++;
} else if (word.IsNonTerminal()) {
// Non-terminal is first so we can copy instead of rescoring.
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(feature)[0]);
phrasePos++;
}
}
for (; phrasePos < size; phrasePos++) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
if (word.IsNonTerminal()) {
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
ruleScore.NonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(feature)[0]);
} else {
std::size_t factor = word.GetFactor(GetFactorType())->GetId();
lm::WordIndex new_word = (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
ruleScore.Terminal(new_word);
}
}
accumulator->Assign(feature, ruleScore.Finish());
return newState;
}
template <class Model> void LanguageModelKen<Model>::TranslateIDs(const std::vector<const Word*> &contextFactor, lm::WordIndex *indices) const
{
FactorType factorType = GetFactorType();
@ -148,16 +222,16 @@ template <class Model> bool LanguageModelKen<Model>::Load(const std::string &fil
FactorType factorType,
size_t /*nGramOrder*/)
{
m_factorType = factorType;
m_filePath = filePath;
m_factorType = factorType;
m_filePath = filePath;
FactorCollection &factorCollection = FactorCollection::Instance();
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
m_sentenceStart = factorCollection.AddFactor(BOS_);
m_sentenceStartArray[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
m_sentenceEnd = factorCollection.AddFactor(EOS_);
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
MappingBuilder builder(m_factorType, factorCollection, m_lmIdLookup);
MappingBuilder builder(factorCollection, m_lmIdLookup);
lm::ngram::Config config;
IFVERBOSE(1) {

View File

@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#include "LanguageModelSingleFactor.h"
#include "kenlm/lm/left.hh"
namespace Moses
{
@ -41,7 +42,7 @@ namespace Moses
// scoring functions which provide more info than the common interface of LanguageModel
virtual LMKenResult GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const = 0;
virtual LMKenResult GetKenFullScoreForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const = 0;
};
};
// Doesn't actually load; moses wants the Load method for that. It needs the file to autodetect binary format.

View File

@ -102,6 +102,10 @@ libmoses_la_HEADERS = \
RuleCube.h \
RuleCubeItem.h \
RuleCubeQueue.h \
RuleTableLoader.h \
RuleTableLoaderCompact.h \
RuleTableLoaderFactory.h \
RuleTableLoaderStandard.h \
ScoreComponentCollection.h \
ScoreProducer.h \
Search.h \
@ -230,6 +234,7 @@ libmoses_la_SOURCES = \
LMList.cpp \
LVoc.cpp \
LanguageModel.cpp \
LanguageModelChartState.cpp \
LanguageModelFactory.cpp \
LanguageModelImplementation.cpp \
LanguageModelInternal.cpp \
@ -267,6 +272,9 @@ libmoses_la_SOURCES = \
RuleCube.cpp \
RuleCubeItem.cpp \
RuleCubeQueue.cpp \
RuleTableLoaderCompact.cpp \
RuleTableLoaderFactory.cpp \
RuleTableLoaderStandard.cpp \
ScoreComponentCollection.cpp \
ScoreProducer.cpp \
Search.cpp \

View File

@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <ext/hash_set>
#endif
#include <algorithm>
#include <limits>
#include <cmath>
#include "Manager.h"
@ -257,6 +258,152 @@ void Manager::CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct) co
}
}
struct SGNReverseCompare {
bool operator() (const SearchGraphNode& s1, const SearchGraphNode& s2) const {
return s1.hypo->GetId() > s2.hypo->GetId();
}
};
/**
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
**/
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
//Calculation of the sigmas of each hypothesis and edge. In C&C notation this is
//the "log of the cumulative unnormalized probability of all the paths in the
// lattice for the hypothesis to a final node"
typedef pair<int, int> Edge;
map<const Hypothesis*, float> sigmas;
map<Edge, float> edgeScores;
map<const Hypothesis*, set<const Hypothesis*> > outgoingHyps;
map<int,const Hypothesis*> idToHyp;
map<int,float> fscores;
//Iterating through the hypos in reverse order of id gives a reverse
//topological order. We rely on the fact that hypo ids are given out
//sequentially, as the search proceeds.
//NB: Could just sort by stack.
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
//first task is to fill in the outgoing hypos and edge scores.
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
i != searchGraph.end(); ++i) {
const Hypothesis* hypo = i->hypo;
idToHyp[hypo->GetId()] = hypo;
fscores[hypo->GetId()] = i->fscore;
if (hypo->GetId()) {
//back to current
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
outgoingHyps[prevHypo].insert(hypo);
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
hypo->GetScore() - prevHypo->GetScore();
}
//forward from current
if (i->forward >= 0) {
map<int,const Hypothesis*>::const_iterator idToHypIter = idToHyp.find(i->forward);
assert(idToHypIter != idToHyp.end());
const Hypothesis* nextHypo = idToHypIter->second;
outgoingHyps[hypo].insert(nextHypo);
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
assert(fscoreIter != fscores.end());
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
i->fscore - fscoreIter->second;
}
}
//then run through again to calculate sigmas
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
i != searchGraph.end(); ++i) {
if (i->forward == -1) {
sigmas[i->hypo] = 0;
} else {
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(i->hypo);
assert(outIter != outgoingHyps.end());
float sigma = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) {
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
assert(succIter != sigmas.end());
map<Edge,float>::const_iterator edgeScoreIter =
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
assert(edgeScoreIter != edgeScores.end());
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
if (sigma == 0) {
sigma = term;
} else {
sigma = log_sum(sigma,term);
}
}
sigmas[i->hypo] = sigma;
}
}
//The actual sampling!
const Hypothesis* startHypo = searchGraph.back().hypo;
assert(startHypo->GetId() == 0);
for (size_t i = 0; i < count; ++i) {
vector<const Hypothesis*> path;
path.push_back(startHypo);
while(1) {
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(path.back());
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
//end of the path
break;
}
//score the possibles
vector<const Hypothesis*> candidates;
vector<float> candidateScores;
float scoreTotal = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) {
candidates.push_back(*j);
assert(sigmas.find(*j) != sigmas.end());
Edge edge(path.back()->GetId(),(*j)->GetId());
assert(edgeScores.find(edge) != edgeScores.end());
candidateScores.push_back(sigmas[*j] + edgeScores[edge]);
if (scoreTotal == 0) {
scoreTotal = candidateScores.back();
} else {
scoreTotal = log_sum(candidateScores.back(), scoreTotal);
}
}
//normalise
transform(candidateScores.begin(), candidateScores.end(), candidateScores.begin(), bind2nd(minus<float>(),scoreTotal));
//copy(candidateScores.begin(),candidateScores.end(),ostream_iterator<float>(cerr," "));
//cerr << endl;
//draw the sample
float random = log((float)rand()/RAND_MAX);
size_t position = 1;
float sum = candidateScores[0];
for (; position < candidateScores.size() && sum < random; ++position) {
sum = log_sum(sum,candidateScores[position]);
}
//cerr << "Random: " << random << " Chose " << position-1 << endl;
const Hypothesis* chosen = candidates[position-1];
path.push_back(chosen);
}
//cerr << "Path: " << endl;
//for (size_t j = 0; j < path.size(); ++j) {
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
//}
//cerr << endl;
//Convert the hypos to TrellisPath
ret.Add(new TrellisPath(path));
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
}
}

View File

@ -130,6 +130,7 @@ public:
const Hypothesis *GetBestHypothesis() const;
const Hypothesis *GetActualBestHypothesis() const;
void CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct=0) const;
void CalcLatticeSamples(size_t count, TrellisPathList &ret) const;
void PrintAllDerivations(long translationId, std::ostream& outputStream) const;
void printDivergentHypothesis(long translationId, const Hypothesis* hypo, const std::vector <const TargetPhrase*> & remainingPhrases, float remainingScore , std::ostream& outputStream) const;
void printThisHypothesis(long translationId, const Hypothesis* hypo, const std::vector <const TargetPhrase* > & remainingPhrases, float remainingScore , std::ostream& outputStream) const;

View File

@ -63,6 +63,7 @@ Parameter::Parameter()
AddParam("max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
AddParam("max-phrase-length", "maximum phrase length (default 20)");
AddParam("n-best-list", "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
AddParam("lattice-samples", "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list");
AddParam("n-best-factor", "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0");
AddParam("print-all-derivations", "to print all derivations in search graph");
AddParam("output-factors", "list of factors in the output");

View File

@ -24,7 +24,8 @@
#include <string>
#include <iterator>
#include <algorithm>
#include <sys/stat.h>
#include "RuleTableLoader.h"
#include "RuleTableLoaderFactory.h"
#include "PhraseDictionarySCFG.h"
#include "FactorCollection.h"
#include "Word.h"
@ -33,9 +34,6 @@
#include "StaticData.h"
#include "WordsRange.h"
#include "UserMessage.h"
#include "ChartTranslationOptionList.h"
#include "DotChart.h"
#include "FactorCollection.h"
#include "ChartRuleLookupManagerMemory.h"
using namespace std;
@ -43,36 +41,6 @@ using namespace std;
namespace Moses
{
inline void TransformString(vector< vector<string>* > &phraseVector)
{
// for target phrase
for (size_t pos = 0 ; pos < phraseVector.size() ; ++pos) {
assert(phraseVector[pos]->size() == 1);
string &str = (*phraseVector[pos])[0];
if (str.substr(0, 1) == "[" && str.substr(str.size()-1, 1) == "]") {
// non-term
str = str.substr(1, str.size() - 2);
}
}
}
void PhraseDictionarySCFG::CreateSourceLabels(vector<Word> &sourceLabels
, const vector<string> &sourceLabelsStr) const
{
FactorCollection &factorCollection = FactorCollection::Instance();
for (size_t ind = 0; ind < sourceLabelsStr.size(); ++ind) {
sourceLabels.push_back(Word());
Word &word = sourceLabels.back();
// TODO - no factors
const Factor *factor = factorCollection.AddFactor(Input, 0, sourceLabelsStr[ind]);
word[0] = factor;
word.SetIsNonTerminal(true);
}
}
bool PhraseDictionarySCFG::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const string &filePath
@ -88,100 +56,13 @@ bool PhraseDictionarySCFG::Load(const std::vector<FactorType> &input
// data from file
InputFileStream inFile(filePath);
bool ret = Load(input, output, inFile, weight, tableLimit, languageModels, wpProducer);
std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(filePath);
bool ret = loader->Load(input, output, inFile, weight, tableLimit,
languageModels, wpProducer, *this);
return ret;
}
bool PhraseDictionarySCFG::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, std::istream &inStream
, const std::vector<float> &weight
, size_t /* tableLimit */
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer)
{
PrintUserTime("Start loading new format pt model");
const StaticData &staticData = StaticData::Instance();
const std::string& factorDelimiter = staticData.GetFactorDelimiter();
string line;
size_t count = 0;
while(getline(inStream, line)) {
vector<string> tokens;
vector<float> scoreVector;
TokenizeMultiCharSeparator(tokens, line , "|||" );
if (tokens.size() != 4 && tokens.size() != 5) {
stringstream strme;
strme << "Syntax error at " << m_filePath << ":" << count;
UserMessage::Add(strme.str());
abort();
}
const string &sourcePhraseString = tokens[0]
, &targetPhraseString = tokens[1]
, &scoreString = tokens[2]
, &alignString = tokens[3];
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
TRACE_ERR( m_filePath << ":" << count << ": pt entry contains empty target, skipping\n");
continue;
}
Tokenize<float>(scoreVector, scoreString);
if (scoreVector.size() != m_numScoreComponent) {
stringstream strme;
strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << count;
UserMessage::Add(strme.str());
abort();
}
assert(scoreVector.size() == m_numScoreComponent);
// parse source & find pt node
// constituent labels
Word sourceLHS, targetLHS;
// source
Phrase sourcePhrase(Input, 0);
sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);
// create target phrase obj
TargetPhrase *targetPhrase = new TargetPhrase(Output);
targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
// rest of target phrase
targetPhrase->SetAlignmentInfo(alignString);
targetPhrase->SetTargetLHS(targetLHS);
//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
// component score, for n-best output
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
targetPhrase->SetScoreChart(GetFeature(), scoreVector, weight, languageModels,wpProducer);
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(sourcePhrase, *targetPhrase);
AddEquivPhrase(phraseColl, targetPhrase);
count++;
}
// cleanup cache
// prune each target phrase collection
if (m_tableLimit) {
m_collection.Sort(m_tableLimit);
}
return true;
}
TargetPhraseCollection &PhraseDictionarySCFG::GetOrCreateTargetPhraseCollection(const Phrase &source, const TargetPhrase &target)
{
PhraseDictionaryNodeSCFG &currNode = GetOrCreateNode(source, target);
@ -220,38 +101,6 @@ PhraseDictionaryNodeSCFG &PhraseDictionarySCFG::GetOrCreateNode(const Phrase &so
return *currNode;
}
void PhraseDictionarySCFG::AddEquivPhrase(const Phrase & /* source */, const TargetPhrase & /* targetPhrase */)
{
assert(false); // TODO
}
void PhraseDictionarySCFG::AddEquivPhrase(TargetPhraseCollection &targetPhraseColl, TargetPhrase *targetPhrase)
{
targetPhraseColl.Add(targetPhrase);
}
const TargetPhraseCollection *PhraseDictionarySCFG::GetTargetPhraseCollection(const Phrase & /* source */) const
{
// exactly like CreateTargetPhraseCollection, but don't create
assert(false);
return NULL;
/*
const size_t size = source.GetSize();
const PhraseDictionaryNodeSCFG *currNode = &m_collection;
for (size_t pos = 0 ; pos < size ; ++pos)
{
const Word& word = source.GetWord(pos);
currNode = currNode->GetChild(word);
if (currNode == NULL)
return NULL;
}
return currNode->GetTargetPhraseCollection();
*/
}
void PhraseDictionarySCFG::InitializeForInput(const InputType& /* input */)
{
// Nothing to do: sentence-specific state is stored in ChartRuleLookupManager
@ -274,6 +123,14 @@ ChartRuleLookupManager *PhraseDictionarySCFG::CreateRuleLookupManager(
return new ChartRuleLookupManagerMemory(sentence, cellCollection, *this);
}
void PhraseDictionarySCFG::SortAndPrune()
{
if (GetTableLimit())
{
m_collection.Sort(GetTableLimit());
}
}
TO_STRING_BODY(PhraseDictionarySCFG);
// friend
@ -294,6 +151,4 @@ ostream& operator<<(ostream& out, const PhraseDictionarySCFG& phraseDict)
return out;
}
}

View File

@ -1,24 +1,21 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
***********************************************************************/
#pragma once
@ -29,51 +26,22 @@
namespace Moses
{
class ChartTranslationOptionList;
class DottedRuleStack;
class DottedRuleColl;
/*** Implementation of a phrase table in a trie. Looking up a phrase of
* length n words requires n look-ups to find the TargetPhraseCollection.
/*** Implementation of a SCFG rule table in a trie. Looking up a rule of
* length n symbols requires n look-ups to find the TargetPhraseCollection.
*/
class PhraseDictionarySCFG : public PhraseDictionary
{
typedef PhraseDictionary MyBase;
friend std::ostream& operator<<(std::ostream&, const PhraseDictionarySCFG&);
friend class RuleTableLoader;
protected:
PhraseDictionaryNodeSCFG m_collection;
std::string m_filePath;
public:
PhraseDictionarySCFG(size_t numScoreComponents,
PhraseDictionaryFeature* feature)
: PhraseDictionary(numScoreComponents, feature) {}
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(const Phrase &source, const TargetPhrase &target);
PhraseDictionaryNodeSCFG &GetOrCreateNode(const Phrase &source, const TargetPhrase &target);
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, std::istream &inStream
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer);
void CreateSourceLabels(std::vector<Word> &sourceLabels
, const std::vector<std::string> &sourceLabelsStr) const;
Word CreateCoveredWord(const Word &origSourceLabel, const InputType &src, const WordsRange &range) const;
public:
PhraseDictionarySCFG(size_t numScoreComponent, PhraseDictionaryFeature* feature)
: MyBase(numScoreComponent, feature) {
}
virtual ~PhraseDictionarySCFG();
std::string GetScoreProducerDescription(unsigned) const {
return "Hieu's Reordering Model";
}
PhraseTableImplementation GetPhraseTableImplementation() const {
return Memory;
}
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
@ -82,21 +50,21 @@ public:
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer);
const PhraseDictionaryNodeSCFG &GetRootNode() const {
return m_collection;
const std::string &GetFilePath() const { return m_filePath; }
const PhraseDictionaryNodeSCFG &GetRootNode() const { return m_collection; }
// Required by PhraseDictionary.
const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const
{
assert(false);
return NULL;
}
const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &source) const;
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase);
void AddEquivPhrase(TargetPhraseCollection &targetPhraseColl, TargetPhrase *targetPhrase);
// for mert
void SetWeightTransModel(const std::vector<float> &weightT);
TO_STRING();
// Required by PhraseDictionary.
void AddEquivPhrase(const Phrase &, const TargetPhrase &)
{
assert(false);
}
void InitializeForInput(const InputType& i);
@ -105,6 +73,20 @@ public:
ChartRuleLookupManager *CreateRuleLookupManager(
const InputType &,
const ChartCellCollection &);
TO_STRING();
private:
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
const Phrase &source, const TargetPhrase &target);
PhraseDictionaryNodeSCFG &GetOrCreateNode(const Phrase &source,
const TargetPhrase &target);
void SortAndPrune();
PhraseDictionaryNodeSCFG m_collection;
std::string m_filePath;
};
} // namespace Moses

View File

@ -0,0 +1,65 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include "PhraseDictionarySCFG.h"
#include "TypeDef.h"
#include <istream>
#include <vector>
namespace Moses
{
class LMList;
class WordPenaltyProducer;
// Abstract base class defining RuleTableLoader interface. Friend of
// PhraseDictionarySCFG.
class RuleTableLoader
{
public:
virtual ~RuleTableLoader() {}
virtual bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
std::istream &inStream,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
const WordPenaltyProducer* wpProducer,
PhraseDictionarySCFG &) = 0;
protected:
// Provide access to PhraseDictionarySCFG's private SortAndPrune function.
void SortAndPrune(PhraseDictionarySCFG &ruleTable) {
ruleTable.SortAndPrune();
}
// Provide access to PhraseDictionarySCFG's private
// GetOrCreateTargetPhraseCollection function.
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
PhraseDictionarySCFG &ruleTable, const Phrase &source,
const TargetPhrase &target) {
return ruleTable.GetOrCreateTargetPhraseCollection(source, target);
}
};
} // namespace Moses

View File

@ -0,0 +1,239 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "RuleTableLoaderCompact.h"
#include "AlignmentInfoCollection.h"
#include "DummyScoreProducers.h"
#include "InputFileStream.h"
#include "LMList.h"
#include "PhraseDictionarySCFG.h"
#include "UserMessage.h"
#include "Util.h"
#include "Word.h"
#include <istream>
#include <sstream>
namespace Moses
{
bool RuleTableLoaderCompact::Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
std::istream &inStream,
const std::vector<float> &weight,
size_t /* tableLimit */,
const LMList &languageModels,
const WordPenaltyProducer* wpProducer,
PhraseDictionarySCFG &ruleTable)
{
PrintUserTime("Start loading compact rule table");
LineReader reader(inStream);
// Read and check version number.
reader.ReadLine();
if (reader.m_line != "1") {
std::stringstream msg;
msg << "Unexpected compact rule table format: " << reader.m_line;
UserMessage::Add(msg.str());
return false;
}
// Load vocabulary.
std::vector<Word> vocab;
LoadVocabularySection(reader, input, vocab);
// Load source phrases.
std::vector<Phrase> sourcePhrases;
std::vector<size_t> sourceLhsIds;
LoadPhraseSection(reader, Input, vocab, sourcePhrases, sourceLhsIds);
// Load target phrases.
std::vector<Phrase> targetPhrases;
std::vector<size_t> targetLhsIds;
LoadPhraseSection(reader, Output, vocab, targetPhrases, targetLhsIds);
// Load alignments.
std::vector<const AlignmentInfo *> alignmentSets;
LoadAlignmentSection(reader, alignmentSets);
// Load rules.
if (!LoadRuleSection(reader, vocab, sourcePhrases, targetPhrases,
targetLhsIds, alignmentSets, languageModels,
wpProducer, weight, ruleTable)) {
return false;
}
// Sort and prune each target phrase collection.
SortAndPrune(ruleTable);
return true;
}
void RuleTableLoaderCompact::LoadVocabularySection(
LineReader &reader,
const std::vector<FactorType> &factorTypes,
std::vector<Word> &vocabulary)
{
// Read symbol count.
reader.ReadLine();
const size_t vocabSize = std::atoi(reader.m_line.c_str());
// Read symbol lines and create Word objects.
vocabulary.resize(vocabSize);
for (size_t i = 0; i < vocabSize; ++i) {
reader.ReadLine();
const size_t len = reader.m_line.size();
bool isNonTerm = (reader.m_line[0] == '[' && reader.m_line[len-1] == ']');
if (isNonTerm) {
reader.m_line = reader.m_line.substr(1, len-2);
}
vocabulary[i].CreateFromString(Input, factorTypes, reader.m_line, isNonTerm);
}
}
void RuleTableLoaderCompact::LoadPhraseSection(
LineReader &reader,
FactorDirection direction,
const std::vector<Word> &vocab,
std::vector<Phrase> &rhsPhrases,
std::vector<size_t> &lhsIds)
{
// Read phrase count.
reader.ReadLine();
const size_t phraseCount = std::atoi(reader.m_line.c_str());
// Reads lines, storing Phrase object for each RHS and vocab ID for each LHS.
rhsPhrases.resize(phraseCount, Phrase(direction, 0));
lhsIds.resize(phraseCount);
std::vector<size_t> tokenPositions;
for (size_t i = 0; i < phraseCount; ++i) {
reader.ReadLine();
tokenPositions.clear();
FindTokens(tokenPositions, reader.m_line);
const char *charLine = reader.m_line.c_str();
lhsIds[i] = std::atoi(charLine+tokenPositions[0]);
for (size_t j = 1; j < tokenPositions.size(); ++j) {
rhsPhrases[i].AddWord(vocab[std::atoi(charLine+tokenPositions[j])]);
}
}
}
void RuleTableLoaderCompact::LoadAlignmentSection(
LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets)
{
// Read alignment set count.
reader.ReadLine();
const size_t alignmentSetCount = std::atoi(reader.m_line.c_str());
alignmentSets.resize(alignmentSetCount);
std::set<std::pair<size_t,size_t> > alignmentInfo;
std::vector<std::string> tokens;
std::vector<size_t> points;
for (size_t i = 0; i < alignmentSetCount; ++i) {
// Read alignment set, lookup in collection, and store pointer.
alignmentInfo.clear();
tokens.clear();
reader.ReadLine();
Tokenize(tokens, reader.m_line);
std::vector<std::string>::const_iterator p;
for (p = tokens.begin(); p != tokens.end(); ++p) {
points.clear();
Tokenize<size_t>(points, *p, "-");
std::pair<size_t, size_t> alignmentPair(points[0], points[1]);
alignmentInfo.insert(alignmentPair);
}
alignmentSets[i] = AlignmentInfoCollection::Instance().Add(alignmentInfo);
}
}
bool RuleTableLoaderCompact::LoadRuleSection(
LineReader &reader,
const std::vector<Word> &vocab,
const std::vector<Phrase> &sourcePhrases,
const std::vector<Phrase> &targetPhrases,
const std::vector<size_t> &targetLhsIds,
const std::vector<const AlignmentInfo *> &alignmentSets,
const LMList &languageModels,
const WordPenaltyProducer *wpProducer,
const std::vector<float> &weights,
PhraseDictionarySCFG &ruleTable)
{
// Read rule count.
reader.ReadLine();
const size_t ruleCount = std::atoi(reader.m_line.c_str());
// Read rules and add to table.
const size_t numScoreComponents =
ruleTable.GetFeature()->GetNumScoreComponents();
std::vector<float> scoreVector(numScoreComponents);
std::vector<size_t> tokenPositions;
for (size_t i = 0; i < ruleCount; ++i) {
reader.ReadLine();
tokenPositions.clear();
FindTokens(tokenPositions, reader.m_line);
const char *charLine = reader.m_line.c_str();
// The first three tokens are IDs for the source phrase, target phrase,
// and alignment set.
const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]);
const int targetPhraseId = std::atoi(charLine+tokenPositions[1]);
const int alignmentSetId = std::atoi(charLine+tokenPositions[2]);
const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId];
const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId];
const Word &targetLhs = vocab[targetLhsIds[targetPhraseId]];
const AlignmentInfo *alignmentInfo = alignmentSets[alignmentSetId];
// Then there should be one score for each score component.
for (size_t j = 0; j < numScoreComponents; ++j) {
float score = std::atof(charLine+tokenPositions[3+j]);
scoreVector[j] = FloorScore(TransformScore(score));
}
if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') {
std::stringstream msg;
msg << "Size of scoreVector != number ("
<< scoreVector.size() << "!=" << numScoreComponents
<< ") of score components on line " << reader.m_lineNum;
UserMessage::Add(msg.str());
return false;
}
// The remaining columns are currently ignored.
// Create and score target phrase.
TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase);
targetPhrase->SetAlignmentInfo(alignmentInfo);
targetPhrase->SetTargetLHS(targetLhs);
targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weights,
languageModels, wpProducer);
// Insert rule into table.
TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
ruleTable, sourcePhrase, *targetPhrase);
coll.Add(targetPhrase);
}
return true;
}
}

View File

@ -0,0 +1,107 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include "Phrase.h"
#include "RuleTableLoader.h"
#include "TypeDef.h"
#include "Word.h"
#include <istream>
#include <string>
#include <vector>
namespace Moses
{
class LMList;
class PhraseDictionarySCFG;
class WordPenaltyProducer;
class RuleTableLoaderCompact : public RuleTableLoader
{
public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
std::istream &inStream,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
const WordPenaltyProducer* wpProducer,
PhraseDictionarySCFG &);
private:
struct LineReader {
LineReader(std::istream &input) : m_input(input), m_lineNum(0) {}
void ReadLine() {
std::getline(m_input, m_line);
// Assume everything's hunky-dory.
++m_lineNum;
}
std::istream &m_input;
std::string m_line;
size_t m_lineNum;
};
void LoadVocabularySection(LineReader &,
const std::vector<FactorType> &,
std::vector<Word> &);
void LoadPhraseSection(LineReader &,
FactorDirection,
const std::vector<Word> &,
std::vector<Phrase> &,
std::vector<size_t> &);
void LoadAlignmentSection(LineReader &,
std::vector<const AlignmentInfo *> &);
bool LoadRuleSection(LineReader &,
const std::vector<Word> &,
const std::vector<Phrase> &,
const std::vector<Phrase> &,
const std::vector<size_t> &,
const std::vector<const AlignmentInfo *> &,
const LMList &languageModels,
const WordPenaltyProducer *wpProducer,
const std::vector<float> &weights,
PhraseDictionarySCFG &ruleTable);
// Like Tokenize() but records starting positions of tokens (instead of
// copying substrings) and assumes delimiter is ASCII space character.
void FindTokens(std::vector<size_t> &output, const std::string &str) const
{
// Skip delimiters at beginning.
size_t lastPos = str.find_first_not_of(' ', 0);
// Find first "non-delimiter".
size_t pos = str.find_first_of(' ', lastPos);
while (std::string::npos != pos || std::string::npos != lastPos) {
// Found a token, add it to the vector.
output.push_back(lastPos);
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(' ', pos);
// Find next "non-delimiter"
pos = str.find_first_of(' ', lastPos);
}
}
};
} // namespace Moses

View File

@ -0,0 +1,55 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "RuleTableLoaderFactory.h"
#include "InputFileStream.h"
#include "RuleTableLoaderCompact.h"
#include "RuleTableLoaderStandard.h"
#include "UserMessage.h"
#include "Util.h"
#include <sstream>
namespace Moses
{
// Determines the rule table type by peeking inside the file then creates
// a suitable RuleTableLoader object.
std::auto_ptr<RuleTableLoader> RuleTableLoaderFactory::Create(
const std::string &path)
{
InputFileStream input(path);
std::string line;
std::getline(input, line);
std::vector<std::string> tokens;
Tokenize(tokens, line);
if (tokens.size() == 1) {
if (tokens[0] == "1") {
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderCompact());
}
std::stringstream msg;
msg << "Unsupported compact rule table format: " << tokens[0];
UserMessage::Add(msg.str());
return std::auto_ptr<RuleTableLoader>();
}
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
}
} // namespace Moses

View File

@ -0,0 +1,37 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <memory>
#include <string>
namespace Moses
{
class RuleTableLoader;
// Creates a RuleTableLoader object suitable for loading the specified file.
class RuleTableLoaderFactory
{
public:
static std::auto_ptr<RuleTableLoader> Create(const std::string &);
};
}

View File

@ -0,0 +1,133 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "RuleTableLoaderStandard.h"
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include <sys/stat.h>
#include "PhraseDictionarySCFG.h"
#include "FactorCollection.h"
#include "Word.h"
#include "Util.h"
#include "InputFileStream.h"
#include "StaticData.h"
#include "WordsRange.h"
#include "UserMessage.h"
#include "ChartTranslationOptionList.h"
#include "DotChart.h"
#include "FactorCollection.h"
using namespace std;
namespace Moses
{
bool RuleTableLoaderStandard::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, std::istream &inStream
, const std::vector<float> &weight
, size_t /* tableLimit */
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer
, PhraseDictionarySCFG &ruleTable)
{
PrintUserTime("Start loading new format pt model");
const StaticData &staticData = StaticData::Instance();
const std::string& factorDelimiter = staticData.GetFactorDelimiter();
string line;
size_t count = 0;
while(getline(inStream, line)) {
vector<string> tokens;
vector<float> scoreVector;
TokenizeMultiCharSeparator(tokens, line , "|||" );
if (tokens.size() != 4 && tokens.size() != 5) {
stringstream strme;
strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
UserMessage::Add(strme.str());
abort();
}
const string &sourcePhraseString = tokens[0]
, &targetPhraseString = tokens[1]
, &scoreString = tokens[2]
, &alignString = tokens[3];
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
continue;
}
Tokenize<float>(scoreVector, scoreString);
const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
if (scoreVector.size() != numScoreComponents) {
stringstream strme;
strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
<< numScoreComponents << ") of score components on line " << count;
UserMessage::Add(strme.str());
abort();
}
assert(scoreVector.size() == numScoreComponents);
// parse source & find pt node
// constituent labels
Word sourceLHS, targetLHS;
// source
Phrase sourcePhrase(Input, 0);
sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);
// create target phrase obj
TargetPhrase *targetPhrase = new TargetPhrase(Output);
targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
// rest of target phrase
targetPhrase->SetAlignmentInfo(alignString);
targetPhrase->SetTargetLHS(targetLHS);
//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
// component score, for n-best output
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer);
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase);
phraseColl.Add(targetPhrase);
count++;
}
// sort and prune each target phrase collection
SortAndPrune(ruleTable);
return true;
}
}

View File

@ -0,0 +1,40 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include "RuleTableLoader.h"
namespace Moses
{
class RuleTableLoaderStandard : public RuleTableLoader
{
public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
std::istream &inStream,
const std::vector<float> &weight,
size_t tableLimit,
const LMList &languageModels,
const WordPenaltyProducer* wpProducer,
PhraseDictionarySCFG &);
};
} // namespace Moses

View File

@ -55,6 +55,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "SyntacticLanguageModel.h"
#endif
#ifdef WITH_THREADS
#include <boost/thread.hpp>
#endif
using namespace std;
namespace Moses
@ -184,7 +188,7 @@ bool StaticData::LoadData(Parameter *parameter)
m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] );
m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2 && m_parameter->GetParam("n-best-list")[2]=="distinct");
} else if (m_parameter->GetParam("n-best-list").size() == 1) {
UserMessage::Add(string("ERROR: wrong format for switch -n-best-list file size"));
UserMessage::Add(string("wrong format for switch -n-best-list file size"));
return false;
} else {
m_nBestSize = 0;
@ -195,6 +199,17 @@ bool StaticData::LoadData(Parameter *parameter)
m_nBestFactor = 20;
}
//lattice samples
if (m_parameter->GetParam("lattice-samples").size() ==2 ) {
m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0];
m_latticeSamplesSize = Scan<size_t>(m_parameter->GetParam("lattice-samples")[1]);
} else if (m_parameter->GetParam("lattice-samples").size() != 0 ) {
UserMessage::Add(string("wrong format for switch -lattice-samples file size"));
return false;
} else {
m_latticeSamplesSize = 0;
}
// word graph
if (m_parameter->GetParam("output-word-graph").size() == 2)
m_outputWordGraph = true;
@ -412,6 +427,35 @@ bool StaticData::LoadData(Parameter *parameter)
m_lmcache_cleanup_threshold = (m_parameter->GetParam("clean-lm-cache").size() > 0) ?
Scan<size_t>(m_parameter->GetParam("clean-lm-cache")[0]) : 1;
m_threadCount = 1;
const std::vector<std::string> &threadInfo = m_parameter->GetParam("threads");
if (!threadInfo.empty()) {
if (threadInfo[0] == "all") {
#ifdef WITH_THREADS
m_threadCount = boost::thread::hardware_concurrency();
if (!m_threadCount) {
UserMessage::Add("-threads all specified but Boost doesn't know how many cores there are");
return false;
}
#else
UserMessage::Add("-threads all specified but moses not built with thread support");
return false;
#endif
} else {
m_threadCount = Scan<int>(threadInfo[0]);
if (m_threadCount < 1) {
UserMessage::Add("Specify at least one thread.");
return false;
}
#ifndef WITH_THREADS
if (m_threadCount > 1) {
UserMessage::Add(std::string("Error: Thread count of ") + threadInfo[0] + " but moses not built with thread support");
return false;
}
#endif
}
}
// Read in constraint decoding file, if provided
if(m_parameter->GetParam("constraint").size()) {
if (m_parameter->GetParam("search-algorithm").size() > 0

View File

@ -128,6 +128,7 @@ protected:
m_maxHypoStackSize //! hypothesis-stack size that triggers pruning
, m_minHypoStackDiversity //! minimum number of hypothesis in stack for each source word coverage
, m_nBestSize
, m_latticeSamplesSize
, m_nBestFactor
, m_maxNoTransOptPerCoverage
, m_maxNoPartTransOpt
@ -137,7 +138,7 @@ protected:
std::string
m_constraintFileName;
std::string m_nBestFilePath;
std::string m_nBestFilePath, m_latticeSamplesFilePath;
bool m_fLMsLoaded, m_labeledNBestList,m_nBestIncludesAlignment;
bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them
bool m_wordDeletionEnabled;
@ -226,6 +227,8 @@ protected:
UnknownLHSList m_unknownLHS;
WordAlignmentSort m_wordAlignmentSort;
int m_threadCount;
StaticData();
@ -434,12 +437,20 @@ public:
return m_nBestFilePath;
}
bool IsNBestEnabled() const {
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_useConsensusDecoding
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
#ifdef HAVE_PROTOBUF
|| m_outputSearchGraphPB
#endif
;
}
size_t GetLatticeSamplesSize() const {
return m_latticeSamplesSize;
}
const std::string& GetLatticeSamplesFilePath() const {
return m_latticeSamplesFilePath;
}
size_t GetNBestFactor() const {
return m_nBestFactor;
}
@ -652,6 +663,10 @@ public:
WordAlignmentSort GetWordAlignmentSort() const {
return m_wordAlignmentSort;
}
int ThreadCount() const {
return m_threadCount;
}
};
}

View File

@ -66,6 +66,17 @@ TargetPhrase::TargetPhrase(FactorDirection direction)
printalign=StaticData::Instance().PrintAlignmentInfo();
}
TargetPhrase::TargetPhrase(const Phrase &phrase)
: Phrase(phrase)
, m_transScore(0.0)
, m_fullScore(0.0)
, m_sourcePhrase(phrase.GetDirection(),0)
, m_alignmentInfo(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
{
wordalignflag=StaticData::Instance().UseAlignmentInfo();
printalign=StaticData::Instance().PrintAlignmentInfo();
}
TargetPhrase::~TargetPhrase()
{
}

View File

@ -64,6 +64,7 @@ protected:
public:
TargetPhrase(FactorDirection direction=Output);
TargetPhrase(FactorDirection direction, std::string out_string);
TargetPhrase(const Phrase &);
~TargetPhrase();
//! used by the unknown word handler- these targets
@ -151,6 +152,9 @@ public:
void SetAlignmentInfo(const std::string &alignString);
void SetAlignmentInfo(const std::set<std::pair<size_t,size_t> > &alignmentInfo);
void SetAlignmentInfo(const AlignmentInfo *alignmentInfo) {
m_alignmentInfo = alignmentInfo;
}
const AlignmentInfo &GetAlignmentInfo() const
{ return *m_alignmentInfo; }
@ -168,9 +172,8 @@ public:
return printalign;
}
void CreateCountInfo(const std::string &countStr);
TO_STRING();
TO_STRING();
};

View File

@ -41,6 +41,25 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
}
}
void TrellisPath::InitScore() {
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
//calc score
size_t sizePath = m_path.size();
for (size_t pos = 0 ; pos < sizePath ; pos++) {
const Hypothesis *hypo = m_path[pos];
const Hypothesis *winningHypo = hypo->GetWinningHypo();
if (hypo != winningHypo) {
m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown());
m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown());
}
}
}
TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc)
:m_prevEdgeChanged(edgeIndex)
{
@ -60,22 +79,20 @@ TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypoth
prevHypo = prevHypo->GetPrevHypo();
}
// Calc score
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
size_t sizePath = m_path.size();
for (size_t pos = 0 ; pos < sizePath ; pos++) {
const Hypothesis *hypo = m_path[pos];
const Hypothesis *winningHypo = hypo->GetWinningHypo();
if (hypo != winningHypo) {
m_totalScore = m_totalScore - winningHypo->GetTotalScore() + hypo->GetTotalScore();
m_scoreBreakdown.MinusEquals(winningHypo->GetScoreBreakdown());
m_scoreBreakdown.PlusEquals(hypo->GetScoreBreakdown());
}
}
InitScore();
}
TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
:m_prevEdgeChanged(NOT_FOUND)
{
m_path.resize(edges.size());
copy(edges.rbegin(),edges.rend(),m_path.begin());
InitScore();
}
void TrellisPath::CreateDeviantPaths(TrellisPathCollection &pathColl) const
{
const size_t sizePath = m_path.size();

View File

@ -41,6 +41,7 @@ class TrellisPathList;
class TrellisPath
{
friend std::ostream& operator<<(std::ostream&, const TrellisPath&);
friend class Manager;
protected:
std::vector<const Hypothesis *> m_path; //< list of hypotheses/arcs
@ -51,6 +52,11 @@ protected:
ScoreComponentCollection m_scoreBreakdown;
float m_totalScore;
//Used by Manager::LatticeSample()
TrellisPath(const std::vector<const Hypothesis*> edges);
void InitScore();
public:
TrellisPath(); // not implemented

View File

@ -373,6 +373,18 @@ inline std::string GetFirstString(const std::string& str, int& first_pos, const
return first_str;
}
template<class T>
T log_sum (T log_a, T log_b)
{
T v;
if (log_a < log_b) {
v = log_b+log ( 1 + exp ( log_a-log_b ));
} else {
v = log_a+log ( 1 + exp ( log_b-log_a ));
}
return ( v );
}
}
#endif

View File

@ -14,34 +14,50 @@
function die () {
echo "$@" >&2
# Try to be as helpful as possible by detecting OS and making recommendations
if (( $(lsb_release -a | fgrep -ci "ubuntu") > 0 )); then
echo >&2
echo >&2 "Need to install build autotools on Ubuntu? Use:"
echo >&2 "sudo aptitude install autoconf automake libtool build-essential"
fi
if (( $(uname -a | fgrep -ci "darwin") > 0 )); then
echo >&2
echo >&2 "Having problems on Mac OSX?"
echo >&2 "You might have an old version of aclocal/automake. You'll need to upgrade these."
fi
exit 1
}
if [ -z "$ACLOCAL" ]
then
if [ -z "$ACLOCAL" ]; then
ACLOCAL=`which aclocal`
[ -n "$ACLOCAL" ] || die "aclocal not found on your system. Please install it or set $ACLOCAL"
fi
if [ -z "$AUTOMAKE" ]
then
if [ -z "$AUTOMAKE" ]; then
AUTOMAKE=`which automake`
[ -n "$AUTOMAKE" ] || die "automake not found on your system. Please install it or set $AUTOMAKE"
fi
if [ -z "$AUTOCONF" ]
then
if [ -z "$AUTOCONF" ]; then
AUTOCONF=`which autoconf`
[ -n "$AUTOCONF" ] || die "autoconf not found on your system. Please install it or set $AUTOCONF"
fi
if [ -z "$LIBTOOLIZE" ]
then
if [ -z "$LIBTOOLIZE" ]; then
LIBTOOLIZE=`which libtoolize`
if [ -z "$LIBTOOLIZE" ]
then
if [ -z "$LIBTOOLIZE" ]; then
LIBTOOLIZE=`which glibtoolize`
fi
[ -n "$LIBTOOLIZE" ] || die "libtoolize/glibtoolize not found on your system. Please install it or set $LIBTOOLIZE"
fi
echo >&2 "Detected aclocal: $($ACLOCAL --version | head -n1)"
echo >&2 "Detected autoconf: $($AUTOCONF --version | head -n1)"
echo >&2 "Detected automake: $($AUTOMAKE --version | head -n1)"
echo >&2 "Detected libtoolize: $($LIBTOOLIZE --version | head -n1)"
echo "Calling $ACLOCAL..."
$ACLOCAL -I m4 || die "aclocal failed"
@ -53,9 +69,17 @@ echo "Calling $LIBTOOLIZE"
$LIBTOOLIZE || die "libtoolize failed"
cores=$(cat /proc/cpuinfo | fgrep -c processor)
if [ -z "$cores" ]; then
cores=2 # assume 2 cores if we can't figure it out
echo >&2 "Assuming 2 cores"
else
echo >&2 "Detected $cores cores"
fi
echo
echo "You should now be able to configure and build:"
echo " ./configure [--with-srilm=/path/to/srilm] [--with-irstlm=/path/to/irstlm] [--with-randlm=/path/to/randlm] [--without-kenlm] [--with-synlm] [--with-xmlrpc-c=/path/to/xmlrpc-c-config]"
echo " make -j 4"
echo " make -j ${cores}"
echo

View File

@ -23,10 +23,15 @@ my @tests = qw (
chart.target-syntax.ondisk
chart.hierarchical
chart.hierarchical-withsrilm
#chart.hierarchical-withkenlm
chart.hierarchical.ondisk
phrase.basic-surface-only
phrase.basic-surface-only-withirstlm
phrase.basic-surface-only-withirstlm-binlm
#phrase.basic-surface-only-withkenlm
#phrase.basic-surface-only-withkenlm.bin
phrase.basic-lm-oov
#phrase.basic-lm-oov-withkenlm
phrase.ptable-filtering
phrase.multi-factor
phrase.multi-factor-drop
@ -54,6 +59,7 @@ use POSIX qw ( strftime );
my $decoderPhrase = "$Bin/../moses-cmd/src/moses";
my $decoderChart = "$Bin/../moses-chart-cmd/src/moses_chart";
my $scoreExe = "$Bin/../scripts/training/phrase-extract/score";
my $kenlmBinarizer = "$Bin/../kenlm/build_binary";
my $test_dir;
my $BIN_TEST = $script_dir;
my $data_dir;
@ -105,6 +111,10 @@ foreach my $test (@tests)
{
$cmd .= "$BIN_TEST/run-test-mert.perl $test_run";
}
elsif ($test =~ /^kenlmbin/)
{
$cmd .= "$BIN_TEST/run-kenlm-binarizer.perl --binarizer=$kenlmBinarizer";
}
else
{
print "FAIL";

View File

@ -0,0 +1,22 @@
#!/usr/bin/perl
BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, "$script_dir/../perllib"; }
use RegTestUtils;
$x=0;
while (<>) {
chomp;
if (/^Finished loading LanguageModels/) {
my $time = RegTestUtils::readTime($_);
print "LMLOAD_TIME ~ $time\n";
}
if (/^Finished loading phrase tables/) {
my $time = RegTestUtils::readTime($_);
print "PTLOAD_TIME ~ $time\n";
}
next unless /^BEST TRANSLATION:/;
my $pscore = RegTestUtils::readHypoScore($_);
$x++;
print "SCORE_$x = $pscore\n";
}

View File

@ -0,0 +1,7 @@
#!/usr/bin/perl
$x=0;
while (<>) {
chomp;
$x++;
print "TRANSLATION_$x=$_\n";
}

Some files were not shown because too many files have changed in this diff Show More