Conflicts:
	moses/TranslationOptionCollection.cpp
	moses/TranslationOptionCollectionLattice.cpp
	moses/TranslationOptionCollectionLattice.h
	moses/TranslationOptionList.h
This commit is contained in:
Ulrich Germann 2015-02-06 01:30:00 +00:00
commit be5799dca3
750 changed files with 39439 additions and 24068 deletions

View File

@ -1,4 +1,4 @@
Please see the Moses website on how to compile and run Moses
http://www.statmt.org/moses/?n=Development.GetStarted
blah blah blah

13
Jamroot
View File

@ -151,6 +151,10 @@ if [ option.get "with-probing-pt" : : "yes" ]
requirements += <library>boost_serialization ;
}
if [ option.get "with-vw" ] {
requirements += <define>HAVE_VW ;
}
project : default-build
<threading>multi
<warnings>on
@ -173,7 +177,8 @@ project : requirements
;
#Add directories here if you want their incidental targets too (i.e. tests).
build-projects lm util phrase-extract search moses moses/LM mert moses-cmd mira scripts regression-testing ;
build-projects lm util phrase-extract phrase-extract/syntax-common search moses moses/LM mert moses-cmd scripts regression-testing ;
# contrib/mira
if [ option.get "with-mm" : : "yes" ]
{
@ -225,11 +230,14 @@ phrase-extract//extract-ghkm
phrase-extract//pcfg-extract
phrase-extract//pcfg-score
phrase-extract//extract-mixed-syntax
phrase-extract//score-stsg
phrase-extract//filter-rule-table
biconcor
mira//mira
#contrib/mira//mira
contrib/server//mosesserver
mm
rephraser
contrib/c++tokenizer//tokenizer
;
@ -249,6 +257,7 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
echo ;
}
#local temp = [ _shell "bash source ./s.sh" ] ;
local temp = [ _shell "mkdir bin" ] ;
local temp = [ _shell "rm bin/moses_chart" ] ;
local temp = [ _shell "cd bin && ln -s moses moses_chart" ] ;

View File

@ -1,5 +1,5 @@
fakelib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp OnDiskQuery.cpp ../moses//headers ;
exe CreateOnDiskPt : Main.cpp ../moses//moses OnDiskPt ;
exe queryOnDiskPt : queryOnDiskPt.cpp ../moses//moses OnDiskPt ;
exe CreateOnDiskPt : Main.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;
exe queryOnDiskPt : queryOnDiskPt.cpp ..//boost_filesystem ../moses//moses OnDiskPt ;

View File

@ -26,7 +26,6 @@
#include <cassert>
#include "moses/InputFileStream.h"
#include "moses/Util.h"
#include "moses/UserMessage.h"
#include "OnDiskWrapper.h"
#include "SourcePhrase.h"
#include "TargetPhrase.h"
@ -154,19 +153,19 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
break;
}
case 4: {
// store only the 3rd one (rule count)
float val = Moses::Scan<float>(tok);
misc[0] = val;
break;
// store only the 3rd one (rule count)
float val = Moses::Scan<float>(tok);
misc[0] = val;
break;
}
case 5: {
// sparse features
sparseFeatures << tok << " ";
// sparse features
sparseFeatures << tok << " ";
break;
}
case 6: {
property << tok << " ";
break;
property << tok << " ";
break;
}
default:
cerr << "ERROR in line " << line << endl;
@ -219,7 +218,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
phrase.AddWord(word);
if (retSourceTarget == 1) {
out = word;
out = word;
}
}
@ -230,7 +229,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
phrase.AddWord(word);
if (retSourceTarget == 2) {
out = word;
out = word;
}
}

View File

@ -59,28 +59,28 @@ bool OnDiskWrapper::OpenForLoad(const std::string &filePath)
{
m_fileSource.open((filePath + "/Source.dat").c_str(), ios::in | ios::binary);
UTIL_THROW_IF(!m_fileSource.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/Source.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/Source.dat");
m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::in | ios::binary);
UTIL_THROW_IF(!m_fileTargetInd.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetInd.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetInd.dat");
m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::in | ios::binary);
UTIL_THROW_IF(!m_fileTargetColl.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetColl.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetColl.dat");
m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::in);
UTIL_THROW_IF(!m_fileVocab.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/Vocab.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/Vocab.dat");
m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::in);
UTIL_THROW_IF(!m_fileMisc.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/Misc.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/Misc.dat");
// set up root node
LoadMisc();
@ -124,46 +124,46 @@ void OnDiskWrapper::BeginSave(const std::string &filePath
m_fileSource.open((filePath + "/Source.dat").c_str(), ios::out | ios::in | ios::binary | ios::ate | ios::trunc);
UTIL_THROW_IF(!m_fileSource.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/Source.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/Source.dat");
m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::out | ios::binary | ios::ate | ios::trunc);
UTIL_THROW_IF(!m_fileTargetInd.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetInd.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetInd.dat");
m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::out | ios::binary | ios::ate | ios::trunc);
UTIL_THROW_IF(!m_fileTargetColl.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetColl.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/TargetColl.dat");
m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::out | ios::ate | ios::trunc);
UTIL_THROW_IF(!m_fileVocab.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/Vocab.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/Vocab.dat");
m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::out | ios::ate | ios::trunc);
UTIL_THROW_IF(!m_fileMisc.is_open(),
util::FileOpenException,
"Couldn't open file " << filePath << "/Misc.dat");
util::FileOpenException,
"Couldn't open file " << filePath << "/Misc.dat");
// offset by 1. 0 offset is reserved
char c = 0xff;
m_fileSource.write(&c, 1);
UTIL_THROW_IF2(1 != m_fileSource.tellp(),
"Couldn't write to stream m_fileSource");
"Couldn't write to stream m_fileSource");
m_fileTargetInd.write(&c, 1);
UTIL_THROW_IF2(1 != m_fileTargetInd.tellp(),
"Couldn't write to stream m_fileTargetInd");
"Couldn't write to stream m_fileTargetInd");
m_fileTargetColl.write(&c, 1);
UTIL_THROW_IF2(1 != m_fileTargetColl.tellp(),
"Couldn't write to stream m_fileTargetColl");
"Couldn't write to stream m_fileTargetColl");
// set up root node
UTIL_THROW_IF2(GetNumCounts() != 1,
"Not sure what this is...");
"Not sure what this is...");
vector<float> counts(GetNumCounts());
counts[0] = DEFAULT_COUNT;
@ -212,8 +212,8 @@ UINT64 OnDiskWrapper::GetMisc(const std::string &key) const
std::map<std::string, UINT64>::const_iterator iter;
iter = m_miscInfo.find(key);
UTIL_THROW_IF2(iter == m_miscInfo.end()
, "Couldn't find value for key " << key
);
, "Couldn't find value for key " << key
);
return iter->second;
}
@ -238,7 +238,7 @@ Word *OnDiskWrapper::ConvertFromMoses(const std::vector<Moses::FactorType> &fact
break;
}
UTIL_THROW_IF2(factor == NULL,
"Expecting factor " << factorType << " at position " << ind);
"Expecting factor " << factorType << " at position " << ind);
strme << "|" << factor->GetString();
} // for (size_t factorType

View File

@ -61,10 +61,12 @@ public:
, int numSourceFactors, int numTargetFactors, int numScores);
void EndSave();
Vocab &GetVocab()
{ return m_vocab; }
const Vocab &GetVocab() const
{ return m_vocab; }
Vocab &GetVocab() {
return m_vocab;
}
const Vocab &GetVocab() const {
return m_vocab;
}
size_t GetSourceWordSize() const;
size_t GetTargetWordSize() const;

View File

@ -35,8 +35,8 @@ void Phrase::AddWord(WordPtr word)
void Phrase::AddWord(WordPtr word, size_t pos)
{
UTIL_THROW_IF2(!(pos < m_words.size()),
"Trying to get word " << pos << " when phrase size is " << m_words.size());
UTIL_THROW_IF2(!(pos < m_words.size()),
"Trying to get word " << pos << " when phrase size is " << m_words.size());
m_words.insert(m_words.begin() + pos + 1, word);
}

View File

@ -166,10 +166,10 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
size_t propSize = m_property.size();
size_t memNeeded = sizeof(UINT64) // file pos (phrase id)
+ sizeof(UINT64) + 2 * sizeof(UINT64) * numAlign // align
+ sizeof(float) * numScores // scores
+ sizeof(UINT64) + sparseFeatureSize // sparse features string
+ sizeof(UINT64) + propSize; // property string
+ sizeof(UINT64) + 2 * sizeof(UINT64) * numAlign // align
+ sizeof(float) * numScores // scores
+ sizeof(UINT64) + sparseFeatureSize // sparse features string
+ sizeof(UINT64) + propSize; // property string
char *mem = (char*) malloc(memNeeded);
//memset(mem, 0, memNeeded);
@ -350,13 +350,13 @@ UINT64 TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &o
bytesRead += sizeof(UINT64);
if (strSize) {
char *mem = (char*) malloc(strSize + 1);
mem[strSize] = '\0';
fileTPColl.read(mem, strSize);
outStr = string(mem);
free(mem);
char *mem = (char*) malloc(strSize + 1);
mem[strSize] = '\0';
fileTPColl.read(mem, strSize);
outStr = string(mem);
free(mem);
bytesRead += strSize;
bytesRead += strSize;
}
return bytesRead;

View File

@ -113,14 +113,12 @@ public:
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
void SetProperty(const std::string &value)
{
m_property = value;
void SetProperty(const std::string &value) {
m_property = value;
}
void SetSparseFeatures(const std::string &value)
{
m_sparseFeatures = value;
void SetSparseFeatures(const std::string &value) {
m_sparseFeatures = value;
}
};

View File

@ -18,6 +18,7 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <boost/algorithm/string/predicate.hpp>
#include "moses/FactorCollection.h"
#include "moses/Util.h"
#include "moses/Word.h"
@ -27,6 +28,7 @@
#include "util/exception.hh"
using namespace std;
using namespace boost::algorithm;
namespace OnDiskPt
{
@ -41,7 +43,7 @@ Word::~Word()
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
{
if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") {
if (starts_with(inString, "[") && ends_with(inString, "]")) {
// non-term
m_isNonTerminal = true;
string str = inString.substr(1, inString.size() - 2);
@ -105,18 +107,17 @@ void Word::ConvertToMoses(
overwrite = Moses::Word(m_isNonTerminal);
if (m_isNonTerminal) {
const std::string &tok = vocab.GetString(m_vocabId);
overwrite.SetFactor(0, factorColl.AddFactor(tok, m_isNonTerminal));
}
else {
// TODO: this conversion should have been done at load time.
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
const std::string &tok = vocab.GetString(m_vocabId);
overwrite.SetFactor(0, factorColl.AddFactor(tok, m_isNonTerminal));
} else {
// TODO: this conversion should have been done at load time.
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
overwrite.SetFactor(*t, factorColl.AddFactor(*tok, m_isNonTerminal));
}
UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
overwrite.SetFactor(*t, factorColl.AddFactor(*tok, m_isNonTerminal));
}
UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
}

View File

@ -7,7 +7,8 @@ size_t lookup( string );
vector<string> tokenize( const char input[] );
SuffixArray suffixArray;
int main(int argc, char* argv[]) {
int main(int argc, char* argv[])
{
// handle parameters
string query;
string fileNameSuffix;
@ -95,14 +96,14 @@ int main(int argc, char* argv[]) {
}
cout << lookup( query ) << endl;
}
}
else if (queryFlag) {
} else if (queryFlag) {
cout << lookup( query ) << endl;
}
return 0;
}
size_t lookup( string query ) {
size_t lookup( string query )
{
cerr << "query is " << query << endl;
vector< string > queryString = tokenize( query.c_str() );
return suffixArray.Count( queryString );

View File

@ -0,0 +1,13 @@
with-re2 = [ option.get "with-re2" ] ;
if $(with-re2) {
lib re2 : : <search>$(with-re2)/lib ;
external-lib glib-2.0 ;
glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
includes += <include>$(with-re2)/include ;
exe tokenizer : tokenizer.cpp tokenizer_main.cpp Parameters.cpp re2 glib-2.0 : <cflags>-std=c++0x <cflags>$(glib-cflags) $(includes) ;
}
else {
alias tokenizer ;
}

View File

@ -0,0 +1,27 @@
#include "Parameters.h"
#ifdef TOKENIZER_NAMESPACE
namespace TOKENIZER_NAMESPACE {
#endif
Parameters::Parameters()
:cfg_path(0),
next_cfg_p(false),
next_output_p(false),
verbose_p(false),
detag_p(false),
alltag_p(false),
escape_p(true),
aggro_p(false),
supersub_p(false),
url_p(true),
downcase_p(false),
penn_p(false),
words_p(false)
{
}
#ifdef TOKENIZER_NAMESPACE
}
#endif

View File

@ -0,0 +1,37 @@
#pragma once
#include <string>
#include <vector>
#ifdef TOKENIZER_NAMESPACE
namespace TOKENIZER_NAMESPACE {
#endif
struct Parameters
{
std::string lang_iso;
std::vector<std::string> args;
std::string out_path;
const char *cfg_path;
bool next_cfg_p;
bool next_output_p;
bool verbose_p;
bool detag_p;
bool alltag_p;
bool escape_p;
bool aggro_p;
bool supersub_p;
bool url_p;
bool downcase_p;
bool penn_p;
bool words_p;
Parameters();
};
#ifdef TOKENIZER_NAMESPACE
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,125 @@
#include <string>
#include <iostream>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <set>
#include <vector>
#include <iterator>
#include <stdexcept>
#include <re2/re2.h>
#include <unistd.h>
#ifdef TOKENIZER_NAMESPACE
namespace TOKENIZER_NAMESPACE {
#endif
//
// @about
// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
//
class Tokenizer {
private:
static std::string cfg_dir;
std::set<std::string> nbpre_num_set;
std::set<std::string> nbpre_gen_set;
std::set<std::wstring> nbpre_num_ucs4;
std::set<std::wstring> nbpre_gen_ucs4;
std::vector<re2::RE2 *> prot_pat_vec;
protected:
// language
std::string lang_iso;
bool english_p; // is lang_iso "en"
bool latin_p; // is lang_iso "fr" or "it"
bool skip_xml_p;
bool skip_alltags_p;
bool non_escape_p;
bool aggressive_hyphen_p;
bool supersub_p;
bool url_p;
bool downcase_p;
bool normalize_p;
bool penn_p;
bool verbose_p;
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
bool escape(std::string& inplace);
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
void protected_tokenize(std::string& inplace);
public:
// cfg_dir is assumed shared by all languages
static void set_config_dir(const std::string& _cfg_dir);
// no throw
Tokenizer(const std::string& _lang_iso = "en",
bool _skip_xml_p = true, // skips isolated (linewise) tags in any case
bool _skip_alltags_p = true, // skip all xml style tags
bool _non_escape_p = false, // default is to call escape method before return
bool _aggressive_hyphen_p = false, // hyphens become tokens when true
bool _supersub_p = false, // handle super/subscript numerics
bool _url_p = true,
bool _downcase_p = false,
bool _normalize_p = true,
bool _penn_p = false, // Treebank-3 compatible tokenization when true
bool _verbose_p = false);
// frees dynamically compiled expressions
~Tokenizer();
// required before other methods, may throw
void init();
// streaming tokenizer reads from is, writes to os, preserving line breaks
std::size_t tokenize(std::istream& is, std::ostream& os);
// tokenize padded line buffer to return string
std::string tokenize(const std::string& buf);
void tokenize(const std::string& buf, std::string& outs) {
outs = tokenize(buf);
}
// tokenize to a vector
std::vector<std::string> tokens(const std::string& in) {
std::istringstream tokss(tokenize(in));
std::vector<std::string> outv;
std::copy(std::istream_iterator<std::string>(tokss),
std::istream_iterator<std::string>(),
std::back_inserter(outv));
return outv;
}
// streaming detokenizer reads from is, writes to os, preserving breaks
std::size_t detokenize(std::istream& is, std::ostream &os);
// detokenize padded line buffer to return string
std::string detokenize(const std::string& buf);
void detokenize(const std::string& buf, std::string& outs) {
outs = detokenize(buf);
}
// detokenize from a vector
std::string detokenize(const std::vector<std::string>& inv) {
std::ostringstream oss;
std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
return detokenize(oss.str());
}
}; // end class Tokenizer
#ifdef TOKENIZER_NAMESPACE
};
#endif

View File

@ -0,0 +1,245 @@
#include "tokenizer.h"
#include "Parameters.h"
#include <memory>
#include <vector>
#include <cctype>
#include <cstring>
#ifdef TOKENIZER_NAMESPACE
using namespace TOKENIZER_NAMESPACE ;
#endif
void
usage(const char *path)
{
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
std::cerr << " -e -- escape entities" << std::endl;
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
std::cerr << " -d -- downcase" << std::endl;
std::cerr << " -o OUT -- output file path" << std::endl;
std::cerr << " -p -- penn treebank style" << std::endl;
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
std::cerr << " -u -- disable url handling" << std::endl;
std::cerr << " -v -- verbose" << std::endl;
std::cerr << " -w -- word filter" << std::endl;
std::cerr << " -x -- skip xml tag lines" << std::endl;
std::cerr << " -y -- skip all xml tags" << std::endl;
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
std::cerr << "LL in en,fr,it affect contraction." << std::endl;
}
std::string token_word(const std::string& in) {
int pos = -1;
int digits_prefixed = 0;
int nalpha = 0;
int len = in.size();
std::vector<char> cv;
int last_quirk = -1;
while (++pos < len) {
char ch = in.at(pos);
if (std::isdigit(ch)) {
if (digits_prefixed > 0) {
last_quirk = pos;
break;
}
digits_prefixed--;
cv.push_back(std::tolower(ch));
} else if (std::isalpha(ch)) {
if (digits_prefixed < 0)
digits_prefixed = -digits_prefixed;
cv.push_back(std::tolower(ch));
nalpha++;
} else {
if (digits_prefixed < 0)
digits_prefixed = -digits_prefixed;
last_quirk = pos;
if ((ch == '-' || ch == '\'') && pos != 0) {
cv.push_back(ch);
} else {
break;
}
}
}
if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
cv.clear(); // invalid word
return std::string(cv.begin(),cv.end());
}
int
copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
if (line.empty()) continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
for (auto& token: tokens) {
std::string word(token_word(token));
if (word.empty()) continue;
ofs << word << ' ';
count++;
}
if (count) {
ofs << std::endl;
nlines++;
}
}
return nlines;
}
int main(int ac, char **av)
{
int rc = 0;
Parameters params;
const char *prog = av[0];
while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
case 'a':
params.aggro_p = true;
break;
case 'h':
usage(prog);
exit(0);
case 'c':
params.next_cfg_p = true;
break;
case 'd':
params.downcase_p = true;
break;
case 'e':
params.escape_p = false;
break;
case 'o':
params.next_output_p = true;
break;
case 'p':
params.penn_p = true;
break;
case 's':
params.supersub_p = true;
break;
case 'u':
params.url_p = false;
break;
case 'v':
params.verbose_p = true;
break;
case 'w':
params.words_p = true;
break;
case 'x':
params.detag_p = true;
break;
case 'y':
params.alltag_p = true;
break;
case 'l':
// ignored
break;
default:
std::cerr << "Unknown option: " << *av << std::endl;
::exit(1);
}
} else if (params.lang_iso.empty() && strlen(*av) == 2) {
params.lang_iso = *av;
} else if (params.next_output_p) {
params.next_output_p = false;
params.out_path = *av;
} else if (params.next_cfg_p) {
params.next_cfg_p = false;
params.cfg_path = *av;
} else {
params.args.push_back(std::string(*av));
}
}
if (!params.cfg_path) {
params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
}
if (!params.cfg_path) {
if (!::access("../share/.",X_OK)) {
if (!::access("../share/moses/.",X_OK)) {
params.cfg_path = "../share/moses";
} else {
params.cfg_path = "../share";
}
} else if (!::access("./scripts/share/.",X_OK)) {
params.cfg_path = "./scripts/share";
} else if (!::access("./nonbreaking_prefix.en",R_OK)) {
params.cfg_path = ".";
} else {
const char *slash = std::strrchr(prog,'/');
if (slash) {
std::string cfg_dir_str(prog,slash-prog);
std::string cfg_shr_str(cfg_dir_str);
cfg_shr_str.append("/shared");
std::string cfg_mos_str(cfg_shr_str);
cfg_mos_str.append("/moses");
if (!::access(cfg_mos_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_mos_str.c_str());
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_shr_str.c_str());
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_dir_str.c_str());
}
}
}
}
if (params.cfg_path) {
if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl;
}
Tokenizer::set_config_dir(std::string(params.cfg_path));
}
std::unique_ptr<std::ofstream> pofs = 0;
if (!params.out_path.empty()) {
pofs.reset(new std::ofstream(params.out_path.c_str()));
}
std::ostream& ofs(pofs ? *pofs : std::cout);
Tokenizer tize(params.lang_iso,params.detag_p,params.alltag_p,!params.escape_p,params.aggro_p,params.supersub_p,params.url_p,params.downcase_p,params.penn_p,params.verbose_p);
tize.init();
size_t nlines = 0;
if (params.words_p) {
if (params.args.empty()) {
nlines += copy_words(tize,std::cin,ofs);
} else {
for (std::string& arg : params.args) {
try {
std::ifstream ifs(arg.c_str());
nlines += copy_words(tize,ifs,ofs);
} catch (...) {
std::cerr << "Exception extracting words from path " << arg << std::endl;
}
}
}
} else if (params.args.empty()) {
nlines = tize.tokenize(std::cin,ofs);
} else {
for (std::string& arg : params.args) {
try {
std::ifstream ifs(arg.c_str());
nlines = tize.tokenize(ifs,ofs);
} catch (...) {
std::cerr << "Exception tokenizing from path " << arg << std::endl;
}
}
}
if (params.verbose_p)
std::cerr << "%%% tokenized lines: " << nlines << std::endl;
return rc;
}

View File

@ -143,8 +143,8 @@ vector< vector<const Word*> > MosesDecoder::runDecoder(const std::string& source
string filename)
{
// run the decoder
m_manager = new Moses::Manager(*m_sentence, search);
m_manager->ProcessSentence();
m_manager = new Moses::Manager(*m_sentence);
m_manager->Decode();
TrellisPathList nBestList;
m_manager->CalcNBest(nBestSize, nBestList, distinct);
@ -221,7 +221,7 @@ vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& s
{
// run the decoder
m_chartManager = new ChartManager(*m_sentence);
m_chartManager->ProcessSentence();
m_chartManager->Decode();
ChartKBestExtractor::KBestVec nBestList;
m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
@ -229,7 +229,7 @@ vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& s
for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
p != nBestList.end(); ++p) {
const ChartKBestExtractor::Derivation &derivation = **p;
featureValues.push_back(derivation.scoreBreakdown);
featureValues.push_back(*ChartKBestExtractor::GetOutputScoreBreakdown(derivation));
float bleuScore, dynBleuScore, realBleuScore;
dynBleuScore = getBleuScore(featureValues.back());
Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation);

15
contrib/mira/Jamfile Normal file
View File

@ -0,0 +1,15 @@
lib mira_lib :
[ glob *.cpp : *Test.cpp Main.cpp ]
../../mert//mert_lib ../../moses//moses ../../OnDiskPt//OnDiskPt ../..//boost_program_options ;
exe mira : Main.cpp mira_lib ../../mert//mert_lib ../../moses//moses ../../OnDiskPt//OnDiskPt ../..//boost_program_options ../..//boost_filesystem ;
alias programs : mira ;
import testing ;
unit-test mira_test : [ glob *Test.cpp ] mira_lib ..//boost_unit_test_framework ;
explicit mira_test ;

View File

@ -1236,15 +1236,15 @@ int main(int argc, char** argv)
cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
modelScoresFear[0][0], learning_rate, rank, epoch);
weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
modelScoresFear[0][0], learning_rate, rank, epoch);
} else {
cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
modelScoresFear, learning_rate, rank, epoch);
featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
modelScoresFear, learning_rate, rank, epoch);
}
} else {
// model_hope_fear

View File

@ -5,12 +5,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.2091728208" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -48,6 +48,7 @@
<listOptionValue builtIn="false" value="boost_filesystem"/>
<listOptionValue builtIn="false" value="pthread"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="dl"/>
<listOptionValue builtIn="false" value="rt"/>
</option>
@ -86,12 +87,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.185559773" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -156,4 +157,5 @@
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

View File

@ -25,6 +25,7 @@
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.730994342" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1461708548" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1669405610" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>

View File

@ -3,6 +3,7 @@
<name>extract-ghkm</name>
<comment></comment>
<projects>
<project>moses</project>
</projects>
<buildSpec>
<buildCommand>

View File

@ -59,18 +59,11 @@
</tool>
</toolChain>
</folderInfo>
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750.38452119" name="/" resourcePath="wrappers">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1621748368" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug" unusedChildren="">
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.2002161718" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.2138497585" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.macosx.exe.debug.86927135" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.exe.debug.62265891"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.315991018" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base.775866405"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1319557326" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1024092140"/>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.1042051280" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.34201722"/>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750.279457772" name="corpus_count_test.cc" rcbsApplicability="disable" resourcePath="builder/corpus_count_test.cc" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1024092140.654966100">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1024092140.654966100" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1024092140"/>
</fileInfo>
<sourceEntries>
<entry excluding="wrappers|left_test.cc|model_test.cc" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
<entry excluding="builder/corpus_count_test.cc|builder/adjust_counts_test.cc|wrappers|left_test.cc|model_test.cc" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>

View File

@ -8,10 +8,12 @@
#include <stdio.h>
#include <algorithm>
#include <fstream>
#include <boost/algorithm/string/predicate.hpp>
#include "EnOpenNLPChunker.h"
#include "moses/Util.h"
using namespace std;
using namespace boost::algorithm;
EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath)
:m_openNLPPath(openNLPPath)
@ -85,7 +87,7 @@ void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, cons
inLabel = true;
}
}
else if (tok.substr(tok.size()-1, 1) == "]") {
else if (ends_with(tok, "]")) {
// end of chunk
if (tok.size() > 1) {
if (tok.substr(1,1) == "_") {

View File

@ -60,6 +60,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/AlignmentInfoTest.cpp</locationURI>
</link>
<link>
<name>BaseManager.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/BaseManager.cpp</locationURI>
</link>
<link>
<name>BaseManager.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/BaseManager.h</locationURI>
</link>
<link>
<name>BitmapContainer.cpp</name>
<type>1</type>
@ -325,6 +335,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FloydWarshall.h</locationURI>
</link>
<link>
<name>ForestInput.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ForestInput.cpp</locationURI>
</link>
<link>
<name>ForestInput.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/ForestInput.h</locationURI>
</link>
<link>
<name>GenerationDictionary.cpp</name>
<type>1</type>
@ -570,16 +590,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Phrase.h</locationURI>
</link>
<link>
<name>PhraseOrientation.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.cpp</locationURI>
</link>
<link>
<name>PhraseOrientation.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h</locationURI>
</link>
<link>
<name>PrefixTree.h</name>
<type>1</type>
@ -760,6 +770,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TabbedSentence.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TabbedSentence.cpp</locationURI>
</link>
<link>
<name>TabbedSentence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TabbedSentence.h</locationURI>
</link>
<link>
<name>TargetPhrase.cpp</name>
<type>1</type>
@ -935,16 +955,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/UniqueObject.h</locationURI>
</link>
<link>
<name>UserMessage.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/UserMessage.cpp</locationURI>
</link>
<link>
<name>UserMessage.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/UserMessage.h</locationURI>
</link>
<link>
<name>Util.cpp</name>
<type>1</type>
@ -1010,6 +1020,11 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>extract-ghkm</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>gzfilebuf.h</name>
<type>1</type>
@ -1105,6 +1120,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/DistortionScoreProducer.h</locationURI>
</link>
<link>
<name>FF/DynamicCacheBasedLanguageModel.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/DynamicCacheBasedLanguageModel.cpp</locationURI>
</link>
<link>
<name>FF/DynamicCacheBasedLanguageModel.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/DynamicCacheBasedLanguageModel.h</locationURI>
</link>
<link>
<name>FF/ExternalFeature.cpp</name>
<type>1</type>
@ -1470,6 +1495,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TreeStructureFeature.h</locationURI>
</link>
<link>
<name>FF/UnalignedWordCountFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/UnalignedWordCountFeature.cpp</locationURI>
</link>
<link>
<name>FF/UnalignedWordCountFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/UnalignedWordCountFeature.h</locationURI>
</link>
<link>
<name>FF/UnknownWordPenaltyProducer.cpp</name>
<type>1</type>
@ -1840,6 +1875,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/CubeQueue.h</locationURI>
</link>
<link>
<name>Syntax/F2S</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Syntax/KBestExtractor.cpp</name>
<type>1</type>
@ -1850,6 +1890,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/KBestExtractor.h</locationURI>
</link>
<link>
<name>Syntax/Manager.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/Manager.cpp</locationURI>
</link>
<link>
<name>Syntax/Manager.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/Manager.h</locationURI>
</link>
<link>
<name>Syntax/NonTerminalMap.h</name>
<type>1</type>
@ -1935,6 +1985,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SymbolHasher.h</locationURI>
</link>
<link>
<name>Syntax/T2S</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>TranslationModel/BilingualDynSuffixArray.cpp</name>
<type>1</type>
@ -1995,6 +2050,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h</locationURI>
</link>
<link>
<name>TranslationModel/PhraseDictionaryDynamicCacheBased.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.cpp</locationURI>
</link>
<link>
<name>TranslationModel/PhraseDictionaryDynamicCacheBased.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h</locationURI>
</link>
<link>
<name>TranslationModel/PhraseDictionaryMemory.cpp</name>
<type>1</type>
@ -2135,6 +2200,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/bin/lm.log</locationURI>
</link>
<link>
<name>extract-ghkm/PhraseOrientation.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.cpp</locationURI>
</link>
<link>
<name>extract-ghkm/PhraseOrientation.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReordering.cpp</name>
<type>1</type>
@ -2275,6 +2350,141 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/SourceOxLM.h</locationURI>
</link>
<link>
<name>Syntax/F2S/DerivationWriter.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/DerivationWriter.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/DerivationWriter.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/DerivationWriter.h</locationURI>
</link>
<link>
<name>Syntax/F2S/Forest.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Forest.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/Forest.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Forest.h</locationURI>
</link>
<link>
<name>Syntax/F2S/GlueRuleSynthesizer.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/GlueRuleSynthesizer.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/GlueRuleSynthesizer.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/GlueRuleSynthesizer.h</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperPath.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPath.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperPath.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPath.h</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperPathLoader.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPathLoader.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperPathLoader.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPathLoader.h</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperTree.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTree.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperTree.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTree.h</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperTreeCreator.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTreeCreator.h</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperTreeLoader.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTreeLoader.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/HyperTreeLoader.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTreeLoader.h</locationURI>
</link>
<link>
<name>Syntax/F2S/Manager-inl.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Manager-inl.h</locationURI>
</link>
<link>
<name>Syntax/F2S/Manager.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Manager.h</locationURI>
</link>
<link>
<name>Syntax/F2S/PHyperedgeToSHyperedgeBundle.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/PHyperedgeToSHyperedgeBundle.h</locationURI>
</link>
<link>
<name>Syntax/F2S/PVertexToStackMap.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/PVertexToStackMap.h</locationURI>
</link>
<link>
<name>Syntax/F2S/RuleMatcher.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcher.h</locationURI>
</link>
<link>
<name>Syntax/F2S/RuleMatcherCallback.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcherCallback.h</locationURI>
</link>
<link>
<name>Syntax/F2S/RuleMatcherHyperTree-inl.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcherHyperTree-inl.h</locationURI>
</link>
<link>
<name>Syntax/F2S/RuleMatcherHyperTree.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcherHyperTree.h</locationURI>
</link>
<link>
<name>Syntax/F2S/TopologicalSorter.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TopologicalSorter.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/TopologicalSorter.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TopologicalSorter.h</locationURI>
</link>
<link>
<name>Syntax/F2S/TreeFragmentTokenizer.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TreeFragmentTokenizer.cpp</locationURI>
</link>
<link>
<name>Syntax/F2S/TreeFragmentTokenizer.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TreeFragmentTokenizer.h</locationURI>
</link>
<link>
<name>Syntax/S2T/DerivationWriter.cpp</name>
<type>1</type>
@ -2380,6 +2590,96 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/SChart.h</locationURI>
</link>
<link>
<name>Syntax/T2S/GlueRuleSynthesizer.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/GlueRuleSynthesizer.cpp</locationURI>
</link>
<link>
<name>Syntax/T2S/GlueRuleSynthesizer.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/GlueRuleSynthesizer.h</locationURI>
</link>
<link>
<name>Syntax/T2S/HyperTree.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/HyperTree.h</locationURI>
</link>
<link>
<name>Syntax/T2S/InputTree.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTree.h</locationURI>
</link>
<link>
<name>Syntax/T2S/InputTreeBuilder.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeBuilder.cpp</locationURI>
</link>
<link>
<name>Syntax/T2S/InputTreeBuilder.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeBuilder.h</locationURI>
</link>
<link>
<name>Syntax/T2S/InputTreeToForest.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeToForest.cpp</locationURI>
</link>
<link>
<name>Syntax/T2S/InputTreeToForest.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeToForest.h</locationURI>
</link>
<link>
<name>Syntax/T2S/Manager-inl.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/Manager-inl.h</locationURI>
</link>
<link>
<name>Syntax/T2S/Manager.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/Manager.h</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleMatcher.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleMatcher.h</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleMatcherSCFG-inl.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleMatcherSCFG-inl.h</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleMatcherSCFG.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleMatcherSCFG.h</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleTrie.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrie.cpp</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleTrie.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrie.h</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleTrieCreator.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrieCreator.h</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleTrieLoader.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrieLoader.cpp</locationURI>
</link>
<link>
<name>Syntax/T2S/RuleTrieLoader.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrieLoader.h</locationURI>
</link>
<link>
<name>TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</name>
<type>1</type>

View File

@ -13,7 +13,12 @@ text = u"il a souhaité que la présidence trace à nice le chemin pour l' aveni
params = {"text":text, "align":"true", "report-all-factors":"true"}
result = proxy.translate(params)
print result['text']
if 'id' in result:
print "Segment ID:%s" % (result['id'])
if 'align' in result:
print "Phrase alignments:"
aligns = result['align']

View File

@ -283,7 +283,7 @@ public:
stringstream in(source + "\n");
tinput.Read(in,inputFactorOrder);
ChartManager manager(tinput);
manager.ProcessSentence();
manager.Decode();
const ChartHypothesis *hypo = manager.GetBestHypothesis();
outputChartHypo(out,hypo);
if (addGraphInfo) {
@ -301,8 +301,8 @@ public:
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
sentence.Read(in,inputFactorOrder);
Manager manager(sentence, staticData.GetSearchAlgorithm());
manager.ProcessSentence();
Manager manager(sentence);
manager.Decode();
const Hypothesis* hypo = manager.GetBestHypothesis();
vector<xmlrpc_c::value> alignInfo;
@ -312,7 +312,7 @@ public:
}
if (addWordAlignInfo) {
stringstream wordAlignment;
IOWrapper::OutputAlignment(wordAlignment, hypo);
hypo->OutputAlignment(wordAlignment);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
@ -475,7 +475,8 @@ public:
if ((int)edges.size() > 0) {
stringstream wordAlignment;
IOWrapper::OutputAlignment(wordAlignment, edges[0]);
const Hypothesis *edge = edges[0];
edge->OutputAlignment(wordAlignment);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
@ -493,7 +494,7 @@ public:
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
IOWrapper::OutputAllFeatureScores(path.GetScoreBreakdown(),buf);
path.GetScoreBreakdown().OutputAllFeatureScores(buf);
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}

View File

@ -184,7 +184,7 @@ rule boost ( min-version ) {
boost-lib system SYSTEM_DYN_LINK ;
boost-lib thread THREAD_DYN_DLL : boost_system ;
boost-lib program_options PROGRAM_OPTIONS_DYN_LINK ;
boost-lib unit_test_framework TEST_DYN_LINK ;
boost-lib unit_test_framework DELETE_ME_TEST_DYN_LINK ;
boost-lib iostreams IOSTREAMS_DYN_LINK ;
boost-lib filesystem FILE_SYSTEM_DYN_LINK ;
# if $(BOOST-VERSION) >= 104800 {

View File

@ -137,6 +137,7 @@ int main(int argc, char *argv[]) {
case 't': // legacy
case 'T':
config.temporary_directory_prefix = optarg;
util::NormalizeTempPrefix(config.temporary_directory_prefix);
break;
case 'm': // legacy
config.building_memory = ParseUInt(optarg) * 1048576;

View File

@ -4,6 +4,7 @@
#include <algorithm>
#include <iostream>
#include <limits>
namespace lm { namespace builder {
@ -108,9 +109,10 @@ class StatCollector {
// order but we don't care because the data is going to be sorted again.
class CollapseStream {
public:
CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold) :
CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold, const std::vector<bool>& prune_words) :
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
prune_threshold_(prune_threshold),
prune_words_(prune_words),
block_(position) {
StartBlock();
}
@ -132,6 +134,15 @@ class CollapseStream {
current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
current_.Mark();
break;
}
}
}
}
current_.NextInMemory();
@ -146,6 +157,15 @@ class CollapseStream {
if(current_.Count() <= prune_threshold_) {
current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
current_.Mark();
break;
}
}
}
return *this;
}
@ -164,6 +184,15 @@ class CollapseStream {
if(current_.Count() <= prune_threshold_) {
current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
current_.Mark();
break;
}
}
}
}
@ -179,6 +208,7 @@ class CollapseStream {
// Goes backwards in the block
uint8_t *copy_from_;
uint64_t prune_threshold_;
const std::vector<bool>& prune_words_;
util::stream::Link block_;
};
@ -192,8 +222,19 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
if (order == 1) {
// Only unigrams. Just collect stats.
for (NGramStream full(positions[0]); full; ++full)
stats.AddFull(full->Count());
for (NGramStream full(positions[0]); full; ++full) {
// Do not prune <s> </s> <unk>
if(*full->begin() > 2) {
if(full->Count() <= prune_thresholds_[0])
full->Mark();
if(!prune_words_.empty() && prune_words_[*full->begin()])
full->Mark();
}
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
}
stats.CalculateDiscounts(discount_config_);
return;
@ -202,56 +243,67 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
NGramStreams streams;
streams.Init(positions, positions.size() - 1);
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back());
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
// Initialization: <unk> has count 0 and so does <s>.
NGramStream *lower_valid = streams.begin();
const NGramStream *const streams_begin = streams.begin();
streams[0]->Count() = 0;
*streams[0]->begin() = kUNK;
stats.Add(0, 0);
(++streams[0])->Count() = 0;
*streams[0]->begin() = kBOS;
// not in stats because it will get put in later.
// <s> is not in stats yet because it will get put in later.
std::vector<uint64_t> lower_counts(positions.size(), 0);
// This keeps track of actual counts for lower orders. It is not output
// (only adjusted counts are), but used to determine pruning.
std::vector<uint64_t> actual_counts(positions.size(), 0);
// Something of a hack: don't prune <s>.
actual_counts[0] = std::numeric_limits<uint64_t>::max();
// iterate over full (the stream of the highest order ngrams)
for (; full; ++full) {
// Iterate over full (the stream of the highest order ngrams)
for (; full; ++full) {
const WordIndex *different = FindDifference(*full, **lower_valid);
std::size_t same = full->end() - 1 - different;
// Increment the adjusted count.
if (same) ++streams[same - 1]->Count();
// Output all the valid ones that changed.
// STEP 1: Output all the n-grams that changed.
for (; lower_valid >= &streams[same]; --lower_valid) {
// mjd: review this!
uint64_t order = (*lower_valid)->Order();
uint64_t realCount = lower_counts[order - 1];
if(order > 1 && prune_thresholds_[order - 1] && realCount <= prune_thresholds_[order - 1])
uint64_t order_minus_1 = lower_valid - streams_begin;
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
(*lower_valid)->Mark();
stats.Add(lower_valid - streams.begin(), (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
if(!prune_words_.empty()) {
for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
if(prune_words_[*i]) {
(*lower_valid)->Mark();
break;
}
}
}
stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
++*lower_valid;
}
// Count the true occurrences of lower-order n-grams
for (std::size_t i = 0; i < lower_counts.size(); ++i) {
if (i >= same) {
lower_counts[i] = 0;
}
lower_counts[i] += full->UnmarkedCount();
}
// STEP 2: Update n-grams that still match.
// n-grams that match get count from the full entry.
for (std::size_t i = 0; i < same; ++i) {
actual_counts[i] += full->UnmarkedCount();
}
// Increment the number of unique extensions for the longest match.
if (same) ++streams[same - 1]->Count();
// STEP 3: Initialize new n-grams.
// This is here because bos is also const WordIndex *, so copy gets
// consistent argument types.
const WordIndex *full_end = full->end();
// Initialize and mark as valid up to bos.
const WordIndex *bos;
for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) {
++lower_valid;
std::copy(bos, full_end, (*lower_valid)->begin());
(*lower_valid)->Count() = 1;
NGramStream &to = *++lower_valid;
std::copy(bos, full_end, to->begin());
to->Count() = 1;
actual_counts[lower_valid - streams_begin] = full->UnmarkedCount();
}
// Now bos indicates where <s> is or is the 0th word of full.
if (bos != full->begin()) {
@ -259,19 +311,32 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
NGramStream &to = *++lower_valid;
std::copy(bos, full_end, to->begin());
// mjd: what is this doing?
to->Count() = full->UnmarkedCount();
// Anything that begins with <s> has full non adjusted count.
to->Count() = full->UnmarkedCount();
actual_counts[lower_valid - streams_begin] = full->UnmarkedCount();
} else {
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
}
assert(lower_valid >= &streams[0]);
}
// Output everything valid.
// The above loop outputs n-grams when it observes changes. This outputs
// the last n-grams.
for (NGramStream *s = streams.begin(); s <= lower_valid; ++s) {
if((*s)->Count() <= prune_thresholds_[(*s)->Order() - 1])
uint64_t lower_count = actual_counts[(*s)->Order() - 1];
if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
(*s)->Mark();
stats.Add(s - streams.begin(), (*s)->UnmarkedCount(), (*s)->IsMarked());
if(!prune_words_.empty()) {
for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
if(prune_words_[*i]) {
(*s)->Mark();
break;
}
}
}
stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked());
++*s;
}
// Poison everyone! Except the N-grams which were already poisoned by the input.

View File

@ -46,9 +46,11 @@ class AdjustCounts {
const std::vector<uint64_t> &prune_thresholds,
std::vector<uint64_t> &counts,
std::vector<uint64_t> &counts_pruned,
const std::vector<bool> &prune_words,
const DiscountConfig &discount_config,
std::vector<Discount> &discounts)
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), discount_config_(discount_config), discounts_(discounts)
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
{}
void Run(const util::stream::ChainPositions &positions);
@ -57,6 +59,7 @@ class AdjustCounts {
const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_;
const std::vector<bool> &prune_words_;
DiscountConfig discount_config_;
std::vector<Discount> &discounts_;

View File

@ -174,8 +174,9 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
return ngram::GrowableVocab<ngram::WriteUniqueWords>::MemUsage(vocab_estimate);
}
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block, WarningAction disallowed_symbol)
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol)
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
prune_words_(prune_words), prune_vocab_filename_(prune_vocab_filename),
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)),
disallowed_symbol_action_(disallowed_symbol) {
@ -223,6 +224,31 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
} catch (const util::EndOfFileException &e) {}
token_count_ = count;
type_count_ = vocab.Size();
// Create list of unigrams that are supposed to be pruned
if (!prune_vocab_filename_.empty()) {
try {
util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
prune_words_.resize(vocab.Size(), true);
try {
while (true) {
StringPiece line(prune_vocab_file.ReadLine());
for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w)
prune_words_[vocab.Index(*w)] = false;
}
} catch (const util::EndOfFileException &e) {}
// Never prune <unk>, <s>, </s>
prune_words_[kUNK] = false;
prune_words_[kBOS] = false;
prune_words_[kEOS] = false;
} catch (const util::Exception &e) {
std::cerr << e.what() << std::endl;
abort();
}
}
}
} // namespace builder

View File

@ -8,6 +8,7 @@
#include <cstddef>
#include <string>
#include <stdint.h>
#include <vector>
namespace util {
class FilePiece;
@ -29,7 +30,7 @@ class CorpusCount {
// token_count: out.
// type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value.
CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block, WarningAction disallowed_symbol);
CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
void Run(const util::stream::ChainPosition &position);
@ -38,6 +39,8 @@ class CorpusCount {
int vocab_write_;
uint64_t &token_count_;
WordIndex &type_count_;
std::vector<bool>& prune_words_;
const std::string& prune_vocab_filename_;
std::size_t dedupe_mem_size_;
util::scoped_malloc dedupe_mem_;

View File

@ -2,16 +2,20 @@
#define LM_BUILDER_HEADER_INFO_H
#include <string>
#include <vector>
#include <stdint.h>
// Some configuration info that is used to add
// comments to the beginning of an ARPA file
struct HeaderInfo {
const std::string input_file;
const uint64_t token_count;
std::string input_file;
uint64_t token_count;
std::vector<uint64_t> counts_pruned;
HeaderInfo(const std::string& input_file_in, uint64_t token_count_in)
: input_file(input_file_in), token_count(token_count_in) {}
HeaderInfo() {}
HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector<uint64_t> &counts_pruned_in)
: input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {}
// TODO: Add smoothing type
// TODO: More info if multiple models were interpolated

View File

@ -51,15 +51,13 @@ class PruneNGramStream {
PruneNGramStream &operator++() {
assert(block_);
if (current_.Order() > 1) {
if(currentCount_ > 0) {
if(dest_.Base() < current_.Base()) {
memcpy(dest_.Base(), current_.Base(), current_.TotalSize());
}
dest_.NextInMemory();
if(current_.Order() == 1 && *current_.begin() <= 2)
dest_.NextInMemory();
else if(currentCount_ > 0) {
if(dest_.Base() < current_.Base()) {
memcpy(dest_.Base(), current_.Base(), current_.TotalSize());
}
} else {
dest_.NextInMemory();
dest_.NextInMemory();
}
current_.NextInMemory();
@ -78,7 +76,7 @@ class PruneNGramStream {
return *this;
}
private:
void StartBlock() {
for (; ; ++block_) {
@ -215,14 +213,33 @@ class MergeRight {
PruneNGramStream grams(primary);
// Without interpolation, the interpolation weight goes to <unk>.
if (grams->Order() == 1 && !interpolate_unigrams_) {
if (grams->Order() == 1) {
BufferEntry sums(*static_cast<const BufferEntry*>(summed.Get()));
// Special case for <unk>
assert(*grams->begin() == kUNK);
grams->Value().uninterp.prob = sums.gamma;
float gamma_assign;
if (interpolate_unigrams_) {
// Default: treat <unk> like a zeroton.
gamma_assign = sums.gamma;
grams->Value().uninterp.prob = 0.0;
} else {
// SRI: give all the interpolation mass to <unk>
gamma_assign = 0.0;
grams->Value().uninterp.prob = sums.gamma;
}
grams->Value().uninterp.gamma = gamma_assign;
++grams;
// Special case for <s>: probability 1.0. This allows <s> to be
// explicitly scores as part of the sentence without impacting
// probability and computes q correctly as b(<s>).
assert(*grams->begin() == kBOS);
grams->Value().uninterp.prob = 1.0;
grams->Value().uninterp.gamma = 0.0;
while (++grams) {
grams->Value().uninterp.prob = discount_.Apply(grams->Count()) / sums.denominator;
grams->Value().uninterp.gamma = 0.0;
grams->Value().uninterp.gamma = gamma_assign;
}
++summed;
return;
@ -256,10 +273,11 @@ void InitialProbabilities(
util::stream::Chains &primary,
util::stream::Chains &second_in,
util::stream::Chains &gamma_out,
const std::vector<uint64_t> &prune_thresholds) {
const std::vector<uint64_t> &prune_thresholds,
bool prune_vocab) {
for (size_t i = 0; i < primary.size(); ++i) {
util::stream::ChainConfig gamma_config = config.adder_out;
if(prune_thresholds[i] > 0)
if(prune_vocab || prune_thresholds[i] > 0)
gamma_config.entry_size = sizeof(HashBufferEntry);
else
gamma_config.entry_size = sizeof(BufferEntry);
@ -267,12 +285,12 @@ void InitialProbabilities(
util::stream::ChainPosition second(second_in[i].Add());
second_in[i] >> util::stream::kRecycle;
gamma_out.push_back(gamma_config);
gamma_out[i] >> AddRight(discounts[i], second, prune_thresholds[i] > 0);
gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
// Don't bother with the OnlyGamma thread for something to discard.
if (i) gamma_out[i] >> OnlyGamma(prune_thresholds[i] > 0);
if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
}
}

View File

@ -33,7 +33,8 @@ void InitialProbabilities(
util::stream::Chains &primary,
util::stream::Chains &second_in,
util::stream::Chains &gamma_out,
const std::vector<uint64_t> &prune_thresholds);
const std::vector<uint64_t> &prune_thresholds,
bool prune_vocab);
} // namespace builder
} // namespace lm

View File

@ -65,9 +65,10 @@ class OutputProbBackoff {
template <class Output> class Callback {
public:
Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds)
Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab)
: backoffs_(backoffs.size()), probs_(backoffs.size() + 2),
prune_thresholds_(prune_thresholds),
prune_vocab_(prune_vocab),
output_(backoffs.size() + 1 /* order */) {
probs_[0] = uniform_prob;
for (std::size_t i = 0; i < backoffs.size(); ++i) {
@ -77,7 +78,7 @@ template <class Output> class Callback {
~Callback() {
for (std::size_t i = 0; i < backoffs_.size(); ++i) {
if(prune_thresholds_[i + 1] > 0)
if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
while(backoffs_[i])
++backoffs_[i];
@ -94,8 +95,8 @@ template <class Output> class Callback {
probs_[order_minus_1 + 1] = pay.complete.prob;
float out_backoff;
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
if(prune_thresholds_[order_minus_1 + 1] > 0) {
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1]) {
if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
@ -129,15 +130,17 @@ template <class Output> class Callback {
std::vector<float> probs_;
const std::vector<uint64_t>& prune_thresholds_;
bool prune_vocab_;
Output output_;
};
} // namespace
Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool output_q)
Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool prune_vocab, bool output_q)
: uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
backoffs_(backoffs),
prune_thresholds_(prune_thresholds),
prune_vocab_(prune_vocab),
output_q_(output_q) {}
// perform order-wise interpolation
@ -145,11 +148,11 @@ void Interpolate::Run(const util::stream::ChainPositions &positions) {
assert(positions.size() == backoffs_.size() + 1);
if (output_q_) {
typedef Callback<OutputQ> C;
C callback(uniform_prob_, backoffs_, prune_thresholds_);
C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_);
JointOrder<C, SuffixOrder>(positions, callback);
} else {
typedef Callback<OutputProbBackoff> C;
C callback(uniform_prob_, backoffs_, prune_thresholds_);
C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_);
JointOrder<C, SuffixOrder>(positions, callback);
}
}

View File

@ -18,7 +18,7 @@ class Interpolate {
public:
// Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
// be larger when the user specifies a consistent vocabulary size.
explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool output_q_);
explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, bool output_q_);
void Run(const util::stream::ChainPositions &positions);
@ -26,6 +26,7 @@ class Interpolate {
float uniform_prob_;
util::stream::ChainPositions backoffs_;
const std::vector<uint64_t> prune_thresholds_;
bool prune_vocab_;
bool output_q_;
};

View File

@ -4,6 +4,11 @@
#include "lm/builder/ngram_stream.hh"
#include "lm/lm_exception.hh"
#ifdef DEBUG
#include "util/fixed_array.hh"
#include <iostream>
#endif
#include <string.h>
namespace lm { namespace builder {
@ -17,21 +22,40 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
unsigned int order;
for (order = 0; order < positions.size() && streams[order]; ++order) {}
assert(order); // should always have <unk>.
// Debugging only: call comparison function to sanity check order.
#ifdef DEBUG
util::FixedArray<Compare> less_compare(order);
for (unsigned i = 0; i < order; ++i)
less_compare.push_back(i + 1);
#endif // DEBUG
unsigned int current = 0;
while (true) {
// Does the context match the lower one?
// Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
callback.Enter(current, *streams[current]);
// Transition to looking for extensions.
if (++current < order) continue;
}
#ifdef DEBUG
// match_check[current - 1] matches current-grams
// The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
std::cerr << "Stream out of order detected" << std::endl;
abort();
}
#endif // DEBUG
// No extension left.
while(true) {
assert(current > 0);
--current;
callback.Exit(current, *streams[current]);
if (++streams[current]) break;
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
order = current;
if (!order) return;
}

View File

@ -1,4 +1,6 @@
#include "lm/builder/output.hh"
#include "lm/builder/pipeline.hh"
#include "lm/builder/print.hh"
#include "lm/lm_exception.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
@ -51,8 +53,7 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// throw if each n-gram order has not threshold specified
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
// threshold for unigram can only be 0 (no pruning)
UTIL_THROW_IF(prune_thresholds[0] != 0, util::Exception, "Unigram pruning is not implemented, so the first pruning threshold must be 0.");
// check if threshold are not in decreasing order
uint64_t lower_threshold = 0;
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
@ -93,6 +94,7 @@ int main(int argc, char *argv[]) {
discount_fallback_default.push_back("0.5");
discount_fallback_default.push_back("1");
discount_fallback_default.push_back("1.5");
bool verbose_header;
options.add_options()
("help,h", po::bool_switch(), "Show this help message")
@ -111,11 +113,12 @@ int main(int argc, char *argv[]) {
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.")
("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, options), vm);
@ -181,6 +184,13 @@ int main(int argc, char *argv[]) {
// parse pruning thresholds. These depend on order, so it is not done as a notifier.
pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
if (!vm["limit_vocab_file"].as<std::string>().empty()) {
pipeline.prune_vocab = true;
}
else {
pipeline.prune_vocab = false;
}
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
@ -202,7 +212,9 @@ int main(int argc, char *argv[]) {
// Read from stdin
try {
lm::builder::Pipeline(pipeline, in.release(), out.release());
lm::builder::Output output;
output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
lm::builder::Pipeline(pipeline, in.release(), output);
} catch (const util::MallocException &e) {
std::cerr << e.what() << std::endl;
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;

14
lm/builder/output.cc Normal file
View File

@ -0,0 +1,14 @@
#include "lm/builder/output.hh"
#include "util/stream/multi_stream.hh"
#include <boost/ref.hpp>
namespace lm { namespace builder {
OutputHook::~OutputHook() {}
void OutputHook::Apply(util::stream::Chains &chains) {
chains >> boost::ref(*this);
}
}} // namespaces

89
lm/builder/output.hh Normal file
View File

@ -0,0 +1,89 @@
#ifndef LM_BUILDER_OUTPUT_H
#define LM_BUILDER_OUTPUT_H
#include "lm/builder/header_info.hh"
#include "util/file.hh"
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/utility.hpp>
#include <map>
namespace util { namespace stream { class Chains; class ChainPositions; } }
/* Outputs from lmplz: ARPA< sharded files, etc */
namespace lm { namespace builder {
// These are different types of hooks. Values should be consecutive to enable a vector lookup.
enum HookType {
COUNT_HOOK, // Raw N-gram counts, highest order only.
PROB_PARALLEL_HOOK, // Probability and backoff (or just q). Output must process the orders in parallel or there will be a deadlock.
PROB_SEQUENTIAL_HOOK, // Probability and backoff (or just q). Output can process orders any way it likes. This requires writing the data to disk then reading. Useful for ARPA files, which put unigrams first etc.
NUMBER_OF_HOOKS // Keep this last so we know how many values there are.
};
class Output;
class OutputHook {
public:
explicit OutputHook(HookType hook_type) : type_(hook_type), master_(NULL) {}
virtual ~OutputHook();
virtual void Apply(util::stream::Chains &chains);
virtual void Run(const util::stream::ChainPositions &positions) = 0;
protected:
const HeaderInfo &GetHeader() const;
int GetVocabFD() const;
private:
friend class Output;
const HookType type_;
const Output *master_;
};
class Output : boost::noncopyable {
public:
Output() {}
// Takes ownership.
void Add(OutputHook *hook) {
hook->master_ = this;
outputs_[hook->type_].push_back(hook);
}
bool Have(HookType hook_type) const {
return !outputs_[hook_type].empty();
}
void SetVocabFD(int to) { vocab_fd_ = to; }
int GetVocabFD() const { return vocab_fd_; }
void SetHeader(const HeaderInfo &header) { header_ = header; }
const HeaderInfo &GetHeader() const { return header_; }
void Apply(HookType hook_type, util::stream::Chains &chains) {
for (boost::ptr_vector<OutputHook>::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) {
entry->Apply(chains);
}
}
private:
boost::ptr_vector<OutputHook> outputs_[NUMBER_OF_HOOKS];
int vocab_fd_;
HeaderInfo header_;
};
inline const HeaderInfo &OutputHook::GetHeader() const {
return master_->GetHeader();
}
inline int OutputHook::GetVocabFD() const {
return master_->GetVocabFD();
}
}} // namespaces
#endif // LM_BUILDER_OUTPUT_H

View File

@ -5,7 +5,7 @@
#include "lm/builder/hash_gamma.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/interpolate.hh"
#include "lm/builder/print.hh"
#include "lm/builder/output.hh"
#include "lm/builder/sort.hh"
#include "lm/sizes.hh"
@ -16,6 +16,7 @@
#include <algorithm>
#include <iostream>
#include <fstream>
#include <vector>
namespace lm { namespace builder {
@ -36,7 +37,7 @@ void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint
class Master {
public:
explicit Master(const PipelineConfig &config)
explicit Master(PipelineConfig &config)
: config_(config), chains_(config.order), files_(config.order) {
config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block);
}
@ -200,14 +201,14 @@ class Master {
std::cerr << std::endl;
}
PipelineConfig config_;
PipelineConfig &config_;
util::stream::Chains chains_;
// Often only unigrams, but sometimes all orders.
util::FixedArray<util::stream::FileBuffer> files_;
};
void CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, std::string &text_file_name) {
void CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, std::string &text_file_name, std::vector<bool> &prune_words) {
const PipelineConfig &config = master.Config();
std::cerr << "=== 1/5 Counting and sorting n-grams ===" << std::endl;
@ -225,7 +226,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
WordIndex type_count = config.vocab_estimate;
util::FilePiece text(text_file, NULL, &std::cerr);
text_file_name = text.FileName();
CorpusCount counter(text, vocab_file, token_count, type_count, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action);
CorpusCount counter(text, vocab_file, token_count, type_count, prune_words, config.prune_vocab_file, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action);
chain >> boost::ref(counter);
util::stream::Sort<SuffixOrder, AddCombiner> sorter(chain, config.sort, SuffixOrder(config.order), AddCombiner());
@ -236,7 +237,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
}
void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary,
util::FixedArray<util::stream::FileBuffer> &gammas, const std::vector<uint64_t> &prune_thresholds) {
util::FixedArray<util::stream::FileBuffer> &gammas, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab) {
const PipelineConfig &config = master.Config();
util::stream::Chains second(config.order);
@ -250,7 +251,7 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
}
util::stream::Chains gamma_chains(config.order);
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds);
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab);
// Don't care about gamma for 0.
gamma_chains[0] >> util::stream::kRecycle;
gammas.Init(config.order - 1);
@ -271,8 +272,7 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
for (std::size_t i = 0; i < config.order - 1; ++i) {
util::stream::ChainConfig read_backoffs(config.read_backoffs);
// Add 1 because here we are skipping unigrams
if(config.prune_thresholds[i + 1] > 0)
if(config.prune_vocab || config.prune_thresholds[i + 1] > 0)
read_backoffs.entry_size = sizeof(HashGamma);
else
read_backoffs.entry_size = sizeof(float);
@ -280,14 +280,14 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
gamma_chains.push_back(read_backoffs);
gamma_chains.back() >> gammas[i].Source();
}
master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.output_q);
master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.prune_vocab, config.output_q);
gamma_chains >> util::stream::kRecycle;
master.BufferFinal(counts);
}
} // namespace
void Pipeline(PipelineConfig config, int text_file, int out_arpa) {
void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// Some fail-fast sanity checks.
if (config.sort.buffer_size * 4 > config.TotalMemory()) {
config.sort.buffer_size = config.TotalMemory() / 4;
@ -310,27 +310,30 @@ void Pipeline(PipelineConfig config, int text_file, int out_arpa) {
util::scoped_fd vocab_file(config.vocab_file.empty() ?
util::MakeTemp(config.TempPrefix()) :
util::CreateOrThrow(config.vocab_file.c_str()));
output.SetVocabFD(vocab_file.get());
uint64_t token_count;
std::string text_file_name;
CountText(text_file, vocab_file.get(), master, token_count, text_file_name);
std::vector<bool> prune_words;
CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words);
std::vector<uint64_t> counts;
std::vector<uint64_t> counts_pruned;
std::vector<Discount> discounts;
master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, config.discount, discounts);
master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, prune_words, config.discount, discounts);
{
util::FixedArray<util::stream::FileBuffer> gammas;
Sorts<SuffixOrder> primary;
InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds);
InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds, config.prune_vocab);
InterpolateProbabilities(counts_pruned, master, primary, gammas);
}
std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
VocabReconstitute vocab(vocab_file.get());
UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?");
HeaderInfo header_info(text_file_name, token_count);
master >> PrintARPA(vocab, counts_pruned, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle;
output.SetHeader(HeaderInfo(text_file_name, token_count, counts_pruned));
output.Apply(PROB_SEQUENTIAL_HOOK, master.MutableChains());
master >> util::stream::kRecycle;
master.MutableChains().Wait(true);
} catch (const util::Exception &e) {
std::cerr << e.what() << std::endl;

View File

@ -14,6 +14,8 @@
namespace lm { namespace builder {
class Output;
struct PipelineConfig {
std::size_t order;
std::string vocab_file;
@ -21,9 +23,6 @@ struct PipelineConfig {
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
// Include a header in the ARPA with some statistics?
bool verbose_header;
// Estimated vocabulary size. Used for sizing CorpusCount memory and
// initial probing hash table sizing, also in CorpusCount.
lm::WordIndex vocab_estimate;
@ -37,6 +36,8 @@ struct PipelineConfig {
// n-gram count thresholds for pruning. 0 values means no pruning for
// corresponding n-gram order
std::vector<uint64_t> prune_thresholds; //mjd
bool prune_vocab;
std::string prune_vocab_file;
// What to do with discount failures.
DiscountConfig discount;
@ -67,7 +68,7 @@ struct PipelineConfig {
};
// Takes ownership of text_file and out_arpa.
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
void Pipeline(PipelineConfig &config, int text_file, Output &output);
}} // namespaces
#endif // LM_BUILDER_PIPELINE_H

View File

@ -24,35 +24,34 @@ VocabReconstitute::VocabReconstitute(int fd) {
map_.push_back(i);
}
PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd)
: vocab_(vocab), out_fd_(out_fd) {
std::stringstream stream;
if (header_info) {
stream << "# Input file: " << header_info->input_file << '\n';
stream << "# Token count: " << header_info->token_count << '\n';
stream << "# Smoothing: Modified Kneser-Ney" << '\n';
}
stream << "\\data\\\n";
for (size_t i = 0; i < counts.size(); ++i) {
stream << "ngram " << (i+1) << '=' << counts[i] << '\n';
}
stream << '\n';
std::string as_string(stream.str());
util::WriteOrThrow(out_fd, as_string.data(), as_string.size());
}
void PrintARPA::Run(const util::stream::ChainPositions &positions) {
util::scoped_fd closer(out_fd_);
UTIL_TIMER("(%w s) Wrote ARPA file\n");
util::FakeOFStream out(out_fd_);
VocabReconstitute vocab(GetVocabFD());
// Write header. TODO: integers in FakeOFStream.
{
std::stringstream stream;
if (verbose_header_) {
stream << "# Input file: " << GetHeader().input_file << '\n';
stream << "# Token count: " << GetHeader().token_count << '\n';
stream << "# Smoothing: Modified Kneser-Ney" << '\n';
}
stream << "\\data\\\n";
for (size_t i = 0; i < positions.size(); ++i) {
stream << "ngram " << (i+1) << '=' << GetHeader().counts_pruned[i] << '\n';
}
stream << '\n';
std::string as_string(stream.str());
util::WriteOrThrow(out_fd_.get(), as_string.data(), as_string.size());
}
util::FakeOFStream out(out_fd_.get());
for (unsigned order = 1; order <= positions.size(); ++order) {
out << "\\" << order << "-grams:" << '\n';
for (NGramStream stream(positions[order - 1]); stream; ++stream) {
// Correcting for numerical precision issues. Take that IRST.
out << stream->Value().complete.prob << '\t' << vocab_.Lookup(*stream->begin());
out << stream->Value().complete.prob << '\t' << vocab.Lookup(*stream->begin());
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
out << ' ' << vocab_.Lookup(*i);
out << ' ' << vocab.Lookup(*i);
}
if (order != positions.size())
out << '\t' << stream->Value().complete.backoff;

View File

@ -3,7 +3,8 @@
#include "lm/builder/ngram.hh"
#include "lm/builder/ngram_stream.hh"
#include "lm/builder/header_info.hh"
#include "lm/builder/output.hh"
#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/mmap.hh"
#include "util/string_piece.hh"
@ -43,60 +44,71 @@ class VocabReconstitute {
};
// Not defined, only specialized.
template <class T> void PrintPayload(std::ostream &to, const Payload &payload);
template <> inline void PrintPayload<uint64_t>(std::ostream &to, const Payload &payload) {
to << payload.count;
template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) {
// TODO slow
to << boost::lexical_cast<std::string>(payload.count);
}
template <> inline void PrintPayload<Uninterpolated>(std::ostream &to, const Payload &payload) {
template <> inline void PrintPayload<Uninterpolated>(util::FakeOFStream &to, const Payload &payload) {
to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma);
}
template <> inline void PrintPayload<ProbBackoff>(std::ostream &to, const Payload &payload) {
template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const Payload &payload) {
to << payload.complete.prob << ' ' << payload.complete.backoff;
}
// template parameter is the type stored.
template <class V> class Print {
public:
explicit Print(const VocabReconstitute &vocab, std::ostream &to) : vocab_(vocab), to_(to) {}
static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {
for (unsigned int i = 0; i < chains.size(); ++i) {
std::string file(file_base + boost::lexical_cast<std::string>(i));
chains[i] >> Print(vocab, util::CreateOrThrow(file.c_str()));
}
}
explicit Print(const VocabReconstitute &vocab, int fd) : vocab_(vocab), to_(fd) {}
void Run(const util::stream::ChainPositions &chains) {
util::scoped_fd fd(to_);
util::FakeOFStream out(to_);
NGramStreams streams(chains);
for (NGramStream *s = streams.begin(); s != streams.end(); ++s) {
DumpStream(*s);
DumpStream(*s, out);
}
}
void Run(const util::stream::ChainPosition &position) {
util::scoped_fd fd(to_);
util::FakeOFStream out(to_);
NGramStream stream(position);
DumpStream(stream);
DumpStream(stream, out);
}
private:
void DumpStream(NGramStream &stream) {
void DumpStream(NGramStream &stream, util::FakeOFStream &to) {
for (; stream; ++stream) {
PrintPayload<V>(to_, stream->Value());
PrintPayload<V>(to, stream->Value());
for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) {
to_ << ' ' << vocab_.Lookup(*w) << '=' << *w;
to << ' ' << vocab_.Lookup(*w) << '=' << *w;
}
to_ << '\n';
to << '\n';
}
}
const VocabReconstitute &vocab_;
std::ostream &to_;
int to_;
};
class PrintARPA {
class PrintARPA : public OutputHook {
public:
// header_info may be NULL to disable the header.
// Takes ownership of out_fd upon Run().
explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd);
explicit PrintARPA(int fd, bool verbose_header)
: OutputHook(PROB_SEQUENTIAL_HOOK), out_fd_(fd), verbose_header_(verbose_header) {}
void Run(const util::stream::ChainPositions &positions);
private:
const VocabReconstitute &vocab_;
int out_fd_;
util::scoped_fd out_fd_;
bool verbose_header_;
};
}} // namespaces

View File

@ -15,7 +15,7 @@ Config::Config() :
unknown_missing_logprob(-100.0),
probing_multiplier(1.5),
building_memory(1073741824ULL), // 1 GB
temporary_directory_prefix(NULL),
temporary_directory_prefix(""),
arpa_complain(ALL),
write_mmap(NULL),
write_method(WRITE_AFTER),

View File

@ -66,9 +66,9 @@ struct Config {
// Template for temporary directory appropriate for passing to mkdtemp.
// The characters XXXXXX are appended before passing to mkdtemp. Only
// applies to trie. If NULL, defaults to write_mmap. If that's NULL,
// applies to trie. If empty, defaults to write_mmap. If that's NULL,
// defaults to input file name.
const char *temporary_directory_prefix;
std::string temporary_directory_prefix;
// Level of complaining to do when loading from ARPA instead of binary format.
enum ARPALoadComplain {ALL, EXPENSIVE, NONE};

View File

@ -1,13 +1,13 @@
#ifndef LM_MAX_ORDER_H
#define LM_MAX_ORDER_H
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER_H, THEN CHANGE THE BUILD SYSTEM.
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
* If not, this is the default maximum order.
* Having this limit means that State can be
* (kMaxOrder - 1) * sizeof(float) bytes instead of
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
*/
#ifndef KENLM_ORDER_MESSAGE
#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER_H, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh."
#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh."
#endif
#endif // LM_MAX_ORDER_H

View File

@ -577,7 +577,7 @@ template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::Setup
template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) {
std::string temporary_prefix;
if (config.temporary_directory_prefix) {
if (!config.temporary_directory_prefix.empty()) {
temporary_prefix = config.temporary_directory_prefix;
} else if (config.write_mmap) {
temporary_prefix = config.write_mmap;

View File

@ -263,13 +263,13 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
{
UTIL_THROW_IF(shard_count == 0, util::Exception, "Must have at least 1 shard");
UTIL_THROW_IF(shard_size < 0 || shard_size > 1,
util::Exception,
"Shard size must be between 0 and 1, inclusive. Currently " << shard_size);
util::Exception,
"Shard size must be between 0 and 1, inclusive. Currently " << shard_size);
size_t data_size = m_score_data->size();
UTIL_THROW_IF(data_size != m_feature_data->size(),
util::Exception,
"Error");
util::Exception,
"Error");
shard_size *= data_size;
const float coeff = static_cast<float>(data_size) / shard_count;

View File

@ -61,7 +61,8 @@ void SparseVector::set(const string& name, FeatureStatsType value)
m_fvector[id] = value;
}
void SparseVector::set(size_t id, FeatureStatsType value) {
void SparseVector::set(size_t id, FeatureStatsType value)
{
assert(m_id_to_name.size() > id);
m_fvector[id] = value;
}
@ -204,7 +205,7 @@ FeatureStats::FeatureStats(const size_t size)
FeatureStats::~FeatureStats()
{
delete [] m_array;
delete [] m_array;
}
void FeatureStats::Copy(const FeatureStats &stats)

View File

@ -31,9 +31,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
namespace MosesTuning {
namespace MosesTuning
{
std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) {
std::ostream& operator<<(std::ostream& out, const WordVec& wordVec)
{
out << "[";
for (size_t i = 0; i < wordVec.size(); ++i) {
out << wordVec[i]->first;
@ -44,7 +46,8 @@ std::ostream& operator<<(std::ostream& out, const WordVec& wordVec) {
}
void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) {
void ReferenceSet::Load(const vector<string>& files, Vocab& vocab)
{
for (size_t i = 0; i < files.size(); ++i) {
util::FilePiece fh(files[i].c_str());
size_t sentenceId = 0;
@ -55,14 +58,15 @@ void ReferenceSet::Load(const vector<string>& files, Vocab& vocab) {
} catch (util::EndOfFileException &e) {
break;
}
AddLine(sentenceId, line, vocab);
++sentenceId;
AddLine(sentenceId, line, vocab);
++sentenceId;
}
}
}
void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) {
void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab)
{
//cerr << line << endl;
NgramCounter ngramCounts;
list<WordVec> openNgrams;
@ -74,14 +78,14 @@ void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vo
openNgrams.push_front(WordVec());
for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
k->push_back(nextTok);
++ngramCounts[*k];
++ngramCounts[*k];
}
if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
}
//merge into overall ngram map
for (NgramCounter::const_iterator ni = ngramCounts.begin();
ni != ngramCounts.end(); ++ni) {
ni != ngramCounts.end(); ++ni) {
size_t count = ni->second;
//cerr << *ni << " " << count << endl;
if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1);
@ -104,8 +108,9 @@ void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vo
//cerr << endl;
}
size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const {
size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool clip) const
{
const NgramMap& ngramCounts = ngramCounts_.at(sentenceId);
NgramMap::const_iterator ngi = ngramCounts.find(ngram);
if (ngi == ngramCounts.end()) return 0;
@ -114,7 +119,8 @@ size_t ReferenceSet::NgramMatches(size_t sentenceId, const WordVec& ngram, bool
VertexState::VertexState(): bleuStats(kBleuNgramOrder), targetLength(0) {}
void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const {
void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStatsType>& bleuStats ) const
{
for (NgramCounter::const_iterator ngi = counts.begin(); ngi != counts.end(); ++ngi) {
//cerr << "Checking: " << *ngi << " matches " << references_.NgramMatches(sentenceId_,*ngi,false) << endl;
size_t order = ngi->first.size();
@ -124,7 +130,8 @@ void HgBleuScorer::UpdateMatches(const NgramCounter& counts, vector<FeatureStats
}
}
size_t HgBleuScorer::GetTargetLength(const Edge& edge) const {
size_t HgBleuScorer::GetTargetLength(const Edge& edge) const
{
size_t targetLength = 0;
for (size_t i = 0; i < edge.Words().size(); ++i) {
const Vocab::Entry* word = edge.Words()[i];
@ -137,7 +144,8 @@ size_t HgBleuScorer::GetTargetLength(const Edge& edge) const {
return targetLength;
}
FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats) {
FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vector<FeatureStatsType>& bleuStats)
{
NgramCounter ngramCounts;
size_t childId = 0;
size_t wordId = 0;
@ -147,7 +155,7 @@ FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vecto
bool inRightContext = false;
list<WordVec> openNgrams;
const Vocab::Entry* currentWord = NULL;
while (wordId < edge.Words().size()) {
while (wordId < edge.Words().size()) {
currentWord = edge.Words()[wordId];
if (currentWord != NULL) {
++wordId;
@ -214,7 +222,7 @@ FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vecto
}
if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
}
//Collect matches
//This edge
//cerr << "edge ngrams" << endl;
@ -227,26 +235,27 @@ FeatureStatsType HgBleuScorer::Score(const Edge& edge, const Vertex& head, vecto
bleuStats[j] += vertexStates_[edge.Children()[i]].bleuStats[j];
}
}
FeatureStatsType sourceLength = head.SourceCovered();
size_t referenceLength = references_.Length(sentenceId_);
FeatureStatsType effectiveReferenceLength =
FeatureStatsType effectiveReferenceLength =
sourceLength / totalSourceLength_ * referenceLength;
bleuStats[bleuStats.size()-1] = effectiveReferenceLength;
//backgroundBleu_[backgroundBleu_.size()-1] =
//backgroundBleu_[backgroundBleu_.size()-1] =
// backgroundRefLength_ * sourceLength / totalSourceLength_;
FeatureStatsType bleu = sentenceLevelBackgroundBleu(bleuStats, backgroundBleu_);
return bleu;
}
void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats) {
void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const vector<FeatureStatsType>& bleuStats)
{
//TODO: Maybe more efficient to absorb into the Score() method
VertexState& vertexState = vertexStates_[vertexId];
//cerr << "Updating state for " << vertexId << endl;
//leftContext
int wi = 0;
const VertexState* childState = NULL;
@ -263,9 +272,9 @@ void HgBleuScorer::UpdateState(const Edge& winnerEdge, size_t vertexId, const ve
//start of child state
childState = &(vertexStates_[winnerEdge.Children()[childi++]]);
contexti = 0;
}
}
if ((size_t)contexti < childState->leftContext.size()) {
vertexState.leftContext.push_back(childState->leftContext[contexti++]);
vertexState.leftContext.push_back(childState->leftContext[contexti++]);
} else {
//end of child context
childState = NULL;
@ -314,7 +323,8 @@ typedef pair<const Edge*,FeatureStatsType> BackPointer;
* Recurse through back pointers
**/
static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<BackPointer>& bps,
HgHypothesis* bestHypo) {
HgHypothesis* bestHypo)
{
//cerr << "Expanding " << vertexId << " Score: " << bps[vertexId].second << endl;
//UTIL_THROW_IF(bps[vertexId].second == kMinScore+1, HypergraphException, "Landed at vertex " << vertexId << " which is a dead end");
if (!bps[vertexId].first) return;
@ -334,14 +344,14 @@ static void GetBestHypothesis(size_t vertexId, const Graph& graph, const vector<
}
}
void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo)
void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references , size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo)
{
BackPointer init(NULL,kMinScore);
vector<BackPointer> backPointers(graph.VertexSize(),init);
HgBleuScorer bleuScorer(references, graph, sentenceId, backgroundBleu);
vector<FeatureStatsType> winnerStats(kBleuNgramOrder*2+1);
for (size_t vi = 0; vi < graph.VertexSize(); ++vi) {
//cerr << "vertex id " << vi << endl;
// cerr << "vertex id " << vi << endl;
FeatureStatsType winnerScore = kMinScore;
const Vertex& vertex = graph.GetVertex(vi);
const vector<const Edge*>& incoming = vertex.GetIncoming();
@ -349,7 +359,7 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight,
//UTIL_THROW(HypergraphException, "Vertex " << vi << " has no incoming edges");
//If no incoming edges, vertex is a dead end
backPointers[vi].first = NULL;
backPointers[vi].second = kMinScore/2;
backPointers[vi].second = kMinScore;
} else {
//cerr << "\nVertex: " << vi << endl;
for (size_t ei = 0; ei < incoming.size(); ++ei) {
@ -357,15 +367,15 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight,
FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
size_t childId = incoming[ei]->Children()[i];
UTIL_THROW_IF(backPointers[childId].second == kMinScore,
HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
incomingScore += backPointers[childId].second;
//UTIL_THROW_IF(backPointers[childId].second == kMinScore,
// HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
incomingScore = max(incomingScore + backPointers[childId].second, kMinScore);
}
vector<FeatureStatsType> bleuStats(kBleuNgramOrder*2+1);
// cerr << "Score: " << incomingScore << " Bleu: ";
// if (incomingScore > nonbleuscore) {nonbleuscore = incomingScore; nonbleuid = ei;}
// cerr << "Score: " << incomingScore << " Bleu: ";
// if (incomingScore > nonbleuscore) {nonbleuscore = incomingScore; nonbleuid = ei;}
FeatureStatsType totalScore = incomingScore;
if (bleuWeight) {
if (bleuWeight) {
FeatureStatsType bleuScore = bleuScorer.Score(*(incoming[ei]), vertex, bleuStats);
if (isnan(bleuScore)) {
cerr << "WARN: bleu score undefined" << endl;
@ -379,7 +389,7 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight,
}
//UTIL_THROW_IF(isnan(bleuScore), util::Exception, "Bleu score undefined, smoothing problem?");
totalScore += bleuWeight * bleuScore;
// cerr << bleuScore << " Total: " << incomingScore << endl << endl;
// cerr << bleuScore << " Total: " << incomingScore << endl << endl;
//cerr << "is " << incomingScore << " bs " << bleuScore << endl;
}
if (totalScore >= winnerScore) {
@ -394,9 +404,12 @@ void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight,
//update with winner
//if (bleuWeight) {
//TODO: Not sure if we need this when computing max-model solution
bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats);
if (backPointers[vi].first) {
bleuScorer.UpdateState(*(backPointers[vi].first), vi, winnerStats);
}
}
// cerr << "backpointer[" << vi << "] = (" << backPointers[vi].first << "," << backPointers[vi].second << ")" << endl;
}
//expand back pointers

View File

@ -27,7 +27,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "BleuScorer.h"
#include "Hypergraph.h"
namespace MosesTuning {
namespace MosesTuning
{
std::ostream& operator<<(std::ostream& out, const WordVec& wordVec);
@ -47,18 +48,21 @@ struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&,
typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter;
class ReferenceSet {
class ReferenceSet
{
public:
void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab);
void Load(const std::vector<std::string>& files, Vocab& vocab);
size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const;
size_t Length(size_t sentenceId) const {return lengths_[sentenceId];}
size_t Length(size_t sentenceId) const {
return lengths_[sentenceId];
}
private:
//ngrams to (clipped,unclipped) counts
@ -80,31 +84,32 @@ struct VertexState {
/**
* Used to score an rule (ie edge) when we are applying it.
**/
class HgBleuScorer {
public:
HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
class HgBleuScorer
{
public:
HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu),
backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
vertexStates_.resize(graph.VertexSize());
totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
}
backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
vertexStates_.resize(graph.VertexSize());
totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
}
FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;
FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;
void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);
void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);
private:
const ReferenceSet& references_;
std::vector<VertexState> vertexStates_;
size_t sentenceId_;
size_t totalSourceLength_;
const Graph& graph_;
std::vector<FeatureStatsType> backgroundBleu_;
FeatureStatsType backgroundRefLength_;
private:
const ReferenceSet& references_;
std::vector<VertexState> vertexStates_;
size_t sentenceId_;
size_t totalSourceLength_;
const Graph& graph_;
std::vector<FeatureStatsType> backgroundBleu_;
FeatureStatsType backgroundRefLength_;
void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
size_t GetTargetLength(const Edge& edge) const;
void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
size_t GetTargetLength(const Edge& edge) const;
};
struct HgHypothesis {

View File

@ -15,7 +15,7 @@ BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
Vocab vocab;
WordVec words;
string wordStrings[] =
{"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g"};
{"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g"};
for (size_t i = 0; i < 9; ++i) {
words.push_back(&(vocab.FindOrAdd((wordStrings[i]))));
}
@ -102,7 +102,7 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
Vocab vocab;
WordVec words;
string wordStrings[] =
{"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
{"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
for (size_t i = 0; i < 13; ++i) {
words.push_back(&(vocab.FindOrAdd((wordStrings[i]))));
}

View File

@ -34,11 +34,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
namespace fs = boost::filesystem;
namespace MosesTuning {
namespace MosesTuning
{
static const ValType BLEU_RATIO = 5;
ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) {
ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv)
{
vector<ValType> stats(scorer_->NumberOfScores(),0);
for(reset(); !finished(); next()) {
vector<ValType> sent;
@ -51,13 +53,14 @@ ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) {
}
NbestHopeFearDecoder::NbestHopeFearDecoder(
const vector<string>& featureFiles,
const vector<string>& scoreFiles,
bool streaming,
bool no_shuffle,
bool safe_hope,
Scorer* scorer
) : safe_hope_(safe_hope) {
const vector<string>& featureFiles,
const vector<string>& scoreFiles,
bool streaming,
bool no_shuffle,
bool safe_hope,
Scorer* scorer
) : safe_hope_(safe_hope)
{
scorer_ = scorer;
if (streaming) {
train_.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles));
@ -67,25 +70,29 @@ NbestHopeFearDecoder::NbestHopeFearDecoder(
}
void NbestHopeFearDecoder::next() {
void NbestHopeFearDecoder::next()
{
train_->next();
}
bool NbestHopeFearDecoder::finished() {
bool NbestHopeFearDecoder::finished()
{
return train_->finished();
}
void NbestHopeFearDecoder::reset() {
void NbestHopeFearDecoder::reset()
{
train_->reset();
}
void NbestHopeFearDecoder::HopeFear(
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
) {
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
)
{
// Hope / fear decode
ValType hope_scale = 1.0;
size_t hope_index=0, fear_index=0, model_index=0;
@ -134,7 +141,8 @@ void NbestHopeFearDecoder::HopeFear(
hopeFear->hopeFearEqual = (hope_index == fear_index);
}
void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) {
void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats)
{
// Find max model
size_t max_index=0;
ValType max_score=0;
@ -152,18 +160,19 @@ void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValTy
HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
(
const string& hypergraphDir,
const vector<string>& referenceFiles,
size_t num_dense,
bool streaming,
bool no_shuffle,
bool safe_hope,
size_t hg_pruning,
const MiraWeightVector& wv,
Scorer* scorer
) :
num_dense_(num_dense) {
(
const string& hypergraphDir,
const vector<string>& referenceFiles,
size_t num_dense,
bool streaming,
bool no_shuffle,
bool safe_hope,
size_t hg_pruning,
const MiraWeightVector& wv,
Scorer* scorer
) :
num_dense_(num_dense)
{
UTIL_THROW_IF(streaming, util::Exception, "Streaming not currently supported for hypergraphs");
UTIL_THROW_IF(!fs::exists(hypergraphDir), HypergraphException, "Directory '" << hypergraphDir << "' does not exist");
@ -177,16 +186,17 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
static const string kWeights = "weights";
fs::directory_iterator dend;
size_t fileCount = 0;
cerr << "Reading hypergraphs" << endl;
for (fs::directory_iterator di(hypergraphDir); di != dend; ++di) {
const fs::path& hgpath = di->path();
if (hgpath.filename() == kWeights) continue;
// cerr << "Reading " << hgpath.filename() << endl;
Graph graph(vocab_);
size_t id = boost::lexical_cast<size_t>(hgpath.stem().string());
util::scoped_fd fd(util::OpenReadOrThrow(hgpath.string().c_str()));
//util::FilePiece file(di->path().string().c_str());
util::FilePiece file(fd.release());
util::FilePiece file(fd.release());
ReadGraph(file,graph);
//cerr << "ref length " << references_.Length(id) << endl;
@ -195,7 +205,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
prunedGraph.reset(new Graph(vocab_));
graph.Prune(prunedGraph.get(), weights, edgeCount);
graphs_[id] = prunedGraph;
//cerr << "Pruning to v=" << graphs_[id]->VertexSize() << " e=" << graphs_[id]->EdgeSize() << endl;
// cerr << "Pruning to v=" << graphs_[id]->VertexSize() << " e=" << graphs_[id]->EdgeSize() << endl;
++fileCount;
if (fileCount % 10 == 0) cerr << ".";
if (fileCount % 400 == 0) cerr << " [count=" << fileCount << "]\n";
@ -210,23 +220,27 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
}
void HypergraphHopeFearDecoder::reset() {
void HypergraphHopeFearDecoder::reset()
{
sentenceIdIter_ = sentenceIds_.begin();
}
void HypergraphHopeFearDecoder::next() {
void HypergraphHopeFearDecoder::next()
{
sentenceIdIter_++;
}
bool HypergraphHopeFearDecoder::finished() {
bool HypergraphHopeFearDecoder::finished()
{
return sentenceIdIter_ == sentenceIds_.end();
}
void HypergraphHopeFearDecoder::HopeFear(
const vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
) {
const vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
)
{
size_t sentenceId = *sentenceIdIter_;
SparseVector weights;
wv.ToSparse(&weights);
@ -246,12 +260,12 @@ void HypergraphHopeFearDecoder::HopeFear(
Viterbi(graph, weights, 0, references_, sentenceId, backgroundBleu, &modelHypo);
// Outer loop rescales the contribution of model score to 'hope' in antagonistic cases
// Outer loop rescales the contribution of model score to 'hope' in antagonistic cases
// where model score is having far more influence than BLEU
// hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU
// if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale)
// hope_scale = abs(hope_bleu) / abs(hope_model);
// else break;
// hope_bleu *= BLEU_RATIO; // We only care about cases where model has MUCH more influence than BLEU
// if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale)
// hope_scale = abs(hope_bleu) / abs(hope_model);
// else break;
//TODO: Don't currently get model and bleu so commented this out for now.
break;
}
@ -310,21 +324,23 @@ void HypergraphHopeFearDecoder::HopeFear(
if (hopeFear->hopeFearEqual) {
for (size_t i = 0; i < fearStats.size(); ++i) {
if (fearStats[i] != hopeFear->hopeStats[i]) {
hopeFear->hopeFearEqual = false;
break;
hopeFear->hopeFearEqual = false;
break;
}
}
}
hopeFear->hopeFearEqual = hopeFear->hopeFearEqual && (hopeFear->fearFeatures == hopeFear->hopeFeatures);
}
void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats) {
void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats)
{
assert(!finished());
HgHypothesis bestHypo;
size_t sentenceId = *sentenceIdIter_;
SparseVector weights;
wv.ToSparse(&weights);
vector<ValType> bg(scorer_->NumberOfScores());
//cerr << "Calculating bleu on " << sentenceId << endl;
Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);
stats->resize(bestHypo.bleuStats.size());
/*

View File

@ -35,7 +35,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
// the n-best list and lattice/hypergraph implementations
//
namespace MosesTuning {
namespace MosesTuning
{
class Scorer;
@ -44,7 +45,7 @@ struct HopeFearData {
MiraFeatureVector modelFeatures;
MiraFeatureVector hopeFeatures;
MiraFeatureVector fearFeatures;
std::vector<float> modelStats;
std::vector<float> hopeStats;
@ -55,7 +56,8 @@ struct HopeFearData {
};
//Abstract base class
class HopeFearDecoder {
class HopeFearDecoder
{
public:
//iterator methods
virtual void reset() = 0;
@ -68,14 +70,14 @@ public:
* Calculate hope, fear and model hypotheses
**/
virtual void HopeFear(
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
) = 0;
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
) = 0;
/** Max score decoding */
virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats)
= 0;
= 0;
/** Calculate bleu on training set */
ValType Evaluate(const AvgWeightVector& wv);
@ -86,25 +88,26 @@ protected:
/** Gets hope-fear from nbest lists */
class NbestHopeFearDecoder : public virtual HopeFearDecoder {
class NbestHopeFearDecoder : public virtual HopeFearDecoder
{
public:
NbestHopeFearDecoder(const std::vector<std::string>& featureFiles,
const std::vector<std::string>& scoreFiles,
bool streaming,
bool no_shuffle,
bool safe_hope,
Scorer* scorer
);
const std::vector<std::string>& scoreFiles,
bool streaming,
bool no_shuffle,
bool safe_hope,
Scorer* scorer
);
virtual void reset();
virtual void next();
virtual bool finished();
virtual void HopeFear(
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
);
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
);
virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats);
@ -117,29 +120,30 @@ private:
/** Gets hope-fear from hypergraphs */
class HypergraphHopeFearDecoder : public virtual HopeFearDecoder {
class HypergraphHopeFearDecoder : public virtual HopeFearDecoder
{
public:
HypergraphHopeFearDecoder(
const std::string& hypergraphDir,
const std::vector<std::string>& referenceFiles,
size_t num_dense,
bool streaming,
bool no_shuffle,
bool safe_hope,
size_t hg_pruning,
const MiraWeightVector& wv,
Scorer* scorer_
);
const std::string& hypergraphDir,
const std::vector<std::string>& referenceFiles,
size_t num_dense,
bool streaming,
bool no_shuffle,
bool safe_hope,
size_t hg_pruning,
const MiraWeightVector& wv,
Scorer* scorer_
);
virtual void reset();
virtual void next();
virtual bool finished();
virtual void HopeFear(
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
);
const std::vector<ValType>& backgroundBleu,
const MiraWeightVector& wv,
HopeFearData* hopeFear
);
virtual void MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats);

View File

@ -55,7 +55,8 @@ void HwcmScorer::setReferenceFiles(const vector<string>& referenceFiles)
}
void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc) {
void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc)
{
if (tree->GetLength() > 0) {
string head = getHead(tree);
@ -64,8 +65,7 @@ void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) {
extractHeadWordChain(*it, history, hwc);
}
}
else {
} else {
vector<string> new_history(kHwcmOrder);
new_history[0] = head;
hwc[0][head]++;
@ -85,11 +85,11 @@ void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history
}
}
string HwcmScorer::getHead(TreePointer tree) {
string HwcmScorer::getHead(TreePointer tree)
{
// assumption (only true for dependency parse: each constituent has a preterminal label, and corresponding terminal is head)
// if constituent has multiple preterminals, first one is picked; if it has no preterminals, empty string is returned
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it)
{
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) {
TreePointer child = *it;
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {

View File

@ -31,18 +31,22 @@ using namespace std;
static const string kBOS = "<s>";
static const string kEOS = "</s>";
namespace MosesTuning {
namespace MosesTuning
{
StringPiece NextLine(util::FilePiece& from) {
StringPiece NextLine(util::FilePiece& from)
{
StringPiece line;
while ((line = from.ReadLine()).starts_with("#"));
return line;
}
Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)){
Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS))
{
}
const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) {
const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str)
{
#if BOOST_VERSION >= 104200
Map::const_iterator i= map_.find(str, Hash(), Equals());
#else
@ -62,7 +66,8 @@ double_conversion::StringToDoubleConverter converter(double_conversion::StringTo
/**
* Reads an incoming edge. Returns edge and source words covered.
**/
static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) {
static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph)
{
Edge* edge = graph.NewEdge();
StringPiece line = from.ReadLine(); //Don't allow comments within edge lists
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
@ -82,7 +87,7 @@ static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) {
edge->AddWord(&found);
}
}
//Features
++pipes;
for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
@ -100,17 +105,18 @@ static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) {
//Covered words
++pipes;
size_t sourceCovered = boost::lexical_cast<size_t>(*pipes);
return pair<Edge*,size_t>(edge,sourceCovered);
return pair<Edge*,size_t>(edge,sourceCovered);
}
void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const {
void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const
{
Graph& newGraph = *pNewGraph;
//TODO: Optimise case where no pruning required
//For debug
/*
map<const Edge*, string> edgeIds;
for (size_t i = 0; i < edges_.Size(); ++i) {
@ -136,7 +142,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
//Compute backward scores
for (size_t vi = 0; vi < vertices_.Size(); ++vi) {
// cerr << "Vertex " << vi << endl;
// cerr << "Vertex " << vi << endl;
const Vertex& vertex = vertices_[vi];
const vector<const Edge*>& incoming = vertex.GetIncoming();
if (!incoming.size()) {
@ -150,7 +156,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
//cerr << "\tChild " << incoming[ei]->Children()[i] << endl;
size_t childId = incoming[ei]->Children()[i];
UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore,
HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
outgoing[childId].push_back(incoming[ei]);
incomingScore += vertexBackwardScores[childId];
}
@ -172,7 +178,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
} else {
for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) {
//cerr << "Edge " << edgeIds[outgoing[vi][ei]] << endl;
FeatureStatsType outgoingScore = 0;
FeatureStatsType outgoingScore = 0;
//add score of head
outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]];
//cerr << "Forward score " << outgoingScore << endl;
@ -204,11 +210,11 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
}
FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge];
edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge));
// cerr << edgeIds[edge] << " " << score << endl;
// cerr << edgeIds[edge] << " " << score << endl;
}
multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin();
size_t edgeCount = 1;
while(edgeCount < minEdgeCount && ei != edgeScores.rend()) {
@ -235,10 +241,10 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
map<size_t,size_t> oldIdToNew;
size_t vi = 0;
for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) {
//cerr << *i << " New: " << vi << endl;
// cerr << *i << " New: " << vi << endl;
oldIdToNew[*i] = vi;
Vertex* vertex = newGraph.NewVertex();
vertex->SetSourceCovered(vertices_[*i].SourceCovered());
vertex->SetSourceCovered(vertices_[*i].SourceCovered());
}
for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) {
@ -255,6 +261,7 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
newHead.AddEdge(newEdge);
}
/*
cerr << "New graph" << endl;
for (size_t vi = 0; vi < newGraph.VertexSize(); ++vi) {
@ -274,21 +281,22 @@ void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeC
}
cerr << endl;
}
*/
*/
}
/**
* Read from "Kenneth's hypergraph" aka cdec target_graph format (with comments)
**/
void ReadGraph(util::FilePiece &from, Graph &graph) {
void ReadGraph(util::FilePiece &from, Graph &graph)
{
//First line should contain field names
StringPiece line = from.ReadLine();
UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'");
line = NextLine(from);
//Then expect numbers of vertices
util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' '));
unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i);
@ -303,9 +311,11 @@ void ReadGraph(util::FilePiece &from, Graph &graph) {
for (unsigned long int e = 0; e < edge_count; ++e) {
pair<Edge*,size_t> edge = ReadEdge(from, graph);
vertex->AddEdge(edge.first);
//Note: the file format attaches this to the edge, but it's really a property
//Note: the file format attaches this to the edge, but it's really a property
//of the vertex.
if (!e) {vertex->SetSourceCovered(edge.second);}
if (!e) {
vertex->SetSourceCovered(edge.second);
}
}
}
}

View File

@ -37,81 +37,88 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureStats.h"
namespace MosesTuning {
namespace MosesTuning
{
typedef unsigned int WordIndex;
const WordIndex kMaxWordIndex = UINT_MAX;
const FeatureStatsType kMinScore = -std::numeric_limits<FeatureStatsType>::max();
const FeatureStatsType kMinScore = -1e10;
template <class T> class FixedAllocator : boost::noncopyable {
public:
FixedAllocator() : current_(NULL), end_(NULL) {}
template <class T> class FixedAllocator : boost::noncopyable
{
public:
FixedAllocator() : current_(NULL), end_(NULL) {}
void Init(std::size_t count) {
assert(!current_);
array_.reset(new T[count]);
current_ = array_.get();
end_ = current_ + count;
}
void Init(std::size_t count) {
assert(!current_);
array_.reset(new T[count]);
current_ = array_.get();
end_ = current_ + count;
}
T &operator[](std::size_t idx) {
return array_.get()[idx];
}
const T &operator[](std::size_t idx) const {
return array_.get()[idx];
}
T &operator[](std::size_t idx) {
return array_.get()[idx];
}
const T &operator[](std::size_t idx) const {
return array_.get()[idx];
}
T *New() {
T *ret = current_++;
UTIL_THROW_IF(ret >= end_, util::Exception, "Allocating past end");
return ret;
}
T *New() {
T *ret = current_++;
UTIL_THROW_IF(ret >= end_, util::Exception, "Allocating past end");
return ret;
}
std::size_t Capacity() const {
return end_ - array_.get();
}
std::size_t Capacity() const {
return end_ - array_.get();
}
std::size_t Size() const {
return current_ - array_.get();
}
std::size_t Size() const {
return current_ - array_.get();
}
private:
boost::scoped_array<T> array_;
T *current_, *end_;
private:
boost::scoped_array<T> array_;
T *current_, *end_;
};
class Vocab {
public:
Vocab();
class Vocab
{
public:
Vocab();
typedef std::pair<const char *const, WordIndex> Entry;
typedef std::pair<const char *const, WordIndex> Entry;
const Entry &FindOrAdd(const StringPiece &str);
const Entry &FindOrAdd(const StringPiece &str);
const Entry& Bos() const {return bos_;}
const Entry& Bos() const {
return bos_;
}
const Entry& Eos() const {return eos_;}
const Entry& Eos() const {
return eos_;
}
private:
util::Pool piece_backing_;
private:
util::Pool piece_backing_;
struct Hash : public std::unary_function<const char *, std::size_t> {
std::size_t operator()(StringPiece str) const {
return util::MurmurHashNative(str.data(), str.size());
}
};
struct Hash : public std::unary_function<const char *, std::size_t> {
std::size_t operator()(StringPiece str) const {
return util::MurmurHashNative(str.data(), str.size());
}
};
struct Equals : public std::binary_function<const char *, const char *, bool> {
bool operator()(StringPiece first, StringPiece second) const {
return first == second;
}
};
struct Equals : public std::binary_function<const char *, const char *, bool> {
bool operator()(StringPiece first, StringPiece second) const {
return first == second;
}
};
typedef boost::unordered_map<const char *, WordIndex, Hash, Equals> Map;
Map map_;
Entry eos_;
Entry bos_;
typedef boost::unordered_map<const char *, WordIndex, Hash, Equals> Map;
Map map_;
Entry eos_;
Entry bos_;
};
@ -125,121 +132,141 @@ typedef boost::shared_ptr<SparseVector> FeaturePtr;
/**
* An edge has 1 head vertex, 0..n child (tail) vertices, a list of words and a feature vector.
**/
class Edge {
public:
Edge() {features_.reset(new SparseVector());}
class Edge
{
public:
Edge() {
features_.reset(new SparseVector());
}
void AddWord(const Vocab::Entry *word) {
words_.push_back(word);
}
void AddWord(const Vocab::Entry *word) {
words_.push_back(word);
}
void AddChild(size_t child) {
children_.push_back(child);
}
void AddChild(size_t child) {
children_.push_back(child);
}
void AddFeature(const StringPiece& name, FeatureStatsType value) {
//TODO StringPiece interface
features_->set(name.as_string(),value);
}
void AddFeature(const StringPiece& name, FeatureStatsType value) {
//TODO StringPiece interface
features_->set(name.as_string(),value);
}
const WordVec &Words() const {
return words_;
}
const FeaturePtr& Features() const {
return features_;
}
const WordVec &Words() const {
return words_;
}
void SetFeatures(const FeaturePtr& features) {
features_ = features;
}
const FeaturePtr& Features() const {
return features_;
}
const std::vector<size_t>& Children() const {
return children_;
}
void SetFeatures(const FeaturePtr& features) {
features_ = features;
}
FeatureStatsType GetScore(const SparseVector& weights) const {
return inner_product(*(features_.get()), weights);
}
const std::vector<size_t>& Children() const {
return children_;
}
private:
// NULL for non-terminals.
std::vector<const Vocab::Entry*> words_;
std::vector<size_t> children_;
boost::shared_ptr<SparseVector> features_;
FeatureStatsType GetScore(const SparseVector& weights) const {
return inner_product(*(features_.get()), weights);
}
private:
// NULL for non-terminals.
std::vector<const Vocab::Entry*> words_;
std::vector<size_t> children_;
boost::shared_ptr<SparseVector> features_;
};
/*
* A vertex has 0..n incoming edges
**/
class Vertex {
public:
Vertex() : sourceCovered_(0) {}
class Vertex
{
public:
Vertex() : sourceCovered_(0) {}
void AddEdge(const Edge* edge) {incoming_.push_back(edge);}
void AddEdge(const Edge* edge) {
incoming_.push_back(edge);
}
void SetSourceCovered(size_t sourceCovered) {sourceCovered_ = sourceCovered;}
void SetSourceCovered(size_t sourceCovered) {
sourceCovered_ = sourceCovered;
}
const std::vector<const Edge*>& GetIncoming() const {return incoming_;}
const std::vector<const Edge*>& GetIncoming() const {
return incoming_;
}
size_t SourceCovered() const {return sourceCovered_;}
size_t SourceCovered() const {
return sourceCovered_;
}
private:
std::vector<const Edge*> incoming_;
size_t sourceCovered_;
private:
std::vector<const Edge*> incoming_;
size_t sourceCovered_;
};
class Graph : boost::noncopyable {
public:
Graph(Vocab& vocab) : vocab_(vocab) {}
class Graph : boost::noncopyable
{
public:
Graph(Vocab& vocab) : vocab_(vocab) {}
void SetCounts(std::size_t vertices, std::size_t edges) {
vertices_.Init(vertices);
edges_.Init(edges);
}
void SetCounts(std::size_t vertices, std::size_t edges) {
vertices_.Init(vertices);
edges_.Init(edges);
}
Vocab &MutableVocab() { return vocab_; }
Vocab &MutableVocab() {
return vocab_;
}
Edge *NewEdge() {
return edges_.New();
}
Edge *NewEdge() {
return edges_.New();
}
Vertex *NewVertex() {
return vertices_.New();
}
Vertex *NewVertex() {
return vertices_.New();
}
const Vertex &GetVertex(std::size_t index) const {
return vertices_[index];
}
const Vertex &GetVertex(std::size_t index) const {
return vertices_[index];
}
Edge &GetEdge(std::size_t index) {
return edges_[index];
}
Edge &GetEdge(std::size_t index) {
return edges_[index];
}
/* Created a pruned copy of this graph with minEdgeCount edges. Uses
the scores in the max-product semiring to rank edges, as suggested by
Colin Cherry */
void Prune(Graph* newGraph, const SparseVector& weights, size_t minEdgeCount) const;
/* Created a pruned copy of this graph with minEdgeCount edges. Uses
the scores in the max-product semiring to rank edges, as suggested by
Colin Cherry */
void Prune(Graph* newGraph, const SparseVector& weights, size_t minEdgeCount) const;
std::size_t VertexSize() const { return vertices_.Size(); }
std::size_t EdgeSize() const { return edges_.Size(); }
std::size_t VertexSize() const {
return vertices_.Size();
}
std::size_t EdgeSize() const {
return edges_.Size();
}
bool IsBoundary(const Vocab::Entry* word) const {
return word->second == vocab_.Bos().second || word->second == vocab_.Eos().second;
}
bool IsBoundary(const Vocab::Entry* word) const {
return word->second == vocab_.Bos().second || word->second == vocab_.Eos().second;
}
private:
FixedAllocator<Edge> edges_;
FixedAllocator<Vertex> vertices_;
Vocab& vocab_;
private:
FixedAllocator<Edge> edges_;
FixedAllocator<Vertex> vertices_;
Vocab& vocab_;
};
class HypergraphException : public util::Exception {
public:
HypergraphException() {}
~HypergraphException() throw() {}
class HypergraphException : public util::Exception
{
public:
HypergraphException() {}
~HypergraphException() throw() {}
};

View File

@ -8,12 +8,12 @@
using namespace std;
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(prune)
BOOST_AUTO_TEST_CASE(prune)
{
Vocab vocab;
WordVec words;
string wordStrings[] =
{"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
{"<s>", "</s>", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
for (size_t i = 0; i < 13; ++i) {
words.push_back(&(vocab.FindOrAdd((wordStrings[i]))));
}
@ -105,7 +105,7 @@ BOOST_AUTO_TEST_CASE(prune)
BOOST_CHECK_EQUAL(5, pruned.EdgeSize());
BOOST_CHECK_EQUAL(4, pruned.VertexSize());
//edges retained should be best path (<s> ab jk </s>) and hi
BOOST_CHECK_EQUAL(1, pruned.GetVertex(0).GetIncoming().size());
BOOST_CHECK_EQUAL(2, pruned.GetVertex(1).GetIncoming().size());
@ -115,37 +115,37 @@ BOOST_AUTO_TEST_CASE(prune)
const Edge* edge;
edge = pruned.GetVertex(0).GetIncoming()[0];
BOOST_CHECK_EQUAL(1, edge->Words().size());
BOOST_CHECK_EQUAL(words[0], edge->Words()[0]);
BOOST_CHECK_EQUAL(1, edge->Words().size());
BOOST_CHECK_EQUAL(words[0], edge->Words()[0]);
edge = pruned.GetVertex(1).GetIncoming()[0];
BOOST_CHECK_EQUAL(3, edge->Words().size());
BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
BOOST_CHECK_EQUAL(words[2]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[3]->first, edge->Words()[2]->first);
BOOST_CHECK_EQUAL(3, edge->Words().size());
BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
BOOST_CHECK_EQUAL(words[2]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[3]->first, edge->Words()[2]->first);
edge = pruned.GetVertex(1).GetIncoming()[1];
BOOST_CHECK_EQUAL(3, edge->Words().size());
BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
BOOST_CHECK_EQUAL(words[9]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[9]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[10]->first, edge->Words()[2]->first);
edge = pruned.GetVertex(2).GetIncoming()[0];
BOOST_CHECK_EQUAL(3, edge->Words().size());
BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
BOOST_CHECK_EQUAL(words[11]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[11]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[12]->first, edge->Words()[2]->first);
edge = pruned.GetVertex(3).GetIncoming()[0];
BOOST_CHECK_EQUAL(2, edge->Words().size());
BOOST_CHECK_EQUAL((Vocab::Entry*)NULL, edge->Words()[0]);
BOOST_CHECK_EQUAL(words[1]->first, edge->Words()[1]->first);
BOOST_CHECK_EQUAL(words[1]->first, edge->Words()[1]->first);
// BOOST_CHECK_EQUAL(words[0], pruned.GetVertex(0).GetIncoming()[0].Words()[0]);
// BOOST_CHECK_EQUAL(words[0], pruned.GetVertex(0).GetIncoming()[0].Words()[0]);
}

View File

@ -174,19 +174,19 @@ float InterpolatedScorer::calculateScore(const std::vector<ScoreStatsType>& tota
float InterpolatedScorer::getReferenceLength(const std::vector<ScoreStatsType>& totals) const
{
size_t scorerNum = 0;
size_t last = 0;
float refLen = 0;
for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
int numScoresScorer = (*itsc)->NumberOfScores();
std::vector<ScoreStatsType> totals_scorer(totals.begin()+last, totals.begin()+last+numScoresScorer);
refLen += (*itsc)->getReferenceLength(totals_scorer) * m_scorer_weights[scorerNum];
last += numScoresScorer;
scorerNum++;
}
return refLen;
size_t scorerNum = 0;
size_t last = 0;
float refLen = 0;
for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
int numScoresScorer = (*itsc)->NumberOfScores();
std::vector<ScoreStatsType> totals_scorer(totals.begin()+last, totals.begin()+last+numScoresScorer);
refLen += (*itsc)->getReferenceLength(totals_scorer) * m_scorer_weights[scorerNum];
last += numScoresScorer;
scorerNum++;
}
return refLen;
}
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
{

View File

@ -34,7 +34,8 @@ namespace MosesTuning
#define CHILD_STDOUT_WRITE pipefds_output[1]
MeteorScorer::MeteorScorer(const string& config)
: StatisticsBasedScorer("METEOR",config) {
: StatisticsBasedScorer("METEOR",config)
{
meteor_jar = getConfig("jar", "");
meteor_lang = getConfig("lang", "en");
meteor_task = getConfig("task", "tune");
@ -88,7 +89,8 @@ MeteorScorer::MeteorScorer(const string& config)
m_from_meteor = new ifdstream(CHILD_STDOUT_READ);
}
MeteorScorer::~MeteorScorer() {
MeteorScorer::~MeteorScorer()
{
// Cleanup IO
delete m_to_meteor;
delete m_from_meteor;
@ -171,7 +173,8 @@ float MeteorScorer::calculateScore(const vector<ScoreStatsType>& comps) const
// Meteor unsupported, throw error if used
MeteorScorer::MeteorScorer(const string& config)
: StatisticsBasedScorer("METEOR",config) {
: StatisticsBasedScorer("METEOR",config)
{
throw runtime_error("Meteor unsupported, requires GLIBCXX");
}

View File

@ -20,7 +20,7 @@ class ifdstream;
class ScoreStats;
/**
* Meteor scoring
* Meteor scoring
*
* https://github.com/mjdenkowski/meteor
* http://statmt.org/wmt11/pdf/WMT07.pdf

View File

@ -9,7 +9,8 @@ namespace MosesTuning
{
void MiraFeatureVector::InitSparse(const SparseVector& sparse, size_t ignoreLimit) {
void MiraFeatureVector::InitSparse(const SparseVector& sparse, size_t ignoreLimit)
{
vector<size_t> sparseFeats = sparse.feats();
bool bFirst = true;
size_t lastFeat = 0;
@ -40,7 +41,8 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
InitSparse(vec.sparse);
}
MiraFeatureVector::MiraFeatureVector(const SparseVector& sparse, size_t num_dense) {
MiraFeatureVector::MiraFeatureVector(const SparseVector& sparse, size_t num_dense)
{
m_dense.resize(num_dense);
//Assume that features with id [0,num_dense) are the dense features
for (size_t id = 0; id < num_dense; ++id) {
@ -162,7 +164,8 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
return MiraFeatureVector(dense,sparseFeats,sparseVals);
}
bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b) {
bool operator==(const MiraFeatureVector& a,const MiraFeatureVector& b)
{
ValType eps = 1e-8;
//dense features
if (a.m_dense.size() != b.m_dense.size()) return false;

View File

@ -93,7 +93,8 @@ void MiraWeightVector::update(size_t index, ValType delta)
m_lastUpdated[index] = m_numUpdates;
}
void MiraWeightVector::ToSparse(SparseVector* sparse) const {
void MiraWeightVector::ToSparse(SparseVector* sparse) const
{
for (size_t i = 0; i < m_weights.size(); ++i) {
if(abs(m_weights[i])>1e-8) {
sparse->set(i,m_weights[i]);
@ -171,7 +172,8 @@ size_t AvgWeightVector::size() const
return m_wv.m_weights.size();
}
void AvgWeightVector::ToSparse(SparseVector* sparse) const {
void AvgWeightVector::ToSparse(SparseVector* sparse) const
{
for (size_t i = 0; i < size(); ++i) {
ValType w = weight(i);
if(abs(w)>1e-8) {

View File

@ -168,8 +168,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
// The rightmost bestindex is the one with the highest slope.
// They should be equal but there might be.
UTIL_THROW_IF(abs(leftmost->first-gradient.rbegin()->first) >= 0.0001,
util::Exception, "Error");
UTIL_THROW_IF(abs(leftmost->first-gradient.rbegin()->first) >= 0.0001,
util::Exception, "Error");
// A small difference due to rounding error
break;
}
@ -191,8 +191,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
if (tit == previnserted) {
// The threshold is the same as before can happen if 2 candidates are the same for example.
UTIL_THROW_IF(previnserted->second.back().first != newd.first,
util::Exception,
"Error");
util::Exception,
"Error");
previnserted->second.back()=newd; // just replace the 1 best for sentence S
// previnsert doesn't change
} else {
@ -207,8 +207,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
// We append the diffs in previnsert to tit before destroying previnsert.
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
UTIL_THROW_IF(tit->second.back().first != newd.first,
util::Exception,
"Error");
util::Exception,
"Error");
tit->second.back()=newd; // change diff for sentence S
thresholdmap.erase(previnserted); // erase old previnsert
previnserted = tit; // point previnsert to the new threshold
@ -216,8 +216,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
}
UTIL_THROW_IF(previnserted == thresholdmap.end(),
util::Exception,
"Error");
util::Exception,
"Error");
} else { //normal insertion process
previnserted = AddThreshold(thresholdmap, leftmostx, newd);
}
@ -254,8 +254,8 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
// We skipped the first el of thresholdlist but GetIncStatScore return 1 more for first1best.
UTIL_THROW_IF(scores.size() != thresholdmap.size(),
util::Exception,
"Error");
util::Exception,
"Error");
for (unsigned int sc = 0; sc != scores.size(); sc++) {
//cerr << "x=" << thrit->first << " => " << scores[sc] << endl;

View File

@ -40,8 +40,8 @@ Point::Point(const vector<parameter_t>& init,
m_max[i] = max[i];
}
} else {
UTIL_THROW_IF(init.size() != m_pdim, util::Exception, "Error");
UTIL_THROW_IF(m_opt_indices.size() != Point::m_dim, util::Exception, "Error");
UTIL_THROW_IF(init.size() != m_pdim, util::Exception, "Error");
UTIL_THROW_IF(m_opt_indices.size() != Point::m_dim, util::Exception, "Error");
for (unsigned int i = 0; i < Point::m_dim; i++) {
operator[](i) = init[m_opt_indices[i]];
m_min[i] = min[m_opt_indices[i]];

View File

@ -35,7 +35,7 @@ PreProcessFilter::PreProcessFilter(const string& filterCommand)
m_fromFilter(NULL)
{
#if defined __MINGW32__
//TODO(jie): replace this function with boost implementation
//TODO(jie): replace this function with boost implementation
#else
// Child error signal install
// sigaction is the replacement for the traditional signal() method

View File

@ -25,9 +25,9 @@ const int kUnknownToken = -1;
Scorer::Scorer(const string& name, const string& config)
: m_name(name),
m_vocab(mert::VocabularyFactory::GetVocabulary()),
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
m_filter(NULL),
#endif
#endif
m_score_data(NULL),
m_enable_preserve_case(true)
{

View File

@ -23,7 +23,7 @@ namespace MosesTuning
*/
class StatisticsBasedScorer : public Scorer
{
friend class HopeFearDecoder;
friend class HopeFearDecoder;
public:
StatisticsBasedScorer(const std::string& name, const std::string& config);

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -23,15 +23,15 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
using namespace std;
namespace TERCpp
{
string alignmentStruct::toString()
{
stringstream s;
string alignmentStruct::toString()
{
stringstream s;
// s << "nword : " << vectorToString(nwords)<<endl;
// s << "alignment" << vectorToString(alignment)<<endl;
// s << "afterShift" << vectorToString(alignment)<<endl;
s << "Nothing to be printed" <<endl;
return s.str();
}
s << "Nothing to be printed" <<endl;
return s.str();
}
// alignmentStruct::alignmentStruct()
// {
@ -99,7 +99,7 @@ namespace TERCpp
// return s.str();
// }
/* The distance of the shift. */
/* The distance of the shift. */
// int alignmentStruct::distance()
// {
// if (moveto < start)

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -34,10 +34,10 @@ using namespace Tools;
namespace TERCpp
{
class alignmentStruct
{
private:
public:
class alignmentStruct
{
private:
public:
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
@ -53,14 +53,14 @@ namespace TERCpp
// int end;
// int moveto;
// int newloc;
vector<string> nwords; // The words we shifted
vector<char> alignment ; // for pra_more output
vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
double cost;
string toString();
};
vector<string> nwords; // The words we shifted
vector<char> alignment ; // for pra_more output
vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
double cost;
string toString();
};
}
#endif

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -36,10 +36,10 @@ using namespace Tools;
namespace TERCpp
{
class bestShiftStruct
{
private:
public:
class bestShiftStruct
{
private:
public:
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
@ -55,16 +55,16 @@ namespace TERCpp
// int end;
// int moveto;
// int newloc;
terShift m_best_shift;
terAlignment m_best_align;
bool m_empty;
terShift m_best_shift;
terAlignment m_best_align;
bool m_empty;
// vector<string> nwords; // The words we shifted
// char* alignment ; // for pra_more output
// vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
// double cost;
};
};
}
#endif

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -28,156 +28,142 @@ using namespace std;
namespace HashMapSpace
{
// hashMap::hashMap();
/* hashMap::~hashMap()
{
// vector<stringHasher>::const_iterator del = m_hasher.begin();
for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMap::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMap::trouve ( long searchKey )
/* hashMap::~hashMap()
{
long foundKey;
// vector<stringHasher>::const_iterator del = m_hasher.begin();
for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMap::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMap::trouve ( long searchKey )
{
long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return 1;
}
}
return 0;
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
int hashMap::trouve ( string key )
{
long searchKey=hashValue ( key );
long foundKey;;
}
return 0;
}
int hashMap::trouve ( string key )
{
long searchKey=hashValue ( key );
long foundKey;;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return 1;
}
}
return 0;
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
/**
* long hashMap::hashValue ( string key )
* @param key
* @return
*/
long hashMap::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> >(loc);
return coll.hash(key.data(),key.data()+key.length());
}
return 0;
}
/**
* long hashMap::hashValue ( string key )
* @param key
* @return
*/
long hashMap::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> >(loc);
return coll.hash(key.data(),key.data()+key.length());
// boost::hash<string> hasher;
// return hasher ( key );
}
/**
* void hashMap::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMap::addHasher ( string key, string value )
{
if ( trouve ( hashValue ( key ) ) ==0 )
{
}
/**
* void hashMap::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMap::addHasher ( string key, string value )
{
if ( trouve ( hashValue ( key ) ) ==0 ) {
// cerr << "ICI1" <<endl;
stringHasher H ( hashValue ( key ),key,value );
stringHasher H ( hashValue ( key ),key,value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
m_hasher.push_back ( H );
}
}
stringHasher hashMap::getHasher ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
stringHasher defaut(0,"","");
m_hasher.push_back ( H );
}
}
stringHasher hashMap::getHasher ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
stringHasher defaut(0,"","");
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return ( *l_hasher );
}
}
return defaut;
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return ( *l_hasher );
}
string hashMap::getValue ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
}
return defaut;
}
string hashMap::getValue ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
}
return "";
return ( *l_hasher ).getValue();
}
string hashMap::searchValue ( string value )
{
}
return "";
}
string hashMap::searchValue ( string value )
{
// long searchKey=hashValue ( key );
// long foundKey;
string foundValue;
string foundValue;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundValue= ( *l_hasher ).getValue();
if ( foundValue.compare ( value ) == 0 )
{
return ( *l_hasher ).getKey();
}
}
return "";
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundValue= ( *l_hasher ).getValue();
if ( foundValue.compare ( value ) == 0 ) {
return ( *l_hasher ).getKey();
}
}
return "";
}
void hashMap::setValue ( string key , string value )
{
long searchKey=hashValue ( key );
long foundKey;
void hashMap::setValue ( string key , string value )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
( *l_hasher ).setValue ( value );
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
}
}
}
/**
*
*/
void hashMap::printHash()
{
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
/**
*
*/
void hashMap::printHash()
{
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -35,27 +35,27 @@ using namespace std;
namespace HashMapSpace
{
class hashMap
{
private:
vector<stringHasher> m_hasher;
class hashMap
{
private:
vector<stringHasher> m_hasher;
public:
public:
// ~hashMap();
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, string value );
stringHasher getHasher ( string key );
string getValue ( string key );
string searchValue ( string key );
void setValue ( string key , string value );
void printHash();
vector<stringHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, string value );
stringHasher getHasher ( string key );
string getValue ( string key );
string searchValue ( string key );
void setValue ( string key , string value );
void printHash();
vector<stringHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -28,117 +28,108 @@ using namespace std;
namespace HashMapSpace
{
// hashMapInfos::hashMap();
/* hashMapInfos::~hashMap()
{
// vector<infosHasher>::const_iterator del = m_hasher.begin();
for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMapInfos::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMapInfos::trouve ( long searchKey )
/* hashMapInfos::~hashMap()
{
long foundKey;
// vector<infosHasher>::const_iterator del = m_hasher.begin();
for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMapInfos::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMapInfos::trouve ( long searchKey )
{
long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return 1;
}
}
return 0;
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
int hashMapInfos::trouve ( string key )
{
long searchKey=hashValue ( key );
long foundKey;;
}
return 0;
}
int hashMapInfos::trouve ( string key )
{
long searchKey=hashValue ( key );
long foundKey;;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return 1;
}
}
return 0;
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
/**
* long hashMapInfos::hashValue ( string key )
* @param key
* @return
*/
long hashMapInfos::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> >(loc);
return coll.hash(key.data(),key.data()+key.length());
/**
* long hashMapInfos::hashValue ( string key )
* @param key
* @return
*/
long hashMapInfos::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> >(loc);
return coll.hash(key.data(),key.data()+key.length());
// boost::hash<string> hasher;
// return hasher ( key );
}
/**
* void hashMapInfos::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMapInfos::addHasher ( string key, vector<int> value )
{
if ( trouve ( hashValue ( key ) ) ==0 )
{
}
/**
* void hashMapInfos::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMapInfos::addHasher ( string key, vector<int> value )
{
if ( trouve ( hashValue ( key ) ) ==0 ) {
// cerr << "ICI1" <<endl;
infosHasher H ( hashValue ( key ),key,value );
infosHasher H ( hashValue ( key ),key,value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
m_hasher.push_back ( H );
}
}
void hashMapInfos::addValue ( string key, vector<int> value )
{
addHasher ( key, value );
}
infosHasher hashMapInfos::getHasher ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
m_hasher.push_back ( H );
}
}
void hashMapInfos::addValue ( string key, vector<int> value )
{
addHasher ( key, value );
}
infosHasher hashMapInfos::getHasher ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return ( *l_hasher );
}
}
vector<int> temp;
infosHasher defaut(0,"",temp);
return defaut;
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return ( *l_hasher );
}
vector<int> hashMapInfos::getValue ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
vector<int> retour;
}
vector<int> temp;
infosHasher defaut(0,"",temp);
return defaut;
}
vector<int> hashMapInfos::getValue ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
vector<int> retour;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
}
return retour;
return ( *l_hasher ).getValue();
}
}
return retour;
}
// string hashMapInfos::searchValue ( string value )
// {
// // long searchKey=hashValue ( key );
@ -158,42 +149,38 @@ namespace HashMapSpace
// }
//
void hashMapInfos::setValue ( string key , vector<int> value )
{
long searchKey=hashValue ( key );
long foundKey;
void hashMapInfos::setValue ( string key , vector<int> value )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
( *l_hasher ).setValue ( value );
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
}
string hashMapInfos::toString ()
{
stringstream to_return;
for ( vector<infosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
to_return << (*l_hasher).toString();
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
return to_return.str();
}
}
}
string hashMapInfos::toString ()
{
stringstream to_return;
for ( vector<infosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
to_return << (*l_hasher).toString();
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
return to_return.str();
}
/**
*
*/
void hashMapInfos::printHash()
{
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
{
/**
*
*/
void hashMapInfos::printHash()
{
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
}
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -34,29 +34,29 @@ using namespace std;
namespace HashMapSpace
{
class hashMapInfos
{
private:
vector<infosHasher> m_hasher;
class hashMapInfos
{
private:
vector<infosHasher> m_hasher;
public:
public:
// ~hashMap();
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, vector<int> value );
void addValue ( string key, vector<int> value );
infosHasher getHasher ( string key );
vector<int> getValue ( string key );
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, vector<int> value );
void addValue ( string key, vector<int> value );
infosHasher getHasher ( string key );
vector<int> getValue ( string key );
// string searchValue ( string key );
void setValue ( string key , vector<int> value );
void printHash();
string toString();
vector<infosHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
void setValue ( string key , vector<int> value );
void printHash();
string toString();
vector<infosHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -27,179 +27,166 @@ using namespace std;
namespace HashMapSpace
{
// hashMapStringInfos::hashMap();
/* hashMapStringInfos::~hashMap()
{
// vector<stringInfosHasher>::const_iterator del = m_hasher.begin();
for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMapStringInfos::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMapStringInfos::trouve ( long searchKey )
{
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return 1;
}
}
return 0;
// hashMapStringInfos::hashMap();
/* hashMapStringInfos::~hashMap()
{
// vector<stringInfosHasher>::const_iterator del = m_hasher.begin();
for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMapStringInfos::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMapStringInfos::trouve ( long searchKey )
{
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
int hashMapStringInfos::trouve ( string key )
{
long searchKey = hashValue ( key );
long foundKey;;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return 1;
}
}
return 0;
int hashMapStringInfos::trouve ( string key )
{
long searchKey = hashValue ( key );
long foundKey;;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
/**
* long hashMapStringInfos::hashValue ( string key )
* @param key
* @return
*/
long hashMapStringInfos::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> > ( loc );
return coll.hash ( key.data(), key.data() + key.length() );
/**
* long hashMapStringInfos::hashValue ( string key )
* @param key
* @return
*/
long hashMapStringInfos::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> > ( loc );
return coll.hash ( key.data(), key.data() + key.length() );
// boost::hash<string> hasher;
// return hasher ( key );
}
/**
* void hashMapStringInfos::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMapStringInfos::addHasher ( string key, vector<string> value )
{
if ( trouve ( hashValue ( key ) ) == 0 )
{
// cerr << "ICI1" <<endl;
stringInfosHasher H ( hashValue ( key ), key, value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
}
/**
* void hashMapStringInfos::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMapStringInfos::addHasher ( string key, vector<string> value )
{
if ( trouve ( hashValue ( key ) ) == 0 ) {
// cerr << "ICI1" <<endl;
stringInfosHasher H ( hashValue ( key ), key, value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
m_hasher.push_back ( H );
}
m_hasher.push_back ( H );
}
}
void hashMapStringInfos::addValue ( string key, vector<string> value )
{
addHasher ( key, value );
}
stringInfosHasher hashMapStringInfos::getHasher ( string key )
{
long searchKey = hashValue ( key );
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return ( *l_hasher );
}
void hashMapStringInfos::addValue ( string key, vector<string> value )
{
addHasher ( key, value );
}
vector<string> tmp;
stringInfosHasher defaut ( 0, "", tmp );
return defaut;
}
vector<string> hashMapStringInfos::getValue ( string key )
{
long searchKey = hashValue ( key );
long foundKey;
vector<string> retour;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
stringInfosHasher hashMapStringInfos::getHasher ( string key )
{
long searchKey = hashValue ( key );
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
return ( *l_hasher );
}
}
vector<string> tmp;
stringInfosHasher defaut ( 0, "", tmp );
return defaut;
}
vector<string> hashMapStringInfos::getValue ( string key )
{
long searchKey = hashValue ( key );
long foundKey;
vector<string> retour;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
}
return retour;
}
// string hashMapStringInfos::searchValue ( string value )
// {
// // long searchKey=hashValue ( key );
// // long foundKey;
// vector<int> foundValue;
//
// // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
// for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
// {
// foundValue= ( *l_hasher ).getValue();
// /* if ( foundValue.compare ( value ) == 0 )
// {
// return ( *l_hasher ).getKey();
// }*/
// }
// return "";
// }
//
}
return retour;
}
// string hashMapStringInfos::searchValue ( string value )
// {
// // long searchKey=hashValue ( key );
// // long foundKey;
// vector<int> foundValue;
//
// // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
// for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
// {
// foundValue= ( *l_hasher ).getValue();
// /* if ( foundValue.compare ( value ) == 0 )
// {
// return ( *l_hasher ).getKey();
// }*/
// }
// return "";
// }
//
void hashMapStringInfos::setValue ( string key , vector<string> value )
{
long searchKey = hashValue ( key );
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey )
{
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
void hashMapStringInfos::setValue ( string key , vector<string> value )
{
long searchKey = hashValue ( key );
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
}
string hashMapStringInfos::toString ()
{
stringstream to_return;
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
to_return << (*l_hasher).toString();
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
return to_return.str();
}
string hashMapStringInfos::toString ()
{
stringstream to_return;
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
to_return << (*l_hasher).toString();
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
return to_return.str();
}
/**
*
*/
void hashMapStringInfos::printHash()
{
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ )
{
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
vector< stringInfosHasher > hashMapStringInfos::getHashMap()
{
return m_hasher;
}
/**
*
*/
void hashMapStringInfos::printHash()
{
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
vector< stringInfosHasher > hashMapStringInfos::getHashMap()
{
return m_hasher;
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -34,29 +34,29 @@ using namespace std;
namespace HashMapSpace
{
class hashMapStringInfos
{
private:
vector<stringInfosHasher> m_hasher;
class hashMapStringInfos
{
private:
vector<stringInfosHasher> m_hasher;
public:
public:
// ~hashMap();
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, vector<string> value );
void addValue ( string key, vector<string> value );
stringInfosHasher getHasher ( string key );
vector<string> getValue ( string key );
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, vector<string> value );
void addValue ( string key, vector<string> value );
stringInfosHasher getHasher ( string key );
vector<string> getValue ( string key );
// string searchValue ( string key );
void setValue ( string key , vector<string> value );
void printHash();
string toString();
vector<stringInfosHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
void setValue ( string key , vector<string> value );
void printHash();
string toString();
vector<stringInfosHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -27,35 +27,35 @@ using namespace Tools;
namespace HashMapSpace
{
infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueVecInt;
}
infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueVecInt;
}
// infosHasher::~infosHasher(){};*/
long infosHasher::getHashKey()
{
return m_hashKey;
}
string infosHasher::getKey()
{
return m_key;
}
vector<int> infosHasher::getValue()
{
return m_value;
}
void infosHasher::setValue ( vector<int> value )
{
m_value=value;
}
string infosHasher::toString()
{
stringstream to_return;
to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl;
return to_return.str();
}
long infosHasher::getHashKey()
{
return m_hashKey;
}
string infosHasher::getKey()
{
return m_key;
}
vector<int> infosHasher::getValue()
{
return m_value;
}
void infosHasher::setValue ( vector<int> value )
{
m_value=value;
}
string infosHasher::toString()
{
stringstream to_return;
to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl;
return to_return.str();
}
// typedef stdext::hash_map<std::string,string, stringhasher> HASH_S_S;

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -31,23 +31,23 @@ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
using namespace std;
namespace HashMapSpace
{
class infosHasher
{
private:
long m_hashKey;
string m_key;
vector<int> m_value;
class infosHasher
{
private:
long m_hashKey;
string m_key;
vector<int> m_value;
public:
infosHasher ( long cle, string cleTxt, vector<int> valueVecInt );
long getHashKey();
string getKey();
vector<int> getValue();
void setValue ( vector<int> value );
string toString();
public:
infosHasher ( long cle, string cleTxt, vector<int> valueVecInt );
long getHashKey();
string getKey();
vector<int> getValue();
void setValue ( vector<int> value );
string toString();
};
};
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -26,29 +26,29 @@ using namespace std;
namespace HashMapSpace
{
stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueTxt;
}
stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueTxt;
}
// stringHasher::~stringHasher(){};*/
long stringHasher::getHashKey()
{
return m_hashKey;
}
string stringHasher::getKey()
{
return m_key;
}
string stringHasher::getValue()
{
return m_value;
}
void stringHasher::setValue ( string value )
{
m_value=value;
}
long stringHasher::getHashKey()
{
return m_hashKey;
}
string stringHasher::getKey()
{
return m_key;
}
string stringHasher::getValue()
{
return m_value;
}
void stringHasher::setValue ( string value )
{
m_value=value;
}
// typedef stdext::hash_map<string, string, stringhasher> HASH_S_S;

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -28,22 +28,22 @@ using namespace std;
namespace HashMapSpace
{
class stringHasher
{
private:
long m_hashKey;
string m_key;
string m_value;
class stringHasher
{
private:
long m_hashKey;
string m_key;
string m_value;
public:
stringHasher ( long cle, string cleTxt, string valueTxt );
long getHashKey();
string getKey();
string getValue();
void setValue ( string value );
public:
stringHasher ( long cle, string cleTxt, string valueTxt );
long getHashKey();
string getKey();
string getValue();
void setValue ( string value );
};
};
}

View File

@ -5,7 +5,7 @@ Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France
Contact: christophe.servan@lium.univ-lemans.fr
The tercpp tool and library are free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the licence, or
(at your option) any later version.
@ -27,35 +27,35 @@ using namespace Tools;
namespace HashMapSpace
{
stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueVecInt;
}
stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueVecInt;
}
// stringInfosHasher::~stringInfosHasher(){};*/
long stringInfosHasher::getHashKey()
{
return m_hashKey;
}
string stringInfosHasher::getKey()
{
return m_key;
}
vector<string> stringInfosHasher::getValue()
{
return m_value;
}
void stringInfosHasher::setValue ( vector<string> value )
{
m_value=value;
}
string stringInfosHasher::toString()
{
stringstream to_return;
to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl;
return to_return.str();
}
long stringInfosHasher::getHashKey()
{
return m_hashKey;
}
string stringInfosHasher::getKey()
{
return m_key;
}
vector<string> stringInfosHasher::getValue()
{
return m_value;
}
void stringInfosHasher::setValue ( vector<string> value )
{
m_value=value;
}
string stringInfosHasher::toString()
{
stringstream to_return;
to_return << m_hashKey << "\t" << m_key << "\t" << vectorToString(m_value,"\t") << endl;
return to_return.str();
}
// typedef stdext::hash_map<string, string, stringhasher> HASH_S_S;

Some files were not shown because too many files have changed in this diff Show More