Added option to specify directory or prefix for temporary files created during phrase table compacting

This commit is contained in:
Marcin Junczys-Dowmunt 2013-01-22 21:11:02 +01:00
parent cfe7d00ea2
commit fcf75fae18
9 changed files with 112 additions and 38 deletions

View File

@ -15,6 +15,7 @@ void printHelp(char **argv)
" options: \n"
"\t-in string -- input table file name\n"
"\t-out string -- prefix of binary table file\n"
"\t-T string -- path to temporary directory (uses /tmp by default)\n"
#ifdef WITH_THREADS
"\t-threads int|all -- number of threads used for conversion\n"
#endif
@ -44,6 +45,7 @@ int main(int argc, char** argv)
std::string inFilePath;
std::string outFilePath("out");
std::string tempfilePath;
size_t orderBits = 10;
size_t fingerPrintBits = 16;
@ -72,6 +74,10 @@ int main(int argc, char** argv)
++i;
outFilePath = argv[i];
}
else if("-T" == arg && i+1 < argc) {
++i;
tempfilePath = argv[i];
}
else if("-landmark" == arg && i+1 < argc)
{
++i;
@ -121,7 +127,7 @@ int main(int argc, char** argv)
outFilePath += ".minlexr";
LexicalReorderingTableCreator(
inFilePath, outFilePath,
inFilePath, outFilePath, tempfilePath,
orderBits, fingerPrintBits,
multipleScoreTrees, quantize
#ifdef WITH_THREADS

View File

@ -14,6 +14,7 @@ void printHelp(char **argv) {
" options: \n"
"\t-in string -- input table file name\n"
"\t-out string -- prefix of binary table file\n"
"\t-T string -- path to temporary directory (uses /tmp by default)\n"
"\t-nscores int -- number of score components in phrase table\n"
"\t-no-alignment-info -- do not include alignment info in the binary phrase table\n"
#ifdef WITH_THREADS
@ -49,6 +50,7 @@ int main(int argc, char **argv) {
std::string inFilePath;
std::string outFilePath("out");
std::string tempfilePath;
PhraseTableCreator::Coding coding = PhraseTableCreator::PREnc;
size_t numScoreComponent = 5;
@ -77,6 +79,10 @@ int main(int argc, char **argv) {
++i;
outFilePath = argv[i];
}
else if("-T" == arg && i+1 < argc) {
++i;
tempfilePath = argv[i];
}
else if("-encoding" == arg && i+1 < argc) {
++i;
std::string val(argv[i]);
@ -166,7 +172,8 @@ int main(int argc, char **argv) {
if(outFilePath.rfind(".minphr") != outFilePath.size() - 7)
outFilePath += ".minphr";
PhraseTableCreator(inFilePath, outFilePath, numScoreComponent, sortScoreIndex,
PhraseTableCreator(inFilePath, outFilePath, tempfilePath,
numScoreComponent, sortScoreIndex,
coding, orderBits, fingerprintBits,
useAlignmentInfo, multipleScoreTrees,
quantize, maxRank, warnMe

View File

@ -23,21 +23,23 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ThrowingFwrite.h"
#include "moses/Util.h"
#include "util/file.hh"
namespace Moses {
LexicalReorderingTableCreator::LexicalReorderingTableCreator(
std::string inPath, std::string outPath,
std::string inPath, std::string outPath, std::string tempfilePath,
size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
size_t quantize
#ifdef WITH_THREADS
, size_t threads
#endif
)
: m_inPath(inPath), m_outPath(outPath), m_orderBits(orderBits),
m_fingerPrintBits(fingerPrintBits), m_numScoreComponent(0),
m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize),
m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits),
m_lastFlushedLine(-1)
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
m_quantize(quantize), m_separator(" ||| "),
m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
#ifdef WITH_THREADS
, m_threads(threads)
#endif
@ -48,12 +50,31 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
m_hash.BeginSave(m_outFile);
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
}
else {
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
EncodeScores();
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
CalcHuffmanCodes();
std::cerr << "Pass 2/2: Compressing scores" << std::endl;
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
}
else {
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
CompressScores();
std::cerr << "Saving to " << m_outPath << std::endl;
@ -88,6 +109,9 @@ LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
delete m_encodedScores;
delete m_compressedScores;
}
@ -134,12 +158,12 @@ void LexicalReorderingTableCreator::CompressScores()
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this);
CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
threads.create_thread(*ct);
}
threads.join_all();
#else
CompressionTaskReordering* ct = new CompressionTaskReordering(m_encodedScores, *this);
CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
(*ct)();
delete ct;
#endif
@ -153,7 +177,7 @@ void LexicalReorderingTableCreator::Save()
for(size_t i = 0; i < m_scoreTrees.size(); i++)
m_scoreTrees[i]->Save(m_outFile);
m_compressedScores.save(m_outFile);
m_compressedScores->save(m_outFile);
}
std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
@ -218,7 +242,7 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
m_lastFlushedLine++;
m_lastRange.push_back(pi.GetSrc());
m_encodedScores.push_back(pi.GetTrg());
m_encodedScores->push_back(pi.GetTrg());
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";
@ -293,7 +317,7 @@ void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
m_queue.pop();
m_lastFlushedLine++;
m_compressedScores.push_back(pi.GetTrg());
m_compressedScores->push_back(pi.GetTrg());
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";

View File

@ -30,6 +30,7 @@ class LexicalReorderingTableCreator {
private:
std::string m_inPath;
std::string m_outPath;
std::string m_tempfilePath;
std::FILE* m_outFile;
@ -51,8 +52,8 @@ class LexicalReorderingTableCreator {
std::vector<ScoreCounter*> m_scoreCounters;
std::vector<ScoreTree*> m_scoreTrees;
StringVector<unsigned char, unsigned long, MmapAllocator> m_encodedScores;
StringVector<unsigned char, unsigned long, MmapAllocator> m_compressedScores;
StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
std::priority_queue<PackedItem> m_queue;
long m_lastFlushedLine;
@ -84,6 +85,7 @@ class LexicalReorderingTableCreator {
public:
LexicalReorderingTableCreator(std::string inPath,
std::string outPath,
std::string tempfilePath,
size_t orderBits = 10,
size_t fingerPrintBits = 16,
bool multipleScoreTrees = true,

View File

@ -66,7 +66,7 @@ namespace Moses
m_data_offset(0), m_fixed(false), m_count(new size_t(0))
{ }
MmapAllocator(std::FILE* f_ptr, size_t data_offset = 0) throw()
MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0))

View File

@ -66,7 +66,7 @@ class PackedArray
std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
}
~PackedArray()
virtual ~PackedArray()
{
delete [] m_storage;
m_size = 0;

View File

@ -25,6 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ConsistentPhrases.h"
#include "ThrowingFwrite.h"
#include "util/file.hh"
namespace Moses
{
@ -40,6 +42,7 @@ std::string PhraseTableCreator::m_separator = " ||| ";
PhraseTableCreator::PhraseTableCreator(std::string inPath,
std::string outPath,
std::string tempfilePath,
size_t numScoreComponent,
size_t sortScoreIndex,
Coding coding,
@ -54,7 +57,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
, size_t threads
#endif
)
: m_inPath(inPath), m_outPath(outPath),
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent),
m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe),
m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
@ -108,7 +111,15 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
// 1st pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl;
m_srcHash.BeginSave(m_outFile);
m_srcHash.BeginSave(m_outFile);
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
}
else {
m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
EncodeTargetPhrases();
cur_pass++;
@ -118,6 +129,14 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
// 2nd pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl;
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
}
else {
m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
CompressTargetPhrases();
std::cerr << "Saving to " << m_outPath << std::endl;
@ -135,6 +154,9 @@ PhraseTableCreator::~PhraseTableCreator()
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
delete m_encodedTargetPhrases;
delete m_compressedTargetPhrases;
}
void PhraseTableCreator::PrintInfo()
@ -230,7 +252,7 @@ void PhraseTableCreator::Save()
m_alignTree->Save(m_outFile);
// Save compressed target phrase collections
m_compressedTargetPhrases.save(m_outFile);
m_compressedTargetPhrases->save(m_outFile);
}
void PhraseTableCreator::LoadLexicalTable(std::string filePath)
@ -355,12 +377,12 @@ void PhraseTableCreator::CompressTargetPhrases()
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this);
CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
threads.create_thread(*ct);
}
threads.join_all();
#else
CompressionTask* ct = new CompressionTask(m_encodedTargetPhrases, *this);
CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
(*ct)();
delete ct;
#endif
@ -940,7 +962,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
targetPhraseCollection << *it;
m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
m_encodedTargetPhrases.push_back(targetPhraseCollection.str());
m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
m_lastFlushedSourceNum++;
if(m_lastFlushedSourceNum % 100000 == 0)
@ -982,7 +1004,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
m_lastCollection.begin(); it != m_lastCollection.end(); it++)
targetPhraseCollection << *it;
m_encodedTargetPhrases.push_back(targetPhraseCollection.str());
m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
m_lastCollection.clear();
}
@ -1019,7 +1041,7 @@ void PhraseTableCreator::FlushCompressedQueue(bool force)
m_queue.pop();
m_lastFlushedLine++;
m_compressedTargetPhrases.push_back(pi.GetTrg());
m_compressedTargetPhrases->push_back(pi.GetTrg());
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";

View File

@ -196,6 +196,7 @@ class PhraseTableCreator
private:
std::string m_inPath;
std::string m_outPath;
std::string m_tempfilePath;
std::FILE* m_outFile;
@ -252,10 +253,10 @@ class PhraseTableCreator
std::vector<size_t> m_lexicalTableIndex;
std::vector<SrcTrg> m_lexicalTable;
StringVector<unsigned char, unsigned long, MmapAllocator>
StringVector<unsigned char, unsigned long, MmapAllocator>*
m_encodedTargetPhrases;
StringVector<unsigned char, unsigned long, MmapAllocator>
StringVector<unsigned char, unsigned long, MmapAllocator>*
m_compressedTargetPhrases;
boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
@ -346,6 +347,7 @@ class PhraseTableCreator
PhraseTableCreator(std::string inPath,
std::string outPath,
std::string tempfilePath,
size_t numScoreComponent = 5,
size_t sortScoreIndex = 2,
Coding coding = PREnc,

View File

@ -79,11 +79,12 @@ template <typename ValueT = unsigned char, typename PosT = unsigned int,
class StringVector
{
protected:
std::vector<ValueT, Allocator<ValueT> > m_charArray;
MonotonicVector<PosT, unsigned int, 32, Allocator> m_positions;
bool m_sorted;
bool m_memoryMapped;
std::vector<ValueT, Allocator<ValueT> >* m_charArray;
MonotonicVector<PosT, unsigned int, 32> m_positions;
virtual const ValueT* value_ptr(PosT i) const;
public:
@ -148,12 +149,18 @@ class StringVector
typedef StringIterator string_iterator;
StringVector();
StringVector(Allocator<ValueT> alloc);
virtual ~StringVector()
{
delete m_charArray;
}
void swap(StringVector<ValueT, PosT, Allocator> &c)
{
m_positions.commit();
m_positions.swap(c.m_positions);
m_charArray.swap(c.m_charArray);
m_charArray->swap(*c.m_charArray);
bool temp = m_sorted;
m_sorted = c.m_sorted;
@ -176,7 +183,7 @@ class StringVector
void clear()
{
m_charArray.clear();
m_charArray->clear();
m_sorted = true;
m_positions = MonotonicVector<PosT, unsigned int, 32>();
}
@ -201,7 +208,7 @@ class StringVector
size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
size += m_positions.load(in, m_memoryMapped);
size += loadCharArray(m_charArray, in, m_memoryMapped);
size += loadCharArray(*m_charArray, in, m_memoryMapped);
return size;
}
@ -272,7 +279,7 @@ class StringVector
size_t valSize = size2();
byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
byteSize += ThrowingFwrite(&m_charArray[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
return byteSize;
}
@ -374,7 +381,11 @@ OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector()
: m_sorted(true), m_memoryMapped(false) { }
: m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> alloc)
: m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename StringT>
@ -384,7 +395,7 @@ void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
m_sorted = false;
m_positions.push_back(size2());
std::copy(s.begin(), s.end(), std::back_inserter(m_charArray));
std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@ -435,7 +446,7 @@ PosT StringVector<ValueT, PosT, Allocator>::size() const
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::size2() const
{
return m_charArray.size();
return m_charArray->size();
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@ -468,7 +479,7 @@ PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
template<typename ValueT, typename PosT, template <typename> class Allocator>
const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
{
return &m_charArray[m_positions[i]];
return &(*m_charArray)[m_positions[i]];
}
template<typename ValueT, typename PosT, template <typename> class Allocator>