mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
251 lines
6.1 KiB
C++
251 lines
6.1 KiB
C++
// -*- c++ -*-
|
|
// Memory-mapped corpus track. The corpus (each Token occupying a fixed number
|
|
// of bytes (must be compatible with the memory alignment in the OS) is stored
|
|
// as one huge array. The "index" maps from sentence IDs to positions within
|
|
// that array.
|
|
|
|
// (c) 2007-2010 Ulrich Germann. All rights reserved
|
|
|
|
#ifndef __ug_mm_ttrack
|
|
#define __ug_mm_ttrack
|
|
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <stdexcept>
|
|
|
|
#include <boost/iostreams/device/mapped_file.hpp>
|
|
#include <boost/shared_ptr.hpp>
|
|
|
|
#include "tpt_typedefs.h"
|
|
#include "tpt_tokenindex.h"
|
|
#include "ug_ttrack_base.h"
|
|
#include "num_read_write.h"
|
|
#include "ug_load_primer.h"
|
|
|
|
namespace ugdiss
|
|
{
|
|
using namespace std;
|
|
namespace bio=boost::iostreams;
|
|
|
|
template<typename TKN=id_type>
|
|
class mmTtrack : public Ttrack<TKN>
|
|
{
|
|
public:
|
|
typedef TKN Token;
|
|
|
|
private:
|
|
bio::mapped_file_source file;
|
|
Token const* data; // pointer to first word of first sentence
|
|
id_type const* index; /* pointer to index (change data type for corpora
|
|
* of more than four billion words)
|
|
*/
|
|
public:
|
|
mmTtrack(string fname);
|
|
mmTtrack();
|
|
|
|
// return pointer to beginning of sentence
|
|
Token const* sntStart(size_t sid) const;
|
|
|
|
// return pointer to end of sentence
|
|
Token const* sntEnd(size_t sid) const;
|
|
|
|
// return size of corpus (in number of sentences)
|
|
size_t size() const;
|
|
|
|
// return size of corpus (in number of sentences)
|
|
size_t numTokens() const;
|
|
|
|
// open an mmTtrack file
|
|
void open(string fname);
|
|
|
|
// FUNCTIONS FOR BUILDING CORPUS TRACKS
|
|
// write a blank file header at the beginning of a new ttrack file
|
|
void write_blank_file_header(ostream& out) const;
|
|
|
|
// write the sentence index /idx/ and fill the file header
|
|
void write_index_and_finalize(ostream& out,
|
|
vector<id_type> const& idx,
|
|
count_type tokenCount) const;
|
|
|
|
// copy a contiguous sequence of sentences to another stream
|
|
// return the number of tokens copied
|
|
id_type copySentences(ostream& trg, id_type start, id_type stop) const;
|
|
|
|
/** find the sentence id of a given token */
|
|
id_type findSid(TKN const* t) const;
|
|
|
|
id_type findSid(id_type tokenOffset) const;
|
|
|
|
/// re-assign ids based on the id maps in /f/
|
|
void remap(string const fname, vector<id_type const*> const & f) const;
|
|
|
|
};
|
|
|
|
/// re-assign ids based on the id maps in /f/
|
|
template<typename TKN>
|
|
void
|
|
mmTtrack<TKN>::
|
|
remap(string const fname, vector<id_type const*> const & f) const
|
|
{
|
|
bio::mapped_file myfile(fname);
|
|
assert(myfile.is_open());
|
|
Moses::prime(myfile);
|
|
filepos_type idxOffset;
|
|
char* p = myfile.data();
|
|
id_type numSent,numWords;
|
|
p = numread(p,idxOffset);
|
|
p = numread(p,numSent);
|
|
p = numread(p,numWords);
|
|
data = reinterpret_cast<TKN*>(p);
|
|
for (size_t i = 0; i < numWords; ++i)
|
|
data[i] = data[i].remap(f);
|
|
myfile.close();
|
|
}
|
|
|
|
|
|
template<typename TKN>
|
|
size_t
|
|
mmTtrack<TKN>::
|
|
size() const
|
|
{
|
|
return this->numSent;
|
|
}
|
|
|
|
template<typename TKN>
|
|
size_t
|
|
mmTtrack<TKN>::
|
|
numTokens() const
|
|
{
|
|
return this->numWords;
|
|
}
|
|
|
|
template<typename TKN>
|
|
TKN const*
|
|
mmTtrack<TKN>::
|
|
sntStart(size_t sid) const // return pointer to beginning of sentence
|
|
{
|
|
if (sid >= this->numSent)
|
|
{
|
|
cerr << "Fatal error: requested sentence #"<<sid<<" is beyond corpus size ("
|
|
<< this->numSent <<")" << endl;
|
|
}
|
|
assert(sid < this->numSent);
|
|
return data+index[sid];
|
|
}
|
|
|
|
template<typename TKN>
|
|
TKN const*
|
|
mmTtrack<TKN>::
|
|
sntEnd(size_t sid) const // return pointer to end of sentence
|
|
{
|
|
assert(sid < this->numSent);
|
|
return data+index[sid+1];
|
|
}
|
|
|
|
template<typename TKN>
|
|
mmTtrack<TKN>::
|
|
mmTtrack()
|
|
{
|
|
data = NULL;
|
|
index = NULL;
|
|
this->numSent = this->numWords = 0;
|
|
}
|
|
|
|
template<typename TKN>
|
|
mmTtrack<TKN>::
|
|
mmTtrack(string fname)
|
|
{
|
|
open(fname);
|
|
}
|
|
|
|
template<typename TKN>
|
|
void
|
|
mmTtrack<TKN>::
|
|
open(string fname)
|
|
{
|
|
if (access(fname.c_str(),F_OK))
|
|
{
|
|
ostringstream msg;
|
|
msg << "mmTtrack<>::open: File '" << fname << "' does not exist.";
|
|
throw std::runtime_error(msg.str().c_str());
|
|
}
|
|
file.open(fname);
|
|
if (!file.is_open())
|
|
{
|
|
cerr << "Error opening file " << fname << endl;
|
|
assert(0);
|
|
}
|
|
filepos_type idxOffset;
|
|
char const* p = file.data();
|
|
p = numread(p,idxOffset);
|
|
p = numread(p,this->numSent);
|
|
p = numread(p,this->numWords);
|
|
data = reinterpret_cast<Token const*>(p);
|
|
index = reinterpret_cast<id_type const*>(file.data()+idxOffset);
|
|
}
|
|
|
|
template<typename TKN>
|
|
id_type
|
|
mmTtrack<TKN>::
|
|
findSid(TKN const* t) const
|
|
{
|
|
id_type tokenPos = t-data;
|
|
id_type const* p = upper_bound(index,index+this->numSent,tokenPos);
|
|
assert(p>index);
|
|
return p-index-1;
|
|
}
|
|
|
|
template<typename TKN>
|
|
id_type
|
|
mmTtrack<TKN>::
|
|
findSid(id_type tokenPos) const
|
|
{
|
|
id_type const* p = upper_bound(index,index+this->numSent,tokenPos);
|
|
assert(p>index);
|
|
return p-index-1;
|
|
}
|
|
|
|
template<typename TKN>
|
|
void
|
|
mmTtrack<TKN>::
|
|
write_blank_file_header(ostream& out) const
|
|
{
|
|
numwrite(out,filepos_type(0)); // place holder for index start
|
|
numwrite(out,id_type(0)); // place holder for index size
|
|
numwrite(out,id_type(0)); // place holder for token count
|
|
}
|
|
|
|
template<typename TKN>
|
|
void
|
|
mmTtrack<TKN>::
|
|
write_index_and_finalize(ostream& out,
|
|
vector<id_type>const& idx,
|
|
id_type tokenCount) const
|
|
{
|
|
id_type idxSize = idx.size();
|
|
filepos_type idxStart = out.tellp();
|
|
for (size_t i = 0; i < idx.size(); ++i)
|
|
numwrite(out,idx[i]);
|
|
out.seekp(0);
|
|
numwrite(out,idxStart);
|
|
numwrite(out,idxSize-1);
|
|
numwrite(out,tokenCount);
|
|
}
|
|
|
|
template<typename TKN>
|
|
id_type
|
|
mmTtrack<TKN>::
|
|
copySentences(ostream& trg, id_type start, id_type stop) const
|
|
{
|
|
assert(stop > start);
|
|
TKN const* a = sntStart(start);
|
|
TKN const* z = sntEnd(stop-1);
|
|
size_t len = (z-a)*sizeof(TKN);
|
|
if (!len) return 0;
|
|
trg.write(reinterpret_cast<char const*>(a),len);
|
|
return z-a;
|
|
}
|
|
|
|
}
|
|
#endif
|