// -*- c++ -*- // Memory-mapped corpus track. The corpus (each Token occupying a fixed number // of bytes (must be compatible with the memory alignment in the OS) is stored // as one huge array. The "index" maps from sentence IDs to positions within // that array. // (c) 2007-2010 Ulrich Germann. All rights reserved #ifndef __ug_mm_ttrack #define __ug_mm_ttrack #include #include #include #include #include #include "tpt_typedefs.h" #include "tpt_tokenindex.h" #include "ug_ttrack_base.h" #include "num_read_write.h" #include "ug_load_primer.h" namespace ugdiss { using namespace std; namespace bio=boost::iostreams; template class mmTtrack : public Ttrack { public: typedef TKN Token; private: bio::mapped_file_source file; Token const* data; // pointer to first word of first sentence id_type const* index; /* pointer to index (change data type for corpora * of more than four billion words) */ public: mmTtrack(string fname); mmTtrack(); // return pointer to beginning of sentence Token const* sntStart(size_t sid) const; // return pointer to end of sentence Token const* sntEnd(size_t sid) const; // return size of corpus (in number of sentences) size_t size() const; // return size of corpus (in number of sentences) size_t numTokens() const; // open an mmTtrack file void open(string fname); // FUNCTIONS FOR BUILDING CORPUS TRACKS // write a blank file header at the beginning of a new ttrack file void write_blank_file_header(ostream& out) const; // write the sentence index /idx/ and fill the file header void write_index_and_finalize(ostream& out, vector const& idx, count_type tokenCount) const; // copy a contiguous sequence of sentences to another stream // return the number of tokens copied id_type copySentences(ostream& trg, id_type start, id_type stop) const; /** find the sentence id of a given token */ id_type findSid(TKN const* t) const; id_type findSid(id_type tokenOffset) const; /// re-assign ids based on the id maps in /f/ void remap(string const fname, vector const & f) const; }; /// re-assign ids based on the id maps in /f/ template void mmTtrack:: remap(string const fname, vector const & f) const { bio::mapped_file myfile(fname); assert(myfile.is_open()); Moses::prime(myfile); filepos_type idxOffset; char* p = myfile.data(); id_type numSent,numWords; p = numread(p,idxOffset); p = numread(p,numSent); p = numread(p,numWords); data = reinterpret_cast(p); for (size_t i = 0; i < numWords; ++i) data[i] = data[i].remap(f); myfile.close(); } template size_t mmTtrack:: size() const { return this->numSent; } template size_t mmTtrack:: numTokens() const { return this->numWords; } template TKN const* mmTtrack:: sntStart(size_t sid) const // return pointer to beginning of sentence { if (sid >= this->numSent) { cerr << "Fatal error: requested sentence #"<numSent <<")" << endl; } assert(sid < this->numSent); return data+index[sid]; } template TKN const* mmTtrack:: sntEnd(size_t sid) const // return pointer to end of sentence { assert(sid < this->numSent); return data+index[sid+1]; } template mmTtrack:: mmTtrack() { data = NULL; index = NULL; this->numSent = this->numWords = 0; } template mmTtrack:: mmTtrack(string fname) { open(fname); } template void mmTtrack:: open(string fname) { if (access(fname.c_str(),F_OK)) { ostringstream msg; msg << "mmTtrack<>::open: File '" << fname << "' does not exist."; throw std::runtime_error(msg.str().c_str()); } file.open(fname); if (!file.is_open()) { cerr << "Error opening file " << fname << endl; assert(0); } filepos_type idxOffset; char const* p = file.data(); p = numread(p,idxOffset); p = numread(p,this->numSent); p = numread(p,this->numWords); data = reinterpret_cast(p); index = reinterpret_cast(file.data()+idxOffset); } template id_type mmTtrack:: findSid(TKN const* t) const { id_type tokenPos = t-data; id_type const* p = upper_bound(index,index+this->numSent,tokenPos); assert(p>index); return p-index-1; } template id_type mmTtrack:: findSid(id_type tokenPos) const { id_type const* p = upper_bound(index,index+this->numSent,tokenPos); assert(p>index); return p-index-1; } template void mmTtrack:: write_blank_file_header(ostream& out) const { numwrite(out,filepos_type(0)); // place holder for index start numwrite(out,id_type(0)); // place holder for index size numwrite(out,id_type(0)); // place holder for token count } template void mmTtrack:: write_index_and_finalize(ostream& out, vectorconst& idx, id_type tokenCount) const { id_type idxSize = idx.size(); filepos_type idxStart = out.tellp(); for (size_t i = 0; i < idx.size(); ++i) numwrite(out,idx[i]); out.seekp(0); numwrite(out,idxStart); numwrite(out,idxSize-1); numwrite(out,tokenCount); } template id_type mmTtrack:: copySentences(ostream& trg, id_type start, id_type stop) const { assert(stop > start); TKN const* a = sntStart(start); TKN const* z = sntEnd(stop-1); size_t len = (z-a)*sizeof(TKN); if (!len) return 0; trg.write(reinterpret_cast(a),len); return z-a; } } #endif