diff --git a/lm/read_arpa.cc b/lm/read_arpa.cc index b709fef94..4723ab3a2 100644 --- a/lm/read_arpa.cc +++ b/lm/read_arpa.cc @@ -1,6 +1,7 @@ #include "lm/read_arpa.hh" #include "lm/blank.hh" +#include "util/file.hh" #include #include diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc index 8663e94e1..dc542bb32 100644 --- a/lm/trie_sort.cc +++ b/lm/trie_sort.cc @@ -65,13 +65,13 @@ class PartialViewProxy { typedef util::ProxyIterator PartialIter; -FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) { - util::scoped_fd file(maker.Make()); +FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) { + util::scoped_fd file(util::MakeTemp(temp_prefix)); util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); return util::FDOpenOrThrow(file); } -FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) { +FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) { const size_t context_size = sizeof(WordIndex) * (order - 1); // Sort just the contexts using the same memory. PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size)); @@ -84,7 +84,7 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make #endif (context_begin, context_end, util::SizedCompare(EntryCompare(order - 1))); - util::scoped_FILE out(maker.MakeFile()); + util::scoped_FILE out(util::FMakeTemp(temp_prefix)); // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. if (context_begin == context_end) return out.release(); @@ -114,12 +114,12 @@ struct FirstCombine { } }; -template FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) { +template FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) { std::size_t entry_size = sizeof(WordIndex) * order + weights_size; RecordReader first, second; first.Init(first_file, entry_size); second.Init(second_file, entry_size); - util::scoped_FILE out_file(maker.MakeFile()); + util::scoped_FILE out_file(util::FMakeTemp(temp_prefix)); EntryCompare less(order); while (first && second) { if (less(first.Data(), second.Data())) { @@ -177,9 +177,8 @@ void RecordReader::Rewind() { } SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { - util::TempMaker maker(file_prefix); PositiveProbWarn warn(config.positive_log_probability); - unigram_.reset(maker.Make()); + unigram_.reset(util::MakeTemp(file_prefix)); { // In case appears. size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); @@ -202,7 +201,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { +void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { ReadNGramHeader(f, order); const size_t count = counts[order - 1]; // Size of weights. Does it include backoff? @@ -261,8 +260,8 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo std::sort #endif (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare(EntryCompare(order))); - files.push_back(DiskFlush(begin, out_end, maker)); - contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order)); + files.push_back(DiskFlush(begin, out_end, file_prefix)); + contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order)); done += (out_end - begin) / entry_size; } @@ -270,10 +269,10 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo // All individual files created. Merge them. while (files.size() > 1) { - files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine())); + files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine())); files_closer.PopFront(); files_closer.PopFront(); - contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine())); + contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine())); contexts_closer.PopFront(); contexts_closer.PopFront(); } diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh index 2197b80ce..1afd9562b 100644 --- a/lm/trie_sort.hh +++ b/lm/trie_sort.hh @@ -18,7 +18,6 @@ namespace util { class FilePiece; -class TempMaker; } // namespace util namespace lm { @@ -101,7 +100,7 @@ class SortedFiles { } private: - void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); + void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); util::scoped_fd unigram_; diff --git a/util/exception.cc b/util/exception.cc index 3806e6de4..557c39862 100644 --- a/util/exception.cc +++ b/util/exception.cc @@ -79,11 +79,6 @@ ErrnoException::ErrnoException() throw() : errno_(errno) { ErrnoException::~ErrnoException() throw() {} -EndOfFileException::EndOfFileException() throw() { - *this << "End of file"; -} -EndOfFileException::~EndOfFileException() throw() {} - OverflowException::OverflowException() throw() {} OverflowException::~OverflowException() throw() {} diff --git a/util/exception.hh b/util/exception.hh index 0165a7a3a..74046cf90 100644 --- a/util/exception.hh +++ b/util/exception.hh @@ -44,7 +44,7 @@ class Exception : public std::exception { }; /* This implements the normal operator<< for Exception and all its children. - * SNIFAE means it only applies to Exception. Think of this as an ersatz + * SFINAE means it only applies to Exception. Think of this as an ersatz * boost::enable_if. */ template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data) { @@ -62,30 +62,26 @@ template typename Except::template ExceptionTag= 3 #define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) @@ -93,15 +89,16 @@ template typename Except::template ExceptionTag #include +#include #include #include +#include #include #include #include @@ -40,6 +42,18 @@ scoped_FILE::~scoped_FILE() { } } +// Note that ErrnoException records errno before NameFromFD is called. +FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) { + *this << "in " << name_guess_ << ' '; +} + +FDException::~FDException() throw() {} + +EndOfFileException::EndOfFileException() throw() { + *this << "End of file"; +} +EndOfFileException::~EndOfFileException() throw() {} + int OpenReadOrThrow(const char *name) { int ret; #if defined(_WIN32) || defined(_WIN64) @@ -78,8 +92,14 @@ uint64_t SizeFile(int fd) { #endif } +uint64_t SizeOrThrow(int fd) { + uint64_t ret = SizeFile(fd); + UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size"); + return ret; +} + void ResizeOrThrow(int fd, uint64_t to) { - UTIL_THROW_IF( + UTIL_THROW_IF_ARG( #if defined(_WIN32) || defined(_WIN64) _chsize_s #elif defined(OS_ANDROID) @@ -87,7 +107,7 @@ void ResizeOrThrow(int fd, uint64_t to) { #else ftruncate #endif - (fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); + (fd, to), FDException, (fd), "while resizing to " << to << " bytes"); } std::size_t PartialRead(int fd, void *to, std::size_t amount) { @@ -95,9 +115,13 @@ std::size_t PartialRead(int fd, void *to, std::size_t amount) { amount = min(static_cast(INT_MAX), amount); int ret = _read(fd, to, amount); #else - ssize_t ret = read(fd, to, amount); + errno = 0; + ssize_t ret; + do { + ret = read(fd, to, amount); + } while (ret == -1 && errno == EINTR); #endif - UTIL_THROW_IF(ret < 0, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes"); return static_cast(ret); } @@ -105,7 +129,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast(to_void); while (amount) { std::size_t ret = PartialRead(fd, to, amount); - UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read."); + UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read."); amount -= ret; to += ret; } @@ -123,29 +147,59 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { return amount; } +void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { + uint8_t *to = static_cast(to_void); +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW(Exception, "TODO: PReadOrThrow for windows using ReadFile http://stackoverflow.com/questions/766477/are-there-equivalents-to-pread-on-different-platforms"); +#else + for (;size ;) { + ssize_t ret; + errno = 0; + do { +#ifdef OS_ANDROID + ret = pread64(fd, to, size, off); +#else + ret = pread(fd, to, size, off); +#endif + } while (ret == -1 && errno == EINTR); + if (ret <= 0) { + UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); + UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); + } + size -= ret; + off += ret; + to += ret; + } +#endif +} + void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast(data_void); while (size) { #if defined(_WIN32) || defined(_WIN64) int ret = write(fd, data, min(static_cast(INT_MAX), size)); #else - ssize_t ret = write(fd, data, size); + errno = 0; + ssize_t ret; + do { + ret = write(fd, data, size); + } while (ret == -1 && errno == EINTR); #endif - if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed"); + UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); data += ret; size -= ret; } } void WriteOrThrow(FILE *to, const void *data, std::size_t size) { - assert(size); - UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), util::ErrnoException, "Short write; requested size " << size); + if (!size) return; + UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size); } void FSyncOrThrow(int fd) { // Apparently windows doesn't have fsync? #if !defined(_WIN32) && !defined(_WIN64) - UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed."); + UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "Syncing"); #endif } @@ -164,7 +218,7 @@ typedef CheckOffT::True IgnoredType; // Can't we all just get along? void InternalSeek(int fd, int64_t off, int whence) { - UTIL_THROW_IF( + UTIL_THROW_IF_ARG( #if defined(_WIN32) || defined(_WIN64) (__int64)-1 == _lseeki64(fd, off, whence), #elif defined(OS_ANDROID) @@ -172,7 +226,7 @@ void InternalSeek(int fd, int64_t off, int whence) { #else (off_t)-1 == lseek(fd, off, whence), #endif - ErrnoException, "Seek failed"); + FDException, (fd), "while seeking to " << off << " whence " << whence); } } // namespace @@ -190,22 +244,18 @@ void SeekEnd(int fd) { std::FILE *FDOpenOrThrow(scoped_fd &file) { std::FILE *ret = fdopen(file.get(), "r+b"); - if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get()); + UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write"); file.release(); return ret; } std::FILE *FDOpenReadOrThrow(scoped_fd &file) { std::FILE *ret = fdopen(file.get(), "rb"); - if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get()); + UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read"); file.release(); return ret; } -TempMaker::TempMaker(const std::string &prefix) : base_(prefix) { - base_ += "XXXXXX"; -} - // Sigh. Windows temporary file creation is full of race conditions. #if defined(_WIN32) || defined(_WIN64) /* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright @@ -322,23 +372,76 @@ int mkstemp_and_unlink(char *tmpl) { int ret = mkstemp(tmpl); if (ret != -1) { - UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); + UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl); } return ret; } #endif -int TempMaker::Make() const { - std::string name(base_); +int MakeTemp(const std::string &base) { + std::string name(base); + name += "XXXXXX"; name.push_back(0); int ret; - UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), util::ErrnoException, "Failed to make a temporary based on " << base_); + UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base); return ret; } -std::FILE *TempMaker::MakeFile() const { - util::scoped_fd file(Make()); +std::FILE *FMakeTemp(const std::string &base) { + util::scoped_fd file(MakeTemp(base)); return FDOpenOrThrow(file); } +int DupOrThrow(int fd) { + int ret = dup(fd); + UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor"); + return ret; +} + +namespace { +// Try to name things but be willing to fail too. +bool TryName(int fd, std::string &out) { +#if defined(_WIN32) || defined(_WIN64) + return false; +#else + std::string name("/proc/self/fd/"); + std::ostringstream convert; + convert << fd; + name += convert.str(); + + struct stat sb; + if (-1 == lstat(name.c_str(), &sb)) + return false; + out.resize(sb.st_size + 1); + ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1); + if (-1 == ret) + return false; + if (ret > sb.st_size) { + // Increased in size?! + return false; + } + out.resize(ret); + // Don't use the non-file names. + if (!out.empty() && out[0] != '/') + return false; + return true; +#endif +} +} // namespace + +std::string NameFromFD(int fd) { + std::string ret; + if (TryName(fd, ret)) return ret; + switch (fd) { + case 0: return "stdin"; + case 1: return "stdout"; + case 2: return "stderr"; + } + ret = "fd "; + std::ostringstream convert; + convert << fd; + ret += convert.str(); + return ret; +} + } // namespace util diff --git a/util/file.hh b/util/file.hh index c24580d60..471198b1c 100644 --- a/util/file.hh +++ b/util/file.hh @@ -1,6 +1,8 @@ #ifndef UTIL_FILE__ #define UTIL_FILE__ +#include "util/exception.hh" + #include #include #include @@ -17,7 +19,7 @@ class scoped_fd { ~scoped_fd(); - void reset(int to) { + void reset(int to = -1) { scoped_fd other(fd_); fd_ = to; } @@ -63,6 +65,32 @@ class scoped_FILE { std::FILE *file_; }; +/* Thrown for any operation where the fd is known. */ +class FDException : public ErrnoException { + public: + explicit FDException(int fd) throw(); + + virtual ~FDException() throw(); + + // This may no longer be valid if the exception was thrown past open. + int FD() const { return fd_; } + + // Guess from NameFromFD. + const std::string &NameGuess() const { return name_guess_; } + + private: + int fd_; + + std::string name_guess_; +}; + +// End of file reached. +class EndOfFileException : public Exception { + public: + EndOfFileException() throw(); + ~EndOfFileException() throw(); +}; + // Open for read only. int OpenReadOrThrow(const char *name); // Create file if it doesn't exist, truncate if it does. Opened for write. @@ -71,12 +99,15 @@ int CreateOrThrow(const char *name); // Return value for SizeFile when it can't size properly. const uint64_t kBadSize = (uint64_t)-1; uint64_t SizeFile(int fd); +uint64_t SizeOrThrow(int fd); void ResizeOrThrow(int fd, uint64_t to); std::size_t PartialRead(int fd, void *to, std::size_t size); void ReadOrThrow(int fd, void *to, std::size_t size); std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size); +// Positioned: unix only for now. +void PReadOrThrow(int fd, void *to, std::size_t size, uint64_t off); void WriteOrThrow(int fd, const void *data_void, std::size_t size); void WriteOrThrow(FILE *to, const void *data, std::size_t size); @@ -91,17 +122,18 @@ void SeekEnd(int fd); std::FILE *FDOpenOrThrow(scoped_fd &file); std::FILE *FDOpenReadOrThrow(scoped_fd &file); -class TempMaker { - public: - explicit TempMaker(const std::string &prefix); +// Temporary files +int MakeTemp(const std::string &prefix); +std::FILE *FMakeTemp(const std::string &prefix); - // These will already be unlinked for you. - int Make() const; - std::FILE *MakeFile() const; +// dup an fd. +int DupOrThrow(int fd); - private: - std::string base_; -}; +/* Attempt get file name from fd. This won't always work (i.e. on Windows or + * a pipe). The file might have been renamed. It's intended for diagnostics + * and logging only. + */ +std::string NameFromFD(int fd); } // namespace util diff --git a/util/file_piece.cc b/util/file_piece.cc index 5a208eff7..5783c5fd0 100644 --- a/util/file_piece.cc +++ b/util/file_piece.cc @@ -34,10 +34,17 @@ FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : +namespace { +std::string NamePossiblyFind(int fd, const char *name) { + if (name) return name; + return NameFromFD(fd); +} +} // namespace + +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), - progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { - Initialize(name, show_progress, min_buffer); + progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) { + Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); } FilePiece::~FilePiece() {} diff --git a/util/file_piece.hh b/util/file_piece.hh index 39bd15811..533109769 100644 --- a/util/file_piece.hh +++ b/util/file_piece.hh @@ -29,7 +29,7 @@ class FilePiece { // 1 MB default. explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); // Takes ownership of fd. name is used for messages. - explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); ~FilePiece(); diff --git a/util/file_piece_test.cc b/util/file_piece_test.cc index e79ece7ab..91e4c5599 100644 --- a/util/file_piece_test.cc +++ b/util/file_piece_test.cc @@ -1,6 +1,7 @@ // Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" +#include "util/file.hh" #include "util/scoped.hh" #define BOOST_TEST_MODULE FilePieceTest diff --git a/util/scoped.hh b/util/scoped.hh index d62c6df16..37bc4744f 100644 --- a/util/scoped.hh +++ b/util/scoped.hh @@ -23,7 +23,7 @@ class scoped_malloc { void call_realloc(std::size_t to) { void *ret; - UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, util::ErrnoException, "realloc to " << to << " bytes failed."); + UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, ErrnoException, "realloc to " << to << " bytes failed."); p_ = ret; }