Merge branch 'master' into weight-new

This commit is contained in:
Hieu Hoang 2013-01-17 18:05:02 +00:00
commit aadefc6df9
21 changed files with 246 additions and 100 deletions

2
.gitignore vendored
View File

@ -65,3 +65,5 @@ contrib/other-builds/*.xcodeproj/project.xcworkspace/
contrib/other-builds/*.xcodeproj/xcuserdata/
*/*.xcodeproj/project.xcworkspace
*/*.xcodeproj/xcuserdata
mert/sentence-bleu

View File

@ -110,11 +110,13 @@ ALTERNATIVE WAYS TO BUILD ON UNIX AND OTHER PLATFORMS
Microsoft Windows
-----------------
Moses is primarily targeted at gcc on UNIX. Windows users should consult
http://ssli.ee.washington.edu/people/amittai/Moses-on-Win7.pdf .
Moses is primarily targeted at gcc on UNIX. Windows users should
install using Cygwin. Outdated instructions can be found here:
http://ssli.ee.washington.edu/people/amittai/Moses-on-Win7.pdf .
Binaries for all external libraries needed can be downloaded from
http://www.statmt.org/moses/?n=Moses.LibrariesUsed
Only the decoder is developed and tested under Windows. There are difficulties
using the training scripts under Windows, even with Cygwin.
Only the decoder is developed and tested under Windows. There are
difficulties using the training scripts under Windows, even with
Cygwin, but it can be done.

View File

@ -1,6 +1,7 @@
#include "lm/read_arpa.hh"
#include "lm/blank.hh"
#include "util/file.hh"
#include <cmath>
#include <cstdlib>

View File

@ -65,13 +65,13 @@ class PartialViewProxy {
typedef util::ProxyIterator<PartialViewProxy> PartialIter;
FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) {
util::scoped_fd file(maker.Make());
FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) {
util::scoped_fd file(util::MakeTemp(temp_prefix));
util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
return util::FDOpenOrThrow(file);
}
FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) {
FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) {
const size_t context_size = sizeof(WordIndex) * (order - 1);
// Sort just the contexts using the same memory.
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
@ -84,7 +84,7 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make
#endif
(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
util::scoped_FILE out(maker.MakeFile());
util::scoped_FILE out(util::FMakeTemp(temp_prefix));
// Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
if (context_begin == context_end) return out.release();
@ -114,12 +114,12 @@ struct FirstCombine {
}
};
template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) {
template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) {
std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
RecordReader first, second;
first.Init(first_file, entry_size);
second.Init(second_file, entry_size);
util::scoped_FILE out_file(maker.MakeFile());
util::scoped_FILE out_file(util::FMakeTemp(temp_prefix));
EntryCompare less(order);
while (first && second) {
if (less(first.Data(), second.Data())) {
@ -177,9 +177,8 @@ void RecordReader::Rewind() {
}
SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
util::TempMaker maker(file_prefix);
PositiveProbWarn warn(config.positive_log_probability);
unigram_.reset(maker.Make());
unigram_.reset(util::MakeTemp(file_prefix));
{
// In case <unk> appears.
size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
@ -202,7 +201,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u
if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);
for (unsigned char order = 2; order <= counts.size(); ++order) {
ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer);
ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer);
}
ReadEnd(f);
}
@ -227,7 +226,7 @@ class Closer {
};
} // namespace
void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
ReadNGramHeader(f, order);
const size_t count = counts[order - 1];
// Size of weights. Does it include backoff?
@ -261,8 +260,8 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
std::sort
#endif
(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
files.push_back(DiskFlush(begin, out_end, maker));
contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
files.push_back(DiskFlush(begin, out_end, file_prefix));
contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order));
done += (out_end - begin) / entry_size;
}
@ -270,10 +269,10 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
// All individual files created. Merge them.
while (files.size() > 1) {
files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine()));
files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine()));
files_closer.PopFront();
files_closer.PopFront();
contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine()));
contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine()));
contexts_closer.PopFront();
contexts_closer.PopFront();
}

View File

@ -18,7 +18,6 @@
namespace util {
class FilePiece;
class TempMaker;
} // namespace util
namespace lm {
@ -101,7 +100,7 @@ class SortedFiles {
}
private:
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
util::scoped_fd unigram_;

View File

@ -360,17 +360,19 @@ split-indomain-target
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
train
in: indomain-stem outdomain-stem settings
in: indomain-stem outdomain-stem
out: model
rerun-on-change: settings
ignore-unless: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer IN2
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
train-in-mono
in: indomain-source indomain-target outdomain-stem settings
in: indomain-source indomain-target outdomain-stem
out: model
rerun-on-change: settings
ignore-if: indomain-stem
default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer IN3
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
[TRAINING] single
consolidate

View File

@ -1350,6 +1350,7 @@ sub check_if_crashed {
'no such file or directory','unknown option',
'died at','exit code','permission denied',
'segmentation fault','abort',
'no space left on device',
'can\'t locate', 'unrecognized option') {
if (/$pattern/i) {
my $not_error = 0;

View File

@ -161,12 +161,16 @@ $catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n
@children = ();
if ($makeTTable)
{
print STDERR "merging extract / extract.inv\n";
$pid = RunFork($catCmd);
push(@children, $pid);
$pid = RunFork($catInvCmd);
push(@children, $pid);
}
else {
print STDERR "skipping extract, doing only extract.o\n";
}
my $numStr = NumStr(0);
if (-e "$TMPDIR/extract.$numStr.o.gz")

View File

@ -13,7 +13,7 @@ die("detruecase.perl < in > out")
'in=s' => \$INFILE);
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
# lowercase even in headline
my %ALWAYS_LOWER;

View File

@ -20,7 +20,7 @@ die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstTo
&& defined($CORPUS) && defined($MODEL);
my %CASING;
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
binmode(CORPUS, ":utf8");
while(<CORPUS>) {

View File

@ -26,7 +26,7 @@ while(<MODEL>) {
close(MODEL);
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
while(<STDIN>) {
chop;

View File

@ -245,7 +245,12 @@ if ($STEPS[1] || $STEPS[2])
}
print STDERR "Using single-thread GIZA\n";
} else {
$GIZA = "$_EXTERNAL_BINDIR/mgiza";
# accept either "mgiza" or "mgizapp" and either "snt2cooc.out" or "snt2cooc"
if (-x "$_EXTERNAL_BINDIR/mgiza") {
$GIZA = "$_EXTERNAL_BINDIR/mgiza";
} elsif (-x "$_EXTERNAL_BINDIR/mgizapp") {
$GIZA = "$_EXTERNAL_BINDIR/mgizapp";
}
if (-x "$_EXTERNAL_BINDIR/snt2cooc") {
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc";
} elsif (-x "$_EXTERNAL_BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $_EXTERNAL_BINDIR
@ -1420,8 +1425,8 @@ sub extract_phrase {
$cmd .= " orientation";
$cmd .= get_extract_reordering_flags();
$cmd .= " --NoTTable" if !$ttable_flag;
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
$cmd .= " --GZOutput ";

View File

@ -1,6 +1,7 @@
#!/usr/bin/perl -w
use strict;
use FindBin qw($RealBin);
use Getopt::Long "GetOptions";
my ($IN,$OUT,$MXPOST);
@ -14,8 +15,8 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
open(TAGGER,"cat $IN | $pipeline");
open(OUT,">$OUT");
open(TAGGER,"$RealBin/../../tokenizer/deescape-special-chars.perl < $IN | $pipeline");
open(OUT,"| $RealBin/../../tokenizer/escape-special-chars.perl > $OUT");
while(<TAGGER>) {
foreach my $word_pos (split) {
$word_pos =~ s/\/([^\/]+)$/_$1/;

View File

@ -79,11 +79,6 @@ ErrnoException::ErrnoException() throw() : errno_(errno) {
ErrnoException::~ErrnoException() throw() {}
EndOfFileException::EndOfFileException() throw() {
*this << "End of file";
}
EndOfFileException::~EndOfFileException() throw() {}
OverflowException::OverflowException() throw() {}
OverflowException::~OverflowException() throw() {}

View File

@ -44,7 +44,7 @@ class Exception : public std::exception {
};
/* This implements the normal operator<< for Exception and all its children.
* SNIFAE means it only applies to Exception. Think of this as an ersatz
* SFINAE means it only applies to Exception. Think of this as an ersatz
* boost::enable_if.
*/
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
@ -62,30 +62,26 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
#endif
#endif
#define UTIL_SET_LOCATION(UTIL_e, child, condition) do { \
(UTIL_e).SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, (child), (condition)); \
} while (0)
/* Create an instance of Exception, add the message Modify, and throw it.
* Modify is appended to the what() message and can contain << for ostream
* operations.
*
* do .. while kludge to swallow trailing ; character
* http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
* Arg can be a constructor argument to the exception.
*/
#define UTIL_THROW(Exception, Modify) do { \
Exception UTIL_e; \
UTIL_SET_LOCATION(UTIL_e, #Exception, NULL); \
#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
Exception UTIL_e Arg; \
UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \
UTIL_e << Modify; \
throw UTIL_e; \
} while (0)
#define UTIL_THROW_VAR(Var, Modify) do { \
Exception &UTIL_e = (Var); \
UTIL_SET_LOCATION(UTIL_e, NULL, NULL); \
UTIL_e << Modify; \
throw UTIL_e; \
} while (0)
#define UTIL_THROW_ARG(Exception, Arg, Modify) \
UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify)
#define UTIL_THROW(Exception, Modify) \
UTIL_THROW_BACKEND(NULL, Exception, , Modify);
#if __GNUC__ >= 3
#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
@ -93,15 +89,16 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
#define UTIL_UNLIKELY(x) (x)
#endif
#define UTIL_THROW_IF(Condition, Exception, Modify) do { \
#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \
if (UTIL_UNLIKELY(Condition)) { \
Exception UTIL_e; \
UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \
UTIL_e << Modify; \
throw UTIL_e; \
UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \
} \
} while (0)
#define UTIL_THROW_IF(Condition, Exception, Modify) \
UTIL_THROW_IF_ARG(Condition, Exception, , Modify)
// Exception that records errno and adds it to the message.
class ErrnoException : public Exception {
public:
ErrnoException() throw();
@ -114,12 +111,7 @@ class ErrnoException : public Exception {
int errno_;
};
class EndOfFileException : public Exception {
public:
EndOfFileException() throw();
~EndOfFileException() throw();
};
// Utilities for overflow checking.
class OverflowException : public Exception {
public:
OverflowException() throw();

View File

@ -7,9 +7,11 @@
#include <cstdlib>
#include <cstdio>
#include <sstream>
#include <iostream>
#include <assert.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@ -40,6 +42,18 @@ scoped_FILE::~scoped_FILE() {
}
}
// Note that ErrnoException records errno before NameFromFD is called.
FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) {
*this << "in " << name_guess_ << ' ';
}
FDException::~FDException() throw() {}
EndOfFileException::EndOfFileException() throw() {
*this << "End of file";
}
EndOfFileException::~EndOfFileException() throw() {}
int OpenReadOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
@ -78,8 +92,14 @@ uint64_t SizeFile(int fd) {
#endif
}
uint64_t SizeOrThrow(int fd) {
uint64_t ret = SizeFile(fd);
UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size");
return ret;
}
void ResizeOrThrow(int fd, uint64_t to) {
UTIL_THROW_IF(
UTIL_THROW_IF_ARG(
#if defined(_WIN32) || defined(_WIN64)
_chsize_s
#elif defined(OS_ANDROID)
@ -87,7 +107,7 @@ void ResizeOrThrow(int fd, uint64_t to) {
#else
ftruncate
#endif
(fd, to), ErrnoException, "Resizing to " << to << " bytes failed");
(fd, to), FDException, (fd), "while resizing to " << to << " bytes");
}
std::size_t PartialRead(int fd, void *to, std::size_t amount) {
@ -95,9 +115,13 @@ std::size_t PartialRead(int fd, void *to, std::size_t amount) {
amount = min(static_cast<std::size_t>(INT_MAX), amount);
int ret = _read(fd, to, amount);
#else
ssize_t ret = read(fd, to, amount);
errno = 0;
ssize_t ret;
do {
ret = read(fd, to, amount);
} while (ret == -1 && errno == EINTR);
#endif
UTIL_THROW_IF(ret < 0, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
return static_cast<std::size_t>(ret);
}
@ -105,7 +129,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
std::size_t ret = PartialRead(fd, to, amount);
UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
@ -123,29 +147,59 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
return amount;
}
void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
uint8_t *to = static_cast<uint8_t*>(to_void);
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW(Exception, "TODO: PReadOrThrow for windows using ReadFile http://stackoverflow.com/questions/766477/are-there-equivalents-to-pread-on-different-platforms");
#else
for (;size ;) {
ssize_t ret;
errno = 0;
do {
#ifdef OS_ANDROID
ret = pread64(fd, to, size, off);
#else
ret = pread(fd, to, size, off);
#endif
} while (ret == -1 && errno == EINTR);
if (ret <= 0) {
UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off);
}
size -= ret;
off += ret;
to += ret;
}
#endif
}
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
#if defined(_WIN32) || defined(_WIN64)
int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
#else
ssize_t ret = write(fd, data, size);
errno = 0;
ssize_t ret;
do {
ret = write(fd, data, size);
} while (ret == -1 && errno == EINTR);
#endif
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes");
data += ret;
size -= ret;
}
}
void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
assert(size);
UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), util::ErrnoException, "Short write; requested size " << size);
if (!size) return;
UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size);
}
void FSyncOrThrow(int fd) {
// Apparently windows doesn't have fsync?
#if !defined(_WIN32) && !defined(_WIN64)
UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed.");
UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "Syncing");
#endif
}
@ -164,7 +218,7 @@ typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
// Can't we all just get along?
void InternalSeek(int fd, int64_t off, int whence) {
UTIL_THROW_IF(
UTIL_THROW_IF_ARG(
#if defined(_WIN32) || defined(_WIN64)
(__int64)-1 == _lseeki64(fd, off, whence),
#elif defined(OS_ANDROID)
@ -172,7 +226,7 @@ void InternalSeek(int fd, int64_t off, int whence) {
#else
(off_t)-1 == lseek(fd, off, whence),
#endif
ErrnoException, "Seek failed");
FDException, (fd), "while seeking to " << off << " whence " << whence);
}
} // namespace
@ -190,22 +244,18 @@ void SeekEnd(int fd) {
std::FILE *FDOpenOrThrow(scoped_fd &file) {
std::FILE *ret = fdopen(file.get(), "r+b");
if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write");
file.release();
return ret;
}
std::FILE *FDOpenReadOrThrow(scoped_fd &file) {
std::FILE *ret = fdopen(file.get(), "rb");
if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read");
file.release();
return ret;
}
TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {
base_ += "XXXXXX";
}
// Sigh. Windows temporary file creation is full of race conditions.
#if defined(_WIN32) || defined(_WIN64)
/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright
@ -322,23 +372,76 @@ int
mkstemp_and_unlink(char *tmpl) {
int ret = mkstemp(tmpl);
if (ret != -1) {
UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl);
}
return ret;
}
#endif
int TempMaker::Make() const {
std::string name(base_);
int MakeTemp(const std::string &base) {
std::string name(base);
name += "XXXXXX";
name.push_back(0);
int ret;
UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base);
return ret;
}
std::FILE *TempMaker::MakeFile() const {
util::scoped_fd file(Make());
std::FILE *FMakeTemp(const std::string &base) {
util::scoped_fd file(MakeTemp(base));
return FDOpenOrThrow(file);
}
int DupOrThrow(int fd) {
int ret = dup(fd);
UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor");
return ret;
}
namespace {
// Try to name things but be willing to fail too.
bool TryName(int fd, std::string &out) {
#if defined(_WIN32) || defined(_WIN64)
return false;
#else
std::string name("/proc/self/fd/");
std::ostringstream convert;
convert << fd;
name += convert.str();
struct stat sb;
if (-1 == lstat(name.c_str(), &sb))
return false;
out.resize(sb.st_size + 1);
ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1);
if (-1 == ret)
return false;
if (ret > sb.st_size) {
// Increased in size?!
return false;
}
out.resize(ret);
// Don't use the non-file names.
if (!out.empty() && out[0] != '/')
return false;
return true;
#endif
}
} // namespace
std::string NameFromFD(int fd) {
std::string ret;
if (TryName(fd, ret)) return ret;
switch (fd) {
case 0: return "stdin";
case 1: return "stdout";
case 2: return "stderr";
}
ret = "fd ";
std::ostringstream convert;
convert << fd;
ret += convert.str();
return ret;
}
} // namespace util

View File

@ -1,6 +1,8 @@
#ifndef UTIL_FILE__
#define UTIL_FILE__
#include "util/exception.hh"
#include <cstddef>
#include <cstdio>
#include <string>
@ -17,7 +19,7 @@ class scoped_fd {
~scoped_fd();
void reset(int to) {
void reset(int to = -1) {
scoped_fd other(fd_);
fd_ = to;
}
@ -63,6 +65,32 @@ class scoped_FILE {
std::FILE *file_;
};
/* Thrown for any operation where the fd is known. */
class FDException : public ErrnoException {
public:
explicit FDException(int fd) throw();
virtual ~FDException() throw();
// This may no longer be valid if the exception was thrown past open.
int FD() const { return fd_; }
// Guess from NameFromFD.
const std::string &NameGuess() const { return name_guess_; }
private:
int fd_;
std::string name_guess_;
};
// End of file reached.
class EndOfFileException : public Exception {
public:
EndOfFileException() throw();
~EndOfFileException() throw();
};
// Open for read only.
int OpenReadOrThrow(const char *name);
// Create file if it doesn't exist, truncate if it does. Opened for write.
@ -71,12 +99,15 @@ int CreateOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1;
uint64_t SizeFile(int fd);
uint64_t SizeOrThrow(int fd);
void ResizeOrThrow(int fd, uint64_t to);
std::size_t PartialRead(int fd, void *to, std::size_t size);
void ReadOrThrow(int fd, void *to, std::size_t size);
std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size);
// Positioned: unix only for now.
void PReadOrThrow(int fd, void *to, std::size_t size, uint64_t off);
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
void WriteOrThrow(FILE *to, const void *data, std::size_t size);
@ -91,17 +122,18 @@ void SeekEnd(int fd);
std::FILE *FDOpenOrThrow(scoped_fd &file);
std::FILE *FDOpenReadOrThrow(scoped_fd &file);
class TempMaker {
public:
explicit TempMaker(const std::string &prefix);
// Temporary files
int MakeTemp(const std::string &prefix);
std::FILE *FMakeTemp(const std::string &prefix);
// These will already be unlinked for you.
int Make() const;
std::FILE *MakeFile() const;
// dup an fd.
int DupOrThrow(int fd);
private:
std::string base_;
};
/* Attempt get file name from fd. This won't always work (i.e. on Windows or
* a pipe). The file might have been renamed. It's intended for diagnostics
* and logging only.
*/
std::string NameFromFD(int fd);
} // namespace util

View File

@ -34,10 +34,17 @@ FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t
Initialize(name, show_progress, min_buffer);
}
FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
namespace {
std::string NamePossiblyFind(int fd, const char *name) {
if (name) return name;
return NameFromFD(fd);
}
} // namespace
FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
Initialize(name, show_progress, min_buffer);
progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) {
Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
}
FilePiece::~FilePiece() {}

View File

@ -29,7 +29,7 @@ class FilePiece {
// 1 MB default.
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
// Takes ownership of fd. name is used for messages.
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
~FilePiece();

View File

@ -1,6 +1,7 @@
// Tests might fail if you have creative characters in your path. Sue me.
#include "util/file_piece.hh"
#include "util/file.hh"
#include "util/scoped.hh"
#define BOOST_TEST_MODULE FilePieceTest

View File

@ -23,7 +23,7 @@ class scoped_malloc {
void call_realloc(std::size_t to) {
void *ret;
UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, util::ErrnoException, "realloc to " << to << " bytes failed.");
UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, ErrnoException, "realloc to " << to << " bytes failed.");
p_ = ret;
}