mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 15:04:05 +03:00
Merge branch 'master' into weight-new
This commit is contained in:
commit
aadefc6df9
2
.gitignore
vendored
2
.gitignore
vendored
@ -65,3 +65,5 @@ contrib/other-builds/*.xcodeproj/project.xcworkspace/
|
||||
contrib/other-builds/*.xcodeproj/xcuserdata/
|
||||
*/*.xcodeproj/project.xcworkspace
|
||||
*/*.xcodeproj/xcuserdata
|
||||
|
||||
mert/sentence-bleu
|
||||
|
@ -110,11 +110,13 @@ ALTERNATIVE WAYS TO BUILD ON UNIX AND OTHER PLATFORMS
|
||||
|
||||
Microsoft Windows
|
||||
-----------------
|
||||
Moses is primarily targeted at gcc on UNIX. Windows users should consult
|
||||
Moses is primarily targeted at gcc on UNIX. Windows users should
|
||||
install using Cygwin. Outdated instructions can be found here:
|
||||
http://ssli.ee.washington.edu/people/amittai/Moses-on-Win7.pdf .
|
||||
|
||||
Binaries for all external libraries needed can be downloaded from
|
||||
http://www.statmt.org/moses/?n=Moses.LibrariesUsed
|
||||
|
||||
Only the decoder is developed and tested under Windows. There are difficulties
|
||||
using the training scripts under Windows, even with Cygwin.
|
||||
Only the decoder is developed and tested under Windows. There are
|
||||
difficulties using the training scripts under Windows, even with
|
||||
Cygwin, but it can be done.
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "lm/read_arpa.hh"
|
||||
|
||||
#include "lm/blank.hh"
|
||||
#include "util/file.hh"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
|
@ -65,13 +65,13 @@ class PartialViewProxy {
|
||||
|
||||
typedef util::ProxyIterator<PartialViewProxy> PartialIter;
|
||||
|
||||
FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) {
|
||||
util::scoped_fd file(maker.Make());
|
||||
FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) {
|
||||
util::scoped_fd file(util::MakeTemp(temp_prefix));
|
||||
util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
|
||||
return util::FDOpenOrThrow(file);
|
||||
}
|
||||
|
||||
FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) {
|
||||
FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) {
|
||||
const size_t context_size = sizeof(WordIndex) * (order - 1);
|
||||
// Sort just the contexts using the same memory.
|
||||
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
|
||||
@ -84,7 +84,7 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make
|
||||
#endif
|
||||
(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
|
||||
|
||||
util::scoped_FILE out(maker.MakeFile());
|
||||
util::scoped_FILE out(util::FMakeTemp(temp_prefix));
|
||||
|
||||
// Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
|
||||
if (context_begin == context_end) return out.release();
|
||||
@ -114,12 +114,12 @@ struct FirstCombine {
|
||||
}
|
||||
};
|
||||
|
||||
template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) {
|
||||
template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) {
|
||||
std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
|
||||
RecordReader first, second;
|
||||
first.Init(first_file, entry_size);
|
||||
second.Init(second_file, entry_size);
|
||||
util::scoped_FILE out_file(maker.MakeFile());
|
||||
util::scoped_FILE out_file(util::FMakeTemp(temp_prefix));
|
||||
EntryCompare less(order);
|
||||
while (first && second) {
|
||||
if (less(first.Data(), second.Data())) {
|
||||
@ -177,9 +177,8 @@ void RecordReader::Rewind() {
|
||||
}
|
||||
|
||||
SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
|
||||
util::TempMaker maker(file_prefix);
|
||||
PositiveProbWarn warn(config.positive_log_probability);
|
||||
unigram_.reset(maker.Make());
|
||||
unigram_.reset(util::MakeTemp(file_prefix));
|
||||
{
|
||||
// In case <unk> appears.
|
||||
size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
|
||||
@ -202,7 +201,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u
|
||||
if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);
|
||||
|
||||
for (unsigned char order = 2; order <= counts.size(); ++order) {
|
||||
ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer);
|
||||
ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer);
|
||||
}
|
||||
ReadEnd(f);
|
||||
}
|
||||
@ -227,7 +226,7 @@ class Closer {
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
|
||||
void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
|
||||
ReadNGramHeader(f, order);
|
||||
const size_t count = counts[order - 1];
|
||||
// Size of weights. Does it include backoff?
|
||||
@ -261,8 +260,8 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
|
||||
std::sort
|
||||
#endif
|
||||
(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
|
||||
files.push_back(DiskFlush(begin, out_end, maker));
|
||||
contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
|
||||
files.push_back(DiskFlush(begin, out_end, file_prefix));
|
||||
contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order));
|
||||
|
||||
done += (out_end - begin) / entry_size;
|
||||
}
|
||||
@ -270,10 +269,10 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
|
||||
// All individual files created. Merge them.
|
||||
|
||||
while (files.size() > 1) {
|
||||
files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine()));
|
||||
files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine()));
|
||||
files_closer.PopFront();
|
||||
files_closer.PopFront();
|
||||
contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine()));
|
||||
contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine()));
|
||||
contexts_closer.PopFront();
|
||||
contexts_closer.PopFront();
|
||||
}
|
||||
|
@ -18,7 +18,6 @@
|
||||
|
||||
namespace util {
|
||||
class FilePiece;
|
||||
class TempMaker;
|
||||
} // namespace util
|
||||
|
||||
namespace lm {
|
||||
@ -101,7 +100,7 @@ class SortedFiles {
|
||||
}
|
||||
|
||||
private:
|
||||
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
|
||||
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
|
||||
|
||||
util::scoped_fd unigram_;
|
||||
|
||||
|
@ -360,17 +360,19 @@ split-indomain-target
|
||||
pass-unless: output-splitter
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
train
|
||||
in: indomain-stem outdomain-stem settings
|
||||
in: indomain-stem outdomain-stem
|
||||
out: model
|
||||
rerun-on-change: settings
|
||||
ignore-unless: indomain-stem
|
||||
default-name: mml/model
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer IN2
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
|
||||
train-in-mono
|
||||
in: indomain-source indomain-target outdomain-stem settings
|
||||
in: indomain-source indomain-target outdomain-stem
|
||||
out: model
|
||||
rerun-on-change: settings
|
||||
ignore-if: indomain-stem
|
||||
default-name: mml/model
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer IN3
|
||||
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
|
||||
|
||||
[TRAINING] single
|
||||
consolidate
|
||||
|
@ -1350,6 +1350,7 @@ sub check_if_crashed {
|
||||
'no such file or directory','unknown option',
|
||||
'died at','exit code','permission denied',
|
||||
'segmentation fault','abort',
|
||||
'no space left on device',
|
||||
'can\'t locate', 'unrecognized option') {
|
||||
if (/$pattern/i) {
|
||||
my $not_error = 0;
|
||||
|
@ -161,12 +161,16 @@ $catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n
|
||||
@children = ();
|
||||
if ($makeTTable)
|
||||
{
|
||||
print STDERR "merging extract / extract.inv\n";
|
||||
$pid = RunFork($catCmd);
|
||||
push(@children, $pid);
|
||||
|
||||
$pid = RunFork($catInvCmd);
|
||||
push(@children, $pid);
|
||||
}
|
||||
else {
|
||||
print STDERR "skipping extract, doing only extract.o\n";
|
||||
}
|
||||
|
||||
my $numStr = NumStr(0);
|
||||
if (-e "$TMPDIR/extract.$numStr.o.gz")
|
||||
|
@ -13,7 +13,7 @@ die("detruecase.perl < in > out")
|
||||
'in=s' => \$INFILE);
|
||||
|
||||
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1);
|
||||
|
||||
# lowercase even in headline
|
||||
my %ALWAYS_LOWER;
|
||||
|
@ -20,7 +20,7 @@ die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstTo
|
||||
&& defined($CORPUS) && defined($MODEL);
|
||||
my %CASING;
|
||||
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1);
|
||||
open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
|
||||
binmode(CORPUS, ":utf8");
|
||||
while(<CORPUS>) {
|
||||
|
@ -26,7 +26,7 @@ while(<MODEL>) {
|
||||
close(MODEL);
|
||||
|
||||
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1);
|
||||
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1);
|
||||
|
||||
while(<STDIN>) {
|
||||
chop;
|
||||
|
@ -245,7 +245,12 @@ if ($STEPS[1] || $STEPS[2])
|
||||
}
|
||||
print STDERR "Using single-thread GIZA\n";
|
||||
} else {
|
||||
$GIZA = "$_EXTERNAL_BINDIR/mgiza";
|
||||
# accept either "mgiza" or "mgizapp" and either "snt2cooc.out" or "snt2cooc"
|
||||
if (-x "$_EXTERNAL_BINDIR/mgiza") {
|
||||
$GIZA = "$_EXTERNAL_BINDIR/mgiza";
|
||||
} elsif (-x "$_EXTERNAL_BINDIR/mgizapp") {
|
||||
$GIZA = "$_EXTERNAL_BINDIR/mgizapp";
|
||||
}
|
||||
if (-x "$_EXTERNAL_BINDIR/snt2cooc") {
|
||||
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc";
|
||||
} elsif (-x "$_EXTERNAL_BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $_EXTERNAL_BINDIR
|
||||
@ -1420,8 +1425,8 @@ sub extract_phrase {
|
||||
$cmd .= " orientation";
|
||||
$cmd .= get_extract_reordering_flags();
|
||||
$cmd .= " --NoTTable" if !$ttable_flag;
|
||||
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
|
||||
}
|
||||
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
|
||||
}
|
||||
|
||||
$cmd .= " --GZOutput ";
|
||||
|
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
|
||||
use Getopt::Long "GetOptions";
|
||||
my ($IN,$OUT,$MXPOST);
|
||||
@ -14,8 +15,8 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
|
||||
|
||||
my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
|
||||
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
|
||||
open(TAGGER,"cat $IN | $pipeline");
|
||||
open(OUT,">$OUT");
|
||||
open(TAGGER,"$RealBin/../../tokenizer/deescape-special-chars.perl < $IN | $pipeline");
|
||||
open(OUT,"| $RealBin/../../tokenizer/escape-special-chars.perl > $OUT");
|
||||
while(<TAGGER>) {
|
||||
foreach my $word_pos (split) {
|
||||
$word_pos =~ s/\/([^\/]+)$/_$1/;
|
||||
|
@ -79,11 +79,6 @@ ErrnoException::ErrnoException() throw() : errno_(errno) {
|
||||
|
||||
ErrnoException::~ErrnoException() throw() {}
|
||||
|
||||
EndOfFileException::EndOfFileException() throw() {
|
||||
*this << "End of file";
|
||||
}
|
||||
EndOfFileException::~EndOfFileException() throw() {}
|
||||
|
||||
OverflowException::OverflowException() throw() {}
|
||||
OverflowException::~OverflowException() throw() {}
|
||||
|
||||
|
@ -44,7 +44,7 @@ class Exception : public std::exception {
|
||||
};
|
||||
|
||||
/* This implements the normal operator<< for Exception and all its children.
|
||||
* SNIFAE means it only applies to Exception. Think of this as an ersatz
|
||||
* SFINAE means it only applies to Exception. Think of this as an ersatz
|
||||
* boost::enable_if.
|
||||
*/
|
||||
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
|
||||
@ -62,30 +62,26 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define UTIL_SET_LOCATION(UTIL_e, child, condition) do { \
|
||||
(UTIL_e).SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, (child), (condition)); \
|
||||
} while (0)
|
||||
|
||||
/* Create an instance of Exception, add the message Modify, and throw it.
|
||||
* Modify is appended to the what() message and can contain << for ostream
|
||||
* operations.
|
||||
*
|
||||
* do .. while kludge to swallow trailing ; character
|
||||
* http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
|
||||
* Arg can be a constructor argument to the exception.
|
||||
*/
|
||||
#define UTIL_THROW(Exception, Modify) do { \
|
||||
Exception UTIL_e; \
|
||||
UTIL_SET_LOCATION(UTIL_e, #Exception, NULL); \
|
||||
#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
|
||||
Exception UTIL_e Arg; \
|
||||
UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \
|
||||
UTIL_e << Modify; \
|
||||
throw UTIL_e; \
|
||||
} while (0)
|
||||
|
||||
#define UTIL_THROW_VAR(Var, Modify) do { \
|
||||
Exception &UTIL_e = (Var); \
|
||||
UTIL_SET_LOCATION(UTIL_e, NULL, NULL); \
|
||||
UTIL_e << Modify; \
|
||||
throw UTIL_e; \
|
||||
} while (0)
|
||||
#define UTIL_THROW_ARG(Exception, Arg, Modify) \
|
||||
UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify)
|
||||
|
||||
#define UTIL_THROW(Exception, Modify) \
|
||||
UTIL_THROW_BACKEND(NULL, Exception, , Modify);
|
||||
|
||||
#if __GNUC__ >= 3
|
||||
#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
|
||||
@ -93,15 +89,16 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
|
||||
#define UTIL_UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
||||
#define UTIL_THROW_IF(Condition, Exception, Modify) do { \
|
||||
#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \
|
||||
if (UTIL_UNLIKELY(Condition)) { \
|
||||
Exception UTIL_e; \
|
||||
UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \
|
||||
UTIL_e << Modify; \
|
||||
throw UTIL_e; \
|
||||
UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define UTIL_THROW_IF(Condition, Exception, Modify) \
|
||||
UTIL_THROW_IF_ARG(Condition, Exception, , Modify)
|
||||
|
||||
// Exception that records errno and adds it to the message.
|
||||
class ErrnoException : public Exception {
|
||||
public:
|
||||
ErrnoException() throw();
|
||||
@ -114,12 +111,7 @@ class ErrnoException : public Exception {
|
||||
int errno_;
|
||||
};
|
||||
|
||||
class EndOfFileException : public Exception {
|
||||
public:
|
||||
EndOfFileException() throw();
|
||||
~EndOfFileException() throw();
|
||||
};
|
||||
|
||||
// Utilities for overflow checking.
|
||||
class OverflowException : public Exception {
|
||||
public:
|
||||
OverflowException() throw();
|
||||
|
151
util/file.cc
151
util/file.cc
@ -7,9 +7,11 @@
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
@ -40,6 +42,18 @@ scoped_FILE::~scoped_FILE() {
|
||||
}
|
||||
}
|
||||
|
||||
// Note that ErrnoException records errno before NameFromFD is called.
|
||||
FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) {
|
||||
*this << "in " << name_guess_ << ' ';
|
||||
}
|
||||
|
||||
FDException::~FDException() throw() {}
|
||||
|
||||
EndOfFileException::EndOfFileException() throw() {
|
||||
*this << "End of file";
|
||||
}
|
||||
EndOfFileException::~EndOfFileException() throw() {}
|
||||
|
||||
int OpenReadOrThrow(const char *name) {
|
||||
int ret;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
@ -78,8 +92,14 @@ uint64_t SizeFile(int fd) {
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t SizeOrThrow(int fd) {
|
||||
uint64_t ret = SizeFile(fd);
|
||||
UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size");
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ResizeOrThrow(int fd, uint64_t to) {
|
||||
UTIL_THROW_IF(
|
||||
UTIL_THROW_IF_ARG(
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
_chsize_s
|
||||
#elif defined(OS_ANDROID)
|
||||
@ -87,7 +107,7 @@ void ResizeOrThrow(int fd, uint64_t to) {
|
||||
#else
|
||||
ftruncate
|
||||
#endif
|
||||
(fd, to), ErrnoException, "Resizing to " << to << " bytes failed");
|
||||
(fd, to), FDException, (fd), "while resizing to " << to << " bytes");
|
||||
}
|
||||
|
||||
std::size_t PartialRead(int fd, void *to, std::size_t amount) {
|
||||
@ -95,9 +115,13 @@ std::size_t PartialRead(int fd, void *to, std::size_t amount) {
|
||||
amount = min(static_cast<std::size_t>(INT_MAX), amount);
|
||||
int ret = _read(fd, to, amount);
|
||||
#else
|
||||
ssize_t ret = read(fd, to, amount);
|
||||
errno = 0;
|
||||
ssize_t ret;
|
||||
do {
|
||||
ret = read(fd, to, amount);
|
||||
} while (ret == -1 && errno == EINTR);
|
||||
#endif
|
||||
UTIL_THROW_IF(ret < 0, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
|
||||
UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
|
||||
return static_cast<std::size_t>(ret);
|
||||
}
|
||||
|
||||
@ -105,7 +129,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
|
||||
uint8_t *to = static_cast<uint8_t*>(to_void);
|
||||
while (amount) {
|
||||
std::size_t ret = PartialRead(fd, to, amount);
|
||||
UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
|
||||
UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read.");
|
||||
amount -= ret;
|
||||
to += ret;
|
||||
}
|
||||
@ -123,29 +147,59 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
|
||||
return amount;
|
||||
}
|
||||
|
||||
void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
|
||||
uint8_t *to = static_cast<uint8_t*>(to_void);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
UTIL_THROW(Exception, "TODO: PReadOrThrow for windows using ReadFile http://stackoverflow.com/questions/766477/are-there-equivalents-to-pread-on-different-platforms");
|
||||
#else
|
||||
for (;size ;) {
|
||||
ssize_t ret;
|
||||
errno = 0;
|
||||
do {
|
||||
#ifdef OS_ANDROID
|
||||
ret = pread64(fd, to, size, off);
|
||||
#else
|
||||
ret = pread(fd, to, size, off);
|
||||
#endif
|
||||
} while (ret == -1 && errno == EINTR);
|
||||
if (ret <= 0) {
|
||||
UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
|
||||
UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off);
|
||||
}
|
||||
size -= ret;
|
||||
off += ret;
|
||||
to += ret;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
|
||||
const uint8_t *data = static_cast<const uint8_t*>(data_void);
|
||||
while (size) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
|
||||
#else
|
||||
ssize_t ret = write(fd, data, size);
|
||||
errno = 0;
|
||||
ssize_t ret;
|
||||
do {
|
||||
ret = write(fd, data, size);
|
||||
} while (ret == -1 && errno == EINTR);
|
||||
#endif
|
||||
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
|
||||
UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes");
|
||||
data += ret;
|
||||
size -= ret;
|
||||
}
|
||||
}
|
||||
|
||||
void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
|
||||
assert(size);
|
||||
UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), util::ErrnoException, "Short write; requested size " << size);
|
||||
if (!size) return;
|
||||
UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size);
|
||||
}
|
||||
|
||||
void FSyncOrThrow(int fd) {
|
||||
// Apparently windows doesn't have fsync?
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed.");
|
||||
UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "Syncing");
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -164,7 +218,7 @@ typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
|
||||
|
||||
// Can't we all just get along?
|
||||
void InternalSeek(int fd, int64_t off, int whence) {
|
||||
UTIL_THROW_IF(
|
||||
UTIL_THROW_IF_ARG(
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
(__int64)-1 == _lseeki64(fd, off, whence),
|
||||
#elif defined(OS_ANDROID)
|
||||
@ -172,7 +226,7 @@ void InternalSeek(int fd, int64_t off, int whence) {
|
||||
#else
|
||||
(off_t)-1 == lseek(fd, off, whence),
|
||||
#endif
|
||||
ErrnoException, "Seek failed");
|
||||
FDException, (fd), "while seeking to " << off << " whence " << whence);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
@ -190,22 +244,18 @@ void SeekEnd(int fd) {
|
||||
|
||||
std::FILE *FDOpenOrThrow(scoped_fd &file) {
|
||||
std::FILE *ret = fdopen(file.get(), "r+b");
|
||||
if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
|
||||
UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write");
|
||||
file.release();
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::FILE *FDOpenReadOrThrow(scoped_fd &file) {
|
||||
std::FILE *ret = fdopen(file.get(), "rb");
|
||||
if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
|
||||
UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read");
|
||||
file.release();
|
||||
return ret;
|
||||
}
|
||||
|
||||
TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {
|
||||
base_ += "XXXXXX";
|
||||
}
|
||||
|
||||
// Sigh. Windows temporary file creation is full of race conditions.
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright
|
||||
@ -322,23 +372,76 @@ int
|
||||
mkstemp_and_unlink(char *tmpl) {
|
||||
int ret = mkstemp(tmpl);
|
||||
if (ret != -1) {
|
||||
UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
|
||||
UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
int TempMaker::Make() const {
|
||||
std::string name(base_);
|
||||
int MakeTemp(const std::string &base) {
|
||||
std::string name(base);
|
||||
name += "XXXXXX";
|
||||
name.push_back(0);
|
||||
int ret;
|
||||
UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
|
||||
UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::FILE *TempMaker::MakeFile() const {
|
||||
util::scoped_fd file(Make());
|
||||
std::FILE *FMakeTemp(const std::string &base) {
|
||||
util::scoped_fd file(MakeTemp(base));
|
||||
return FDOpenOrThrow(file);
|
||||
}
|
||||
|
||||
int DupOrThrow(int fd) {
|
||||
int ret = dup(fd);
|
||||
UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor");
|
||||
return ret;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Try to name things but be willing to fail too.
|
||||
bool TryName(int fd, std::string &out) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return false;
|
||||
#else
|
||||
std::string name("/proc/self/fd/");
|
||||
std::ostringstream convert;
|
||||
convert << fd;
|
||||
name += convert.str();
|
||||
|
||||
struct stat sb;
|
||||
if (-1 == lstat(name.c_str(), &sb))
|
||||
return false;
|
||||
out.resize(sb.st_size + 1);
|
||||
ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1);
|
||||
if (-1 == ret)
|
||||
return false;
|
||||
if (ret > sb.st_size) {
|
||||
// Increased in size?!
|
||||
return false;
|
||||
}
|
||||
out.resize(ret);
|
||||
// Don't use the non-file names.
|
||||
if (!out.empty() && out[0] != '/')
|
||||
return false;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::string NameFromFD(int fd) {
|
||||
std::string ret;
|
||||
if (TryName(fd, ret)) return ret;
|
||||
switch (fd) {
|
||||
case 0: return "stdin";
|
||||
case 1: return "stdout";
|
||||
case 2: return "stderr";
|
||||
}
|
||||
ret = "fd ";
|
||||
std::ostringstream convert;
|
||||
convert << fd;
|
||||
ret += convert.str();
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace util
|
||||
|
52
util/file.hh
52
util/file.hh
@ -1,6 +1,8 @@
|
||||
#ifndef UTIL_FILE__
|
||||
#define UTIL_FILE__
|
||||
|
||||
#include "util/exception.hh"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
@ -17,7 +19,7 @@ class scoped_fd {
|
||||
|
||||
~scoped_fd();
|
||||
|
||||
void reset(int to) {
|
||||
void reset(int to = -1) {
|
||||
scoped_fd other(fd_);
|
||||
fd_ = to;
|
||||
}
|
||||
@ -63,6 +65,32 @@ class scoped_FILE {
|
||||
std::FILE *file_;
|
||||
};
|
||||
|
||||
/* Thrown for any operation where the fd is known. */
|
||||
class FDException : public ErrnoException {
|
||||
public:
|
||||
explicit FDException(int fd) throw();
|
||||
|
||||
virtual ~FDException() throw();
|
||||
|
||||
// This may no longer be valid if the exception was thrown past open.
|
||||
int FD() const { return fd_; }
|
||||
|
||||
// Guess from NameFromFD.
|
||||
const std::string &NameGuess() const { return name_guess_; }
|
||||
|
||||
private:
|
||||
int fd_;
|
||||
|
||||
std::string name_guess_;
|
||||
};
|
||||
|
||||
// End of file reached.
|
||||
class EndOfFileException : public Exception {
|
||||
public:
|
||||
EndOfFileException() throw();
|
||||
~EndOfFileException() throw();
|
||||
};
|
||||
|
||||
// Open for read only.
|
||||
int OpenReadOrThrow(const char *name);
|
||||
// Create file if it doesn't exist, truncate if it does. Opened for write.
|
||||
@ -71,12 +99,15 @@ int CreateOrThrow(const char *name);
|
||||
// Return value for SizeFile when it can't size properly.
|
||||
const uint64_t kBadSize = (uint64_t)-1;
|
||||
uint64_t SizeFile(int fd);
|
||||
uint64_t SizeOrThrow(int fd);
|
||||
|
||||
void ResizeOrThrow(int fd, uint64_t to);
|
||||
|
||||
std::size_t PartialRead(int fd, void *to, std::size_t size);
|
||||
void ReadOrThrow(int fd, void *to, std::size_t size);
|
||||
std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size);
|
||||
// Positioned: unix only for now.
|
||||
void PReadOrThrow(int fd, void *to, std::size_t size, uint64_t off);
|
||||
|
||||
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
|
||||
void WriteOrThrow(FILE *to, const void *data, std::size_t size);
|
||||
@ -91,17 +122,18 @@ void SeekEnd(int fd);
|
||||
std::FILE *FDOpenOrThrow(scoped_fd &file);
|
||||
std::FILE *FDOpenReadOrThrow(scoped_fd &file);
|
||||
|
||||
class TempMaker {
|
||||
public:
|
||||
explicit TempMaker(const std::string &prefix);
|
||||
// Temporary files
|
||||
int MakeTemp(const std::string &prefix);
|
||||
std::FILE *FMakeTemp(const std::string &prefix);
|
||||
|
||||
// These will already be unlinked for you.
|
||||
int Make() const;
|
||||
std::FILE *MakeFile() const;
|
||||
// dup an fd.
|
||||
int DupOrThrow(int fd);
|
||||
|
||||
private:
|
||||
std::string base_;
|
||||
};
|
||||
/* Attempt get file name from fd. This won't always work (i.e. on Windows or
|
||||
* a pipe). The file might have been renamed. It's intended for diagnostics
|
||||
* and logging only.
|
||||
*/
|
||||
std::string NameFromFD(int fd);
|
||||
|
||||
} // namespace util
|
||||
|
||||
|
@ -34,10 +34,17 @@ FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t
|
||||
Initialize(name, show_progress, min_buffer);
|
||||
}
|
||||
|
||||
FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
|
||||
namespace {
|
||||
std::string NamePossiblyFind(int fd, const char *name) {
|
||||
if (name) return name;
|
||||
return NameFromFD(fd);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
|
||||
file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
|
||||
progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
|
||||
Initialize(name, show_progress, min_buffer);
|
||||
progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) {
|
||||
Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
|
||||
}
|
||||
|
||||
FilePiece::~FilePiece() {}
|
||||
|
@ -29,7 +29,7 @@ class FilePiece {
|
||||
// 1 MB default.
|
||||
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||
// Takes ownership of fd. name is used for messages.
|
||||
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||
explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||
|
||||
~FilePiece();
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
// Tests might fail if you have creative characters in your path. Sue me.
|
||||
#include "util/file_piece.hh"
|
||||
|
||||
#include "util/file.hh"
|
||||
#include "util/scoped.hh"
|
||||
|
||||
#define BOOST_TEST_MODULE FilePieceTest
|
||||
|
@ -23,7 +23,7 @@ class scoped_malloc {
|
||||
|
||||
void call_realloc(std::size_t to) {
|
||||
void *ret;
|
||||
UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, util::ErrnoException, "realloc to " << to << " bytes failed.");
|
||||
UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, ErrnoException, "realloc to " << to << " bytes failed.");
|
||||
p_ = ret;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user