pro extraction mainline and stub of feature data iterator

This commit is contained in:
bhaddow 2011-11-14 18:32:36 +00:00
parent 4bb9ecb8eb
commit 4cf6e0320a
5 changed files with 228 additions and 3 deletions

View File

@ -17,6 +17,7 @@ AC_PROG_LIBTOOL
AX_XMLRPC_C
BOOST_REQUIRE([1.36.0])
BOOST_SMART_PTR
BOOST_PROGRAM_OPTIONS
AC_ARG_WITH(protobuf,
[AC_HELP_STRING([--with-protobuf=PATH], [(optional) path to Google protobuf])],

View File

@ -0,0 +1,40 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "FeatureDataIterator.h"
using namespace std;
FeatureDataIterator::FeatureDataIterator(const string filename) {
}
void FeatureDataIterator::increment() {
}
bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
}
const vector<FeatureDataItem>& FeatureDataIterator::dereference() const {
}

View File

@ -0,0 +1,94 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef _FEATURE_DATA_ITERATOR_
#define _FEATURE_DATA_ITERATOR_
/**
* For loading from the feature data file.
**/
#include <fstream>
#include <map>
#include <vector>
#include <boost/iterator/iterator_facade.hpp>
//Minimal sparse vector
class SparseVector {
public:
typedef std::map<size_t,float> fvector_t;
typedef std::map<std::string, size_t> name2id_t;
typedef std::vector<std::string> id2name_t;
float get(std::string name) const;
float get(size_t id) const;
void set(std::string name, float value);
void clear();
size_t size() const;
void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs);
private:
static name2id_t name2id_;
static id2name_t id2name_;
fvector_t fvector_;
};
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
class FeatureDataItem {
public:
std::vector<float> dense;
SparseVector sparse;
};
class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator,
const std::vector<FeatureDataItem>,
boost::forward_traversal_tag>
{
public:
FeatureDataIterator(const std::string filename);
static FeatureDataIterator end() {
return FeatureDataIterator("");
}
private:
friend class boost::iterator_core_access;
void increment();
bool equal(const FeatureDataIterator& rhs) const;
const std::vector<FeatureDataItem>& dereference() const;
std::ifstream* in_;
};
#endif

View File

@ -1,5 +1,5 @@
lib_LTLIBRARIES = libmert.la
bin_PROGRAMS = mert extractor evaluator
bin_PROGRAMS = mert extractor evaluator pro
AM_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE $(BOOST_CPPFLAGS)
libmert_la_SOURCES = \
@ -7,6 +7,7 @@ Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
FeatureDataIterator.cpp \
Data.cpp \
BleuScorer.cpp \
Point.cpp \
@ -29,9 +30,13 @@ CderScorer.cpp \
MergeScorer.cpp
mert_SOURCES = mert.cpp $(top_builddir)/moses/src/ThreadPool.cpp
extractor_SOURCES = extractor.cpp
evaluator_SOURCES = evaluator.cpp
extractor_SOURCES = extractor.cpp
evaluator_SOURCES = evaluator.cpp
pro_SOURCES = pro.cpp
extractor_LDADD = libmert.la -lm -lz
mert_LDADD = libmert.la -lm -lz $(BOOST_THREAD_LDFLAGS) $(BOOST_THREAD_LIBS)
evaluator_LDADD = libmert.la -lm -lz
pro_LDADD = libmert.la @KENLM_LDFLAGS@ $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIBS)
pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la

View File

@ -0,0 +1,85 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
/**
* This is part of the PRO implementation. It converts the features and scores
* files into a form suitable for input into the megam maxent trainer.
*
* For details of PRO, refer to Hopkins & May (EMNLP 2011)
**/
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <string>
#include <vector>
#include <boost/program_options.hpp>
#include "FeatureDataIterator.h"
using namespace std;
namespace po = boost::program_options;
int main(int argc, char** argv)
{
bool help;
vector<string> scoreFiles;
vector<string> featureFiles;
int seed;
po::options_description desc("Allowed options");
desc.add_options()
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
;
po::options_description cmdline_options;
cmdline_options.add(desc);
po::variables_map vm;
po::store(po::command_line_parser(argc,argv).
options(cmdline_options).run(), vm);
po::notify(vm);
if (help) {
cout << "Usage: " + string(argv[0]) + " [options]" << endl;
cout << desc << endl;
return 0;
}
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
srand(seed);
} else {
cerr << "Initialising random seed from system clock" << endl;
srand(time(NULL));
}
FeatureDataIterator fi(featureFiles[0]);
for (; fi != FeatureDataIterator::end(); ++fi) {
const vector<FeatureDataItem>& featureData = *fi;
}
}