merged in TER Scorer from mert-other_metrics (at r4140)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4146 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
machacekmatous 2011-08-16 16:21:31 +00:00
parent 6dae77c3eb
commit 3ef02eb7e6
48 changed files with 10901 additions and 4 deletions

View File

@ -9,7 +9,25 @@ Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
mert.cpp
mert.cpp \
TERsrc/alignmentStruct.cpp \
TERsrc/hashMap.cpp \
TERsrc/hashMapStringInfos.cpp \
TERsrc/segmentStructure.cpp \
TERsrc/stringHasher.cpp \
TERsrc/terAlignment.cpp \
TERsrc/terShift.cpp \
TERsrc/tinyxml.cpp \
TERsrc/tinyxmlparser.cpp \
TERsrc/documentStructure.cpp \
TERsrc/hashMapInfos.cpp \
TERsrc/infosHasher.cpp \
TERsrc/stringInfosHasher.cpp \
TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp
extractor_SOURCES = Util.cpp \
Timer.cpp \
@ -21,8 +39,25 @@ Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
extractor.cpp
extractor.cpp \
TERsrc/alignmentStruct.cpp \
TERsrc/hashMap.cpp \
TERsrc/hashMapStringInfos.cpp \
TERsrc/segmentStructure.cpp \
TERsrc/stringHasher.cpp \
TERsrc/terAlignment.cpp \
TERsrc/terShift.cpp \
TERsrc/tinyxml.cpp \
TERsrc/tinyxmlparser.cpp \
TERsrc/documentStructure.cpp \
TERsrc/hashMapInfos.cpp \
TERsrc/infosHasher.cpp \
TERsrc/stringInfosHasher.cpp \
TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp
mert_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE
extractor_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE

View File

@ -14,6 +14,7 @@
#include "Scorer.h"
#include "BleuScorer.h"
#include "PerScorer.h"
#include "TerScorer.h"
using namespace std;
@ -25,6 +26,7 @@ public:
vector<string> types;
types.push_back(string("BLEU"));
types.push_back(string("PER"));
types.push_back(string("TER"));
return types;
}
@ -33,6 +35,8 @@ public:
return (BleuScorer*) new BleuScorer(config);
} else if (type == "PER") {
return (PerScorer*) new PerScorer(config);
} else if (type == "TER") {
return (TerScorer*) new TerScorer(config);
} else {
throw runtime_error("Unknown scorer type: " + type);
}

View File

@ -0,0 +1,115 @@
#include "alignmentStruct.h"
using namespace std;
namespace TERCpp
{
string alignmentStruct::toString()
{
stringstream s;
// s << "nword : " << vectorToString(nwords)<<endl;
// s << "alignment" << vectorToString(alignment)<<endl;
// s << "afterShift" << vectorToString(alignment)<<endl;
s << "Nothing to be printed" <<endl;
return s.str();
}
// alignmentStruct::alignmentStruct()
// {
// // vector<string> ref;
// // vector<string> hyp;
// // vector<string> aftershift;
//
// // alignmentStruct[] allshifts = null;
//
// numEdits=0;
// numWords=0;
// bestRef="";
//
// numIns=0;
// numDel=0;
// numSub=0;
// numSft=0;
// numWsf=0;
// }
// alignmentStruct::alignmentStruct ()
// {
// start = 0;
// end = 0;
// moveto = 0;
// newloc = 0;
// cost=1.0;
// }
// alignmentStruct::alignmentStruct (int _start, int _end, int _moveto, int _newloc)
// {
// start = _start;
// end = _end;
// moveto = _moveto;
// newloc = _newloc;
// cost=1.0;
// }
// alignmentStruct::alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted)
// {
// start = _start;
// end = _end;
// moveto = _moveto;
// newloc = _newloc;
// shifted = _shifted;
// cost=1.0;
// }
// string alignmentStruct::vectorToString(vector<string> vec)
// {
// string retour("");
// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
// {
// retour+=(*vecIter)+"\t";
// }
// return retour;
// }
// string alignmentStruct::toString()
// {
// stringstream s;
// s.str("");
// s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]";
// if ((int)shifted.size() > 0)
// {
// s << " (" << vectorToString(shifted) << ")";
// }
// return s.str();
// }
/* The distance of the shift. */
// int alignmentStruct::distance()
// {
// if (moveto < start)
// {
// return start - moveto;
// }
// else if (moveto > end)
// {
// return moveto - end;
// }
// else
// {
// return moveto - start;
// }
// }
//
// bool alignmentStruct::leftShift()
// {
// return (moveto < start);
// }
//
// int alignmentStruct::size()
// {
// return (end - start) + 1;
// }
// alignmentStruct alignmentStruct::operator=(alignmentStruct t)
// {
//
// return t;
// }
}

View File

@ -0,0 +1,46 @@
#ifndef __TERCPPALIGNMENTSTRUCT_H__
#define __TERCPPALIGNMENTSTRUCT_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class alignmentStruct
{
private:
public:
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
// alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted);
// string toString();
// int distance() ;
// bool leftShift();
// int size();
// alignmentStruct operator=(alignmentStruct t);
// string vectorToString(vector<string> vec);
// int start;
// int end;
// int moveto;
// int newloc;
vector<string> nwords; // The words we shifted
vector<char> alignment ; // for pra_more output
vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
double cost;
string toString();
};
}
#endif

View File

@ -0,0 +1,50 @@
#ifndef __BESTSHIFTSTRUCT_H__
#define __BESTSHIFTSTRUCT_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
#include "terShift.h"
#include "terAlignment.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class bestShiftStruct
{
private:
public:
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
// alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted);
// string toString();
// int distance() ;
// bool leftShift();
// int size();
// alignmentStruct operator=(alignmentStruct t);
// string vectorToString(vector<string> vec);
// int start;
// int end;
// int moveto;
// int newloc;
terShift m_best_shift;
terAlignment m_best_align;
bool m_empty;
// vector<string> nwords; // The words we shifted
// char* alignment ; // for pra_more output
// vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
// double cost;
};
}
#endif

View File

@ -0,0 +1,181 @@
#include "documentStructure.h"
using namespace std;
namespace TERCpp
{
string documentStructure::toString()
{
stringstream s;
// s << "nword : " << vectorToString(nwords)<<endl;
// s << "alignment" << vectorToString(alignment)<<endl;
// s << "afterShift" << vectorToString(alignment)<<endl;
s << "Nothing to be printed" << endl;
return s.str();
}
string documentStructure::getDocId()
{
return docId;
}
vector< segmentStructure >* documentStructure::getSegments()
{
return &seg;
}
string documentStructure::getSysId()
{
return sysId;
}
// float documentStructure::getAverageLength()
// {
// return averageLength;
// }
// void documentStructure::setAverageLength(float f)
// {
// averageLength=f;
// }
void documentStructure::addSegments ( segmentStructure s )
{
seg.push_back ( s );
}
void documentStructure::addSegments ( string id, string text )
{
segmentStructure tmp_seg ( id, text );
seg.push_back ( tmp_seg );
}
segmentStructure* documentStructure::getLastSegments()
{
return & seg.at ( ( int ) seg.size() - 1 );
}
void documentStructure::setDocId ( string s )
{
docId = s;
}
void documentStructure::setSysId ( string s )
{
sysId = s;
}
segmentStructure* documentStructure::getSegment ( string id )
{
for ( int i = 0; i < ( int ) seg.size(); i++ ) {
if ( id.compare ( seg.at ( i ).getSegId() ) == 0 ) {
return & ( seg.at ( i ) );
}
}
cerr << "ERROR : documentStructure::getSegment : Segment " << id << " does not exist" <<endl;
cerr << "Segment size " << seg.size()<< endl;
for (int i=0; i<(int)seg.size(); i++) {
cerr <<seg.at(i).getSegId()<<endl;
}
exit(0);
}
int documentStructure::getSize()
{
return ( int ) seg.size();
}
// documentStructure::documentStructure()
// {
// // vector<string> ref;
// // vector<string> hyp;
// // vector<string> aftershift;
//
// // documentStructure[] allshifts = null;
//
// numEdits=0;
// numWords=0;
// bestRef="";
//
// numIns=0;
// numDel=0;
// numSub=0;
// numSft=0;
// numWsf=0;
// }
// documentStructure::documentStructure ()
// {
// start = 0;
// end = 0;
// moveto = 0;
// newloc = 0;
// cost=1.0;
// }
// documentStructure::documentStructure (int _start, int _end, int _moveto, int _newloc)
// {
// start = _start;
// end = _end;
// moveto = _moveto;
// newloc = _newloc;
// cost=1.0;
// }
// documentStructure::documentStructure (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted)
// {
// start = _start;
// end = _end;
// moveto = _moveto;
// newloc = _newloc;
// shifted = _shifted;
// cost=1.0;
// }
// string documentStructure::vectorToString(vector<string> vec)
// {
// string retour("");
// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
// {
// retour+=(*vecIter)+"\t";
// }
// return retour;
// }
// string documentStructure::toString()
// {
// stringstream s;
// s.str("");
// s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]";
// if ((int)shifted.size() > 0)
// {
// s << " (" << vectorToString(shifted) << ")";
// }
// return s.str();
// }
/* The distance of the shift. */
// int documentStructure::distance()
// {
// if (moveto < start)
// {
// return start - moveto;
// }
// else if (moveto > end)
// {
// return moveto - end;
// }
// else
// {
// return moveto - start;
// }
// }
//
// bool documentStructure::leftShift()
// {
// return (moveto < start);
// }
//
// int documentStructure::size()
// {
// return (end - start) + 1;
// }
// documentStructure documentStructure::operator=(documentStructure t)
// {
//
// return t;
// }
}

View File

@ -0,0 +1,60 @@
#ifndef __DOCUMENTSTRUCTURE_H__
#define __DOCUMENTSTRUCTURE_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
#include "segmentStructure.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class documentStructure
{
private:
string docId;
string sysId;
vector<segmentStructure> seg;
public:
string getDocId();
string getSysId();
vector<segmentStructure>* getSegments();
segmentStructure* getLastSegments();
void setDocId ( string s );
void setSysId ( string s );
void addSegments ( segmentStructure s );
void addSegments ( string id, string text );
segmentStructure* getSegment ( string id );
int getSize();
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
// alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted);
// string toString();
// int distance() ;
// bool leftShift();
// int size();
// alignmentStruct operator=(alignmentStruct t);
// string vectorToString(vector<string> vec);
// int start;
// int end;
// int moveto;
// int newloc;
// vector<string> nwords; // The words we shifted
// vector<char> alignment ; // for pra_more output
// vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
// double cost;
string toString();
};
}
#endif

153
mert/TERsrc/hashMap.cpp Normal file
View File

@ -0,0 +1,153 @@
#include "hashMap.h"
// The following class defines a hash function for strings
using namespace std;
namespace HashMapSpace
{
// hashMap::hashMap();
/* hashMap::~hashMap()
{
// vector<stringHasher>::const_iterator del = m_hasher.begin();
for ( vector<stringHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMap::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMap::trouve ( long searchKey )
{
long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
int hashMap::trouve ( string key )
{
long searchKey=hashValue ( key );
long foundKey;;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
/**
* long hashMap::hashValue ( string key )
* @param key
* @return
*/
long hashMap::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> >(loc);
return coll.hash(key.data(),key.data()+key.length());
// boost::hash<string> hasher;
// return hasher ( key );
}
/**
* void hashMap::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMap::addHasher ( string key, string value )
{
if ( trouve ( hashValue ( key ) ) ==0 ) {
// cerr << "ICI1" <<endl;
stringHasher H ( hashValue ( key ),key,value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
m_hasher.push_back ( H );
}
}
stringHasher hashMap::getHasher ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
stringHasher defaut(0,"","");
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return ( *l_hasher );
}
}
return defaut;
}
string hashMap::getValue ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
}
return "";
}
string hashMap::searchValue ( string value )
{
// long searchKey=hashValue ( key );
// long foundKey;
string foundValue;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundValue= ( *l_hasher ).getValue();
if ( foundValue.compare ( value ) == 0 ) {
return ( *l_hasher ).getKey();
}
}
return "";
}
void hashMap::setValue ( string key , string value )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<stringHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
}
/**
*
*/
void hashMap::printHash()
{
for ( vector<stringHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
// long hashValue(string key){}
}

44
mert/TERsrc/hashMap.h Normal file
View File

@ -0,0 +1,44 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __HASHMAP_H__
#define __HASHMAP_H__
#include <boost/functional/hash.hpp>
#include "stringHasher.h"
#include <vector>
#include <string>
#include <sstream>
#include <fstream>
#include <locale>
using namespace std;
namespace HashMapSpace
{
class hashMap
{
private:
vector<stringHasher> m_hasher;
public:
// ~hashMap();
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, string value );
stringHasher getHasher ( string key );
string getValue ( string key );
string searchValue ( string key );
void setValue ( string key , string value );
void printHash();
vector<stringHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
}
#endif

View File

@ -0,0 +1,162 @@
#include "hashMapInfos.h"
// The following class defines a hash function for strings
using namespace std;
namespace HashMapSpace
{
// hashMapInfos::hashMap();
/* hashMapInfos::~hashMap()
{
// vector<infosHasher>::const_iterator del = m_hasher.begin();
for ( vector<infosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMapInfos::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMapInfos::trouve ( long searchKey )
{
long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
int hashMapInfos::trouve ( string key )
{
long searchKey=hashValue ( key );
long foundKey;;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
/**
* long hashMapInfos::hashValue ( string key )
* @param key
* @return
*/
long hashMapInfos::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> >(loc);
return coll.hash(key.data(),key.data()+key.length());
// boost::hash<string> hasher;
// return hasher ( key );
}
/**
* void hashMapInfos::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMapInfos::addHasher ( string key, vector<int> value )
{
if ( trouve ( hashValue ( key ) ) ==0 ) {
// cerr << "ICI1" <<endl;
infosHasher H ( hashValue ( key ),key,value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
m_hasher.push_back ( H );
}
}
void hashMapInfos::addValue ( string key, vector<int> value )
{
addHasher ( key, value );
}
infosHasher hashMapInfos::getHasher ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return ( *l_hasher );
}
}
vector<int> temp;
infosHasher defaut(0,"",temp);
return defaut;
}
vector<int> hashMapInfos::getValue ( string key )
{
long searchKey=hashValue ( key );
long foundKey;
vector<int> retour;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
}
return retour;
}
// string hashMapInfos::searchValue ( string value )
// {
// // long searchKey=hashValue ( key );
// // long foundKey;
// vector<int> foundValue;
//
// // vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
// for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
// {
// foundValue= ( *l_hasher ).getValue();
// /* if ( foundValue.compare ( value ) == 0 )
// {
// return ( *l_hasher ).getKey();
// }*/
// }
// return "";
// }
//
void hashMapInfos::setValue ( string key , vector<int> value )
{
long searchKey=hashValue ( key );
long foundKey;
// vector<infosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
foundKey= ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
}
/**
*
*/
void hashMapInfos::printHash()
{
for ( vector<infosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ ) {
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
// long hashValue(string key){}
}

View File

@ -0,0 +1,44 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __HASHMAPINFOS_H__
#define __HASHMAPINFOS_H__
#include <boost/functional/hash.hpp>
#include "infosHasher.h"
#include <vector>
#include <string>
#include <sstream>
#include <fstream>
using namespace std;
namespace HashMapSpace
{
class hashMapInfos
{
private:
vector<infosHasher> m_hasher;
public:
// ~hashMap();
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, vector<int> value );
void addValue ( string key, vector<int> value );
infosHasher getHasher ( string key );
vector<int> getValue ( string key );
// string searchValue ( string key );
void setValue ( string key , vector<int> value );
void printHash();
vector<infosHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
}
#endif

View File

@ -0,0 +1,167 @@
#include "hashMapStringInfos.h"
// The following class defines a hash function for strings
using namespace std;
namespace HashMapSpace
{
// hashMapStringInfos::hashMap();
/* hashMapStringInfos::~hashMap()
{
// vector<stringInfosHasher>::const_iterator del = m_hasher.begin();
for ( vector<stringInfosHasher>::const_iterator del=m_hasher.begin(); del != m_hasher.end(); del++ )
{
delete(*del);
}
}*/
/**
* int hashMapStringInfos::trouve ( long searchKey )
* @param searchKey
* @return
*/
int hashMapStringInfos::trouve ( long searchKey )
{
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
int hashMapStringInfos::trouve ( string key )
{
long searchKey = hashValue ( key );
long foundKey;;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return 1;
}
}
return 0;
}
/**
* long hashMapStringInfos::hashValue ( string key )
* @param key
* @return
*/
long hashMapStringInfos::hashValue ( string key )
{
locale loc; // the "C" locale
const collate<char>& coll = use_facet<collate<char> > ( loc );
return coll.hash ( key.data(), key.data() + key.length() );
// boost::hash<string> hasher;
// return hasher ( key );
}
/**
* void hashMapStringInfos::addHasher ( string key, string value )
* @param key
* @param value
*/
void hashMapStringInfos::addHasher ( string key, vector<string> value )
{
if ( trouve ( hashValue ( key ) ) == 0 ) {
// cerr << "ICI1" <<endl;
stringInfosHasher H ( hashValue ( key ), key, value );
// cerr <<" "<< hashValue ( key )<<" "<< key<<" "<<value <<endl;
// cerr << "ICI2" <<endl;
m_hasher.push_back ( H );
}
}
void hashMapStringInfos::addValue ( string key, vector<string> value )
{
addHasher ( key, value );
}
stringInfosHasher hashMapStringInfos::getHasher ( string key )
{
long searchKey = hashValue ( key );
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
return ( *l_hasher );
}
}
vector<string> tmp;
stringInfosHasher defaut ( 0, "", tmp );
return defaut;
}
vector<string> hashMapStringInfos::getValue ( string key )
{
long searchKey = hashValue ( key );
long foundKey;
vector<string> retour;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
// cerr <<"value found : " << key<<"|"<< ( *l_hasher ).getValue()<<endl;
return ( *l_hasher ).getValue();
}
}
return retour;
}
// string hashMapStringInfos::searchValue ( string value )
// {
// // long searchKey=hashValue ( key );
// // long foundKey;
// vector<int> foundValue;
//
// // vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
// for ( vector<stringInfosHasher>:: iterator l_hasher=m_hasher.begin() ; l_hasher!=m_hasher.end() ; l_hasher++ )
// {
// foundValue= ( *l_hasher ).getValue();
// /* if ( foundValue.compare ( value ) == 0 )
// {
// return ( *l_hasher ).getKey();
// }*/
// }
// return "";
// }
//
void hashMapStringInfos::setValue ( string key , vector<string> value )
{
long searchKey = hashValue ( key );
long foundKey;
// vector<stringInfosHasher>::const_iterator l_hasher=m_hasher.begin();
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
foundKey = ( *l_hasher ).getHashKey();
if ( searchKey == foundKey ) {
( *l_hasher ).setValue ( value );
// return ( *l_hasher ).getValue();
}
}
}
/**
*
*/
void hashMapStringInfos::printHash()
{
for ( vector<stringInfosHasher>:: iterator l_hasher = m_hasher.begin() ; l_hasher != m_hasher.end() ; l_hasher++ ) {
// cout << ( *l_hasher ).getHashKey() <<" | "<< ( *l_hasher ).getKey() << " | " << ( *l_hasher ).getValue() << endl;
}
}
vector< stringInfosHasher > hashMapStringInfos::getHashMap()
{
return m_hasher;
}
// long hashValue(string key){}
}

View File

@ -0,0 +1,44 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __HASHMAPSTRINGINFOS_H__
#define __HASHMAPSTRINGINFOS_H__
#include <boost/functional/hash.hpp>
#include "stringInfosHasher.h"
#include <vector>
#include <string>
#include <sstream>
#include <fstream>
using namespace std;
namespace HashMapSpace
{
class hashMapStringInfos
{
private:
vector<stringInfosHasher> m_hasher;
public:
// ~hashMap();
long hashValue ( string key );
int trouve ( long searchKey );
int trouve ( string key );
void addHasher ( string key, vector<string> value );
void addValue ( string key, vector<string> value );
stringInfosHasher getHasher ( string key );
vector<string> getValue ( string key );
// string searchValue ( string key );
void setValue ( string key , vector<string> value );
void printHash();
vector<stringInfosHasher> getHashMap();
string printStringHash();
string printStringHash2();
string printStringHashForLexicon();
};
}
#endif

View File

@ -0,0 +1,35 @@
#include "infosHasher.h"
// The following class defines a hash function for strings
using namespace std;
namespace HashMapSpace
{
infosHasher::infosHasher (long cle,string cleTxt, vector<int> valueVecInt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueVecInt;
}
// infosHasher::~infosHasher(){};*/
long infosHasher::getHashKey()
{
return m_hashKey;
}
string infosHasher::getKey()
{
return m_key;
}
vector<int> infosHasher::getValue()
{
return m_value;
}
void infosHasher::setValue ( vector<int> value )
{
m_value=value;
}
// typedef stdext::hash_map<std::string,string, stringhasher> HASH_S_S;
}

31
mert/TERsrc/infosHasher.h Normal file
View File

@ -0,0 +1,31 @@
#ifndef __INFOSHASHER_H__
#define __INFOSHASHER_H__
#include <string>
// #include <ext/hash_map>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;
namespace HashMapSpace
{
class infosHasher
{
private:
long m_hashKey;
string m_key;
vector<int> m_value;
public:
infosHasher ( long cle, string cleTxt, vector<int> valueVecInt );
long getHashKey();
string getKey();
vector<int> getValue();
void setValue ( vector<int> value );
};
}
#endif

View File

@ -0,0 +1,332 @@
#include "multiEvaluation.h"
// #include <iostream>
// #include <boost/filesystem/fstream.hpp>
// #include <boost/archive/xml_oarchive.hpp>
// #include <boost/archive/xml_iarchive.hpp>
// #include <boost/serialization/nvp.hpp>
// helper functions to allow us to load and save sandwiches to/from xml
namespace TERCpp
{
multiEvaluation::multiEvaluation()
{
evalParameters.debugMode = false;
evalParameters.caseOn = false;
evalParameters.noPunct = false;
evalParameters.normalize = false;
evalParameters.tercomLike = false;
evalParameters.sgmlInputs = false;
evalParameters.noTxtIds = false;
// referencesTxt=new multiTxtDocument();
// hypothesisTxt=new documentStructure();
}
multiEvaluation::multiEvaluation ( param p )
{
evalParameters.debugMode = false;
evalParameters.caseOn = false;
evalParameters.noPunct = false;
evalParameters.normalize = false;
evalParameters.tercomLike = false;
evalParameters.sgmlInputs = false;
evalParameters.noTxtIds = false;
evalParameters = Tools::copyParam ( p );
// referencesTxt=new multiTxtDocument();
// hypothesisTxt=new documentStructure();
}
void multiEvaluation::addReferences()
{
referencesTxt.loadRefFiles ( evalParameters );
}
// void multiEvaluation::addReferences(vector< string > vecRefecrences)
// {
// for (int i=0; i< (int) vecRefecrences.size(); i++)
// {
// referencesTxt.loadFile(vecRefecrences.at(i));
// }
// }
void multiEvaluation::setHypothesis()
{
multiTxtDocument l_multiTxtTmp;
l_multiTxtTmp.loadHypFile ( evalParameters );
hypothesisTxt = (*(l_multiTxtTmp.getDocument ( "0" )));
}
void multiEvaluation::setParameters ( param p )
{
evalParameters = Tools::copyParam ( p );
}
void multiEvaluation::launchTxtEvaluation()
{
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchTxtEvaluation : before testing references and hypothesis size "<<endl<<"END DEBUG"<<endl;
}
if ( referencesTxt.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchTxtEvaluation : there is no references" << endl;
exit ( 0 );
}
if ( hypothesisTxt.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchTxtEvaluation : there is no hypothesis" << endl;
exit ( 0 );
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchTxtEvaluation : testing references and hypothesis size "<<endl<<" number of references : "<< referencesTxt.getSize()<<endl;
vector <string> s =referencesTxt.getListDocuments();
cerr << " avaiable ids : ";
for (vector <string>::iterator iterS=s.begin(); iterS!=s.end(); iterS++) {
cerr << " " << (*iterS);
}
cerr << endl;
for (vector <string>::iterator iterSBis=s.begin(); iterSBis!=s.end(); iterSBis++) {
cerr << " reference : "+(*iterSBis)+"; size : "<< (referencesTxt.getDocument((*iterSBis)))->getSize() << endl;
}
cerr << " hypothesis size : "<< hypothesisTxt.getSize() << endl<<"END DEBUG"<<endl;
}
int incDocRefences = 0;
stringstream l_stream;
vector<float> editsResults;
vector<float> wordsResults;
int tot_ins = 0;
int tot_del = 0;
int tot_sub = 0;
int tot_sft = 0;
int tot_wsf = 0;
float tot_err = 0;
float tot_wds = 0;
// vector<stringInfosHasher> setOfHypothesis = hashHypothesis.getHashMap();
ofstream outputSum ( ( evalParameters.hypothesisFile + ".output.sum.log" ).c_str() );
outputSum << "Hypothesis File: " + evalParameters.hypothesisFile + "\nReference File: " + evalParameters.referenceFile + "\n" + "Ave-Reference File: " << endl;
char outputCharBuffer[200];
sprintf ( outputCharBuffer, "%19s | %4s | %4s | %4s | %4s | %4s | %6s | %8s | %8s", "Sent Id", "Ins", "Del", "Sub", "Shft", "WdSh", "NumEr", "AvNumWd", "TER");
outputSum << outputCharBuffer << endl;
outputSum << "-------------------------------------------------------------------------------------" << endl;
vector <string> referenceList =referencesTxt.getListDocuments();
for (vector <string>::iterator referenceListIter=referenceList.begin(); referenceListIter!=referenceList.end(); referenceListIter++) {
// cerr << " " << (*referenceListIter);
documentStructure l_reference = (*(referencesTxt.getDocument ( (*referenceListIter) )));
evaluate ( l_reference, hypothesisTxt );
// evaluate ( l_reference);
}
// for ( incDocRefences = 0; incDocRefences < referencesTxt.getSize();incDocRefences++ )
// {
// l_stream.str ( "" );
// l_stream << incDocRefences;
// }
for ( vector<segmentStructure>::iterator segHypIt = hypothesisTxt.getSegments()->begin(); segHypIt != hypothesisTxt.getSegments()->end(); segHypIt++ ) {
terAlignment l_result = segHypIt->getAlignment();
string bestDocId = segHypIt->getBestDocId();
string l_id=segHypIt->getSegId();
editsResults.push_back(l_result.numEdits);
wordsResults.push_back(l_result.numWords);
l_result.scoreDetails();
tot_ins += l_result.numIns;
tot_del += l_result.numDel;
tot_sub += l_result.numSub;
tot_sft += l_result.numSft;
tot_wsf += l_result.numWsf;
tot_err += l_result.numEdits;
tot_wds += l_result.averageWords;
char outputCharBufferTmp[200];
sprintf(outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f",(l_id+":"+bestDocId).c_str(), l_result.numIns, l_result.numDel, l_result.numSub, l_result.numSft, l_result.numWsf, l_result.numEdits, l_result.averageWords, l_result.scoreAv()*100.0);
outputSum<< outputCharBufferTmp<<endl;
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchTxtEvaluation : Evaluation "<<endl<< l_result.toString() <<endl<<"END DEBUG"<<endl;
}
}
cout << "Total TER: " << scoreTER ( editsResults, wordsResults );
char outputCharBufferTmp[200];
outputSum << "-------------------------------------------------------------------------------------" << endl;
sprintf ( outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f", "TOTAL", tot_ins, tot_del, tot_sub, tot_sft, tot_wsf, tot_err, tot_wds, tot_err*100.0 / tot_wds );
outputSum << outputCharBufferTmp << endl;
outputSum.close();
}
void multiEvaluation::evaluate ( documentStructure& docStructReference, documentStructure& docStructhypothesis )
{
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : launching evaluate on "<<endl<<" references size : "<< docStructReference.getSize() << endl << " hypothesis size : "<< docStructhypothesis.getSize() << endl<<"END DEBUG"<<endl;
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : testing hypothesis "<<endl;
cerr <<" segId : "<< docStructhypothesis.getSegments()->at(0).getSegId() << endl<<"END DEBUG"<<endl;
}
for ( vector<segmentStructure>::iterator segHypIt = docStructhypothesis.getSegments()->begin(); segHypIt != docStructhypothesis.getSegments()->end(); segHypIt++ ) {
// cerr << "************************************************************************************************************************************************************************************** 1 " << (docStructhypothesis.getSegments()->at(0)).toString()<<endl;
terCalc * l_evalTER = new terCalc();
// cerr << "************************************************************************************************************************************************************************************** 2"<<endl;
// (*segHypIt).getSegId() ;
// cerr << "************************************************************************************************************************************************************************************** 3"<<endl;
segmentStructure * l_segRef = docStructReference.getSegment ( segHypIt->getSegId() );
// cerr << "************************************************************************************************************************************************************************************** 4"<<endl;
// exit(0);
terAlignment l_result = l_evalTER->TER ( segHypIt->getContent(), l_segRef->getContent());
l_result.averageWords = l_segRef->getAverageLength();
if (l_result.averageWords==0.0) {
cerr << "ERROR : tercpp : multiEvaluation::evaluate : averageWords is equal to zero" <<endl;
exit(0);
}
l_segRef->setAlignment ( l_result );
if ((segHypIt->getAlignment().numWords == 0) && (segHypIt->getAlignment().numEdits == 0 )) {
segHypIt->setAlignment ( l_result );
segHypIt->setBestDocId ( docStructReference.getDocId() );
} else if ( l_result.scoreAv() < segHypIt->getAlignment().scoreAv() ) {
segHypIt->setAlignment ( l_result );
segHypIt->setBestDocId ( docStructReference.getDocId() );
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : testing "<<endl<<" hypothesis : "<< segHypIt->getSegId() <<endl;
cerr << "hypothesis score : "<< segHypIt->getAlignment().scoreAv() <<endl;
cerr << "BestDoc Id : "<< segHypIt->getBestDocId() <<endl;
cerr << "new score : "<< l_result.scoreAv() <<endl;
cerr << "new BestDoc Id : "<< docStructReference.getDocId() <<endl;
cerr << endl<<"END DEBUG"<<endl;
}
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::evaluate : "<<endl<<"End of function"<<endl<<"END DEBUG"<<endl;
}
// for (incSegHypothesis=0; incSegHypothesis< getSize();incSegHypothesis++)
// {
// docStructhypothesis->getSegments()
// }
}
string multiEvaluation::scoreTER ( vector<float> numEdits, vector<float> numWords )
{
vector<float>::iterator editsIt = numEdits.begin();
vector<float>::iterator wordsIt = numWords.begin();
if ( numWords.size() != numEdits.size() ) {
cerr << "ERROR : tercpp:score, diffrent size of hyp and ref" << endl;
exit ( 0 );
}
double editsCount = 0.0;
double wordsCount = 0.0;
while ( editsIt != numEdits.end() ) {
editsCount += ( *editsIt );
wordsCount += ( *wordsIt );
editsIt++;
wordsIt++;
}
stringstream output;
if ( ( wordsCount <= 0.0 ) && ( editsCount > 0.0 ) ) {
output << 1.0 << " (" << editsCount << "/" << wordsCount << ")" << endl;
} else if ( wordsCount <= 0.0 ) {
output << 0.0 << " (" << editsCount << "/" << wordsCount << ")" << endl;
} else {
// return editsCount/wordsCount;
output << editsCount / wordsCount << " (" << editsCount << "/" << wordsCount << ")" << endl;
}
return output.str();
}
void multiEvaluation::launchSGMLEvaluation()
{
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchSGMLEvaluation : before testing references and hypothesis size "<<endl<<"END DEBUG"<<endl;
}
if ( referencesSGML.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchSGMLEvaluation : there is no references" << endl;
exit ( 0 );
}
if ( hypothesisSGML.getSize() == 0 ) {
cerr << "ERROR : multiEvaluation::launchSGMLEvaluation : there is no hypothesis" << endl;
exit ( 0 );
}
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchSGMLEvaluation : testing references and hypothesis size "<<endl<<" references size : "<< referencesSGML.getSize() << endl << " hypothesis size : "<< hypothesisSGML.getSize() << endl<<"END DEBUG"<<endl;
}
int incDocRefences = 0;
stringstream l_stream;
vector<float> editsResults;
vector<float> wordsResults;
int tot_ins = 0;
int tot_del = 0;
int tot_sub = 0;
int tot_sft = 0;
int tot_wsf = 0;
float tot_err = 0;
float tot_wds = 0;
// vector<stringInfosHasher> setOfHypothesis = hashHypothesis.getHashMap();
ofstream outputSum ( ( evalParameters.hypothesisFile + ".output.sum.log" ).c_str() );
outputSum << "Hypothesis File: " + evalParameters.hypothesisFile + "\nReference File: " + evalParameters.referenceFile + "\n" + "Ave-Reference File: " << endl;
char outputCharBuffer[200];
sprintf ( outputCharBuffer, "%19s | %4s | %4s | %4s | %4s | %4s | %6s | %8s | %8s", "Sent Id", "Ins", "Del", "Sub", "Shft", "WdSh", "NumEr", "AvNumWd", "TER");
outputSum << outputCharBuffer << endl;
outputSum << "-------------------------------------------------------------------------------------" << endl;
for ( incDocRefences = 0; incDocRefences < referencesSGML.getSize(); incDocRefences++ ) {
l_stream.str ( "" );
l_stream << incDocRefences;
documentStructure l_reference = (*(referencesSGML.getDocument ( l_stream.str() )));
evaluate ( l_reference, hypothesisSGML );
}
for ( vector<segmentStructure>::iterator segHypIt = hypothesisSGML.getSegments()->begin(); segHypIt != hypothesisSGML.getSegments()->end(); segHypIt++ ) {
terAlignment l_result = segHypIt->getAlignment();
string bestDocId = segHypIt->getBestDocId();
string l_id=segHypIt->getSegId();
editsResults.push_back(l_result.numEdits);
wordsResults.push_back(l_result.averageWords);
l_result.scoreDetails();
tot_ins += l_result.numIns;
tot_del += l_result.numDel;
tot_sub += l_result.numSub;
tot_sft += l_result.numSft;
tot_wsf += l_result.numWsf;
tot_err += l_result.numEdits;
tot_wds += l_result.averageWords;
char outputCharBufferTmp[200];
sprintf(outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f",(l_id+":"+bestDocId).c_str(), l_result.numIns, l_result.numDel, l_result.numSub, l_result.numSft, l_result.numWsf, l_result.numEdits, l_result.averageWords, l_result.scoreAv()*100.0);
outputSum<< outputCharBufferTmp<<endl;
if (evalParameters.debugMode) {
cerr <<"DEBUG tercpp : multiEvaluation::launchSGMLEvaluation : Evaluation "<<endl<< l_result.toString() <<endl<<"END DEBUG"<<endl;
}
}
cout << "Total TER: " << scoreTER ( editsResults, wordsResults );
char outputCharBufferTmp[200];
outputSum << "-------------------------------------------------------------------------------------" << endl;
sprintf ( outputCharBufferTmp, "%19s | %4d | %4d | %4d | %4d | %4d | %6.1f | %8.3f | %8.3f", "TOTAL", tot_ins, tot_del, tot_sub, tot_sft, tot_wsf, tot_err, tot_wds, tot_err*100.0 / tot_wds );
outputSum << outputCharBufferTmp << endl;
outputSum.close();
}
void multiEvaluation::addSGMLReferences()
{
xmlStructure refStruct;
refStruct.xmlParams=copyParam(evalParameters);
referencesSGML=refStruct.dump_to_SGMLDocument(evalParameters.referenceFile);
}
void multiEvaluation::setSGMLHypothesis()
{
SGMLDocument sgmlHyp;
xmlStructure hypStruct;
hypStruct.xmlParams=copyParam(evalParameters);
hypStruct.xmlParams.tercomLike=false;
sgmlHyp=hypStruct.dump_to_SGMLDocument(evalParameters.hypothesisFile);
hypothesisSGML=(*(sgmlHyp.getFirstDocument()));
}
}

View File

@ -0,0 +1,44 @@
#ifndef __MULTIEVAL_DOCUMENT_H__
#define __MULTIEVAL_DOCUMENT_H__
#include "multiTxtDocument.h"
#include "tools.h"
#include <iostream>
#include <string>
#include "xmlStructure.h"
#include "sgmlDocument.h"
using namespace Tools;
namespace TERCpp
{
class multiEvaluation
{
public:
multiEvaluation();
multiEvaluation(param p );
// void addReferences(string s);
// void addReferences(vector<string> vecRefecrences);
// void addReferences(documentStructure doc);
// void setHypothesis(string s);
// void setHypothesis(documentStructure doc);
void addReferences();
void setHypothesis();
void addSGMLReferences();
void setSGMLHypothesis();
void setParameters ( param p );
void launchTxtEvaluation();
void launchSGMLEvaluation();
void evaluate ( documentStructure & docStructReference, documentStructure & docStructhypothesis );
string scoreTER ( vector<float> numEdits, vector<float> numWords );
private:
param evalParameters;
multiTxtDocument referencesTxt;
documentStructure hypothesisTxt;
SGMLDocument referencesSGML;
documentStructure hypothesisSGML;
};
}
#endif //SANDWICH_DEFINED

View File

@ -0,0 +1,347 @@
#include "multiTxtDocument.h"
// #include <iostream>
// #include <boost/filesystem/fstream.hpp>
// #include <boost/archive/xml_oarchive.hpp>
// #include <boost/archive/xml_iarchive.hpp>
// #include <boost/serialization/nvp.hpp>
// helper functions to allow us to load and save sandwiches to/from xml
namespace TERCpp
{
multiTxtDocument::multiTxtDocument()
{
// docType="";
// setId="";
// srcLang="";
// tgtLang="";
}
// multiTxtDocument::multiTxtDocument ( string FileName )
// {
// this=xmlStruct.copy_to_multiTxtDocument(FileName);
// }
// xmlStructure multiTxtDocument::getStructure()
// {
// return xmlStruct;
// }
// string multiTxtDocument::getDocType()
// {
// return docType;
// }
// string multiTxtDocument::getSetId()
// {
// return setId;
// }
// string multiTxtDocument::getSrcLang()
// {
// return srcLang;
// }
// string multiTxtDocument::getTgtLang()
// {
// return tgtLang;
// }
// void multiTxtDocument::setDocType ( string s )
// {
// docType=s;
// }
// void multiTxtDocument::setSetId ( string s )
// {
// setId=s;
// }
// void multiTxtDocument::setSrcLang ( string s )
// {
// srcLang=s;
// }
// void multiTxtDocument::setTgtLang ( string s )
// {
// tgtLang=s;
// }
void multiTxtDocument::addDocument ( documentStructure doc )
{
documents.push_back ( doc );
}
documentStructure* multiTxtDocument::getLastDocument()
{
return & ( documents.at ( ( int ) documents.size() - 1 ) );
}
vector< documentStructure > multiTxtDocument::getDocuments()
{
return documents;
}
vector< string > multiTxtDocument::getListDocuments()
{
vector< string > to_return;
for (vector< documentStructure >::iterator iter=documents.begin(); iter!=documents.end(); iter++) {
string l_id=(*iter).getDocId();
to_return.push_back(l_id);
}
return to_return;
}
documentStructure* multiTxtDocument::getDocument ( string docId )
{
for ( int i = 0; i < ( int ) documents.size(); i++ ) {
if ( docId.compare ( documents.at ( i ).getDocId() ) == 0 ) {
return & ( documents.at ( i ) );
}
}
cerr << "ERROR : multiTxtDocument::getDocument : document " << docId << " does not exist !" << endl;
exit ( 0 );
}
void multiTxtDocument::loadFile ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike )
{
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : loading files " << endl << fileName << endl << "END DEBUG" << endl;
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing params " << endl << Tools::printParams ( multiTxtDocumentParams ) << endl << "END DEBUG" << endl;
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : testing others params " << endl << "caseOn : " << caseOn << endl << "noPunct : " << noPunct << endl << "debugMode : " << debugMode << endl << "noTxtIds : " << noTxtIds << endl << "tercomLike : " << tercomLike << endl << "END DEBUG" << endl;
}
ifstream fichierLoad ( fileName.c_str(), ios::in );
string line;
documentStructure l_doc;
if ( fichierLoad ) {
int l_ids = 1;
stringstream l_stream;
while ( getline ( fichierLoad, line ) ) {
string l_key;
string line_mod;
l_stream.str ( "" );
if ( noTxtIds ) {
l_stream << l_ids;
l_key = l_stream.str();
line_mod = line;
l_ids++;
} else {
if ((int)line.rfind ( "(" )==-1) {
cerr << "ERROR : multiTxtDocument::loadFile : Id not found, maybe you should use the --noTxtIds Option ? " << endl;
exit ( 0 );
}
l_key = line.substr ( line.rfind ( "(" ), line.size() - 1 );
line_mod = line.substr ( 0, line.rfind ( "(" ) - 1 );
}
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG multiTxtDocument::loadFile : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
if ( !tercomLike ) {
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = tokenizePunct ( line_mod );
}
if ( !caseOn ) {
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = lowerCase ( line_mod );
}
if ( noPunct ) {
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFile : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl;
}
if ( !tercomLike ) {
line_mod = removePunctTercom ( line_mod );
} else {
line_mod = removePunct ( line_mod );
}
}
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG multiTxtDocument::loadFile : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
vector<string> vecDocLine = stringToVector ( line_mod, " " );
// string l_key;
// hashHypothesis.addValue(l_key,vecDocLine);
// l_key=(string)vecDocLine.at((int)vecDocLine.size()-1);
// vecDocLine.pop_back();
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp multiTxtDocument::loadFile : " << l_key << "|" << vectorToString ( vecDocLine ) << "|" << endl << "Vector Size : " << vecDocLine.size() << endl << "Line length : " << ( int ) line_mod.length() << endl << "END DEBUG" << endl;
}
// hashHypothesis.addValue(l_key,vecDocLine);
segmentStructure l_seg ( l_key, vecDocLine );
l_doc.addSegments ( l_seg );
}
// Ref=line;
// getline ( fichierHyp, line );
// Hyp=line;
fichierLoad.close(); // on ferme le fichier
l_stream.str ( "" );
l_stream << ( int ) documents.size();
l_doc.setDocId ( l_stream.str() );
addDocument ( l_doc );
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG multiTxtDocument::loadFile : document " << l_doc.getDocId() << " added !!!" << endl << "END DEBUG" << endl;
}
} else { // sinon
cerr << "ERROR : multiTxtDocument::loadFile : can't open file : " + fileName + " !" << endl;
exit ( 0 );
}
}
// void save_sandwich(const multiTxtDocument &sw, const std::string &file_name);
// multiTxtDocument load_sandwich(const std::string &file_name);
// int callmultiTxtDocument()
// {
// // xml filename
// const std::string fn="JasonsSarnie.xml";
//
// // create a new sandwich and lets take a look at it!
// multiTxtDocument *s = new multiTxtDocument("Granary", "Brie", "Bacon", false); // mmmmm, Brie and bacon! ;)
// std::cout << "Created the following sandwich:" << std::endl;
// s->output();
//
// // Now lets save the sandwich out to an XML file....
// std::cout << std::endl << "Saving the sandwich to xml...." << std::endl;
// save_sandwich(*s, fn);
//
// // And then load it into another multiTxtDocument variable and take a look at what we've got
// std::cout << "Attempting to load the saved sandwich..." << std::endl;
// multiTxtDocument s2 = load_sandwich(fn);
// std::cout << "Contents of loaded multiTxtDocument:" << std::endl;
// s2.output();
//
// delete s;
// std::string dummy;
// std::getline(std::cin, dummy);
//
// }
/*
// Save a multiTxtDocument to XML...
void save_sandwich(const multiTxtDocument &sw, const std::string &file_name)
{
// Create a filestream object
boost::filesystem::fstream ofs(file_name, std::ios::trunc | std::ios::out);
// Now create an XML output file using our filestream
boost::archive::xml_oarchive xml(ofs);
// call serialization::make_nvp, passing our sandwich.
// make_nvp will eventually call the sandwich instance (sw) serialize function
// causing the contents of sw to be output to the xml file
xml << boost::serialization::make_nvp("multiTxtDocument", sw);
}
// The load function works in almost the exact same way as save_sandwich,
// The only differences are:
// 1. we create an XML input stream - the original example in AD's link created another xml_oarchive, causing a runtime error...doh!
// 2. the call to make_nvp populates the sandwich instance(sw) which is then returned...
multiTxtDocument load_sandwich(const std::string &file_name)
{
multiTxtDocument sw;
boost::filesystem::fstream ifs(file_name, std::ios::binary | std::ios::in);
boost::archive::xml_iarchive xml(ifs);
xml >> boost::serialization::make_nvp("multiTxtDocument", sw);
return sw;
}*/
void multiTxtDocument::setAverageLength()
{
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : Starting calculate Average length " << endl << "END DEBUG" << endl;
}
vecFloat l_avLength((*documents.begin()).getSize(),0.0);
vector< documentStructure >::iterator iter=documents.begin();
// for (vector< documentStructure >::iterator iter=documents.begin(); iter!=documents.end(); iter++)
// {
// string l_id=(*iter).getDocId();
// to_return.push_back(l_id);
vector< segmentStructure > * l_vecSeg=(*iter).getSegments();
// vector< segmentStructure >::iterator iterSeg=l_vecSeg->begin();
for (vector< segmentStructure >::iterator iterSeg=l_vecSeg->begin(); iterSeg!=l_vecSeg->end(); iterSeg++) {
segmentStructure l_seg=(*iterSeg);
// if ( multiTxtDocumentParams.debugMode )
// {
// cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : Average length: " << l_seg.getAverageLength() << endl << "END DEBUG" << endl;
// }
if (l_seg.getAverageLength()==0.0) {
float l_average=0.0;
for (int l_iter =0; l_iter < (int)documents.size(); l_iter++) {
l_average+=(float)(documents.at(l_iter).getSegment(l_seg.getSegId()))->getSize();
}
l_average=l_average/(float)documents.size();
l_seg.setAverageLength(l_average);
for (iter=documents.begin(); iter!=documents.end(); iter++) {
// if ( multiTxtDocumentParams.debugMode )
// {
// cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : average length BEFORE assignation: DocId, SegId, Average: " << (*iter).getDocId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getSegId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getAverageLength() << endl << "END DEBUG" << endl;
// }
(*iter).getSegment(l_seg.getSegId())->setAverageLength(l_average);
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : average length AFTER assignation: DocId, SegId, Average: " << (*iter).getDocId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getSegId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getAverageLength() << endl << "END DEBUG" << endl;
}
}
}
iter=documents.begin();
// if ( multiTxtDocumentParams.debugMode )
// {
// cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : average length verification: DocId, SegId, Average: " << (*iter).getDocId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getSegId() << "\t"<< (*iter).getSegment(l_seg.getSegId())->getAverageLength() << endl << "END DEBUG" << endl;
// }
}
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::setAverageLength : End calculate Average length " << endl << "END DEBUG" << endl;
}
// }
}
void multiTxtDocument::loadFiles ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike )
{
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadFiles : loading files " << endl << fileName << endl << "END DEBUG" << endl;
}
vector<string> vecFiles = stringToVector ( fileName, "," );
for ( int i = 0; i < ( int ) vecFiles.size(); i++ ) {
loadFile ( vecFiles.at ( i ), caseOn, noPunct, debugMode, noTxtIds, tercomLike );
}
setAverageLength();
}
void multiTxtDocument::loadRefFile ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadRefFile : loading references " << endl << multiTxtDocumentParams.referenceFile << endl << "END DEBUG" << endl;
}
loadFile ( multiTxtDocumentParams.referenceFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
void multiTxtDocument::loadRefFiles ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadRefFiles : loading references " << endl << multiTxtDocumentParams.referenceFile << endl << "END DEBUG" << endl;
}
loadFiles ( multiTxtDocumentParams.referenceFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
void multiTxtDocument::loadHypFile ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
multiTxtDocumentParams.tercomLike = false;
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadHypFile : loading hypothesis " << endl << multiTxtDocumentParams.hypothesisFile << endl << "END DEBUG" << endl;
}
loadFile ( multiTxtDocumentParams.hypothesisFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
void multiTxtDocument::loadHypFiles ( param p )
{
multiTxtDocumentParams = Tools::copyParam ( p );
multiTxtDocumentParams.tercomLike = false;
if ( multiTxtDocumentParams.debugMode ) {
cerr << "DEBUG tercpp : multiTxtDocument::loadHypFiles : loading hypothesis " << endl << multiTxtDocumentParams.hypothesisFile << endl << "END DEBUG" << endl;
}
loadFile ( multiTxtDocumentParams.hypothesisFile, multiTxtDocumentParams.caseOn, multiTxtDocumentParams.noPunct, multiTxtDocumentParams.debugMode, multiTxtDocumentParams.noTxtIds, multiTxtDocumentParams.tercomLike );
}
int multiTxtDocument::getSize()
{
return ( int ) documents.size();
}
}

View File

@ -0,0 +1,81 @@
#ifndef __MULTITXT_DOCUMENT_H__
#define __MULTITXT_DOCUMENT_H__
#include "documentStructure.h"
#include "tools.h"
// #include "xmlStructure.h"
#include <iostream>
#include <string>
namespace TERCpp
{
class multiTxtDocument
{
public:
multiTxtDocument();
// multiTxtDocument(string FileName);
// multiTxtDocument(const std::string &bread, const std::string &cheese, const std::string &meat, const bool pickle):
// m_bread(bread), m_cheese(cheese), m_meat(meat), m_pickle(pickle){};
// ~multiTxtDocument(){};
// void output()
// {
// std::cout << "Bread = " << m_bread << ", Cheese = " << m_cheese <<
// ", Meat = " << m_meat << ", Has Pickle = " << m_pickle << std::endl;
//
// }
// void setDocType(string s);
// void setSetId(string s);
// void setSrcLang(string s);
// void setTgtLang(string s);
// string getDocType();
// string getSetId();
// string getSrcLang();
// string getTgtLang();
// xmlStructure getStructure();
void addDocument ( documentStructure doc );
documentStructure* getLastDocument();
documentStructure* getDocument ( string docId );
vector<documentStructure> getDocuments ();
vector<string> getListDocuments ();
void loadFile ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike );
void loadFiles ( string fileName, bool caseOn, bool noPunct, bool debugMode, bool noTxtIds, bool tercomLike );
void loadRefFile ( param p );
void loadRefFiles ( param p );
void loadHypFile ( param p );
void loadHypFiles ( param p );
void setAverageLength();
int getSize();
private:
// string docType;
// string setId;
// string srcLang;
// string tgtLang;
// xmlStructure xmlStruct;
param multiTxtDocumentParams;
vector<documentStructure> documents;
// vector<string> bestDocumentId;
// std::string m_bread, m_cheese, m_meat;
// bool m_pickle;
//
// // declare the boost::serialization::access class as a friend of multiTxtDocument
// friend class boost::serialization::access;
// // Create a serialize function for serialization::access to use, I guess you could regard this as a kind of callback function!
// template<class archive>
// void serialize(archive& ar, const unsigned int version)
// {
// // Note: As explained in the original tut. the & operator is overridden in boost to use
// // << or >> depending on the direction of the data (read/write)
// using boost::serialization::make_nvp;
// ar & make_nvp("Bread", m_bread);
// ar & make_nvp("Cheese", m_cheese);
// ar & make_nvp("Meats", m_meat);
// ar & make_nvp("HasPickle", m_pickle);
// // Also note: strings in the first parameter of make_nvp cannot contain spaces!
// }
};
}
#endif //SANDWICH_DEFINED

View File

@ -0,0 +1,82 @@
#include "segmentStructure.h"
using namespace std;
namespace TERCpp
{
vecString segmentStructure::getContent()
{
return content;
}
string segmentStructure::getSegId()
{
return segId;
}
string segmentStructure::toString()
{
// return vectorToString(content);
return "";
}
void segmentStructure::addContent ( vecString vecS )
{
content = vecS;
averageLength=0.0;
}
void segmentStructure::setSegId ( string s )
{
segId = s;
}
segmentStructure::segmentStructure ( string id, vecString vecS )
{
segId = id;
content = vecS;
averageLength=0.0;
}
segmentStructure::segmentStructure ( string id, string txt )
{
segId = id;
content = stringToVector ( txt, " " );
averageLength=0.0;
}
void segmentStructure::addContent ( string s )
{
content = stringToVector ( s, " " );
averageLength=0.0;
}
segmentStructure::segmentStructure()
{
segId = "";
}
terAlignment segmentStructure::getAlignment()
{
return evaluation;
}
void segmentStructure::setAlignment ( terAlignment& l_align )
{
evaluation = l_align;
}
string segmentStructure::getBestDocId()
{
return bestDocId;
}
void segmentStructure::setBestDocId ( string s )
{
bestDocId = s;
}
float segmentStructure::getAverageLength()
{
return averageLength;
}
void segmentStructure::setAverageLength(float f)
{
averageLength=f;
}
int segmentStructure::getSize()
{
return (int)content.size();
}
}

View File

@ -0,0 +1,73 @@
#ifndef __SEGMENTSTRUCTURE_H__
#define __SEGMENTSTRUCTURE_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
#include "tercalc.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class segmentStructure
{
private:
string segId;
vecString content;
terAlignment evaluation;
string bestDocId;
float averageLength;
public:
segmentStructure();
segmentStructure ( string id, vecString vecS );
segmentStructure ( string id, string txt );
void setAverageLength(float f);
float getAverageLength();
string getSegId();
terAlignment getAlignment();
void setAlignment(terAlignment& l_align);
void setSegId ( string s );
void setBestDocId ( string s );
string getBestDocId();
void addContent ( vecString vecS );
void addContent ( string s );
int getSize();
// {
// return segId;
// }
vecString getContent();
// {
// return content;
// }
// alignmentStruct();
// alignmentStruct (int _start, int _end, int _moveto, int _newloc);
// alignmentStruct (int _start, int _end, int _moveto, int _newloc, vector<string> _shifted);
// string toString();
// int distance() ;
// bool leftShift();
// int size();
// alignmentStruct operator=(alignmentStruct t);
// string vectorToString(vector<string> vec);
// int start;
// int end;
// int moveto;
// int newloc;
vector<string> nwords; // The words we shifted
vector<char> alignment ; // for pra_more output
vector<vecInt> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
double cost;
string toString();
};
}
#endif

149
mert/TERsrc/sgmlDocument.cpp Executable file
View File

@ -0,0 +1,149 @@
#include "sgmlDocument.h"
// #include <iostream>
// #include <boost/filesystem/fstream.hpp>
// #include <boost/archive/xml_oarchive.hpp>
// #include <boost/archive/xml_iarchive.hpp>
// #include <boost/serialization/nvp.hpp>
// helper functions to allow us to load and save sandwiches to/from xml
namespace TERCpp
{
SGMLDocument::SGMLDocument()
{
docType="";
setId="";
srcLang="";
tgtLang="";
}
// SGMLDocument::SGMLDocument ( string FileName )
// {
// this=xmlStruct.copy_to_SGMLDocument(FileName);
// }
// xmlStructure SGMLDocument::getStructure()
// {
// return xmlStruct;
// }
string SGMLDocument::getDocType()
{
return docType;
}
string SGMLDocument::getSetId()
{
return setId;
}
string SGMLDocument::getSrcLang()
{
return srcLang;
}
string SGMLDocument::getTgtLang()
{
return tgtLang;
}
void SGMLDocument::setDocType ( string s )
{
docType=s;
}
void SGMLDocument::setSetId ( string s )
{
setId=s;
}
void SGMLDocument::setSrcLang ( string s )
{
srcLang=s;
}
void SGMLDocument::setTgtLang ( string s )
{
tgtLang=s;
}
void SGMLDocument::addDocument ( documentStructure doc )
{
documents.push_back(doc);
}
documentStructure* SGMLDocument::getLastDocument()
{
return &(documents.at((int)documents.size()-1));
}
documentStructure* SGMLDocument::getFirstDocument()
{
return &(documents.at(0));
}
int SGMLDocument::getSize()
{
return (int)documents.size();
}
documentStructure* SGMLDocument::getDocument(string docId)
{
for ( int i = 0; i < ( int ) documents.size(); i++ ) {
if ( docId.compare ( documents.at ( i ).getDocId() ) == 0 ) {
return & ( documents.at ( i ) );
}
}
cerr << "ERROR : SGMLDocument::getDocument : document " << docId << " does not exist !" << endl;
exit ( 0 );
}
// void save_sandwich(const SGMLDocument &sw, const std::string &file_name);
// SGMLDocument load_sandwich(const std::string &file_name);
// int callSGMLDocument()
// {
// // xml filename
// const std::string fn="JasonsSarnie.xml";
//
// // create a new sandwich and lets take a look at it!
// SGMLDocument *s = new SGMLDocument("Granary", "Brie", "Bacon", false); // mmmmm, Brie and bacon! ;)
// std::cout << "Created the following sandwich:" << std::endl;
// s->output();
//
// // Now lets save the sandwich out to an XML file....
// std::cout << std::endl << "Saving the sandwich to xml...." << std::endl;
// save_sandwich(*s, fn);
//
// // And then load it into another SGMLDocument variable and take a look at what we've got
// std::cout << "Attempting to load the saved sandwich..." << std::endl;
// SGMLDocument s2 = load_sandwich(fn);
// std::cout << "Contents of loaded SGMLDocument:" << std::endl;
// s2.output();
//
// delete s;
// std::string dummy;
// std::getline(std::cin, dummy);
//
// }
/*
// Save a SGMLDocument to XML...
void save_sandwich(const SGMLDocument &sw, const std::string &file_name)
{
// Create a filestream object
boost::filesystem::fstream ofs(file_name, std::ios::trunc | std::ios::out);
// Now create an XML output file using our filestream
boost::archive::xml_oarchive xml(ofs);
// call serialization::make_nvp, passing our sandwich.
// make_nvp will eventually call the sandwich instance (sw) serialize function
// causing the contents of sw to be output to the xml file
xml << boost::serialization::make_nvp("SGMLDocument", sw);
}
// The load function works in almost the exact same way as save_sandwich,
// The only differences are:
// 1. we create an XML input stream - the original example in AD's link created another xml_oarchive, causing a runtime error...doh!
// 2. the call to make_nvp populates the sandwich instance(sw) which is then returned...
SGMLDocument load_sandwich(const std::string &file_name)
{
SGMLDocument sw;
boost::filesystem::fstream ifs(file_name, std::ios::binary | std::ios::in);
boost::archive::xml_iarchive xml(ifs);
xml >> boost::serialization::make_nvp("SGMLDocument", sw);
return sw;
}*/
}

69
mert/TERsrc/sgmlDocument.h Executable file
View File

@ -0,0 +1,69 @@
#ifndef __SGML_DOCUMENT_H__
#define __SGML_DOCUMENT_H__
#include "documentStructure.h"
// #include "xmlStructure.h"
#include <iostream>
#include <string>
namespace TERCpp
{
class SGMLDocument
{
public:
SGMLDocument();
// SGMLDocument(string FileName);
// SGMLDocument(const std::string &bread, const std::string &cheese, const std::string &meat, const bool pickle):
// m_bread(bread), m_cheese(cheese), m_meat(meat), m_pickle(pickle){};
// ~SGMLDocument(){};
// void output()
// {
// std::cout << "Bread = " << m_bread << ", Cheese = " << m_cheese <<
// ", Meat = " << m_meat << ", Has Pickle = " << m_pickle << std::endl;
//
// }
void setDocType ( string s );
void setSetId ( string s );
void setSrcLang ( string s );
void setTgtLang ( string s );
string getDocType();
string getSetId();
string getSrcLang();
string getTgtLang();
// xmlStructure getStructure();
void addDocument ( documentStructure doc );
documentStructure* getLastDocument();
documentStructure* getFirstDocument();
int getSize();
documentStructure* getDocument(string docId);
private:
string docType;
string setId;
string srcLang;
string tgtLang;
// xmlStructure xmlStruct;
vector<documentStructure> documents;
// std::string m_bread, m_cheese, m_meat;
// bool m_pickle;
//
// // declare the boost::serialization::access class as a friend of SGMLDocument
// friend class boost::serialization::access;
// // Create a serialize function for serialization::access to use, I guess you could regard this as a kind of callback function!
// template<class archive>
// void serialize(archive& ar, const unsigned int version)
// {
// // Note: As explained in the original tut. the & operator is overridden in boost to use
// // << or >> depending on the direction of the data (read/write)
// using boost::serialization::make_nvp;
// ar & make_nvp("Bread", m_bread);
// ar & make_nvp("Cheese", m_cheese);
// ar & make_nvp("Meats", m_meat);
// ar & make_nvp("HasPickle", m_pickle);
// // Also note: strings in the first parameter of make_nvp cannot contain spaces!
// }
};
}
#endif //SANDWICH_DEFINED

View File

@ -0,0 +1,35 @@
#include "stringHasher.h"
// The following class defines a hash function for strings
using namespace std;
namespace HashMapSpace
{
stringHasher::stringHasher ( long cle, string cleTxt, string valueTxt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueTxt;
}
// stringHasher::~stringHasher(){};*/
long stringHasher::getHashKey()
{
return m_hashKey;
}
string stringHasher::getKey()
{
return m_key;
}
string stringHasher::getValue()
{
return m_value;
}
void stringHasher::setValue ( string value )
{
m_value=value;
}
// typedef stdext::hash_map<string, string, stringhasher> HASH_S_S;
}

View File

@ -0,0 +1,30 @@
#ifndef __STRINGHASHER_H__
#define __STRINGHASHER_H__
#include <string>
//#include <ext/hash_map>
#include <iostream>
using namespace std;
namespace HashMapSpace
{
class stringHasher
{
private:
long m_hashKey;
string m_key;
string m_value;
public:
stringHasher ( long cle, string cleTxt, string valueTxt );
long getHashKey();
string getKey();
string getValue();
void setValue ( string value );
};
}
#endif

View File

@ -0,0 +1,35 @@
#include "stringInfosHasher.h"
// The following class defines a hash function for strings
using namespace std;
namespace HashMapSpace
{
stringInfosHasher::stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt )
{
m_hashKey=cle;
m_key=cleTxt;
m_value=valueVecInt;
}
// stringInfosHasher::~stringInfosHasher(){};*/
long stringInfosHasher::getHashKey()
{
return m_hashKey;
}
string stringInfosHasher::getKey()
{
return m_key;
}
vector<string> stringInfosHasher::getValue()
{
return m_value;
}
void stringInfosHasher::setValue ( vector<string> value )
{
m_value=value;
}
// typedef stdext::hash_map<string, string, stringhasher> HASH_S_S;
}

View File

@ -0,0 +1,30 @@
#ifndef __STRINGINFOSHASHER_H__
#define __STRINGINFOSHASHER_H__
#include <string>
// #include <ext/hash_map>
#include <iostream>
#include <vector>
using namespace std;
namespace HashMapSpace
{
class stringInfosHasher
{
private:
long m_hashKey;
string m_key;
vector<string> m_value;
public:
stringInfosHasher ( long cle, string cleTxt, vector<string> valueVecInt );
long getHashKey();
string getKey();
vector<string> getValue();
void setValue ( vector<string> value );
};
}
#endif

View File

@ -0,0 +1,131 @@
#include "terAlignment.h"
using namespace std;
namespace TERCpp
{
terAlignment::terAlignment()
{
// vector<string> ref;
// vector<string> hyp;
// vector<string> aftershift;
// TERshift[] allshifts = null;
numEdits=0;
numWords=0;
bestRef="";
numIns=0;
numDel=0;
numSub=0;
numSft=0;
numWsf=0;
}
string terAlignment::toString()
{
stringstream s;
s.str ( "" );
s << "Original Ref: " << join ( " ", ref ) << endl;
s << "Original Hyp: " << join ( " ", hyp ) <<endl;
s << "Hyp After Shift: " << join ( " ", aftershift );
s << endl;
// string s = "Original Ref: " + join(" ", ref) + "\nOriginal Hyp: " + join(" ", hyp) + "\nHyp After Shift: " + join(" ", aftershift);
if ( ( int ) sizeof ( alignment ) >0 ) {
s << "Alignment: (";
// s += "\nAlignment: (";
for ( int i = 0; i < ( int ) ( alignment.size() ); i++ ) {
s << alignment[i];
// s+=alignment[i];
}
// s += ")";
s << ")";
}
s << endl;
if ( ( int ) allshifts.size() == 0 ) {
// s += "\nNumShifts: 0";
s << "NumShifts: 0";
} else {
// s += "\nNumShifts: " + (int)allshifts.size();
s << "NumShifts: "<< ( int ) allshifts.size();
for ( int i = 0; i < ( int ) allshifts.size(); i++ ) {
s << endl << " " ;
s << ( ( terShift ) allshifts[i] ).toString();
// s += "\n " + allshifts[i];
}
}
s << endl << "Score: " << scoreAv() << " (" << numEdits << "/" << averageWords << ")";
// s += "\nScore: " + score() + " (" + numEdits + "/" + numWords + ")";
return s.str();
}
string terAlignment::join ( string delim, vector<string> arr )
{
if ( ( int ) arr.size() == 0 ) return "";
// if ((int)delim.compare("") == 0) delim = new String("");
// String s = new String("");
stringstream s;
s.str ( "" );
for ( int i = 0; i < ( int ) arr.size(); i++ ) {
if ( i == 0 ) {
s << arr.at ( i );
} else {
s << delim << arr.at ( i );
}
}
return s.str();
// return "";
}
double terAlignment::score()
{
if ( ( numWords <= 0.0 ) && ( numEdits > 0.0 ) ) {
return 1.0;
}
if ( numWords <= 0.0 ) {
return 0.0;
}
return ( double ) numEdits / numWords;
}
double terAlignment::scoreAv()
{
if ( ( averageWords <= 0.0 ) && ( numEdits > 0.0 ) ) {
return 1.0;
}
if ( averageWords <= 0.0 ) {
return 0.0;
}
return ( double ) numEdits / averageWords;
}
void terAlignment::scoreDetails()
{
numIns = numDel = numSub = numWsf = numSft = 0;
if((int)allshifts.size()>0) {
for(int i = 0; i < (int)allshifts.size(); ++i) {
numWsf += allshifts[i].size();
}
numSft = allshifts.size();
}
if((int)alignment.size()>0 ) {
for(int i = 0; i < (int)alignment.size(); ++i) {
switch (alignment[i]) {
case 'S':
case 'T':
numSub++;
break;
case 'D':
numDel++;
break;
case 'I':
numIns++;
break;
}
}
}
// if(numEdits != numSft + numDel + numIns + numSub)
// System.out.println("** Error, unmatch edit erros " + numEdits +
// " vs " + (numSft + numDel + numIns + numSub));
}
}

View File

@ -0,0 +1,51 @@
#ifndef __TERCPPTERALIGNMENT_H__
#define __TERCPPTERALIGNMENT_H__
#include <vector>
#include <stdio.h>
#include <string.h>
#include "tools.h"
#include "terShift.h"
using namespace std;
// using namespace HashMapSpace;
namespace TERCpp
{
class terAlignment
{
private:
public:
terAlignment();
string toString();
void scoreDetails();
vector<string> ref;
vector<string> hyp;
vector<string> aftershift;
vector<terShift> allshifts;
double numEdits;
double numWords;
double averageWords;
vector<char> alignment;
string bestRef;
int numIns;
int numDel;
int numSub;
int numSft;
int numWsf;
string join ( string delim, vector<string> arr );
double score();
double scoreAv();
};
}
#endif

View File

@ -0,0 +1,40 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __XMLSTRUCTURE_H__
#define __XMLSTRUCTURE_H__
#include "sgmlDocument.h"
#include "documentStructure.h"
#include "stdio.h"
#include <iostream>
#include <string>
#include "tinyxml.h"
using namespace std;
namespace TERCpp
{
class xmlStructure
{
private:
unsigned int NUM_INDENTS_PER_SPACE;
// void dump_attribs_to_SGMLDocuments ( SGMLDocument* arg1, const TiXmlElement* arg2 );
void dump_attribs_to_SGMLDocuments ( SGMLDocument* sgmlDoc, TiXmlElement* pElement, unsigned int indent );
public:
xmlStructure();
const char * getIndent( unsigned int numIndents );
const char * getIndentAlt( unsigned int numIndents );
int dump_attribs_to_stdout(TiXmlElement* pElement, unsigned int indent);
void dump_to_stdout( TiXmlNode* pParent, unsigned int indent );
void dump_to_stdout(const char* pFilename);
void copy_to_SGMLDocument(SGMLDocument* sgmlDoc ,TiXmlNode* pParent, unsigned int indent );
SGMLDocument dump_to_SGMLDocument(string FileName);
};
}
#endif

100
mert/TERsrc/terShift.cpp Normal file
View File

@ -0,0 +1,100 @@
#include "terShift.h"
using namespace std;
namespace TERCpp
{
// terShift::terShift()
// {
// // vector<string> ref;
// // vector<string> hyp;
// // vector<string> aftershift;
//
// // terShift[] allshifts = null;
//
// numEdits=0;
// numWords=0;
// bestRef="";
//
// numIns=0;
// numDel=0;
// numSub=0;
// numSft=0;
// numWsf=0;
// }
terShift::terShift ()
{
start = 0;
end = 0;
moveto = 0;
newloc = 0;
cost=1.0;
}
terShift::terShift ( int _start, int _end, int _moveto, int _newloc )
{
start = _start;
end = _end;
moveto = _moveto;
newloc = _newloc;
cost=1.0;
}
terShift::terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted )
{
start = _start;
end = _end;
moveto = _moveto;
newloc = _newloc;
shifted = _shifted;
cost=1.0;
}
// string terShift::vectorToString(vector<string> vec)
// {
// string retour("");
// for (vector<string>::iterator vecIter=vec.begin();vecIter!=vec.end(); vecIter++)
// {
// retour+=(*vecIter)+"\t";
// }
// return retour;
// }
string terShift::toString()
{
stringstream s;
s.str ( "" );
s << "[" << start << ", " << end << ", " << moveto << "/" << newloc << "]";
if ( ( int ) shifted.size() > 0 ) {
s << " (" << vectorToString ( shifted ) << ")";
}
return s.str();
}
/* The distance of the shift. */
int terShift::distance()
{
if ( moveto < start ) {
return start - moveto;
} else if ( moveto > end ) {
return moveto - end;
} else {
return moveto - start;
}
}
bool terShift::leftShift()
{
return ( moveto < start );
}
int terShift::size()
{
return ( end - start ) + 1;
}
// terShift terShift::operator=(terShift t)
// {
//
// return t;
// }
}

45
mert/TERsrc/terShift.h Normal file
View File

@ -0,0 +1,45 @@
#ifndef __TERCPPTERSHIFT_H__
#define __TERCPPTERSHIFT_H__
#include <vector>
#include <stdio.h>
#include <string>
#include <sstream>
#include "tools.h"
using namespace std;
using namespace Tools;
namespace TERCpp
{
class terShift
{
private:
public:
terShift();
terShift ( int _start, int _end, int _moveto, int _newloc );
terShift ( int _start, int _end, int _moveto, int _newloc, vector<string> _shifted );
string toString();
int distance() ;
bool leftShift();
int size();
// terShift operator=(terShift t);
// string vectorToString(vector<string> vec);
int start;
int end;
int moveto;
int newloc;
vector<string> shifted; // The words we shifted
vector<char> alignment ; // for pra_more output
vector<string> aftershift; // for pra_more output
// This is used to store the cost of a shift, so we don't have to
// calculate it multiple times.
double cost;
};
}
#endif

1035
mert/TERsrc/tercalc.cpp Normal file

File diff suppressed because it is too large Load Diff

82
mert/TERsrc/tercalc.h Normal file
View File

@ -0,0 +1,82 @@
#ifndef _TERCPPTERCALC_H___
#define _TERCPPTERCALC_H___
#include <vector>
#include <stdio.h>
#include <string.h>
#include <sstream>
#include "hashMap.h"
#include "hashMapInfos.h"
#include "hashMapStringInfos.h"
#include "terAlignment.h"
#include "tools.h"
#include "terShift.h"
#include "alignmentStruct.h"
#include "bestShiftStruct.h"
using namespace std;
using namespace Tools;
using namespace HashMapSpace;
namespace TERCpp
{
// typedef size_t WERelement[2];
// Vecteur d'alignement contenant le hash du mot et son evaluation (0=ok, 1=sub, 2=ins, 3=del)
typedef vector<terShift> vecTerShift;
/**
@author
*/
class terCalc
{
private :
// Vecteur d'alignement contenant le hash du mot et son evaluation (0=ok, 1=sub, 2=ins, 3=del)
WERalignment l_WERalignment;
// HashMap contenant les caleurs de hash de chaque mot
hashMap bagOfWords;
int MAX_SHIFT_SIZE;
/* Variables for some internal counting. */
int NUM_SEGMENTS_SCORED;
int NUM_SHIFTS_CONSIDERED;
int NUM_BEAM_SEARCH_CALLS;
int MAX_SHIFT_DIST;
bool PRINT_DEBUG;
/* These are resized by the MIN_EDIT_DIST code if they aren't big enough */
double S[1000][1000];
char P[1000][1000];
vector<vecInt> refSpans;
vector<vecInt> hypSpans;
int BEAM_WIDTH;
public:
int shift_cost;
int insert_cost;
int delete_cost;
int substitute_cost;
int match_cost;
double INF;
terCalc();
// ~terCalc();
// size_t* hashVec ( vector<string> s );
void setDebugMode ( bool b );
int WERCalculation ( size_t * ref, size_t * hyp );
int WERCalculation ( vector<string> ref, vector<string> hyp );
int WERCalculation ( vector<int> ref, vector<int> hyp );
// string vectorToString(vector<string> vec);
// vector<string> subVector(vector<string> vec, int start, int end);
hashMapInfos BuildWordMatches ( vector<string> hyp, vector<string> ref );
terAlignment MinEditDist ( vector<string> hyp, vector<string> ref, vector<vecInt> curHypSpans );
bool spanIntersection ( vecInt refSpan, vecInt hypSpan );
terAlignment TER ( vector<string> hyp, vector<string> ref , float avRefLength );
terAlignment TER ( vector<string> hyp, vector<string> ref );
terAlignment TER ( vector<int> hyp, vector<int> ref );
bestShiftStruct CalcBestShift ( vector<string> cur, vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment cur_align );
void FindAlignErr ( terAlignment align, bool* herr, bool* rerr, int* ralign );
vector<vecTerShift> GatherAllPossShifts ( vector<string> hyp, vector<string> ref, hashMapInfos rloc, terAlignment align, bool* herr, bool* rerr, int* ralign );
alignmentStruct PerformShift ( vector<string> words, terShift s );
alignmentStruct PerformShift ( vector<string> words, int start, int end, int newloc );
};
}
#endif

111
mert/TERsrc/tinystr.cpp Normal file
View File

@ -0,0 +1,111 @@
/*
www.sourceforge.net/projects/tinyxml
Original file by Yves Berquin.
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
/*
* THIS FILE WAS ALTERED BY Tyge Løvset, 7. April 2005.
*/
#ifndef TIXML_USE_STL
#include "tinystr.h"
// Error value for find primitive
const TiXmlString::size_type TiXmlString::npos = static_cast< TiXmlString::size_type >(-1);
// Null rep.
TiXmlString::Rep TiXmlString::nullrep_ = { 0, 0, { '\0' } };
void TiXmlString::reserve (size_type cap)
{
if (cap > capacity()) {
TiXmlString tmp;
tmp.init(length(), cap);
memcpy(tmp.start(), data(), length());
swap(tmp);
}
}
TiXmlString& TiXmlString::assign(const char* str, size_type len)
{
size_type cap = capacity();
if (len > cap || cap > 3*(len + 8)) {
TiXmlString tmp;
tmp.init(len);
memcpy(tmp.start(), str, len);
swap(tmp);
} else {
memmove(start(), str, len);
set_size(len);
}
return *this;
}
TiXmlString& TiXmlString::append(const char* str, size_type len)
{
size_type newsize = length() + len;
if (newsize > capacity()) {
reserve (newsize + capacity());
}
memmove(finish(), str, len);
set_size(newsize);
return *this;
}
TiXmlString operator + (const TiXmlString & a, const TiXmlString & b)
{
TiXmlString tmp;
tmp.reserve(a.length() + b.length());
tmp += a;
tmp += b;
return tmp;
}
TiXmlString operator + (const TiXmlString & a, const char* b)
{
TiXmlString tmp;
TiXmlString::size_type b_len = static_cast<TiXmlString::size_type>( strlen(b) );
tmp.reserve(a.length() + b_len);
tmp += a;
tmp.append(b, b_len);
return tmp;
}
TiXmlString operator + (const char* a, const TiXmlString & b)
{
TiXmlString tmp;
TiXmlString::size_type a_len = static_cast<TiXmlString::size_type>( strlen(a) );
tmp.reserve(a_len + b.length());
tmp.append(a, a_len);
tmp += b;
return tmp;
}
#endif // TIXML_USE_STL

337
mert/TERsrc/tinystr.h Normal file
View File

@ -0,0 +1,337 @@
/*
www.sourceforge.net/projects/tinyxml
Original file by Yves Berquin.
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
/*
* THIS FILE WAS ALTERED BY Tyge Lovset, 7. April 2005.
*
* - completely rewritten. compact, clean, and fast implementation.
* - sizeof(TiXmlString) = pointer size (4 bytes on 32-bit systems)
* - fixed reserve() to work as per specification.
* - fixed buggy compares operator==(), operator<(), and operator>()
* - fixed operator+=() to take a const ref argument, following spec.
* - added "copy" constructor with length, and most compare operators.
* - added swap(), clear(), size(), capacity(), operator+().
*/
#ifndef TIXML_USE_STL
#ifndef TIXML_STRING_INCLUDED
#define TIXML_STRING_INCLUDED
#include <assert.h>
#include <string.h>
/* The support for explicit isn't that universal, and it isn't really
required - it is used to check that the TiXmlString class isn't incorrectly
used. Be nice to old compilers and macro it here:
*/
#if defined(_MSC_VER) && (_MSC_VER >= 1200 )
// Microsoft visual studio, version 6 and higher.
#define TIXML_EXPLICIT explicit
#elif defined(__GNUC__) && (__GNUC__ >= 3 )
// GCC version 3 and higher.s
#define TIXML_EXPLICIT explicit
#else
#define TIXML_EXPLICIT
#endif
/*
TiXmlString is an emulation of a subset of the std::string template.
Its purpose is to allow compiling TinyXML on compilers with no or poor STL support.
Only the member functions relevant to the TinyXML project have been implemented.
The buffer allocation is made by a simplistic power of 2 like mechanism : if we increase
a string and there's no more room, we allocate a buffer twice as big as we need.
*/
class TiXmlString
{
public :
// The size type used
typedef size_t size_type;
// Error value for find primitive
static const size_type npos; // = -1;
// TiXmlString empty constructor
TiXmlString () : rep_(&nullrep_) {
}
// TiXmlString copy constructor
TiXmlString ( const TiXmlString & copy) : rep_(0) {
init(copy.length());
memcpy(start(), copy.data(), length());
}
// TiXmlString constructor, based on a string
TIXML_EXPLICIT TiXmlString ( const char * copy) : rep_(0) {
init( static_cast<size_type>( strlen(copy) ));
memcpy(start(), copy, length());
}
// TiXmlString constructor, based on a string
TIXML_EXPLICIT TiXmlString ( const char * str, size_type len) : rep_(0) {
init(len);
memcpy(start(), str, len);
}
// TiXmlString destructor
~TiXmlString () {
quit();
}
// = operator
TiXmlString& operator = (const char * copy) {
return assign( copy, (size_type)strlen(copy));
}
// = operator
TiXmlString& operator = (const TiXmlString & copy) {
return assign(copy.start(), copy.length());
}
// += operator. Maps to append
TiXmlString& operator += (const char * suffix) {
return append(suffix, static_cast<size_type>( strlen(suffix) ));
}
// += operator. Maps to append
TiXmlString& operator += (char single) {
return append(&single, 1);
}
// += operator. Maps to append
TiXmlString& operator += (const TiXmlString & suffix) {
return append(suffix.data(), suffix.length());
}
// Convert a TiXmlString into a null-terminated char *
const char * c_str () const {
return rep_->str;
}
// Convert a TiXmlString into a char * (need not be null terminated).
const char * data () const {
return rep_->str;
}
// Return the length of a TiXmlString
size_type length () const {
return rep_->size;
}
// Alias for length()
size_type size () const {
return rep_->size;
}
// Checks if a TiXmlString is empty
bool empty () const {
return rep_->size == 0;
}
// Return capacity of string
size_type capacity () const {
return rep_->capacity;
}
// single char extraction
const char& at (size_type index) const {
assert( index < length() );
return rep_->str[ index ];
}
// [] operator
char& operator [] (size_type index) const {
assert( index < length() );
return rep_->str[ index ];
}
// find a char in a string. Return TiXmlString::npos if not found
size_type find (char lookup) const {
return find(lookup, 0);
}
// find a char in a string from an offset. Return TiXmlString::npos if not found
size_type find (char tofind, size_type offset) const {
if (offset >= length()) return npos;
for (const char* p = c_str() + offset; *p != '\0'; ++p) {
if (*p == tofind) return static_cast< size_type >( p - c_str() );
}
return npos;
}
void clear () {
//Lee:
//The original was just too strange, though correct:
// TiXmlString().swap(*this);
//Instead use the quit & re-init:
quit();
init(0,0);
}
/* Function to reserve a big amount of data when we know we'll need it. Be aware that this
function DOES NOT clear the content of the TiXmlString if any exists.
*/
void reserve (size_type cap);
TiXmlString& assign (const char* str, size_type len);
TiXmlString& append (const char* str, size_type len);
void swap (TiXmlString& other) {
Rep* r = rep_;
rep_ = other.rep_;
other.rep_ = r;
}
private:
void init(size_type sz) {
init(sz, sz);
}
void set_size(size_type sz) {
rep_->str[ rep_->size = sz ] = '\0';
}
char* start() const {
return rep_->str;
}
char* finish() const {
return rep_->str + rep_->size;
}
struct Rep {
size_type size, capacity;
char str[1];
};
void init(size_type sz, size_type cap) {
if (cap) {
// Lee: the original form:
// rep_ = static_cast<Rep*>(operator new(sizeof(Rep) + cap));
// doesn't work in some cases of new being overloaded. Switching
// to the normal allocation, although use an 'int' for systems
// that are overly picky about structure alignment.
const size_type bytesNeeded = sizeof(Rep) + cap;
const size_type intsNeeded = ( bytesNeeded + sizeof(int) - 1 ) / sizeof( int );
rep_ = reinterpret_cast<Rep*>( new int[ intsNeeded ] );
rep_->str[ rep_->size = sz ] = '\0';
rep_->capacity = cap;
} else {
rep_ = &nullrep_;
}
}
void quit() {
if (rep_ != &nullrep_) {
// The rep_ is really an array of ints. (see the allocator, above).
// Cast it back before delete, so the compiler won't incorrectly call destructors.
delete [] ( reinterpret_cast<int*>( rep_ ) );
}
}
Rep * rep_;
static Rep nullrep_;
} ;
inline bool operator == (const TiXmlString & a, const TiXmlString & b)
{
return ( a.length() == b.length() ) // optimization on some platforms
&& ( strcmp(a.c_str(), b.c_str()) == 0 ); // actual compare
}
inline bool operator < (const TiXmlString & a, const TiXmlString & b)
{
return strcmp(a.c_str(), b.c_str()) < 0;
}
inline bool operator != (const TiXmlString & a, const TiXmlString & b)
{
return !(a == b);
}
inline bool operator > (const TiXmlString & a, const TiXmlString & b)
{
return b < a;
}
inline bool operator <= (const TiXmlString & a, const TiXmlString & b)
{
return !(b < a);
}
inline bool operator >= (const TiXmlString & a, const TiXmlString & b)
{
return !(a < b);
}
inline bool operator == (const TiXmlString & a, const char* b)
{
return strcmp(a.c_str(), b) == 0;
}
inline bool operator == (const char* a, const TiXmlString & b)
{
return b == a;
}
inline bool operator != (const TiXmlString & a, const char* b)
{
return !(a == b);
}
inline bool operator != (const char* a, const TiXmlString & b)
{
return !(b == a);
}
TiXmlString operator + (const TiXmlString & a, const TiXmlString & b);
TiXmlString operator + (const TiXmlString & a, const char* b);
TiXmlString operator + (const char* a, const TiXmlString & b);
/*
TiXmlOutStream is an emulation of std::ostream. It is based on TiXmlString.
Only the operators that we need for TinyXML have been developped.
*/
class TiXmlOutStream : public TiXmlString
{
public :
// TiXmlOutStream << operator.
TiXmlOutStream & operator << (const TiXmlString & in) {
*this += in;
return *this;
}
// TiXmlOutStream << operator.
TiXmlOutStream & operator << (const char * in) {
*this += in;
return *this;
}
} ;
#endif // TIXML_STRING_INCLUDED
#endif // TIXML_USE_STL

1778
mert/TERsrc/tinyxml.cpp Normal file

File diff suppressed because it is too large Load Diff

2043
mert/TERsrc/tinyxml.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,52 @@
/*
www.sourceforge.net/projects/tinyxml
Original code (2.0 and earlier )copyright (c) 2000-2006 Lee Thomason (www.grinninglizard.com)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
#include "tinyxml.h"
// The goal of the seperate error file is to make the first
// step towards localization. tinyxml (currently) only supports
// english error messages, but the could now be translated.
//
// It also cleans up the code a bit.
//
const char* TiXmlBase::errorString[ TIXML_ERROR_STRING_COUNT ] = {
"No error",
"Error",
"Failed to open file",
"Memory allocation failed.",
"Error parsing Element.",
"Failed to read Element name",
"Error reading Element value.",
"Error reading Attributes.",
"Error: empty tag.",
"Error reading end tag.",
"Error parsing Unknown.",
"Error parsing Comment.",
"Error parsing Declaration.",
"Error document empty.",
"Error null (0) or unexpected EOF found in input stream.",
"Error parsing CDATA.",
"Error when TiXmlDocument added to document, because TiXmlDocument can only be at the root.",
};

File diff suppressed because it is too large Load Diff

543
mert/TERsrc/tools.cpp Normal file
View File

@ -0,0 +1,543 @@
#include "tools.h"
using namespace std;
using namespace boost::xpressive;
namespace Tools
{
string vectorToString ( vector<string> vec )
{
string retour ( "" );
for ( vector<string>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) {
if ( vecIter == vec.begin() ) {
retour += ( *vecIter );
} else {
retour += "\t" + ( *vecIter );
}
}
return retour;
}
string vectorToString ( vector< string > vec, string s )
{
string retour ( "" );
for ( vector<string>::iterator vecIter = vec.begin(); vecIter != vec.end(); vecIter++ ) {
if ( vecIter == vec.begin() ) {
retour += ( *vecIter );
} else {
retour += s + ( *vecIter );
}
}
return retour;
}
vector<string> subVector ( vector<string> vec, int start, int end )
{
vector<string> retour;
if ( start > end ) {
cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
exit ( 0 );
}
for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) {
retour.push_back ( vec.at ( i ) );
}
return retour;
}
vector<int> subVector ( vector<int> vec, int start, int end )
{
vector<int> retour;
if ( start > end ) {
cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
exit ( 0 );
}
for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) {
retour.push_back ( vec.at ( i ) );
}
return retour;
}
vector<float> subVector ( vector<float> vec, int start, int end )
{
vector<float> retour;
if ( start > end ) {
cerr << "ERREUR : TERcalc::subVector : end > start" << endl;
exit ( 0 );
}
for ( int i = start; ( ( i < end ) && ( i < ( int ) vec.size() ) ); i++ ) {
retour.push_back ( vec.at ( i ) );
}
return retour;
}
vector<string> copyVector ( vector<string> vec )
{
vector<string> retour;
for ( int i = 0; i < ( int ) vec.size(); i++ ) {
retour.push_back ( vec.at ( i ) );
}
return retour;
}
vector<int> copyVector ( vector<int> vec )
{
vector<int> retour;
for ( int i = 0; i < ( int ) vec.size(); i++ ) {
retour.push_back ( vec.at ( i ) );
}
return retour;
}
vector<float> copyVector ( vector<float> vec )
{
vector<float> retour;
for ( int i = 0; i < ( int ) vec.size(); i++ ) {
retour.push_back ( vec.at ( i ) );
}
return retour;
}
vector<string> stringToVector ( string s, string tok )
{
vector<string> to_return;
string to_push ( "" );
bool pushed = false;
string::iterator sIt;
for ( sIt = s.begin(); sIt < s.end(); sIt++ ) {
pushed = false;
for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) {
if ( ( *sIt ) == ( *sTok ) ) {
to_return.push_back ( to_push );
to_push = "";
pushed = true;
}
}
if ( !pushed ) {
to_push.push_back ( ( *sIt ) );
}
}
to_return.push_back ( to_push );
return to_return;
}
vector<int> stringToVectorInt ( string s, string tok )
{
vector<int> to_return;
string to_push ( "" );
bool pushed = false;
string::iterator sIt;
for ( sIt = s.begin(); sIt < s.end(); sIt++ ) {
pushed = false;
for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) {
if ( ( *sIt ) == ( *sTok ) ) {
if ( ( int ) to_push.length() > 0 ) {
to_return.push_back ( atoi ( to_push.c_str() ) );
}
to_push = "";
pushed = true;
}
}
if ( !pushed ) {
to_push.push_back ( ( *sIt ) );
}
}
if ( ( int ) to_push.length() > 0 ) {
to_return.push_back ( atoi ( to_push.c_str() ) );
}
return to_return;
}
vector<float> stringToVectorFloat ( string s, string tok )
{
vector<float> to_return;
string to_push ( "" );
bool pushed = false;
string::iterator sIt;
for ( sIt = s.begin(); sIt < s.end(); sIt++ ) {
pushed = false;
for ( string::iterator sTok = tok.begin(); sTok < tok.end(); sTok++ ) {
if ( ( *sIt ) == ( *sTok ) ) {
if ( ( int ) to_push.length() > 0 ) {
to_return.push_back ( atof ( to_push.c_str() ) );
}
to_push = "";
pushed = true;
}
}
if ( !pushed ) {
to_push.push_back ( ( *sIt ) );
}
}
if ( ( int ) to_push.length() > 0 ) {
to_return.push_back ( atoi ( to_push.c_str() ) );
}
return to_return;
}
string lowerCase ( string str )
{
for ( int i = 0; i < ( int ) str.size(); i++ ) {
if ( ( str[i] >= 0x41 ) && ( str[i] <= 0x5A ) ) {
str[i] = str[i] + 0x20;
}
}
return str;
}
string removePunctTercom ( string str )
{
string str_mod = str;
sregex rex;
string replace;
rex = sregex::compile ( "^[ ]+" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\"]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[,]" );
replace = " ";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
replace = ( "$1 $3" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
replace = ( "$1 $3" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
replace = ( "$1 $3" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([\\.]$)" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\?]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\;]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\:]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\!]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\(]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\)]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[ ]+" );
replace = " ";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[ ]+$" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
return str_mod;
}
string removePunct ( string str )
{
string str_mod = str;
sregex rex;
string replace;
rex = sregex::compile ( "^[ ]+" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\"]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[,]" );
replace = " ";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
replace = ( "$1 $3" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
replace = ( "$1 $3" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([^0-9])([\\.])([^0-9])" );
replace = ( "$1 $3" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "([\\.]$)" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\?]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\;]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\:]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\!]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\(]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\)]" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[ ]+" );
replace = " ";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[ ]+$" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "^[ ]+" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
return str_mod;
}
string tokenizePunct ( string str )
{
string str_mod = str;
sregex rex = sregex::compile ( "(([^0-9])([\\,])([^0-9]))" );
string replace ( "$2 $3 $4" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(([^0-9])([\\.])([^0-9]))" );
replace = ( "$2 $3 $4" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.]) )" );
replace = ( " $2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([A-Z]|[a-z]) ([\\.])$)" );
replace = ( " $2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^([A-Z]|[a-z]) ([\\.]) )" );
replace = ( " $2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(([A-Z]|[a-z])([\\.]) ([A-Z]|[a-z])([\\.]) )" );
replace = ( "$2.$4. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\?]" );
replace = ( " ? " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\;]" );
replace = ( " ; " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(([^0-9])([\\:])([^0-9]))" );
replace = ( "$2 $3 $4" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\!]" );
replace = ( " ! " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\(]" );
replace = ( " ( " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\\)]" );
replace = ( " ) " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[\"]" );
replace = ( " \" " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(num_ \\( ([^\\)]+) \\))" );
replace = ( "num_($2)" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(ordinal_ \\( ([^\\)]*) \\))" );
replace = ( "ordinal_($2)" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^([Mm]) \\.)" );
replace = ( "$2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([Mm]) \\.)" );
replace = ( " $2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^([Dd]r) \\.)" );
replace = ( "$2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([Dd]r) \\.)" );
replace = ( " $2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^([Mm]r) \\.)" );
replace = ( "$2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([Mm]r) \\.)" );
replace = ( " $2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^([Mm]rs) \\.)" );
replace = ( "$2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([Mm]rs) \\.)" );
replace = ( " $2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^([Nn]o) \\.)" );
replace = ( "$2." );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( ([Nn]o) \\.)" );
replace = ( " $2." );
str_mod = regex_replace ( str_mod, rex, replace );
// rex = sregex::compile ( "(^(([Jj]an)|([Ff]ev)|([Mm]ar)|([Aa]pr)|([Jj]un)|([Jj]ul)|([Aa]ug)|([Ss]ept)|([Oo]ct)|([Nn]ov)|([Dd]ec)) \\.)" );
// replace = ( "$2." );
// str_mod = regex_replace ( str_mod, rex, replace );
//
// rex = sregex::compile ( "( (([Jj]an)|([Ff]ev)|([Mm]ar)|([Aa]pr)|([Jj]un)|([Jj]ul)|([Aa]ug)|([Ss]ept)|([Oo]ct)|([Nn]ov)|([Dd]ec)) \\.)" );
// replace = ( " $2." );
// str_mod = regex_replace ( str_mod, rex, replace );
//
// rex = sregex::compile ( "(^(([Gg]en)|([Cc]ol)) \\.)" );
// replace = ( "$2." );
// str_mod = regex_replace ( str_mod, rex, replace );
//
// rex = sregex::compile ( "( (([Gg]en)|([Cc]ol)) \\.)" );
// replace = ( " $2." );
// str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^(([A-Z][a-z])) \\. )" );
replace = ( "$2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( (([A-Z][a-z])) \\. )" );
replace = ( " $2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "(^(([A-Z][a-z][a-z])) \\. )" );
replace = ( "$2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "( (([A-Z][a-z][a-z])) \\. )" );
replace = ( " $2. " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[ ]+" );
replace = " ";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "^[ ]+" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "[ ]+$" );
replace = "";
str_mod = regex_replace ( str_mod, rex, replace );
return str_mod;
}
string normalizeStd ( string str )
{
string str_mod = str;
sregex rex = sregex::compile ( "(<skipped>)" );
string replace ( "" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "-\n" );
replace = ( "" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "\n" );
replace = ( " " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "&quot;" );
replace = ( "\"" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "&amp;" );
replace = ( "& " );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "&lt;" );
replace = ( "<" );
str_mod = regex_replace ( str_mod, rex, replace );
rex = sregex::compile ( "&gt;" );
replace = ( ">" );
str_mod = regex_replace ( str_mod, rex, replace );
return str_mod;
}
param copyParam ( param p )
{
param to_return;
to_return.caseOn = p.caseOn;
to_return.noPunct = p.noPunct;
to_return.debugMode = p.debugMode;
to_return.hypothesisFile = p.hypothesisFile;
to_return.referenceFile = p.referenceFile;
to_return.normalize = p.normalize;
to_return.noTxtIds = p.noTxtIds;
to_return.outputFileExtension = p.outputFileExtension;
to_return.outputFileName = p.outputFileName;
to_return.sgmlInputs = p.sgmlInputs;
to_return.tercomLike = p.tercomLike;
return to_return;
}
string printParams ( param p )
{
stringstream s;
s << "caseOn = " << p.caseOn << endl;
s << "noPunct = " << p.noPunct << endl;
s << "debugMode = " << p.debugMode << endl;
s << "hypothesisFile = " << p.hypothesisFile << endl;
s << "referenceFile = " << p.referenceFile << endl;
s << "normalize = " << p.normalize << endl;
s << "noTxtIds = " << p.noTxtIds << endl;
s << "outputFileExtension = " << p.outputFileExtension << endl;
s << "outputFileName = " << p.outputFileName << endl;
s << "sgmlInputs = " << p.sgmlInputs << endl;
s << "tercomLike = " << p.tercomLike << endl;
return s.str();
}
}

66
mert/TERsrc/tools.h Normal file
View File

@ -0,0 +1,66 @@
#ifndef __TERCPPTOOLS_H__
#define __TERCPPTOOLS_H__
#include <vector>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <sstream>
#include <boost/xpressive/xpressive.hpp>
using namespace std;
namespace Tools
{
typedef vector<double> vecDouble;
typedef vector<char> vecChar;
typedef vector<int> vecInt;
typedef vector<float> vecFloat;
typedef vector<string> vecString;
typedef vector<string> alignmentElement;
typedef vector<alignmentElement> WERalignment;
struct param {
bool debugMode;
string referenceFile; // path to the resources
string hypothesisFile; // path to the configuration files
string outputFileExtension;
string outputFileName;
bool noPunct;
bool caseOn;
bool normalize;
bool tercomLike;
bool sgmlInputs;
bool noTxtIds;
};
// param = { false, "","","","" };
// class tools{
// private:
// public:
string vectorToString ( vector<string> vec );
string vectorToString ( vector<string> vec, string s );
vector<string> subVector ( vector<string> vec, int start, int end );
vector<int> subVector ( vector<int> vec, int start, int end );
vector<float> subVector ( vector<float> vec, int start, int end );
vector<string> copyVector ( vector<string> vec );
vector<int> copyVector ( vector<int> vec );
vector<float> copyVector ( vector<float> vec );
vector<string> stringToVector ( string s, string tok );
vector<int> stringToVectorInt ( string s, string tok );
vector<float> stringToVectorFloat ( string s, string tok );
string lowerCase(string str);
string removePunct(string str);
string tokenizePunct(string str);
string removePunctTercom(string str);
string normalizeStd(string str);
string printParams(param p);
// };
param copyParam(param p);
}
#endif

View File

@ -0,0 +1,332 @@
#include "xmlStructure.h"
// The following class defines a hash function for strings
using namespace std;
namespace TERCpp
{
// tutorial demo program
// ----------------------------------------------------------------------
// STDOUT dump and indenting utility functions
// ----------------------------------------------------------------------
// const unsigned int NUM_INDENTS_PER_SPACE=2;
xmlStructure::xmlStructure()
{
NUM_INDENTS_PER_SPACE = 2;
}
const char * xmlStructure::getIndent ( unsigned int numIndents )
{
static const char * pINDENT = " + ";
static const unsigned int LENGTH = strlen ( pINDENT );
unsigned int n = numIndents * NUM_INDENTS_PER_SPACE;
if ( n > LENGTH )
n = LENGTH;
return &pINDENT[ LENGTH-n ];
}
// same as getIndent but no "+" at the end
const char * xmlStructure::getIndentAlt ( unsigned int numIndents )
{
static const char * pINDENT = " ";
static const unsigned int LENGTH = strlen ( pINDENT );
unsigned int n = numIndents * NUM_INDENTS_PER_SPACE;
if ( n > LENGTH )
n = LENGTH;
return &pINDENT[ LENGTH-n ];
}
int xmlStructure::dump_attribs_to_stdout ( TiXmlElement* pElement, unsigned int indent )
{
if ( !pElement )
return 0;
TiXmlAttribute* pAttrib = pElement->FirstAttribute();
int i = 0;
int ival;
double dval;
const char* pIndent = getIndent ( indent );
printf ( "\n" );
while ( pAttrib ) {
printf ( "%s%s: value=[%s]", pIndent, pAttrib->Name(), pAttrib->Value() );
if ( pAttrib->QueryIntValue ( &ival ) == TIXML_SUCCESS )
printf ( " int=%d", ival );
if ( pAttrib->QueryDoubleValue ( &dval ) == TIXML_SUCCESS )
printf ( " d=%1.1f", dval );
printf ( "\n" );
i++;
pAttrib = pAttrib->Next();
}
return i;
}
void xmlStructure::dump_to_stdout ( TiXmlNode* pParent, unsigned int indent = 0 )
{
if ( !pParent )
return;
TiXmlNode* pChild;
TiXmlText* pText;
int t = pParent->Type();
printf ( "%s", getIndent ( indent ) );
int num;
switch ( t ) {
case TiXmlNode::DOCUMENT:
printf ( "Document" );
break;
case TiXmlNode::ELEMENT:
printf ( "Element [%s]", pParent->Value() );
num = dump_attribs_to_stdout ( pParent->ToElement(), indent + 1 );
switch ( num ) {
case 0:
printf ( " (No attributes)" );
break;
case 1:
printf ( "%s1 attribute", getIndentAlt ( indent ) );
break;
default:
printf ( "%s%d attributes", getIndentAlt ( indent ), num );
break;
}
break;
case TiXmlNode::COMMENT:
printf ( "Comment: [%s]", pParent->Value() );
break;
case TiXmlNode::UNKNOWN:
printf ( "Unknown" );
break;
case TiXmlNode::TEXT:
pText = pParent->ToText();
printf ( "Text: [%s]", pText->Value() );
break;
case TiXmlNode::DECLARATION:
printf ( "Declaration" );
break;
default:
break;
}
printf ( "\n" );
for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling() ) {
dump_to_stdout ( pChild, indent + 1 );
}
}
// load the named file and dump its structure to STDOUT
void xmlStructure::dump_to_stdout ( const char* pFilename )
{
TiXmlDocument doc ( pFilename );
bool loadOkay = doc.LoadFile();
if ( loadOkay ) {
printf ( "\n%s:\n", pFilename );
dump_to_stdout ( &doc ); // defined later in the tutorial
} else {
printf ( "Failed to load file \"%s\"\n", pFilename );
}
}
// Load the file and dump it into a SGMLDocument.
SGMLDocument xmlStructure::dump_to_SGMLDocument ( string FileName )
{
TiXmlDocument doc ( FileName.c_str() );
SGMLDocument to_return;
bool isLoaded = doc.LoadFile();
if ( isLoaded ) {
copy_to_SGMLDocument ( &to_return, &doc, ( unsigned int ) 0 );
} else {
cerr << "ERROR : xmlStructure::dump_to_SGMLDocument : Failed to load file " << FileName << endl;
exit ( 0 );
}
return to_return;
}
void xmlStructure::copy_to_SGMLDocument ( SGMLDocument* sgmlDoc, TiXmlNode* pParent, unsigned int indent )
{
if ( !pParent )
return;
TiXmlNode* pChild;
TiXmlText* pText;
int t = pParent->Type();
// printf ( "%s", getIndent ( indent ) );
// int num;
string elementValue;
switch ( t ) {
case TiXmlNode::DOCUMENT:
// printf ( "Document" );
break;
case TiXmlNode::ELEMENT:
printf ( "Element [%s]", pParent->Value() );
elementValue = pParent->Value();
if ( ( ( int ) elementValue.compare ( "refset" ) == 0 ) || ( ( int ) elementValue.compare ( "tstset" ) == 0 ) ) {
sgmlDoc->setDocType ( elementValue );
} else if ( ( int ) elementValue.compare ( "doc" ) == 0 ) {
documentStructure tmp_doc;
sgmlDoc->addDocument ( tmp_doc );
} else if ( ( int ) elementValue.compare ( "seg" ) == 0 ) {
segmentStructure tmp_seg;
( sgmlDoc->getLastDocument() )->addSegments ( tmp_seg );
}
dump_attribs_to_SGMLDocuments ( sgmlDoc, pParent->ToElement(), indent + 1 );
// num = dump_attribs_to_stdout ( pParent->ToElement(), indent + 1 );
// switch ( num )
// {
// case 0:
// printf ( " (No attributes)" );
// break;
// case 1:
// printf ( "%s1 attribute", getIndentAlt ( indent ) );
// break;
// default:
// printf ( "%s%d attributes", getIndentAlt ( indent ), num );
// break;
// }
break;
// case TiXmlNode::COMMENT:
// printf ( "Comment: [%s]", pParent->Value() );
// break;
//
// case TiXmlNode::UNKNOWN:
// printf ( "Unknown" );
// break;
case TiXmlNode::TEXT:
pText = pParent->ToText();
// printf ( "Text: [%s]", pText->Value() );
if ( indent == 5 ) {
documentStructure * l_tmp_doc = sgmlDoc->getLastDocument();
segmentStructure * l_tmp_seg = l_tmp_doc->getLastSegments();
string l_text = pText->Value();
string line_mod=l_text;
if ( !xmlParams.tercomLike ) {
if ( xmlParams.debugMode ) {
cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line NOT tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
if ( xmlParams.debugMode ) {
cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "TERCOM AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = tokenizePunct ( line_mod );
}
if ( !xmlParams.caseOn ) {
if ( xmlParams.debugMode ) {
cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "CASEON AT FALSE " << endl << "END DEBUG" << endl;
}
line_mod = lowerCase ( line_mod );
}
if ( xmlParams.noPunct ) {
if ( xmlParams.debugMode ) {
cerr << "DEBUG tercpp : xmlStructure::copy_to_SGMLDocument : " << endl << "NOPUNCT AT TRUE " << endl << "END DEBUG" << endl;
}
if ( !xmlParams.tercomLike ) {
line_mod = removePunctTercom ( line_mod );
} else {
line_mod = removePunct ( line_mod );
}
}
if ( xmlParams.debugMode ) {
cerr << "DEBUG xmlStructure::copy_to_SGMLDocument : line tokenized |" << line_mod << "|" << endl << "END DEBUG" << endl;
}
l_tmp_seg->addContent ( line_mod );
}
break;
// case TiXmlNode::DECLARATION:
// printf ( "Declaration" );
// break;
default:
break;
}
// printf ( "\n" );
for ( pChild = pParent->FirstChild(); pChild != 0; pChild = pChild->NextSibling() ) {
copy_to_SGMLDocument ( sgmlDoc, pChild, indent + 1 );
}
}
void xmlStructure::dump_attribs_to_SGMLDocuments ( SGMLDocument * sgmlDoc, TiXmlElement* pElement, unsigned int indent )
{
if ( !pElement )
return;
TiXmlAttribute* pAttrib = pElement->FirstAttribute();
// int i = 0;
// int ival;
// double dval;
// const char* pIndent = getIndent ( indent );
// printf ( "\n" );
while ( pAttrib ) {
string attribut = pAttrib->Name();
switch ( indent ) {
case 1 : {
if ( attribut.compare ( "setid" ) == 0 ) {
sgmlDoc->setSetId ( pAttrib->Value() );
}
if ( attribut.compare ( "srclang" ) == 0 ) {
sgmlDoc->setSrcLang ( pAttrib->Value() );
}
if ( attribut.compare ( "tgtlang" ) == 0 ) {
sgmlDoc->setTgtLang ( pAttrib->Value() );
}
}
break;
case 2: {
documentStructure * tmp_doc_bis = sgmlDoc->getLastDocument();
if ( attribut.compare ( "docid" ) == 0 ) {
tmp_doc_bis->setDocId ( pAttrib->Value() );
}
if ( attribut.compare ( "sysid" ) == 0 ) {
tmp_doc_bis->setSysId ( pAttrib->Value() );
}
}
break;
case 4: {
documentStructure * l_tmp_doc = sgmlDoc->getLastDocument();
segmentStructure * l_tmp_seg = l_tmp_doc->getLastSegments();
if ( attribut.compare ( "id" ) == 0 ) {
l_tmp_seg->setSegId ( pAttrib->Value() );
}
// else
// if (attribut.compare("Text")==0)
// {
// tmp_seg.addContent(pAttrib->Value());
// }
}
break;
default:
break;
}
// printf ( "%s%s: value=[%s]", pIndent, pAttrib->Name(), pAttrib->Value() );
// if ( pAttrib->QueryIntValue ( &ival ) == TIXML_SUCCESS )
// printf ( " int=%d", ival );
// if ( pAttrib->QueryDoubleValue ( &dval ) == TIXML_SUCCESS )
// printf ( " d=%1.1f", dval );
// printf ( "\n" );
// i++;
pAttrib = pAttrib->Next();
}
// return i;
}
// std::size_t hashValue(std::string key){}
}

View File

@ -0,0 +1,40 @@
/*
* Generic hashmap manipulation functions
*/
#ifndef __XMLSTRUCTURE_H__
#define __XMLSTRUCTURE_H__
#include "sgmlDocument.h"
#include "documentStructure.h"
#include "stdio.h"
#include <iostream>
#include <string>
#include "tinyxml.h"
using namespace std;
namespace TERCpp
{
class xmlStructure
{
private:
unsigned int NUM_INDENTS_PER_SPACE;
// void dump_attribs_to_SGMLDocuments ( SGMLDocument* arg1, const TiXmlElement* arg2 );
void dump_attribs_to_SGMLDocuments ( SGMLDocument* sgmlDoc, TiXmlElement* pElement, unsigned int indent );
public:
xmlStructure();
const char * getIndent ( unsigned int numIndents );
const char * getIndentAlt ( unsigned int numIndents );
int dump_attribs_to_stdout ( TiXmlElement* pElement, unsigned int indent );
void dump_to_stdout ( TiXmlNode* pParent, unsigned int indent );
void dump_to_stdout ( const char* pFilename );
void copy_to_SGMLDocument ( SGMLDocument* sgmlDoc , TiXmlNode* pParent, unsigned int indent );
SGMLDocument dump_to_SGMLDocument ( string FileName );
param xmlParams;
};
}
#endif

107
mert/TerScorer.cpp Normal file
View File

@ -0,0 +1,107 @@
#include "TerScorer.h"
#include "TERsrc/tercalc.h"
#include "TERsrc/terAlignment.h"
const int TerScorer::LENGTH = 2;
using namespace TERCpp;
using namespace std;
void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
{
// for each line in the reference file, create a multiset of the
// word ids
for ( int incRefs = 0; incRefs < ( int ) referenceFiles.size(); incRefs++ ) {
stringstream convert;
m_references.clear();
_reftokens.clear();
_reflengths.clear();
ifstream in ( referenceFiles.at ( incRefs ).c_str() );
if ( !in ) {
throw runtime_error ( "Unable to open " + referenceFiles.at ( incRefs ) );
}
string line;
int sid = 0;
while ( getline ( in, line ) ) {
vector<int> tokens;
encode ( line, tokens );
m_references.push_back ( tokens );
TRACE_ERR ( "." );
++sid;
}
m_multi_references.push_back ( m_references );
}
TRACE_ERR ( endl );
m_references=m_multi_references.at(0);
}
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
{
terAlignment result;
result.numEdits = 0.0 ;
result.numWords = 0.0 ;
result.averageWords = 0.0;
for ( int incRefs = 0; incRefs < ( int ) m_multi_references.size(); incRefs++ ) {
if ( sid >= m_multi_references.at(incRefs).size() ) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error ( msg.str() );
}
vector<int> testtokens;
vector<int> reftokens;
reftokens = m_multi_references.at ( incRefs ).at ( sid );
double averageLength=0.0;
for ( int incRefsBis = 0; incRefsBis < ( int ) m_multi_references.size(); incRefsBis++ ) {
if ( sid >= m_multi_references.at(incRefsBis).size() ) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error ( msg.str() );
}
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
}
averageLength=averageLength/( double ) m_multi_references.size();
encode ( text, testtokens );
terCalc * evaluation=new terCalc();
evaluation->setDebugMode ( false );
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
tmp_result.averageWords=averageLength;
if ( ( result.numEdits == 0.0 ) && ( result.averageWords == 0.0 ) ) {
result = tmp_result;
} else if ( result.scoreAv() > tmp_result.scoreAv() ) {
result = tmp_result;
}
}
ostringstream stats;
stats << result.numEdits << " " << result.averageWords << " " << result.scoreAv() << " " ;
string stats_str = stats.str();
entry.set ( stats_str );
}
float TerScorer::calculateScore ( const vector<int>& comps )
{
float denom = 1.0 * comps[1];
float num = -1.0 * comps[0];
if ( denom == 0 ) {
// shouldn't happen!
return 1.0;
} else {
return (1.0+(num / denom));
}
}
float TerScorer::calculateScore ( const vector<float>& comps )
{
float denom = 1.0 * comps[1];
float num = -1.0 * comps[0];
if ( denom == 0 ) {
// shouldn't happen!
return 1.0;
} else {
return (1.0+(num / denom));
}
}

67
mert/TerScorer.h Normal file
View File

@ -0,0 +1,67 @@
#ifndef __TERSCORER_H__
#define __TERSCORER_H__
// #include <stdio.h>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include <limits.h>
#include "Types.h"
#include "ScoreData.h"
#include "Scorer.h"
#include "TERsrc/tercalc.h"
#include "TERsrc/terAlignment.h"
using namespace std;
using namespace TERCpp;
// enum TerReferenceLengthStrategy { TER_AVERAGE, TER_SHORTEST, TER_CLOSEST };
/**
* Bleu scoring
**/
class TerScorer: public StatisticsBasedScorer
{
public:
TerScorer(const string& config = "") : StatisticsBasedScorer("TER",config) {}
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
static const int LENGTH;
virtual void whoami() {
cerr << "I AM TerScorer" << std::endl;
}
size_t NumberOfScores() {
cerr << "TerScorer: " << (LENGTH + 1) << endl;
return (LENGTH + 1);
};
// protected:
float calculateScore(const vector<int>& comps);
float calculateScore(const vector<float>& comps);
private:
string javaEnv;
string tercomEnv;
//no copy
TerScorer(const TerScorer&);
~TerScorer() {};
TerScorer& operator=(const TerScorer&);
// data extracted from reference files
vector<size_t> _reflengths;
vector<multiset<int> > _reftokens;
vector<vector<int> > m_references;
vector<vector<vector<int> > > m_multi_references;
string m_pid;
};
#endif //__TERSCORER_H

View File

@ -10,7 +10,7 @@ $extractor --binary --ffile FEATSTAT.3 --scfile SCORESTAT.3 --sctype BLEU --prev
$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.4 --scfile SCORESTAT.4 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT.3 --prev-scfile SCORESTAT,SCORESTAT.3
$mert --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT -d $size --verbose 4 -n 5
$mert -r 1234 --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT -d $size --verbose 4 -n 5
exit

View File

@ -1 +1,3 @@
1 1 1 1 1 1 1 1 0.3 0.2 0.3 0.2 0 0 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1