subimtted working scripts for hierarchical training (msd)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/hierarchical-reo@2796 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
sarst 2010-01-29 22:38:18 +00:00
parent ad3b0760b2
commit bf70dd4767
3 changed files with 2070 additions and 108 deletions

View File

@ -10,9 +10,7 @@
#include <time.h>
#include <cstring>
#include <map>
#include <set>
#include <vector>
#include "hierarchical.h"
using namespace std;
@ -28,20 +26,6 @@ using namespace std;
}
#define LINE_MAX_LENGTH 60000
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
// bottom-left and top-right
typedef pair<HPhraseVertex, HPhraseVertex> HPhrase;
// HPhraseVector is a vector of HPhrases
typedef vector < HPhrase > HPhraseVector;
// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
// The key of the map is the English index and the value is a set of the foreign ones
typedef map <int, set<int> > HSenteceVertices;
class SentenceAlignment {
public:
vector<string> english;
@ -59,20 +43,11 @@ void addPhrase( SentenceAlignment &, int, int, int, int );
vector<string> tokenize( char [] );
bool isAligned ( SentenceAlignment &, int, int );
// Reordering
// Hierarchical reordering
void HRextract( SentenceAlignment & );
void HRaddPhrase( SentenceAlignment &, int, int, int, int, string &, string & );
void PBextract( SentenceAlignment & );
enum REO_MODEL_NAME {REO_HIER, REO_PHRASE, REO_WORD};
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO, REO_LR};
bool allModelsOutputFlag = false;
REO_MODEL_NAME modelName;
REO_MODEL_TYPE modelType;
map < REO_MODEL_NAME, REO_MODEL_TYPE > selectedModels;
ofstream extractFile;
ofstream extractFileInv;
ofstream extractFileOrientation;
@ -80,6 +55,8 @@ int maxPhraseLength;
int phraseCount = 0;
char* fileNameExtract;
bool orientationFlag = false;
bool HRorientationFlag = false;
bool PBorientationFlag = false;
bool onlyOutputSpanInfo = false;
bool noFileLimit = false;
bool zipFiles = false;
@ -91,59 +68,25 @@ int main(int argc, char* argv[])
<< "phrase extraction from an aligned parallel corpus\n";
time_t starttime = time(NULL);
if (argc < 6) {
cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning | --hierarchical-reo]\n";
exit(1);
}
char* &fileNameE = argv[1];
char* &fileNameF = argv[2];
char* &fileNameA = argv[3];
fileNameExtract = argv[4];
maxPhraseLength = atoi(argv[5]);
for(int i=6;i<argc;i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
onlyOutputSpanInfo = true;
}
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
noFileLimit = true;
}
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
orientationFlag = true;
selectedModels.insert(make_pair(REO_WORD, REO_MSD));
}
else if(strcmp(argv[i],"--model") == 0){
char* modelParams = argv[++i];
char* modelName = strtok(modelParams, "-");
char* modelType = strtok(modelParams, "-");
REO_MODEL_NAME intModelName;
REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "word") == 0)
intModelName = REO_WORD;
else if(strcmp(modelName, "phrase") == 0)
intModelName = REO_PHRASE;
else if(strcmp(modelName, "phrase") == 0)
intModelName = REO_HIER;
else{
cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
exit(1);
}
if(strcmp(modelType, "msd") == 0)
intModelType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
intModelType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0)
intModelType = REO_MONO;
else if(strcmp(modelType, "leftright") == 0)
intModelType = REO_LR;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
}
if (argc < 6) {
cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning | --hierarchical-reo]\n";
exit(1);
}
char* &fileNameE = argv[1];
char* &fileNameF = argv[2];
char* &fileNameA = argv[3];
fileNameExtract = argv[4];
maxPhraseLength = atoi(argv[5]);
for(int i=6;i<argc;i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
onlyOutputSpanInfo = true;
}
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
noFileLimit = true;
}
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
orientationFlag = true;
}
else if (strcmp(argv[i],"--ZipFiles") == 0) {
zipFiles = true;
@ -151,6 +94,12 @@ int main(int argc, char* argv[])
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
properConditioning = true;
}
else if (strcmp(argv[i],"--hierarchical-reo") == 0) {
HRorientationFlag = true;
}
else if (strcmp(argv[i],"--phrase-based-reo") == 0) {
PBorientationFlag = true;
}
else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@ -192,7 +141,6 @@ int main(int argc, char* argv[])
HRextract(sentence);
else if(PBorientationFlag)
PBextract(sentence);
else
extract(sentence);
if (properConditioning) extractBase(sentence);
}
@ -286,7 +234,7 @@ void extract( SentenceAlignment &sentence ) {
(endF<countF &&
endF<startF+maxPhraseLength && // within length limit
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
endF++)
endF++)
addPhrase(sentence,startE,endE,startF,endF);
}
}
@ -369,11 +317,8 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
// orientation to previous E
bool connectedLeftTop = isAligned( sentence, startF-1, startE-1 );
bool connectedRightTop = isAligned( sentence, endF+1, startE-1 );
if ( connectedLeftTop && !connectedRightTop)
{
if()
if ( connectedLeftTop && !connectedRightTop)
extractFileOrientation << "mono";
}
else if (!connectedLeftTop && connectedRightTop)
extractFileOrientation << "swap";
else

File diff suppressed because it is too large Load Diff

View File

@ -1372,8 +1372,8 @@ sub get_reordering {
}
my $smooth = $___REORDERING_SMOOTH;
my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,$smooth);
my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,$smooth);
my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,0);
my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,0);
my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING);
if ($smooth =~ /(.+)u$/) {
@ -1501,19 +1501,19 @@ sub store_reordering_f {
$f_current,
$mono_previous_f/$total_previous_f,
$swap_previous_f/$total_previous_f,
$left_previous_f+$right_previous_f/$total_previous_f);
($left_previous_f+$right_previous_f)/$total_previous_f);
}
elsif ($model->{"orient"} eq "monotonicity") {
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
$f_current,
$mono_previous_f/$total_previous_f,
$swap_previous_f+$left_previous_f+$right_previous_f/$total_previous_f);
($swap_previous_f+$left_previous_f+$right_previous_f)/$total_previous_f);
}
elsif ($model->{"orient"} eq "leftright") {
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
$f_current,
$mono_previous_f+$left_previous_f/$total_previous_f,
$swap_previous_f+$right_previous_f/$total_previous_f);
($mono_previous_f+$left_previous_f)/$total_previous_f,
($swap_previous_f+$right_previous_f)/$total_previous_f);
}
}
}
@ -1525,14 +1525,14 @@ sub store_reordering_fe {
foreach my $model (@REORDERING_MODELS) {
next if ($model->{"lang"} ne "fe");
if ($model->{"orient"} eq "mslr") {
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g",
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g ",
$f_current, $e_current,
$mono_previous_fe/$total_previous_fe,
$swap_previous_fe/$total_previous_fe,
$left_previous_fe/$total_previous_fe,
$right_previous_fe/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } (" ||| %g %g %g %g",
printf { $model->{"filehandle"} } ("%g %g %g %g",
$mono_following_fe/$total_following_fe,
$swap_following_fe/$total_following_fe,
$left_following_fe/$total_following_fe,
@ -1541,40 +1541,40 @@ sub store_reordering_fe {
printf { $model->{"filehandle"} } ("\n");
}
elsif ($model->{"orient"} eq "msd") {
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g",
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g ",
$f_current, $e_current,
$mono_previous_fe/$total_previous_fe,
$swap_previous_fe/$total_previous_fe,
$left_previous_fe+$right_previous_fe/$total_previous_fe);
($left_previous_fe+$right_previous_fe)/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } (" ||| %g %g %g",
printf { $model->{"filehandle"} } ("%g %g %g",
$mono_following_fe/$total_following_fe,
$swap_following_fe/$total_following_fe,
$left_following_fe+$right_following_fe/$total_following_fe);
($left_following_fe+$right_following_fe)/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}
elsif ($model->{"orient"} eq "monotonicity") {
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g",
printf { $model->{"filehandle"} } ("%s %s ||| %g %g ",
$f_current, $e_current,
$mono_previous_fe/$total_previous_fe,
$swap_previous_fe+$left_previous_fe+$right_previous_fe/$total_previous_fe);
($swap_previous_fe+$left_previous_fe+$right_previous_fe)/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } (" ||| %g %g",
printf { $model->{"filehandle"} } ("%g %g",
$mono_following_fe/$total_following_fe,
$swap_following_fe+$left_following_fe+$right_following_fe/$total_following_fe);
($swap_following_fe+$left_following_fe+$right_following_fe)/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}
elsif ($model->{"orient"} eq "leftright") {
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g",
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g ",
$f_current, $e_current,
$mono_previous_fe+$left_previous_fe/$total_previous_fe,
$swap_previous_fe+$right_previous_fe/$total_previous_fe);
($mono_previous_fe+$left_previous_fe)/$total_previous_fe,
($swap_previous_fe+$right_previous_fe)/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } (" ||| %g %g",
$mono_following_fe+$left_following_fe/$total_following_fe,
$swap_following_fe+$right_following_fe/$total_following_fe);
printf { $model->{"filehandle"} } ("%g %g",
($mono_following_fe+$left_following_fe)/$total_following_fe,
($swap_following_fe+$right_following_fe)/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}