mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
subimtted working scripts for hierarchical training (msd)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/hierarchical-reo@2796 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
ad3b0760b2
commit
bf70dd4767
@ -10,9 +10,7 @@
|
||||
#include <time.h>
|
||||
#include <cstring>
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include "hierarchical.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -28,20 +26,6 @@ using namespace std;
|
||||
}
|
||||
#define LINE_MAX_LENGTH 60000
|
||||
|
||||
// HPhraseVertex represents a point in the alignment matrix
|
||||
typedef pair <int, int> HPhraseVertex;
|
||||
|
||||
// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
|
||||
// bottom-left and top-right
|
||||
typedef pair<HPhraseVertex, HPhraseVertex> HPhrase;
|
||||
|
||||
// HPhraseVector is a vector of HPhrases
|
||||
typedef vector < HPhrase > HPhraseVector;
|
||||
|
||||
// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
|
||||
// The key of the map is the English index and the value is a set of the foreign ones
|
||||
typedef map <int, set<int> > HSenteceVertices;
|
||||
|
||||
class SentenceAlignment {
|
||||
public:
|
||||
vector<string> english;
|
||||
@ -59,20 +43,11 @@ void addPhrase( SentenceAlignment &, int, int, int, int );
|
||||
vector<string> tokenize( char [] );
|
||||
bool isAligned ( SentenceAlignment &, int, int );
|
||||
|
||||
// Reordering
|
||||
// Hierarchical reordering
|
||||
void HRextract( SentenceAlignment & );
|
||||
void HRaddPhrase( SentenceAlignment &, int, int, int, int, string &, string & );
|
||||
void PBextract( SentenceAlignment & );
|
||||
|
||||
enum REO_MODEL_NAME {REO_HIER, REO_PHRASE, REO_WORD};
|
||||
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO, REO_LR};
|
||||
|
||||
bool allModelsOutputFlag = false;
|
||||
REO_MODEL_NAME modelName;
|
||||
REO_MODEL_TYPE modelType;
|
||||
|
||||
map < REO_MODEL_NAME, REO_MODEL_TYPE > selectedModels;
|
||||
|
||||
ofstream extractFile;
|
||||
ofstream extractFileInv;
|
||||
ofstream extractFileOrientation;
|
||||
@ -80,6 +55,8 @@ int maxPhraseLength;
|
||||
int phraseCount = 0;
|
||||
char* fileNameExtract;
|
||||
bool orientationFlag = false;
|
||||
bool HRorientationFlag = false;
|
||||
bool PBorientationFlag = false;
|
||||
bool onlyOutputSpanInfo = false;
|
||||
bool noFileLimit = false;
|
||||
bool zipFiles = false;
|
||||
@ -91,59 +68,25 @@ int main(int argc, char* argv[])
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning | --hierarchical-reo]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
char* &fileNameF = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
fileNameExtract = argv[4];
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
|
||||
for(int i=6;i<argc;i++) {
|
||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||
onlyOutputSpanInfo = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
|
||||
noFileLimit = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
orientationFlag = true;
|
||||
selectedModels.insert(make_pair(REO_WORD, REO_MSD));
|
||||
}
|
||||
else if(strcmp(argv[i],"--model") == 0){
|
||||
char* modelParams = argv[++i];
|
||||
char* modelName = strtok(modelParams, "-");
|
||||
char* modelType = strtok(modelParams, "-");
|
||||
|
||||
REO_MODEL_NAME intModelName;
|
||||
REO_MODEL_TYPE intModelType;
|
||||
|
||||
if(strcmp(modelName, "word") == 0)
|
||||
intModelName = REO_WORD;
|
||||
else if(strcmp(modelName, "phrase") == 0)
|
||||
intModelName = REO_PHRASE;
|
||||
else if(strcmp(modelName, "phrase") == 0)
|
||||
intModelName = REO_HIER;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
intModelType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
intModelType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0)
|
||||
intModelType = REO_MONO;
|
||||
else if(strcmp(modelType, "leftright") == 0)
|
||||
intModelType = REO_LR;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning | --hierarchical-reo]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
char* &fileNameF = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
fileNameExtract = argv[4];
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
|
||||
for(int i=6;i<argc;i++) {
|
||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||
onlyOutputSpanInfo = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
|
||||
noFileLimit = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
orientationFlag = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ZipFiles") == 0) {
|
||||
zipFiles = true;
|
||||
@ -151,6 +94,12 @@ int main(int argc, char* argv[])
|
||||
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
|
||||
properConditioning = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--hierarchical-reo") == 0) {
|
||||
HRorientationFlag = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--phrase-based-reo") == 0) {
|
||||
PBorientationFlag = true;
|
||||
}
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
exit(1);
|
||||
@ -192,7 +141,6 @@ int main(int argc, char* argv[])
|
||||
HRextract(sentence);
|
||||
else if(PBorientationFlag)
|
||||
PBextract(sentence);
|
||||
else
|
||||
extract(sentence);
|
||||
if (properConditioning) extractBase(sentence);
|
||||
}
|
||||
@ -286,7 +234,7 @@ void extract( SentenceAlignment &sentence ) {
|
||||
(endF<countF &&
|
||||
endF<startF+maxPhraseLength && // within length limit
|
||||
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
|
||||
endF++)
|
||||
endF++)
|
||||
addPhrase(sentence,startE,endE,startF,endF);
|
||||
}
|
||||
}
|
||||
@ -369,11 +317,8 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
|
||||
// orientation to previous E
|
||||
bool connectedLeftTop = isAligned( sentence, startF-1, startE-1 );
|
||||
bool connectedRightTop = isAligned( sentence, endF+1, startE-1 );
|
||||
if ( connectedLeftTop && !connectedRightTop)
|
||||
{
|
||||
if()
|
||||
if ( connectedLeftTop && !connectedRightTop)
|
||||
extractFileOrientation << "mono";
|
||||
}
|
||||
else if (!connectedLeftTop && connectedRightTop)
|
||||
extractFileOrientation << "swap";
|
||||
else
|
||||
|
2017
scripts/training/train-factored-phrase-model-hierreo.perl
Normal file
2017
scripts/training/train-factored-phrase-model-hierreo.perl
Normal file
File diff suppressed because it is too large
Load Diff
@ -1372,8 +1372,8 @@ sub get_reordering {
|
||||
}
|
||||
|
||||
my $smooth = $___REORDERING_SMOOTH;
|
||||
my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,$smooth);
|
||||
my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,$smooth);
|
||||
my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,0);
|
||||
my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,0);
|
||||
|
||||
my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING);
|
||||
if ($smooth =~ /(.+)u$/) {
|
||||
@ -1501,19 +1501,19 @@ sub store_reordering_f {
|
||||
$f_current,
|
||||
$mono_previous_f/$total_previous_f,
|
||||
$swap_previous_f/$total_previous_f,
|
||||
$left_previous_f+$right_previous_f/$total_previous_f);
|
||||
($left_previous_f+$right_previous_f)/$total_previous_f);
|
||||
}
|
||||
elsif ($model->{"orient"} eq "monotonicity") {
|
||||
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
|
||||
$f_current,
|
||||
$mono_previous_f/$total_previous_f,
|
||||
$swap_previous_f+$left_previous_f+$right_previous_f/$total_previous_f);
|
||||
($swap_previous_f+$left_previous_f+$right_previous_f)/$total_previous_f);
|
||||
}
|
||||
elsif ($model->{"orient"} eq "leftright") {
|
||||
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
|
||||
$f_current,
|
||||
$mono_previous_f+$left_previous_f/$total_previous_f,
|
||||
$swap_previous_f+$right_previous_f/$total_previous_f);
|
||||
($mono_previous_f+$left_previous_f)/$total_previous_f,
|
||||
($swap_previous_f+$right_previous_f)/$total_previous_f);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1525,14 +1525,14 @@ sub store_reordering_fe {
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
next if ($model->{"lang"} ne "fe");
|
||||
if ($model->{"orient"} eq "mslr") {
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g",
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g ",
|
||||
$f_current, $e_current,
|
||||
$mono_previous_fe/$total_previous_fe,
|
||||
$swap_previous_fe/$total_previous_fe,
|
||||
$left_previous_fe/$total_previous_fe,
|
||||
$right_previous_fe/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } (" ||| %g %g %g %g",
|
||||
printf { $model->{"filehandle"} } ("%g %g %g %g",
|
||||
$mono_following_fe/$total_following_fe,
|
||||
$swap_following_fe/$total_following_fe,
|
||||
$left_following_fe/$total_following_fe,
|
||||
@ -1541,40 +1541,40 @@ sub store_reordering_fe {
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
elsif ($model->{"orient"} eq "msd") {
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g",
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g ",
|
||||
$f_current, $e_current,
|
||||
$mono_previous_fe/$total_previous_fe,
|
||||
$swap_previous_fe/$total_previous_fe,
|
||||
$left_previous_fe+$right_previous_fe/$total_previous_fe);
|
||||
($left_previous_fe+$right_previous_fe)/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } (" ||| %g %g %g",
|
||||
printf { $model->{"filehandle"} } ("%g %g %g",
|
||||
$mono_following_fe/$total_following_fe,
|
||||
$swap_following_fe/$total_following_fe,
|
||||
$left_following_fe+$right_following_fe/$total_following_fe);
|
||||
($left_following_fe+$right_following_fe)/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
elsif ($model->{"orient"} eq "monotonicity") {
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g",
|
||||
printf { $model->{"filehandle"} } ("%s %s ||| %g %g ",
|
||||
$f_current, $e_current,
|
||||
$mono_previous_fe/$total_previous_fe,
|
||||
$swap_previous_fe+$left_previous_fe+$right_previous_fe/$total_previous_fe);
|
||||
($swap_previous_fe+$left_previous_fe+$right_previous_fe)/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } (" ||| %g %g",
|
||||
printf { $model->{"filehandle"} } ("%g %g",
|
||||
$mono_following_fe/$total_following_fe,
|
||||
$swap_following_fe+$left_following_fe+$right_following_fe/$total_following_fe);
|
||||
($swap_following_fe+$left_following_fe+$right_following_fe)/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
elsif ($model->{"orient"} eq "leftright") {
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g",
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g ",
|
||||
$f_current, $e_current,
|
||||
$mono_previous_fe+$left_previous_fe/$total_previous_fe,
|
||||
$swap_previous_fe+$right_previous_fe/$total_previous_fe);
|
||||
($mono_previous_fe+$left_previous_fe)/$total_previous_fe,
|
||||
($swap_previous_fe+$right_previous_fe)/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } (" ||| %g %g",
|
||||
$mono_following_fe+$left_following_fe/$total_following_fe,
|
||||
$swap_following_fe+$right_following_fe/$total_following_fe);
|
||||
printf { $model->{"filehandle"} } ("%g %g",
|
||||
($mono_following_fe+$left_following_fe)/$total_following_fe,
|
||||
($swap_following_fe+$right_following_fe)/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user