extract can read an instance weights file.

Still have to parallelise.
This commit is contained in:
Barry Haddow 2012-12-21 15:39:25 +00:00
parent 8fe900d312
commit 861792bfc5
5 changed files with 44 additions and 7 deletions

View File

@ -46,6 +46,7 @@ class PhraseExtractionOptions {
bool includeSentenceIdFlag; //include sentence id in extract file
bool onlyOutputSpanInfo;
bool gzOutput;
std::string instanceWeightsFile; //weights for each sentence
public:
PhraseExtractionOptions(const int initmaxPhraseLength):
@ -99,7 +100,11 @@ public:
}
void initGzOutput (const bool initgzOutput){
gzOutput= initgzOutput;
}
}
void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
instanceWeightsFile = std::string(initInstanceWeightsFile);
}
// functions for getting values
bool isAllModelsOutputFlag() const {
return allModelsOutputFlag;
@ -136,7 +141,10 @@ public:
}
bool isGzOutput () const {
return gzOutput;
}
}
std::string getInstanceWeightsFile() const {
return instanceWeightsFile;
}
};
}

View File

@ -54,10 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo
return true;
}
bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;
this->weightString = std::string(weightString);
// process sentence strings and store in target and source members.
if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {

View File

@ -35,6 +35,7 @@ public:
std::vector<int> alignedCountS;
std::vector<std::vector<int> > alignedToT;
int sentenceID;
std::string weightString;
virtual ~SentenceAlignment();
@ -43,7 +44,7 @@ public:
virtual bool processSourceSentence(const char *, int, bool boundaryRules);
bool create(char targetString[], char sourceString[],
char alignmentString[], int sentenceID, bool boundaryRules);
char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
};

View File

@ -114,7 +114,7 @@ int main(int argc, char* argv[])
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n";
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
exit(1);
}
@ -144,6 +144,12 @@ int main(int argc, char* argv[])
sentenceOffset = atoi(argv[++i]);
} else if (strcmp(argv[i], "--GZOutput") == 0) {
options.initGzOutput(true);
} else if (strcmp(argv[i], "--InstanceWeights") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
exit(1);
}
options.initInstanceWeightsFile(argv[++i]);
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@ -220,6 +226,13 @@ int main(int argc, char* argv[])
istream *fFileP = &fFile;
istream *aFileP = &aFile;
istream *iwFileP = NULL;
auto_ptr<Moses::InputFileStream> instanceWeightsFile;
if (options.getInstanceWeightsFile().length()) {
instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile()));
iwFileP = instanceWeightsFile.get();
}
// open output files
if (options.isTranslationFlag()) {
string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
@ -238,10 +251,14 @@ int main(int argc, char* argv[])
char englishString[LINE_MAX_LENGTH];
char foreignString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
char weightString[LINE_MAX_LENGTH];
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
if (eFileP->eof()) break;
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
if (iwFileP) {
SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
}
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
@ -251,7 +268,7 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation);
task->Run();
delete task;
@ -695,6 +712,16 @@ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isOrientationFlag())
outextractstrOrientation << orientationInfo;
if (m_options.getInstanceWeightsFile().length()) {
if (m_options.isTranslationFlag()) {
outextractstr << " ||| " << sentence.weightString;
outextractstrInv << " ||| " << sentence.weightString;
}
if (m_options.isOrientationFlag()) {
outextractstrOrientation << " ||| " << sentence.weightString;
}
}
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
}

View File

@ -337,7 +337,7 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) {
if (options.unknownWordLabelFlag) {
collectWordLabelCounts(sentence);
}