mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
optionally ignore punctuation for glm
This commit is contained in:
parent
30cf66d180
commit
6ce2fed6ff
@ -9,19 +9,25 @@ using namespace std;
|
||||
namespace Moses
|
||||
{
|
||||
GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const vector< FactorType >& inFactors,
|
||||
const vector< FactorType >& outFactors)
|
||||
const vector< FactorType >& outFactors,
|
||||
bool ignorePunctuation)
|
||||
: StatelessFeatureFunction("glm",ScoreProducer::unlimited),
|
||||
m_sparseProducerWeight(1)
|
||||
m_sparseProducerWeight(1),
|
||||
m_ignorePunctuation(ignorePunctuation)
|
||||
{
|
||||
std::cerr << "Creating global lexical model unlimited...\n";
|
||||
std::cerr << "Creating global lexical model unlimited.. ";
|
||||
|
||||
// load model
|
||||
LoadData( inFactors, outFactors );
|
||||
|
||||
// compile a list of punctuation characters
|
||||
/* char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~=";
|
||||
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
|
||||
m_punctuationHash[punctuation[i]] = 1;*/
|
||||
if (m_ignorePunctuation) {
|
||||
cerr << "ignoring punctuation";
|
||||
char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~=";
|
||||
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
|
||||
m_punctuationHash[punctuation[i]] = 1;
|
||||
}
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
GlobalLexicalModelUnlimited::~GlobalLexicalModelUnlimited(){}
|
||||
@ -46,22 +52,26 @@ void GlobalLexicalModelUnlimited::Evaluate(const TargetPhrase& targetPhrase, Sco
|
||||
for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
|
||||
string targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
|
||||
|
||||
// check if first char is punctuation
|
||||
/* char firstChar = targetString.at(0);
|
||||
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
||||
if(charIterator != m_punctuationHash.end())
|
||||
continue;*/
|
||||
if (m_ignorePunctuation) {
|
||||
// check if first char is punctuation
|
||||
char firstChar = targetString.at(0);
|
||||
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
||||
if(charIterator != m_punctuationHash.end())
|
||||
continue;
|
||||
}
|
||||
|
||||
// set< const Word*, WordComparer > alreadyScored; // do not score a word twice
|
||||
StringHash alreadyScored;
|
||||
for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
|
||||
string inputString = input.GetWord(inputIndex).GetString(0); // TODO: change for other factors
|
||||
|
||||
// check if first char is punctuation
|
||||
/* firstChar = inputString.at(0);
|
||||
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
||||
if(charIterator != m_punctuationHash.end())
|
||||
continue;*/
|
||||
if (m_ignorePunctuation) {
|
||||
// check if first char is punctuation
|
||||
char firstChar = inputString.at(0);
|
||||
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
||||
if(charIterator != m_punctuationHash.end())
|
||||
continue;
|
||||
}
|
||||
|
||||
//if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
|
||||
if ( alreadyScored.find(inputString) == alreadyScored.end()) {
|
||||
|
@ -33,7 +33,7 @@ class InputType;
|
||||
|
||||
class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
|
||||
{
|
||||
// typedef std::map< char, short > CharHash;
|
||||
typedef std::map< char, short > CharHash;
|
||||
typedef std::map< std::string, short > StringHash;
|
||||
|
||||
struct ThreadLocalStorage
|
||||
@ -51,7 +51,8 @@ private:
|
||||
private:
|
||||
const Sentence *m_input;
|
||||
|
||||
// CharHash m_punctuationHash;
|
||||
CharHash m_punctuationHash;
|
||||
bool m_ignorePunctuation;
|
||||
|
||||
std::vector< FactorType > m_inputFactors;
|
||||
std::vector< FactorType > m_outputFactors;
|
||||
@ -63,7 +64,8 @@ private:
|
||||
|
||||
public:
|
||||
GlobalLexicalModelUnlimited(const std::vector< FactorType >& inFactors,
|
||||
const std::vector< FactorType >& outFactors);
|
||||
const std::vector< FactorType >& outFactors,
|
||||
bool ignorePunctuation);
|
||||
|
||||
virtual ~GlobalLexicalModelUnlimited();
|
||||
|
||||
|
@ -972,14 +972,25 @@ bool StaticData::LoadGlobalLexicalModelUnlimited()
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < weight.size(); i++ ) {
|
||||
vector< string > factors = Tokenize(modelSpec[i],"-");
|
||||
bool ignorePunctuation = false;
|
||||
vector< string > factors;
|
||||
vector< string > factors_punctuation = Tokenize(modelSpec[i]," ");
|
||||
|
||||
// read optional punctuation specification
|
||||
if (factors_punctuation.size() > 0) {
|
||||
factors = Tokenize(factors_punctuation[0],"-");
|
||||
ignorePunctuation = Scan<int>(factors_punctuation[1]);
|
||||
}
|
||||
else
|
||||
factors = Tokenize(modelSpec[i],"-");
|
||||
|
||||
if ( factors.size() != 2 ) {
|
||||
std::cerr << "wrong factor definition for global lexical model unlimited: " << modelSpec[i] << endl;
|
||||
return false;
|
||||
}
|
||||
const vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
|
||||
const vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
|
||||
m_globalLexicalModelsUnlimited.push_back(new GlobalLexicalModelUnlimited(inputFactors, outputFactors));
|
||||
m_globalLexicalModelsUnlimited.push_back(new GlobalLexicalModelUnlimited(inputFactors, outputFactors, ignorePunctuation));
|
||||
m_globalLexicalModelsUnlimited[i]->SetSparseProducerWeight(weight[i]);
|
||||
}
|
||||
return true;
|
||||
|
Loading…
Reference in New Issue
Block a user