mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-05 15:58:03 +03:00
daily automatic beautifier
This commit is contained in:
parent
1a795f549e
commit
c3424ce541
@ -142,7 +142,7 @@ void SuffixArray::Create(const string& fileName )
|
||||
}
|
||||
|
||||
// very specific code to deal with common crawl document ids
|
||||
bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId )
|
||||
bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId )
|
||||
{
|
||||
size_t i;
|
||||
// first 32 characters are hex-hash
|
||||
@ -158,7 +158,7 @@ bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId
|
||||
if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
i++;
|
||||
|
||||
// last token is url (=name)
|
||||
@ -337,7 +337,7 @@ void SuffixArray::List(INDEX start, INDEX end)
|
||||
}
|
||||
}
|
||||
|
||||
void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
|
||||
void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
|
||||
{
|
||||
cout << "QUERY\t";
|
||||
for(size_t i=0; i<phrase.size(); i++) {
|
||||
@ -358,7 +358,7 @@ void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase )
|
||||
|
||||
// loop through all matches
|
||||
cout << (lastMatch-firstMatch+1) << " matches" << endl;
|
||||
for(INDEX i=firstMatch; i<=lastMatch;i++) {
|
||||
for(INDEX i=firstMatch; i<=lastMatch; i++) {
|
||||
// get sentence information
|
||||
INDEX pos = GetPosition( i );
|
||||
INDEX start = pos - GetWordInSentence( pos );
|
||||
@ -394,8 +394,7 @@ SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const
|
||||
}
|
||||
if (sentence < m_document[mid]) {
|
||||
max = mid-1;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
min = mid+1;
|
||||
}
|
||||
}
|
||||
@ -416,13 +415,13 @@ void SuffixArray::Save(const string& fileName ) const
|
||||
fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length
|
||||
|
||||
char useDocument = m_useDocument; // not sure if that is needed
|
||||
fwrite( &useDocument, sizeof(char), 1, pFile );
|
||||
fwrite( &useDocument, sizeof(char), 1, pFile );
|
||||
if (m_useDocument) {
|
||||
fwrite( &m_documentCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_document, sizeof(INDEX), m_documentCount, pFile );
|
||||
fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile );
|
||||
fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile );
|
||||
fwrite( &m_documentCount, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_document, sizeof(INDEX), m_documentCount, pFile );
|
||||
fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile );
|
||||
fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile );
|
||||
fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile );
|
||||
}
|
||||
fclose( pFile );
|
||||
|
||||
@ -436,8 +435,8 @@ void SuffixArray::Load(const string& fileName )
|
||||
|
||||
cerr << "loading from " << fileName << endl;
|
||||
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_size from", fileName);
|
||||
fread( &m_size, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_size from", fileName);
|
||||
cerr << "words in corpus: " << m_size << endl;
|
||||
|
||||
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
|
||||
@ -449,47 +448,47 @@ void SuffixArray::Load(const string& fileName )
|
||||
CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence");
|
||||
CheckAllocation(m_sentence != NULL, "m_sentence");
|
||||
fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus
|
||||
|| Error("could not read m_array from", fileName);
|
||||
|| Error("could not read m_array from", fileName);
|
||||
fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array
|
||||
|| Error("could not read m_index from", fileName);
|
||||
|| Error("could not read m_index from", fileName);
|
||||
fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index
|
||||
|| Error("could not read m_wordInSentence from", fileName);
|
||||
|| Error("could not read m_wordInSentence from", fileName);
|
||||
fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index
|
||||
|| Error("could not read m_sentence from", fileName);
|
||||
|| Error("could not read m_sentence from", fileName);
|
||||
|
||||
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_sentenceCount from", fileName);
|
||||
|| Error("could not read m_sentenceCount from", fileName);
|
||||
cerr << "sentences in corpus: " << m_sentenceCount << endl;
|
||||
|
||||
m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount );
|
||||
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
|
||||
fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length
|
||||
|| Error("could not read m_sentenceLength from", fileName);
|
||||
|| Error("could not read m_sentenceLength from", fileName);
|
||||
|
||||
if (m_useDocument) { // do not read it when you do not need it
|
||||
char useDocument;
|
||||
fread( &useDocument, sizeof(char), 1, pFile )
|
||||
|| Error("could not read m_useDocument from", fileName);
|
||||
|| Error("could not read m_useDocument from", fileName);
|
||||
if (!useDocument) {
|
||||
cerr << "Error: stored suffix array does not have a document index\n";
|
||||
exit(1);
|
||||
}
|
||||
fread( &m_documentCount, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_documentCount from", fileName);
|
||||
fread( &m_documentCount, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_documentCount from", fileName);
|
||||
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
CheckAllocation(m_document != NULL, "m_document");
|
||||
CheckAllocation(m_documentName != NULL, "m_documentName");
|
||||
fread( m_document, sizeof(INDEX), m_documentCount, pFile )
|
||||
|| Error("could not read m_document from", fileName);
|
||||
fread( m_documentName, sizeof(INDEX), m_documentCount, pFile )
|
||||
|| Error("could not read m_documentName from", fileName);
|
||||
fread( m_document, sizeof(INDEX), m_documentCount, pFile )
|
||||
|| Error("could not read m_document from", fileName);
|
||||
fread( m_documentName, sizeof(INDEX), m_documentCount, pFile )
|
||||
|| Error("could not read m_documentName from", fileName);
|
||||
fread( &m_documentNameLength, sizeof(INDEX), 1, pFile )
|
||||
|| Error("could not read m_documentNameLength from", fileName);
|
||||
|| Error("could not read m_documentNameLength from", fileName);
|
||||
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
|
||||
CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer");
|
||||
fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile )
|
||||
|| Error("could not read m_document from", fileName);
|
||||
|| Error("could not read m_document from", fileName);
|
||||
}
|
||||
|
||||
fclose( pFile );
|
||||
@ -497,16 +496,16 @@ void SuffixArray::Load(const string& fileName )
|
||||
m_vcb.Load( fileName + ".src-vcb" );
|
||||
}
|
||||
|
||||
void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const
|
||||
void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const
|
||||
{
|
||||
if (check) return;
|
||||
cerr << "Error: could not allocate memory for " << dataStructure << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bool SuffixArray::Error( const char *message, const string &fileName) const
|
||||
bool SuffixArray::Error( const char *message, const string &fileName) const
|
||||
{
|
||||
cerr << "Error: " << message << " " << fileName << endl;
|
||||
exit(1);
|
||||
return true; // yeah, i know.
|
||||
return true; // yeah, i know.
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ int main(int argc, char* argv[])
|
||||
bool createFlag = false;
|
||||
bool queryFlag = false;
|
||||
bool querySentenceFlag = false;
|
||||
|
||||
|
||||
int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
|
||||
string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n";
|
||||
while(1) {
|
||||
@ -113,16 +113,13 @@ int main(int argc, char* argv[])
|
||||
if (querySentenceFlag) {
|
||||
vector< string > queryString = util::tokenize( query.c_str() );
|
||||
suffixArray.PrintSentenceMatches( queryString );
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cout << lookup( query ) << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (queryFlag) {
|
||||
} else if (queryFlag) {
|
||||
cout << lookup( query ) << endl;
|
||||
}
|
||||
else if (querySentenceFlag) {
|
||||
} else if (querySentenceFlag) {
|
||||
vector< string > queryString = util::tokenize( query.c_str() );
|
||||
suffixArray.PrintSentenceMatches( queryString );
|
||||
}
|
||||
|
@ -4,109 +4,106 @@ namespace MosesTuning
|
||||
{
|
||||
|
||||
InternalTree::InternalTree(const std::string & line, const bool terminal):
|
||||
m_isTerminal(terminal)
|
||||
{
|
||||
m_isTerminal(terminal)
|
||||
{
|
||||
|
||||
size_t found = line.find_first_of("[] ");
|
||||
size_t found = line.find_first_of("[] ");
|
||||
|
||||
if (found == line.npos) {
|
||||
m_value = line;
|
||||
}
|
||||
if (found == line.npos) {
|
||||
m_value = line;
|
||||
}
|
||||
|
||||
else {
|
||||
AddSubTree(line, 0);
|
||||
}
|
||||
else {
|
||||
AddSubTree(line, 0);
|
||||
}
|
||||
}
|
||||
|
||||
size_t InternalTree::AddSubTree(const std::string & line, size_t pos) {
|
||||
size_t InternalTree::AddSubTree(const std::string & line, size_t pos)
|
||||
{
|
||||
|
||||
std::string value;
|
||||
char token = 0;
|
||||
std::string value;
|
||||
char token = 0;
|
||||
|
||||
while (token != ']' && pos != std::string::npos)
|
||||
{
|
||||
size_t oldpos = pos;
|
||||
pos = line.find_first_of("[] ", pos);
|
||||
if (pos == std::string::npos) break;
|
||||
token = line[pos];
|
||||
value = line.substr(oldpos,pos-oldpos);
|
||||
while (token != ']' && pos != std::string::npos) {
|
||||
size_t oldpos = pos;
|
||||
pos = line.find_first_of("[] ", pos);
|
||||
if (pos == std::string::npos) break;
|
||||
token = line[pos];
|
||||
value = line.substr(oldpos,pos-oldpos);
|
||||
|
||||
if (token == '[') {
|
||||
if (m_value.size() > 0) {
|
||||
m_children.push_back(boost::make_shared<InternalTree>(value,false));
|
||||
pos = m_children.back()->AddSubTree(line, pos+1);
|
||||
}
|
||||
else {
|
||||
if (value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
pos = AddSubTree(line, pos+1);
|
||||
}
|
||||
}
|
||||
else if (token == ' ' || token == ']') {
|
||||
if (value.size() > 0 && !(m_value.size() > 0)) {
|
||||
m_value = value;
|
||||
}
|
||||
else if (value.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
m_children.push_back(boost::make_shared<InternalTree>(value,true));
|
||||
}
|
||||
if (token == ' ') {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_children.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
if (token == '[') {
|
||||
if (m_value.size() > 0) {
|
||||
m_children.push_back(boost::make_shared<InternalTree>(value,false));
|
||||
pos = m_children.back()->AddSubTree(line, pos+1);
|
||||
} else {
|
||||
if (value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
pos = AddSubTree(line, pos+1);
|
||||
}
|
||||
} else if (token == ' ' || token == ']') {
|
||||
if (value.size() > 0 && !(m_value.size() > 0)) {
|
||||
m_value = value;
|
||||
} else if (value.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
m_children.push_back(boost::make_shared<InternalTree>(value,true));
|
||||
}
|
||||
if (token == ' ') {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
return line.size();
|
||||
if (m_children.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
}
|
||||
return std::min(line.size(),pos+1);
|
||||
}
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
return line.size();
|
||||
}
|
||||
return std::min(line.size(),pos+1);
|
||||
|
||||
}
|
||||
|
||||
std::string InternalTree::GetString(bool start) const {
|
||||
std::string InternalTree::GetString(bool start) const
|
||||
{
|
||||
|
||||
std::string ret = "";
|
||||
if (!start) {
|
||||
ret += " ";
|
||||
}
|
||||
std::string ret = "";
|
||||
if (!start) {
|
||||
ret += " ";
|
||||
}
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "[";
|
||||
}
|
||||
if (!m_isTerminal) {
|
||||
ret += "[";
|
||||
}
|
||||
|
||||
ret += m_value;
|
||||
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it)
|
||||
{
|
||||
ret += (*it)->GetString(false);
|
||||
}
|
||||
ret += m_value;
|
||||
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
ret += (*it)->GetString(false);
|
||||
}
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "]";
|
||||
}
|
||||
return ret;
|
||||
if (!m_isTerminal) {
|
||||
ret += "]";
|
||||
}
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void InternalTree::Combine(const std::vector<TreePointer> &previous) {
|
||||
void InternalTree::Combine(const std::vector<TreePointer> &previous)
|
||||
{
|
||||
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
leafNT next_leafNT(this);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
*it = *it_prev;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
leafNT next_leafNT(this);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
*it = *it_prev;
|
||||
} else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -18,60 +18,60 @@ typedef int NTLabel;
|
||||
|
||||
class InternalTree
|
||||
{
|
||||
std::string m_value;
|
||||
std::vector<TreePointer> m_children;
|
||||
bool m_isTerminal;
|
||||
std::string m_value;
|
||||
std::vector<TreePointer> m_children;
|
||||
bool m_isTerminal;
|
||||
public:
|
||||
InternalTree(const std::string & line, const bool terminal = false);
|
||||
InternalTree(const InternalTree & tree):
|
||||
m_value(tree.m_value),
|
||||
m_isTerminal(tree.m_isTerminal) {
|
||||
const std::vector<TreePointer> & children = tree.m_children;
|
||||
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
|
||||
m_children.push_back(boost::make_shared<InternalTree>(**it));
|
||||
}
|
||||
}
|
||||
size_t AddSubTree(const std::string & line, size_t start);
|
||||
InternalTree(const std::string & line, const bool terminal = false);
|
||||
InternalTree(const InternalTree & tree):
|
||||
m_value(tree.m_value),
|
||||
m_isTerminal(tree.m_isTerminal) {
|
||||
const std::vector<TreePointer> & children = tree.m_children;
|
||||
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
|
||||
m_children.push_back(boost::make_shared<InternalTree>(**it));
|
||||
}
|
||||
}
|
||||
size_t AddSubTree(const std::string & line, size_t start);
|
||||
|
||||
std::string GetString(bool start = true) const;
|
||||
void Combine(const std::vector<TreePointer> &previous);
|
||||
const std::string & GetLabel() const {
|
||||
return m_value;
|
||||
}
|
||||
std::string GetString(bool start = true) const;
|
||||
void Combine(const std::vector<TreePointer> &previous);
|
||||
const std::string & GetLabel() const {
|
||||
return m_value;
|
||||
}
|
||||
|
||||
size_t GetLength() const {
|
||||
return m_children.size();
|
||||
}
|
||||
std::vector<TreePointer> & GetChildren() {
|
||||
return m_children;
|
||||
}
|
||||
size_t GetLength() const {
|
||||
return m_children.size();
|
||||
}
|
||||
std::vector<TreePointer> & GetChildren() {
|
||||
return m_children;
|
||||
}
|
||||
|
||||
bool IsTerminal() const {
|
||||
return m_isTerminal;
|
||||
}
|
||||
bool IsTerminal() const {
|
||||
return m_isTerminal;
|
||||
}
|
||||
|
||||
bool IsLeafNT() const {
|
||||
return (!m_isTerminal && m_children.size() == 0);
|
||||
}
|
||||
bool IsLeafNT() const {
|
||||
return (!m_isTerminal && m_children.size() == 0);
|
||||
}
|
||||
};
|
||||
|
||||
// Python-like generator that yields next nonterminal leaf on every call
|
||||
$generator(leafNT) {
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNT(InternalTree* root = 0): tree(root) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(it);
|
||||
}
|
||||
else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
$generator(leafNT)
|
||||
{
|
||||
std::vector<TreePointer>::iterator it;
|
||||
InternalTree* tree;
|
||||
leafNT(InternalTree* root = 0): tree(root) {}
|
||||
$emit(std::vector<TreePointer>::iterator)
|
||||
for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) {
|
||||
if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) {
|
||||
$yield(it);
|
||||
} else if ((*it)->GetLength() > 0) {
|
||||
if ((*it).get()) { // normal pointer to same object that TreePointer points to
|
||||
$restart(tree = (*it).get());
|
||||
}
|
||||
}
|
||||
$stop;
|
||||
}
|
||||
$stop;
|
||||
};
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user