mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-13 00:59:02 +03:00
remove zmert and cmert
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3397 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
8616a2bdee
commit
321f528ff5
@ -26,7 +26,7 @@ RELEASEDIR=$(TARGETDIR)/scripts-$(TS)
|
||||
|
||||
all: compile
|
||||
|
||||
SUBDIRS=cmert-0.5 phrase-extract symal mbr lexical-reordering
|
||||
SUBDIRS=phrase-extract symal mbr lexical-reordering
|
||||
SUBDIRS_CLEAN=$(SUBDIRS) memscore
|
||||
|
||||
compile: compile-memscore
|
||||
|
@ -63,23 +63,6 @@ training/absolutize_moses_model.pl
|
||||
training/build-generation-table.perl
|
||||
training/clean-corpus-n.perl
|
||||
training/clone_moses_model.pl
|
||||
training/cmert-0.5/bleu.py
|
||||
training/cmert-0.5/dataset.py
|
||||
training/cmert-0.5/log.py
|
||||
training/cmert-0.5/mert
|
||||
training/cmert-0.5/enhanced-mert
|
||||
training/cmert-0.5/reduce-field.pl
|
||||
training/cmert-0.5/extend-field.pl
|
||||
training/cmert-0.5/python/psyco/classes.py
|
||||
training/cmert-0.5/python/psyco/core.py
|
||||
training/cmert-0.5/python/psyco/__init__.py
|
||||
training/cmert-0.5/python/psyco/kdictproxy.py
|
||||
training/cmert-0.5/python/psyco/logger.py
|
||||
training/cmert-0.5/python/psyco/profiler.py
|
||||
training/cmert-0.5/python/psyco/_psyco.so
|
||||
training/cmert-0.5/python/psyco/support.py
|
||||
training/cmert-0.5/README
|
||||
training/cmert-0.5/score-nbest.py
|
||||
training/mbr/mbr
|
||||
training/filter-model-given-input.pl
|
||||
training/filter-rule-table.py
|
||||
@ -87,7 +70,6 @@ training/lexical-reordering/score
|
||||
training/memscore/memscore
|
||||
training/zmert-moses.pl
|
||||
training/mert-moses.pl
|
||||
training/mert-moses-new.pl
|
||||
training/phrase-extract/extract
|
||||
training/phrase-extract/extract-rules
|
||||
training/phrase-extract/score
|
||||
|
@ -12,21 +12,6 @@ training/absolutize_moses_model.pl
|
||||
training/build-generation-table.perl
|
||||
training/clean-corpus-n.perl
|
||||
training/clone_moses_model.pl
|
||||
training/cmert-0.5/bleu.py
|
||||
training/cmert-0.5/dataset.py
|
||||
training/cmert-0.5/log.py
|
||||
training/cmert-0.5/mert.exe
|
||||
training/cmert-0.5/enhanced-mert
|
||||
training/cmert-0.5/python/psyco/classes.py
|
||||
training/cmert-0.5/python/psyco/core.py
|
||||
training/cmert-0.5/python/psyco/__init__.py
|
||||
training/cmert-0.5/python/psyco/kdictproxy.py
|
||||
training/cmert-0.5/python/psyco/logger.py
|
||||
training/cmert-0.5/python/psyco/profiler.py
|
||||
training/cmert-0.5/python/psyco/_psyco.so
|
||||
training/cmert-0.5/python/psyco/support.py
|
||||
training/cmert-0.5/README
|
||||
training/cmert-0.5/score-nbest.py
|
||||
training/combine_factors.pl
|
||||
training/filter-model-given-input.pl
|
||||
training/lexical-reordering/score.exe
|
||||
|
@ -1,15 +0,0 @@
|
||||
OBJS=mert.o data.o point.o score.o
|
||||
CFLAGS=-O3
|
||||
LDFLAGS=
|
||||
LDLIBS=-lm
|
||||
|
||||
all: mert
|
||||
|
||||
clean:
|
||||
rm -f *.o
|
||||
|
||||
mert: $(OBJS)
|
||||
$(CXX) $(OBJS) $(LDLIBS) -o $@
|
||||
|
||||
mert_p: $(OBJS)
|
||||
$(CXX) $(LDFLAGS) $(OBJS) $(LDLIBS) -o $@
|
@ -1,10 +0,0 @@
|
||||
CMERT 0.5
|
||||
5 Nov 2005
|
||||
Copyright (c) 2005 David Chiang. All rights reserved (for now).
|
||||
|
||||
Minimalist installation instructions:
|
||||
|
||||
- make
|
||||
- set #! lines and sys.path lines in Python scripts
|
||||
- see run-cmert for example
|
||||
|
@ -1,179 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# $Id$
|
||||
|
||||
'''Provides:
|
||||
|
||||
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
|
||||
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
|
||||
score_cooked(alltest, n=4): Score a list of cooked test sentences.
|
||||
|
||||
score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
|
||||
|
||||
The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
|
||||
'''
|
||||
|
||||
import optparse
|
||||
import sys, math, re, xml.sax.saxutils
|
||||
sys.path.append('/fs/clip-mteval/Programs/hiero')
|
||||
import dataset
|
||||
import log
|
||||
|
||||
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
|
||||
nonorm = 0
|
||||
|
||||
preserve_case = False
|
||||
eff_ref_len = "shortest"
|
||||
|
||||
normalize1 = [
|
||||
('<skipped>', ''), # strip "skipped" tags
|
||||
(r'-\n', ''), # strip end-of-line hyphenation and join lines
|
||||
(r'\n', ' '), # join lines
|
||||
# (r'(\d)\s+(?=\d)', r'\1'), # join digits
|
||||
]
|
||||
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
|
||||
|
||||
normalize2 = [
|
||||
(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
|
||||
(r'([^0-9])([\.,])',r'\1 \2 '), # tokenize period and comma unless preceded by a digit
|
||||
(r'([\.,])([^0-9])',r' \1 \2'), # tokenize period and comma unless followed by a digit
|
||||
(r'([0-9])(-)',r'\1 \2 ') # tokenize dash when preceded by a digit
|
||||
]
|
||||
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
|
||||
|
||||
def normalize(s):
|
||||
'''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
|
||||
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
|
||||
if (nonorm):
|
||||
return s.split()
|
||||
if type(s) is not str:
|
||||
s = " ".join(s)
|
||||
# language-independent part:
|
||||
for (pattern, replace) in normalize1:
|
||||
s = re.sub(pattern, replace, s)
|
||||
s = xml.sax.saxutils.unescape(s, {'"':'"'})
|
||||
# language-dependent part (assuming Western languages):
|
||||
s = " %s " % s
|
||||
if not preserve_case:
|
||||
s = s.lower() # this might not be identical to the original
|
||||
for (pattern, replace) in normalize2:
|
||||
s = re.sub(pattern, replace, s)
|
||||
return s.split()
|
||||
|
||||
def count_ngrams(words, n=4):
|
||||
counts = {}
|
||||
for k in xrange(1,n+1):
|
||||
for i in xrange(len(words)-k+1):
|
||||
ngram = tuple(words[i:i+k])
|
||||
counts[ngram] = counts.get(ngram, 0)+1
|
||||
return counts
|
||||
|
||||
def cook_refs(refs, n=4):
|
||||
'''Takes a list of reference sentences for a single segment
|
||||
and returns an object that encapsulates everything that BLEU
|
||||
needs to know about them.'''
|
||||
|
||||
refs = [normalize(ref) for ref in refs]
|
||||
maxcounts = {}
|
||||
for ref in refs:
|
||||
counts = count_ngrams(ref, n)
|
||||
for (ngram,count) in counts.iteritems():
|
||||
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
|
||||
return ([len(ref) for ref in refs], maxcounts)
|
||||
|
||||
def cook_test(test, (reflens, refmaxcounts), n=4):
|
||||
'''Takes a test sentence and returns an object that
|
||||
encapsulates everything that BLEU needs to know about it.'''
|
||||
|
||||
test = normalize(test)
|
||||
result = {}
|
||||
result["testlen"] = len(test)
|
||||
|
||||
# Calculate effective reference sentence length.
|
||||
|
||||
if eff_ref_len == "shortest":
|
||||
result["reflen"] = min(reflens)
|
||||
elif eff_ref_len == "average":
|
||||
result["reflen"] = float(sum(reflens))/len(reflens)
|
||||
elif eff_ref_len == "closest":
|
||||
min_diff = None
|
||||
for reflen in reflens:
|
||||
if min_diff is None or abs(reflen-len(test)) < min_diff:
|
||||
min_diff = abs(reflen-len(test))
|
||||
result['reflen'] = reflen
|
||||
|
||||
result["guess"] = [max(len(test)-k+1,0) for k in xrange(1,n+1)]
|
||||
|
||||
result['correct'] = [0]*n
|
||||
counts = count_ngrams(test, n)
|
||||
for (ngram, count) in counts.iteritems():
|
||||
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
|
||||
|
||||
return result
|
||||
|
||||
def score_cooked(allcomps, n=4):
|
||||
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
|
||||
for comps in allcomps:
|
||||
for key in ['testlen','reflen']:
|
||||
totalcomps[key] += comps[key]
|
||||
for key in ['guess','correct']:
|
||||
for k in xrange(n):
|
||||
totalcomps[key][k] += comps[key][k]
|
||||
logbleu = 0.0
|
||||
for k in xrange(n):
|
||||
if totalcomps['correct'][k] == 0:
|
||||
return 0.0
|
||||
log.write("%d-grams: %f\n" % (k,float(totalcomps['correct'][k])/totalcomps['guess'][k]))
|
||||
logbleu += math.log(totalcomps['correct'][k])-math.log(totalcomps['guess'][k])
|
||||
logbleu /= float(n)
|
||||
log.write("Effective reference length: %d test length: %d\n" % (totalcomps['reflen'], totalcomps['testlen']))
|
||||
logbleu += min(0,1-float(totalcomps['reflen'])/totalcomps['testlen'])
|
||||
return math.exp(logbleu)
|
||||
|
||||
def score_set(set, testid, refids, n=4):
|
||||
alltest = []
|
||||
for seg in set.segs():
|
||||
try:
|
||||
test = seg.versions[testid].words
|
||||
except KeyError:
|
||||
log.write("Warning: missing test sentence\n")
|
||||
continue
|
||||
try:
|
||||
refs = [seg.versions[refid].words for refid in refids]
|
||||
except KeyError:
|
||||
log.write("Warning: missing reference sentence, %s\n" % seg.id)
|
||||
refs = cook_refs(refs, n)
|
||||
alltest.append(cook_test(test, refs, n))
|
||||
log.write("%d sentences\n" % len(alltest))
|
||||
return score_cooked(alltest, n)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import psyco
|
||||
psyco.full()
|
||||
|
||||
import getopt
|
||||
raw_test = False
|
||||
(opts,args) = getopt.getopt(sys.argv[1:], "rc", [])
|
||||
for (opt,parm) in opts:
|
||||
if opt == "-r":
|
||||
raw_test = True
|
||||
elif opt == "-c":
|
||||
preserve_case = True
|
||||
|
||||
s = dataset.Dataset()
|
||||
if args[0] == '-':
|
||||
infile = sys.stdin
|
||||
else:
|
||||
infile = args[0]
|
||||
if raw_test:
|
||||
(root, testids) = s.read_raw(infile, docid='whatever', sysid='testsys')
|
||||
else:
|
||||
(root, testids) = s.read(infile)
|
||||
print "Test systems: %s" % ", ".join(testids)
|
||||
(root, refids) = s.read(args[1])
|
||||
print "Reference systems: %s" % ", ".join(refids)
|
||||
|
||||
for testid in testids:
|
||||
print "BLEU score: ", score_set(s, testid, refids)
|
||||
|
||||
|
@ -1,93 +0,0 @@
|
||||
// $Id$
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "data.h"
|
||||
#include "point.h"
|
||||
|
||||
extern int comps_n;
|
||||
|
||||
data_t *read_data(void) {
|
||||
FILE *fp;
|
||||
static char buf[1000];
|
||||
char *tok, *s;
|
||||
int field;
|
||||
int sent_i, cand_i, cands_n;
|
||||
int total_cands_n;
|
||||
data_t *data;
|
||||
candidate_t *cands;
|
||||
|
||||
data = malloc(sizeof(data_t));
|
||||
|
||||
data->sents_max = 100;
|
||||
data->sents_n = 0;
|
||||
data->cands_n = malloc(data->sents_max*sizeof(int));
|
||||
|
||||
total_cands_n = 0;
|
||||
|
||||
fp = fopen("cands.opt", "r");
|
||||
while (fgets(buf, sizeof(buf), fp) != NULL) {
|
||||
// should we check to make sure every sentence is accounted for?
|
||||
sscanf(buf, "%d %d", &sent_i, &cands_n);
|
||||
if (sent_i >= data->sents_n)
|
||||
data->sents_n = sent_i+1;
|
||||
if (sent_i >= data->sents_max) {
|
||||
data->sents_max = (sent_i+1)*2;
|
||||
data->cands_n = realloc(data->cands_n, data->sents_max*sizeof(int));
|
||||
}
|
||||
data->cands_n[sent_i] = cands_n;
|
||||
total_cands_n += cands_n;
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
/* create master array for candidates and then set data->sents
|
||||
to point into it */
|
||||
cands = malloc(total_cands_n * sizeof(candidate_t));
|
||||
data->sents = malloc(data->sents_n * sizeof(candidate_t *));
|
||||
total_cands_n = 0;
|
||||
for (sent_i=0; sent_i<data->sents_n; sent_i++) {
|
||||
data->sents[sent_i] = cands+total_cands_n;
|
||||
total_cands_n += data->cands_n[sent_i];
|
||||
}
|
||||
|
||||
|
||||
cand_i = 0;
|
||||
fp = fopen("feats.opt", "r");
|
||||
while (fgets(buf, sizeof(buf), fp) != NULL) {
|
||||
cands[cand_i].features = malloc(dim*sizeof(float));
|
||||
cands[cand_i].comps = malloc(comps_n*sizeof(int));
|
||||
|
||||
field = 0;
|
||||
s = buf;
|
||||
while ((tok = strsep(&s, " \t\n")) != NULL) {
|
||||
if (!*tok) // empty token
|
||||
continue;
|
||||
// read dim floats and then comps_n ints
|
||||
if (field < dim)
|
||||
cands[cand_i].features[field] = -strtod(tok, NULL); // Venugopal format uses costs
|
||||
else if (field < dim+comps_n)
|
||||
cands[cand_i].comps[field-dim] = strtol(tok, NULL, 10);
|
||||
else {
|
||||
fprintf(stderr, "read_data(): too many fields in line in feats.opt\n");
|
||||
return NULL;
|
||||
}
|
||||
field++;
|
||||
}
|
||||
if (field != dim+comps_n) {
|
||||
fprintf(stderr, "read_data(): wrong number of fields in line in feats.opt - expected %d + %d and found %d on line %d\n",dim,comps_n,field,cand_i);
|
||||
return NULL;
|
||||
}
|
||||
cand_i++;
|
||||
}
|
||||
|
||||
if (cand_i != total_cands_n) {
|
||||
fprintf(stderr, "read_data(): wrong number of lines in cands.opt\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
@ -1,18 +0,0 @@
|
||||
// $Id$
|
||||
#ifndef DATA_H
|
||||
#define DATA_H
|
||||
|
||||
typedef struct {
|
||||
float *features;
|
||||
int *comps;
|
||||
float m, b; // slope and intercept, used as scratch space
|
||||
} candidate_t;
|
||||
|
||||
typedef struct {
|
||||
candidate_t **sents;
|
||||
int sents_n, sents_max, *cands_n;
|
||||
} data_t;
|
||||
|
||||
data_t *read_data(void);
|
||||
|
||||
#endif
|
@ -1,392 +0,0 @@
|
||||
#!/usr/bin/python2.3
|
||||
|
||||
# $Id$
|
||||
'''Decoder interface:
|
||||
|
||||
Dataset.process() expects a function, which in turn takes a Sentence as input
|
||||
and produces a Sentence or list of Sentences as output.
|
||||
|
||||
The input Sentence will be marked with the <seg> tag it was found in
|
||||
the input file with.
|
||||
|
||||
The output Sentences should be marked with <seg> tags if they are to
|
||||
be marked as such in the output file.
|
||||
'''
|
||||
|
||||
import sys, sgmllib, xml.sax.saxutils, log
|
||||
|
||||
def attrs_to_str(d):
|
||||
if len(d) == 0:
|
||||
return ""
|
||||
l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]
|
||||
return " ".join(l)
|
||||
|
||||
def attrs_to_dict(a):
|
||||
d = {}
|
||||
for (name, value) in a:
|
||||
if d.has_key(name.lower()):
|
||||
raise ValueError, "duplicate attribute names"
|
||||
d[name.lower()] = value
|
||||
return d
|
||||
|
||||
def strip_newlines(s):
|
||||
return " ".join(s.split())
|
||||
|
||||
class Sentence(object):
|
||||
def __init__(self, words=None, meta=None):
|
||||
if words is not None:
|
||||
self.words = list(words)
|
||||
else:
|
||||
self.words = []
|
||||
if meta is not None:
|
||||
self.meta = meta
|
||||
else:
|
||||
self.meta = []
|
||||
|
||||
def mark(self, tag, attrs):
|
||||
self.meta.append((tag, attrs, 0, len(self.words)))
|
||||
|
||||
def getmark(self):
|
||||
if len(self.meta) > 0:
|
||||
(tag, attrs, i, j) = self.meta[-1]
|
||||
if i == 0 and j == len(self.words):
|
||||
return (tag, attrs)
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
def unmark(self):
|
||||
mark = self.getmark()
|
||||
if mark is not None:
|
||||
self.meta = self.meta[:-1]
|
||||
return mark
|
||||
|
||||
def __cmp__(self, other):
|
||||
return cmp((self.words, self.meta), (other.words, other.meta))
|
||||
|
||||
def __str__(self):
|
||||
def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):
|
||||
if i1==i2<=j1==j2:
|
||||
return 0
|
||||
elif i2<=i1<=j1<=j2:
|
||||
return -1
|
||||
elif i1<=i2<=j2<=j1:
|
||||
return 1
|
||||
else:
|
||||
return cmp((i1,j1),(i2,j2)) # don't care
|
||||
# this guarantees that equal spans will come out nested
|
||||
# we want the later spans to be outer
|
||||
# this relies on stable sort
|
||||
open = [[] for i in xrange(len(self.words)+1)]
|
||||
# there seems to be a bug still with empty spans
|
||||
empty = [[] for i in xrange(len(self.words)+1)]
|
||||
close = [[] for j in xrange(len(self.words)+1)]
|
||||
for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):
|
||||
if i == j:
|
||||
# do we want these to nest?
|
||||
empty[i].append("<%s%s/>" % (tag, attrs_to_str(attrs)))
|
||||
open[i].append("<%s%s>" % (tag, attrs_to_str(attrs)))
|
||||
close[j].append("</%s>" % tag)
|
||||
|
||||
result = []
|
||||
if len(empty[0]) > 0:
|
||||
result.extend(empty[0])
|
||||
for i in xrange(len(self.words)):
|
||||
if i > 0:
|
||||
result.append(" ")
|
||||
result.extend(reversed(open[i]))
|
||||
result.append(self.words[i])
|
||||
result.extend(close[i+1])
|
||||
if len(empty[i+1]) > 0:
|
||||
result.extend(empty[i+1])
|
||||
|
||||
return "".join(result)
|
||||
|
||||
def __add__(self, other):
|
||||
if type(other) in (list, tuple):
|
||||
return Sentence(self.words + list(other), self.meta)
|
||||
else:
|
||||
othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]
|
||||
return Sentence(self.words + other.words, self.meta+othermeta)
|
||||
|
||||
def read_raw(f):
|
||||
"""Read a raw file into a list of Sentences."""
|
||||
if type(f) is str:
|
||||
f = file(f, "r")
|
||||
inputs = []
|
||||
i = 0
|
||||
for line in f:
|
||||
sent = process_sgml_line(line, i)
|
||||
sent.mark('seg', [('id',str(i))])
|
||||
inputs.append(sent)
|
||||
i += 1
|
||||
return inputs
|
||||
|
||||
class Dataset(object):
|
||||
def __init__(self, id=None):
|
||||
self.id = id
|
||||
self.docs = {}
|
||||
self.sysids = []
|
||||
self.langs = {}
|
||||
|
||||
def read(self, f):
|
||||
'''Read a file into the dataset. Returns (root, sysids)'''
|
||||
if type(f) is str:
|
||||
f = file(f, "r")
|
||||
p = DatasetParser(self)
|
||||
p.feed(f.read())
|
||||
p.close()
|
||||
return (p.root,p.sysids)
|
||||
|
||||
def read_raw(self, f, docid, setid=None, sysid=None, lang=None):
|
||||
"""Read a raw file into the dataset."""
|
||||
if setid is not None:
|
||||
if self.id is not None and self.id != setid:
|
||||
raise ValueError, "Set ID does not match"
|
||||
else:
|
||||
self.id = setid
|
||||
if sysid not in self.sysids:
|
||||
self.sysids.append(sysid)
|
||||
self.langs[sysid] = lang
|
||||
if type(f) is str:
|
||||
f = file(f, "r")
|
||||
doc = self.docs.setdefault(docid, Document(docid))
|
||||
i = 0
|
||||
for line in f:
|
||||
if len(doc.segs)-1 < i:
|
||||
doc.segs.append(Segment(i))
|
||||
if doc.segs[i].versions.has_key(sysid):
|
||||
raise ValueError, "multiple versions from same system"
|
||||
doc.segs[i].versions[sysid] = process_sgml_line(line, i)
|
||||
doc.segs[i].versions[sysid].mark('seg', [('id',str(i))])
|
||||
i += 1
|
||||
return (None, [sysid])
|
||||
|
||||
def write(self, f, tag, sysids=None):
|
||||
if type(f) is str:
|
||||
f = file(f, "w")
|
||||
f.write(self.string(tag, sysids))
|
||||
|
||||
def write_raw(self, f, sysid=None):
|
||||
if type(f) is str:
|
||||
f = file(f, "w")
|
||||
for seg in self.segs():
|
||||
f.write(" ".join(seg.versions[sysid].words))
|
||||
f.write("\n")
|
||||
|
||||
def string(self, tag, sysids=None):
|
||||
if sysids is None:
|
||||
sysids = self.sysids
|
||||
elif type(sysids) is str:
|
||||
sysids = [sysids]
|
||||
attrs = [('setid', self.id)]
|
||||
if self.langs.has_key(None):
|
||||
attrs.append(('srclang', self.langs[None]))
|
||||
trglangs = [self.langs[sysid] for sysid in sysids if sysid is not None]
|
||||
for lang in trglangs[1:]:
|
||||
if lang != trglangs[0]:
|
||||
raise ValueError, "Inconsistent target language"
|
||||
if len(trglangs) >= 1:
|
||||
attrs.append(('trglang', trglangs[0]))
|
||||
|
||||
return "<%s%s>\n%s</%s>\n" % (tag,
|
||||
attrs_to_str(attrs),
|
||||
"".join([doc.string(sysid) for doc in self.docs.values() for sysid in sysids]),
|
||||
tag)
|
||||
|
||||
def process(self, processor, sysid, lang, srcsysid=None):
|
||||
if sysid in self.sysids:
|
||||
raise ValueError, "sysid already in use"
|
||||
else:
|
||||
self.sysids.append(sysid)
|
||||
self.langs[sysid] = lang
|
||||
for seg in self.segs():
|
||||
if log.level >= 2:
|
||||
sys.stderr.write("Input: %s\n" % str(seg.versions[srcsysid]))
|
||||
seg.versions[sysid] = processor(seg.versions[srcsysid])
|
||||
if log.level >= 2:
|
||||
if type(seg.versions[sysid]) is not list:
|
||||
sys.stderr.write("Output: %s\n" % str(seg.versions[sysid]))
|
||||
else:
|
||||
sys.stderr.write("Output (1st): %s\n" % str(seg.versions[sysid][0]))
|
||||
|
||||
def segs(self):
|
||||
for doc in self.docs.values():
|
||||
for seg in doc.segs:
|
||||
yield seg
|
||||
|
||||
class Document(object):
|
||||
def __init__(self, id):
|
||||
self.id = id
|
||||
self.segs = []
|
||||
|
||||
def string(self, sysid):
|
||||
attrs = [('docid', self.id)]
|
||||
if sysid is not None:
|
||||
attrs.append(('sysid', sysid))
|
||||
return "<doc%s>\n%s</doc>\n" % (attrs_to_str(attrs),
|
||||
"".join([seg.string(sysid) for seg in self.segs]))
|
||||
|
||||
class Segment(object):
|
||||
def __init__(self, id=None):
|
||||
self.id = id
|
||||
self.versions = {}
|
||||
|
||||
def string(self, sysid):
|
||||
v = self.versions[sysid]
|
||||
if type(v) is not list:
|
||||
v = [v]
|
||||
output = []
|
||||
for i in xrange(len(v)):
|
||||
output.append(str(v[i]))
|
||||
output.append('\n')
|
||||
return "".join(output)
|
||||
|
||||
def process_sgml_line(line, id=None):
|
||||
p = DatasetParser(None)
|
||||
p.pos = 0
|
||||
p.words = []
|
||||
p.meta = []
|
||||
p.feed(line)
|
||||
p.close()
|
||||
sent = Sentence(p.words, p.meta)
|
||||
return sent
|
||||
|
||||
class DatasetParser(sgmllib.SGMLParser):
|
||||
def __init__(self, set):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
self.words = None
|
||||
self.sysids = []
|
||||
self.set = set
|
||||
self.mystack = []
|
||||
|
||||
def handle_starttag(self, tag, method, attrs):
|
||||
thing = method(attrs)
|
||||
self.mystack.append(thing)
|
||||
|
||||
def handle_endtag(self, tag, method):
|
||||
thing = self.mystack.pop()
|
||||
method(thing)
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
thing = self.start(tag, attrs)
|
||||
self.mystack.append(thing)
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
thing = self.mystack.pop()
|
||||
self.end(tag, thing)
|
||||
|
||||
def start_srcset(self, attrs):
|
||||
attrs = attrs_to_dict(attrs)
|
||||
if self.set.id is None:
|
||||
self.set.id = attrs['setid']
|
||||
if 0 and self.set.id != attrs['setid']:
|
||||
raise ValueError, "Set ID does not match"
|
||||
self.lang = attrs['srclang']
|
||||
self.root = 'srcset'
|
||||
return None
|
||||
|
||||
def start_refset(self, attrs):
|
||||
attrs = attrs_to_dict(attrs)
|
||||
if self.set.id is None:
|
||||
self.set.id = attrs['setid']
|
||||
if 0 and self.set.id != attrs['setid']:
|
||||
raise ValueError, "Set ID does not match"
|
||||
if self.set.langs.setdefault(None, attrs['srclang']) != attrs['srclang']:
|
||||
raise ValueError, "Source language does not match"
|
||||
self.lang = attrs['trglang']
|
||||
self.root = 'refset'
|
||||
return None
|
||||
|
||||
def start_tstset(self, attrs):
|
||||
attrs = attrs_to_dict(attrs)
|
||||
if self.set.id is None:
|
||||
self.set.id = attrs['setid']
|
||||
if 0 and self.set.id != attrs['setid']:
|
||||
raise ValueError, "Set ID does not match"
|
||||
if 0 and self.set.langs.setdefault(None, attrs['srclang']) != attrs['srclang']:
|
||||
raise ValueError, "Source language does not match"
|
||||
self.lang = attrs['trglang']
|
||||
self.root = 'tstset'
|
||||
return None
|
||||
|
||||
def end_srcset(self, thing):
|
||||
for sysid in self.sysids:
|
||||
if sysid not in self.set.sysids:
|
||||
self.set.sysids.append(sysid)
|
||||
self.set.langs[sysid] = self.lang
|
||||
end_refset = end_tstset = end_srcset
|
||||
|
||||
def start_doc(self, attrs):
|
||||
attrs = attrs_to_dict(attrs)
|
||||
self.doc = self.set.docs.setdefault(attrs['docid'], Document(attrs['docid']))
|
||||
self.seg_i = 0
|
||||
if self.root == 'srcset':
|
||||
self.sysid = None
|
||||
else:
|
||||
self.sysid = attrs['sysid']
|
||||
if self.sysid not in self.sysids:
|
||||
self.sysids.append(self.sysid)
|
||||
return None
|
||||
|
||||
def end_doc(self, thing):
|
||||
pass
|
||||
|
||||
def start_seg(self, attrs):
|
||||
thing = ('seg', attrs, 0, None)
|
||||
attrs = attrs_to_dict(attrs)
|
||||
if len(self.doc.segs)-1 < self.seg_i:
|
||||
self.doc.segs.append(Segment(attrs.get('id', None)))
|
||||
self.seg = self.doc.segs[self.seg_i]
|
||||
if 0 and self.seg.id is not None and attrs.has_key('id') and self.seg.id != attrs['id']:
|
||||
raise ValueError, "segment ids do not match (%s != %s)" % (str(self.seg.id), str(attrs.get('id', None)))
|
||||
if self.seg.versions.has_key(self.sysid):
|
||||
raise ValueError, "multiple versions from same system"
|
||||
self.pos = 0
|
||||
self.words = []
|
||||
self.meta = []
|
||||
return thing
|
||||
|
||||
def end_seg(self, thing):
|
||||
(tag, attrs, i, j) = thing
|
||||
self.meta.append((tag, attrs, i, self.pos))
|
||||
self.seg_i += 1
|
||||
self.seg.versions[self.sysid] = Sentence(self.words, self.meta)
|
||||
self.words = None
|
||||
|
||||
"""# Special case for start and end of sentence
|
||||
def start_s(self, attrs):
|
||||
if self.words is not None:
|
||||
self.pos += 1
|
||||
self.words.append('<s>')
|
||||
return None
|
||||
|
||||
def end_s(self, thing):
|
||||
if self.words is not None:
|
||||
self.pos += 1
|
||||
self.words.append('</s>')"""
|
||||
|
||||
def start(self, tag, attrs):
|
||||
if self.words is not None:
|
||||
return (tag, attrs, self.pos, None)
|
||||
else:
|
||||
return None
|
||||
|
||||
def end(self, tag, thing):
|
||||
if self.words is not None:
|
||||
(tag, attrs, i, j) = thing
|
||||
self.meta.append((tag, attrs, i, self.pos))
|
||||
|
||||
def handle_data(self, s):
|
||||
if self.words is not None:
|
||||
words = s.split()
|
||||
self.pos += len(words)
|
||||
self.words.extend(words)
|
||||
|
||||
if __name__ == "__main__":
|
||||
s = Dataset()
|
||||
|
||||
for filename in sys.argv[1:]:
|
||||
s.read_raw(filename, 'whatever', 'whatever', filename, 'English')
|
||||
s.write(sys.stdout, 'tstset')
|
@ -1,119 +0,0 @@
|
||||
#! /bin/bash
|
||||
|
||||
PrintUsageAndDie(){
|
||||
echo "USAGE: enhanced-cmert.sh -d size [-active] [-help]"
|
||||
echo " perform cmert on a subset of the feature scores"
|
||||
echo " the ratios among not activated weights are not modified"
|
||||
echo " Parameters (*=optional):"
|
||||
echo " -d: the number of original features"
|
||||
echo " -rootdir: the scripts root dir"
|
||||
echo " -activate (*): comma-separated (or blank-separated) list of the indexes of active features"
|
||||
echo " if not set, all features are optimized"
|
||||
echo " -debug(*): debug information"
|
||||
echo " -help(*): print his help"
|
||||
echo
|
||||
echo "Example: see examples in the directory example which are created with the script readme.txt"
|
||||
exit
|
||||
}
|
||||
|
||||
normalize_weights(){
|
||||
perl -ne '{$tot=0;chomp;split;grep($tot+=($_>0)?$_:-$_,@_); grep($_/=$tot,@_); for ($i=0;$i<scalar(@_);$i++){printf STDOUT "%.6f ",$_[$i];};printf STDOUT "\n";}'
|
||||
}
|
||||
|
||||
activeflag=0;
|
||||
help=0
|
||||
debug=""
|
||||
|
||||
if [ $# -lt 1 ] ; then PrintUsageAndDie ; fi
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case $1 in
|
||||
-help) help=1 ; shift 1 ; ;;
|
||||
-d) size=$2 ; shift 2 ; ;;
|
||||
-rootdir) SCRIPTS_ROOTDIR=$2 ; shift 2 ; ;;
|
||||
-debug) debug="-debug"; shift 1 ; ;;
|
||||
-activate) activeflag=1 ; activefields=$2 ; shift 2 ; ;;
|
||||
*) shift $# ; ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $help == 1 ] ; then PrintUsageAndDie ; fi
|
||||
|
||||
# call the basic mert command
|
||||
if [ $activeflag == 0 ] ; then
|
||||
$SCRIPTS_ROOTDIR/training/cmert-0.5/mert -d $size
|
||||
exit
|
||||
fi
|
||||
|
||||
# else
|
||||
if [ $debug ] ; then echo "names of active fields: $activefields" ; fi
|
||||
|
||||
#get indexes of active fields from file "names.txt
|
||||
oldname="__FALSE_NAME__"
|
||||
name="__FALSE_NAME__"
|
||||
separator="_"
|
||||
|
||||
i=1 lastj=1
|
||||
for name in `cat names.txt` ; do
|
||||
if [ $name == $oldname ] ; then i=$(( i + 1 )) ; else i=1 ; fi
|
||||
arrayname[$lastj]=$name
|
||||
arrayname2[$lastj]=$name$separator$i
|
||||
lastj=$(( lastj + 1 ))
|
||||
oldname=$name
|
||||
done
|
||||
|
||||
#map feature names into feature indexes
|
||||
out=""
|
||||
for name in `echo $activefields | tr ',' ' ' ` ; do
|
||||
match=0; j=1
|
||||
while [ $j -lt $lastj ] ; do
|
||||
|
||||
if [ ${arrayname[$j]} == $name -o ${arrayname2[$j]} == "$name" ] ; then
|
||||
match=$j
|
||||
if [ $out ] ; then out="$out,$j" ; else out="$j" ; fi
|
||||
fi
|
||||
|
||||
j=$(( j + 1 ))
|
||||
done
|
||||
|
||||
if [ $match -eq 0 ] ; then echo "feature $name you are asking for is not present" ; fi
|
||||
|
||||
done
|
||||
|
||||
activefields=`echo $out | tr ',' '\012' | sort -nu | tr '\012' ',' | perl -pe 's/\,$//' `
|
||||
|
||||
if [ $debug ] ; then echo "indexes of active fields: $activefields" ; fi
|
||||
|
||||
#filter active fields, perform cmert and ...
|
||||
tmpdir=tmp$$
|
||||
mkdir -p $tmpdir
|
||||
|
||||
for file in feats.opt init.opt ; do
|
||||
mv $file $tmpdir
|
||||
done
|
||||
|
||||
cat $tmpdir/init.opt | tail -1 > $tmpdir/weight.opt
|
||||
|
||||
cat $tmpdir/init.opt | perl $SCRIPTS_ROOTDIR/training/cmert-0.5/reduce-field.pl $debug -weight $tmpdir/weight.opt -d $size -activate $activefields | perl -pe 's/^\S+ /1 /' > init.opt
|
||||
cat $tmpdir/feats.opt | perl $SCRIPTS_ROOTDIR/training/cmert-0.5/reduce-field.pl $debug -weight $tmpdir/weight.opt -d $size -activate $activefields > feats.opt
|
||||
|
||||
active=`cat init.opt | head -1 | awk '{print NF}'`
|
||||
|
||||
$SCRIPTS_ROOTDIR/training/cmert-0.5/mert -d $active 2> reduced_cmert.log
|
||||
|
||||
for file in feats.opt init.opt; do
|
||||
mv $file reduced_$file
|
||||
mv $tmpdir/$file $file
|
||||
done
|
||||
|
||||
mv weights.txt reduced_weights.txt
|
||||
cat reduced_weights.txt | perl $SCRIPTS_ROOTDIR/training/cmert-0.5/extend-field.pl $debug -weight $tmpdir/weight.opt -d $size -activate $activefields | normalize_weights > weights.txt
|
||||
rm -r $tmpdir
|
||||
|
||||
bestpointline=`echo "Best point:"`
|
||||
bestpointline="$bestpointline "`cat weights.txt`
|
||||
bestpointline="$bestpointline => "`cat reduced_cmert.log | grep -i "Best point:" | awk '{print $NF}'`
|
||||
echo $bestpointline > /dev/stderr
|
||||
|
||||
exit
|
@ -1 +0,0 @@
|
||||
0 10
|
@ -1,10 +0,0 @@
|
||||
4.0 383.916 60.6749 113.308 28.7833 94.443 -27.9971 66.0 49 66 27 65 16 64 10 63 67
|
||||
6.0 370.709 67.0555 105.849 37.0838 85.7675 -29.9969 64.0 49 64 29 63 17 62 10 61 67
|
||||
10.0 415.511 57.7613 97.1628 27.7191 83.3125 -28.997 68.0 54 68 30 67 19 66 13 65 67
|
||||
6.0 412.823 59.5607 99.215 28.2344 82.0559 -28.997 67.0 53 67 32 66 20 65 13 64 67
|
||||
4.0 422.048 56.6241 97.204 28.6241 80.8079 -28.997 67.0 52 67 30 66 19 65 13 64 67
|
||||
4.0 392.685 60.6979 105.33 28.4244 90.094 -28.997 66.0 51 66 29 65 17 64 11 63 67
|
||||
6.0 365.877 69.0651 108.001 37.33 83.4477 -31.9967 63.0 49 63 29 62 17 61 10 60 67
|
||||
6.0 418.054 57.5832 97.2047 26.9759 83.6841 -29.9969 68.0 54 68 32 67 20 66 13 65 67
|
||||
6.0 375.021 64.0915 103.471 38.6084 84.3162 -29.9969 63.0 49 63 28 62 16 61 10 60 67
|
||||
6.0 364.308 71.1182 110.425 35.7858 82.8551 -30.9968 63.0 49 63 29 62 17 61 10 60 67
|
@ -1,3 +0,0 @@
|
||||
0 0 0 0 0 0 -1 -1
|
||||
1 2 2 2 2 2 1 1
|
||||
1 1 0.3 0.2 0.2 0.3 0 0
|
@ -1 +0,0 @@
|
||||
d lm tm tm tm tm tm w
|
@ -1,12 +0,0 @@
|
||||
mkdir -p example1
|
||||
../enhanced-mert -d 8 >& cmert.log
|
||||
mv cmert.log weights.txt example1
|
||||
|
||||
mkdir -p example2
|
||||
../enhanced-mert -d 8 -activate lm,tm_2,tm_5,w >& cmert.log
|
||||
mv cmert.log weights.txt reduced_* example2
|
||||
|
||||
mkdir -p example3
|
||||
../enhanced-mert -d 8 -activate d,tm_1,tm_5 >& cmert.log
|
||||
mv cmert.log weights.txt reduced_* example3
|
||||
|
@ -1,80 +0,0 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
sub PrintArgsAndDie () {
|
||||
print stderr "USAGE: extend-field.pl [-h] \n";
|
||||
print stderr "This scripts extend the number of active fields for the mert procedure. (See the dual script reduce-field.pl)\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
my $weightfile="";
|
||||
my $size=-1;
|
||||
my $activefields="";
|
||||
|
||||
while (@ARGV){
|
||||
if ($ARGV[0] eq "-h"){
|
||||
&PrintArgsAndDie();
|
||||
}
|
||||
if ($ARGV[0] eq "-debug"){
|
||||
$debug=1;
|
||||
shift(@ARGV);
|
||||
}
|
||||
if ($ARGV[0] eq "-weight"){
|
||||
$weightfile=$ARGV[1];
|
||||
shift(@ARGV); shift(@ARGV);
|
||||
}
|
||||
if ($ARGV[0] eq "-d"){
|
||||
$size=$ARGV[1];
|
||||
shift(@ARGV); shift(@ARGV);
|
||||
}
|
||||
if ($ARGV[0] eq "-activate"){
|
||||
$activefields=$ARGV[1];
|
||||
shift(@ARGV); shift(@ARGV);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
die "Cannot open/find weight file ($weightfile)\n" if ! -e $weightfile;
|
||||
|
||||
my @weight=();
|
||||
open(IN,$weightfile);
|
||||
chomp($weight=<IN>);
|
||||
close(IN);
|
||||
push @weight,(0,split(/[ \t]+/,$weight));
|
||||
my @active=();
|
||||
my @invertedactive=();
|
||||
|
||||
if ($activefields eq ""){
|
||||
for (my $i=1; $i<=$size; $i++){ $active[$i]=1; };
|
||||
}else{
|
||||
@active=split(/,/,$activefields);
|
||||
}
|
||||
|
||||
for (my $i=0; $i<=$size; $i++){ $invertedactive[$i]=0; };
|
||||
for (my $i=0; $i<scalar(@active); $i++){ $invertedactive[$active[$i]]=1; };
|
||||
my $j=0;
|
||||
for (my $i=1; $i<=$size; $i++){ if (!$invertedactive[$i]){$notactive[$j]=$i; $j++}};
|
||||
|
||||
if ($debug>0){
|
||||
print STDERR "ORIGINAL SIZE: $size\n";
|
||||
print STDERR "ORIGINAL WEIGHTS: @weight\n";
|
||||
print STDERR "ORIGINAL ACTIVE: @active\n";
|
||||
print STDERR "ORIGINAL NOTACTIVE: @notactive\n";
|
||||
print STDERR "ORIGINAL INVERTEDACTIVE: @invertedactive\n";
|
||||
}
|
||||
|
||||
while(chomp($_=<STDIN>)){
|
||||
@field=split(/[ \t]+/,$_);
|
||||
|
||||
my $j=1;
|
||||
for (my $i=1; $i<=$size; $i++){
|
||||
if ($invertedactive[$i]){
|
||||
print STDOUT "$field[$j] ";
|
||||
print STDERR "j:$j i:$i -> $field[$j]\n" if $debug>0;
|
||||
$j++;
|
||||
}else{
|
||||
printf STDOUT "%.6f ",$field[0]*$weight[$i];
|
||||
print STDERR "i:$i -> $field[0] $weight[$i]\n" if $debug>0;
|
||||
}
|
||||
};
|
||||
print STDOUT "\n";
|
||||
}
|
@ -1,19 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# $Id$
|
||||
import sys
|
||||
|
||||
level = 1
|
||||
file = sys.stderr
|
||||
|
||||
def writeln(s=""):
|
||||
file.write("%s\n" % s)
|
||||
file.flush()
|
||||
|
||||
def write(s):
|
||||
file.write(s)
|
||||
file.flush()
|
||||
|
||||
|
||||
|
||||
|
@ -1,27 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
if ($#ARGV != 2) {
|
||||
die "usage: makeinitopt <ranges> <weightfile> <rangefile>"
|
||||
}
|
||||
$s = $ARGV[0];
|
||||
$woutput = $ARGV[1];
|
||||
$routput = $ARGV[2];
|
||||
open WOUT, ">$woutput" || die "couldn't open $woutput";
|
||||
open ROUT, ">$routput" || die "couldn't open $routput";
|
||||
|
||||
@w = ();
|
||||
@lo = ();
|
||||
@hi = ();
|
||||
foreach $x (split(/;/, $s)) {
|
||||
if ($x =~ /(.*),(-?[\d.]+)-(-?[\d.]+)/) {
|
||||
push(@w, $1);
|
||||
push(@lo, $2);
|
||||
push(@hi, $3);
|
||||
} else {
|
||||
print STDERR "bad weight range: $x\n";
|
||||
}
|
||||
}
|
||||
|
||||
print WOUT join(" ", @w), "\n";
|
||||
print ROUT join(" ", @lo), "\n";
|
||||
print ROUT join(" ", @hi), "\n";
|
@ -1,88 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
WORKDIR=$1
|
||||
if [ ! -d $WORKDIR ]; then
|
||||
mkdir -p $WORKDIR
|
||||
fi
|
||||
|
||||
SRCFILE=$2
|
||||
REFPREFIX=$3
|
||||
REFFILES=$REFPREFIX[0-9]*
|
||||
NBEST=$4
|
||||
DECODER=$5
|
||||
DECODEROPTS=$6
|
||||
RANGES=$7
|
||||
START=$8
|
||||
|
||||
#default pwdcmd is pwd
|
||||
#pwdcmd is pawd if exists
|
||||
PWDCMD="pwd"
|
||||
___PWDCMD=`which pawd | head -1 | awk '{print $1}'`
|
||||
if [ $___PWDCMD -a -e $___PWDCMD ] ; then PWDCMD=$___PWDCMD ; fi;
|
||||
|
||||
RUNDIR=`$PWDCMD`
|
||||
|
||||
makeinitopt "$RANGES" $WORKDIR/weights.txt $WORKDIR/ranges.txt
|
||||
DIM=`cat $WORKDIR/weights.txt | awk '{print NF; exit}'`
|
||||
echo $DIM dimensions
|
||||
|
||||
PATH=/group/project/statmt/pkoehn/user/abhishek:/group/project/statmt/pkoehn/user/abhishek/cmert-0.5:$PATH
|
||||
export PATH
|
||||
|
||||
date
|
||||
|
||||
echo Reference sets: $REFFILES
|
||||
|
||||
if [ "x$START" == "x" ]; then
|
||||
START=1
|
||||
fi
|
||||
|
||||
I=$START
|
||||
PREVLINECOUNT=0
|
||||
#$DECODEROPTS =~ s / \-f / -config /;
|
||||
#$DECODEROPTS =~ s/^\-f /-config /;
|
||||
filename=$WORKDIR/run$I.best$NBEST.out
|
||||
|
||||
while true; do
|
||||
echo Run decoder
|
||||
|
||||
WEIGHTS=`cat $WORKDIR/weights.txt`
|
||||
|
||||
###Changes - AA 29/11/05
|
||||
#echo "$DECODER $NBEST \"$WEIGHTS\" $WORKDIR/run$I \"$DECODEROPTS\" < $SRCFILE > $WORKDIR/run$I.nbest"
|
||||
#$DECODER $NBEST \"$WEIGHTS\" $WORKDIR/run$I \"$DECODEROPTS\" < $SRCFILE > $WORKDIR/run$I.nbest
|
||||
|
||||
echo "$DECODER $DECODEROPTS \"$WEIGHTS\" -n-best-list $filename $NBEST < $SRCFILE > $WORKDIR/run$I.nbest"
|
||||
$DECODER $DECODEROPTS "$WEIGHTS" -n-best-list $filename $NBEST < $SRCFILE > $WORKDIR/run$I.nbest
|
||||
|
||||
echo Calculate BLEU component scores
|
||||
|
||||
sort -mn -t\| -k 1,1 $WORKDIR/run*.nbest | score-nbest.py $REFFILES $WORKDIR/
|
||||
|
||||
#LINECOUNT=`cat $WORKDIR/feats.opt | awk '{n++} END {print n}'`
|
||||
LINECOUNT=`cat $WORKDIR/cands.opt | awk '{n += $2} END {print n}'`
|
||||
echo $LINECOUNT accumulated translations
|
||||
if [ $LINECOUNT -le $PREVLINECOUNT ]; then
|
||||
echo "Training finished"
|
||||
date
|
||||
break
|
||||
fi
|
||||
|
||||
echo Optimize feature weights
|
||||
|
||||
cd $WORKDIR
|
||||
cat ranges.txt weights.txt > init.opt
|
||||
rm -f weights.txt
|
||||
mert -d$DIM
|
||||
cd $RUNDIR
|
||||
|
||||
if [ "x`cat $WORKDIR/weights.txt`" == "x" ]; then
|
||||
echo Optimization failed
|
||||
break
|
||||
fi
|
||||
|
||||
I=`expr $I + 1`
|
||||
PREVLINECOUNT=$LINECOUNT
|
||||
|
||||
date
|
||||
done
|
@ -1,432 +0,0 @@
|
||||
// $Id$
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "data.h"
|
||||
#include "point.h"
|
||||
#include "score.h"
|
||||
|
||||
int verbose = 2;
|
||||
|
||||
float min_interval = 1e-3;
|
||||
|
||||
typedef struct {
|
||||
float x;
|
||||
int cand;
|
||||
int *delta_comps;
|
||||
} intersection_t;
|
||||
|
||||
intersection_t *new_intersection(float x, int cand, int *comps1, int *comps2) {
|
||||
intersection_t *inter;
|
||||
int i;
|
||||
inter = malloc(sizeof(intersection_t));
|
||||
inter->x = x;
|
||||
inter->cand = cand; // this is not used but sometimes it's handy
|
||||
inter->delta_comps = malloc(comps_n * sizeof(int));
|
||||
for (i=0; i<comps_n; i++)
|
||||
inter->delta_comps[i] = comps1[i]-comps2[i];
|
||||
return inter;
|
||||
}
|
||||
|
||||
void intersection_delete(intersection_t *inter) {
|
||||
free(inter->delta_comps);
|
||||
free(inter);
|
||||
}
|
||||
|
||||
int compare_intersections(intersection_t **i1, intersection_t **i2) {
|
||||
if ((*i1)->x == (*i2)->x)
|
||||
return 0;
|
||||
else if ((*i1)->x < (*i2)->x)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
float slow_bleu(data_t *data, point_t *point) {
|
||||
int sent_i, cand_i, cand_n, i;
|
||||
candidate_t *cands;
|
||||
float p, best_p;
|
||||
int best;
|
||||
int *comps;
|
||||
float score;
|
||||
int ties, totalties;
|
||||
|
||||
comps = calloc(comps_n, sizeof(int));
|
||||
|
||||
totalties = 0;
|
||||
|
||||
for (sent_i = 0; sent_i < data->sents_n; sent_i++) {
|
||||
cands = data->sents[sent_i];
|
||||
cand_n = data->cands_n[sent_i];
|
||||
|
||||
ties = 0;
|
||||
|
||||
best = 0;
|
||||
best_p = point_dotproduct(point, cands[0].features);
|
||||
for (cand_i = 1; cand_i < cand_n; cand_i++) {
|
||||
p = point_dotproduct(point, cands[cand_i].features);
|
||||
if (p > best_p) {
|
||||
best_p = p;
|
||||
best = cand_i;
|
||||
ties = 0;
|
||||
} else if (p == best_p) {
|
||||
ties++;
|
||||
}
|
||||
}
|
||||
totalties += ties;
|
||||
comps_addto(comps, cands[best].comps);
|
||||
}
|
||||
//point_print(point, stderr, 1);
|
||||
//fprintf(stderr, "\n");
|
||||
//fprintf(stderr, "slow bleu => %f\n", compute_score(comps));
|
||||
score = compute_score(comps);
|
||||
free(comps);
|
||||
return score;
|
||||
}
|
||||
|
||||
/* Global optimization along a line (Och, 2004) */
|
||||
point_t *line_optimize(data_t *data, point_t *origin, point_t *dir) {
|
||||
int sent_i, cand_i, cand_n, intersection_i;
|
||||
candidate_t *cands;
|
||||
static intersection_t **intersections = NULL;
|
||||
intersection_t *inter;
|
||||
static int intersection_max;
|
||||
int intersection_n = 0;
|
||||
int prev, leftmost;
|
||||
float x, leftmost_x, prev_x, best_x;
|
||||
float score, best_score;
|
||||
int *comps;
|
||||
point_t *point;
|
||||
int first;
|
||||
|
||||
if (!origin->has_score)
|
||||
point_set_score(origin, slow_bleu(data, origin));
|
||||
|
||||
if (verbose >= 2) {
|
||||
fprintf(stderr, "starting point: ");
|
||||
point_print(origin, stderr, 1);
|
||||
fprintf(stderr, "\n direction: ");
|
||||
point_print(dir, stderr, 1);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
comps = calloc(comps_n, sizeof(int));
|
||||
|
||||
if (intersections == NULL) {
|
||||
intersection_max = 10;
|
||||
intersections = malloc(intersection_max*sizeof(intersection_t *));
|
||||
}
|
||||
|
||||
for (sent_i = 0; sent_i < data->sents_n; sent_i++) {
|
||||
cands = data->sents[sent_i];
|
||||
cand_n = data->cands_n[sent_i];
|
||||
|
||||
if (verbose >= 3)
|
||||
fprintf(stderr, "sentence %d\n", sent_i);
|
||||
|
||||
if (cand_n < 1)
|
||||
continue;
|
||||
|
||||
/* calculate slopes and intercepts */
|
||||
for (cand_i = 0; cand_i < cand_n; cand_i++) {
|
||||
cands[cand_i].m = point_dotproduct(dir, cands[cand_i].features);
|
||||
cands[cand_i].b = point_dotproduct(origin, cands[cand_i].features);
|
||||
}
|
||||
|
||||
/* find intersection points */
|
||||
|
||||
/* find best candidate for x -> -inf */
|
||||
prev = -1;
|
||||
for (cand_i = 0; cand_i < cand_n; cand_i++)
|
||||
if (prev < 0 ||
|
||||
cands[cand_i].m < cands[prev].m ||
|
||||
cands[cand_i].m == cands[prev].m && cands[prev].b < cands[cand_i].b)
|
||||
prev = cand_i;
|
||||
|
||||
if (verbose >= 3) {
|
||||
fprintf(stderr, "x->-inf cand %d\n", prev);
|
||||
}
|
||||
|
||||
comps_addto(comps, cands[prev].comps);
|
||||
|
||||
first = 1;
|
||||
while (1) {
|
||||
// find leftmost intersection
|
||||
leftmost = -1;
|
||||
for (cand_i = 0; cand_i < cand_n; cand_i++) {
|
||||
if (cands[prev].m == cands[cand_i].m) {
|
||||
if (cands[cand_i].b > cands[cand_i].b)
|
||||
fprintf(stderr, "two parallel lines and discarding the higher -- this shouldn't happen\n");
|
||||
continue; // no intersection
|
||||
}
|
||||
|
||||
/* optimization: piecewise linear function must be concave up.
|
||||
Maybe it would be still faster to sort by slope beforehand */
|
||||
if (cands[cand_i].m < cands[prev].m)
|
||||
continue;
|
||||
|
||||
x = -(cands[prev].b-cands[cand_i].b)/(cands[prev].m-cands[cand_i].m);
|
||||
|
||||
if (leftmost < 0 || x < leftmost_x) {
|
||||
leftmost = cand_i;
|
||||
leftmost_x = x;
|
||||
}
|
||||
}
|
||||
|
||||
if (leftmost < 0)
|
||||
break; // no more intersections
|
||||
|
||||
/* Require that the intersection point be at least min_interval
|
||||
to the right of the previous one. If not, we replace the
|
||||
previous intersection point with this one. Yes, it can even
|
||||
happen that the new intersection point is slightly to the
|
||||
left of the old one, because of numerical imprecision. We
|
||||
don't check that the new point is also min_interval to the
|
||||
right of the penultimate one. In that case, the points would
|
||||
switch places in the sort, resulting in a bogus score for
|
||||
that inteval. */
|
||||
|
||||
if (first || leftmost_x - prev_x > min_interval) {
|
||||
if (intersection_n == intersection_max) {
|
||||
intersection_max *= 2;
|
||||
intersections = realloc(intersections, intersection_max*sizeof(intersection_t));
|
||||
if (intersections == NULL)
|
||||
fprintf(stderr, "couldn't realloc intersections\n");
|
||||
}
|
||||
intersections[intersection_n++] = new_intersection(leftmost_x, leftmost, cands[leftmost].comps, cands[prev].comps);
|
||||
} else {
|
||||
// replace the old one
|
||||
inter = new_intersection(leftmost_x, leftmost, cands[leftmost].comps, cands[prev].comps);
|
||||
comps_addto(inter->delta_comps, intersections[intersection_n-1]->delta_comps);
|
||||
intersection_delete(intersections[intersection_n-1]);
|
||||
intersections[intersection_n-1] = inter;
|
||||
}
|
||||
|
||||
if (verbose >= 3)
|
||||
fprintf(stderr, "found intersection point: %f, right cand %d\n", leftmost_x, leftmost);
|
||||
prev = leftmost;
|
||||
prev_x = leftmost_x;
|
||||
first = 0;
|
||||
}
|
||||
}
|
||||
|
||||
best_score = compute_score(comps);
|
||||
//fprintf(stderr, "x->-inf => %f\n", best_score);
|
||||
|
||||
if (intersection_n == 0)
|
||||
best_x = 0.0;
|
||||
else {
|
||||
qsort(intersections, intersection_n, sizeof(intersection_t *), (int(*)(const void *, const void *))compare_intersections);
|
||||
best_x = intersections[0]->x - 1000.0; // whatever
|
||||
}
|
||||
for (intersection_i = 0; intersection_i < intersection_n; intersection_i++) {
|
||||
comps_addto(comps, intersections[intersection_i]->delta_comps);
|
||||
score = compute_score(comps);
|
||||
//fprintf(stderr, "x=%f => %f\n", intersections[intersection_i]->x, score);
|
||||
if (score > best_score) {
|
||||
best_score = score;
|
||||
if (intersection_i+1 < intersection_n)
|
||||
// what if interval is zero-width?
|
||||
best_x = 0.5*(intersections[intersection_i]->x + intersections[intersection_i+1]->x);
|
||||
else
|
||||
best_x = intersections[intersection_i]->x + 0.1; // whatever
|
||||
}
|
||||
}
|
||||
//fprintf(stderr, "best_x = %f\n", best_x);
|
||||
point = point_copy(dir);
|
||||
point_multiplyby(point, best_x);
|
||||
point_addto(point, origin);
|
||||
point_set_score(point, best_score);
|
||||
|
||||
if (verbose >= 2) {
|
||||
fprintf(stderr, " ending point: ");
|
||||
point_print(point, stderr, 1);
|
||||
fprintf(stderr, "\n");
|
||||
//check_comps(data, point, comps);
|
||||
}
|
||||
|
||||
for (intersection_i = 0; intersection_i < intersection_n; intersection_i++)
|
||||
intersection_delete(intersections[intersection_i]);
|
||||
free(comps);
|
||||
|
||||
if (best_score < origin->score) {
|
||||
/* this can happen in the case of a tie between two candidates with different bleu component scores. just trash the point and return the starting point */
|
||||
point_delete(point);
|
||||
return point_copy(origin);
|
||||
}
|
||||
|
||||
return point;
|
||||
}
|
||||
|
||||
point_t *optimize_powell(data_t *data, point_t *point) {
|
||||
int i;
|
||||
point_t **u, **p;
|
||||
float biggestwin, totalwin, extrapolatedwin;
|
||||
int biggestwin_i;
|
||||
point_t *point_e;
|
||||
|
||||
u = malloc(dim*sizeof(point_t *));
|
||||
p = malloc(dim*sizeof(point_t *));
|
||||
|
||||
point = point_copy(point);
|
||||
if (!point->has_score)
|
||||
point_set_score(point, slow_bleu(data, point));
|
||||
|
||||
for (i=0; i<dim; i++) {
|
||||
u[i] = new_point();
|
||||
u[i]->weights[i] = 1.0;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
p[0] = line_optimize(data, point, u[0]);
|
||||
biggestwin_i = 0;
|
||||
biggestwin = p[0]->score - point->score;
|
||||
for (i=1; i<dim; i++) {
|
||||
p[i] = line_optimize(data, p[i-1], u[i]);
|
||||
if (p[i]->score - p[i-1]->score > biggestwin) {
|
||||
biggestwin_i = i;
|
||||
biggestwin = p[i]->score - p[i-1]->score;
|
||||
}
|
||||
}
|
||||
|
||||
totalwin = p[dim-1]->score - point->score;
|
||||
|
||||
if (totalwin < 0.000001)
|
||||
break;
|
||||
|
||||
// last point minus first point
|
||||
point_multiplyby(point, -1.0);
|
||||
point_addto(point, p[dim-1]);
|
||||
|
||||
point_e = point_copy(point);
|
||||
point_addto(point_e, p[dim-1]);
|
||||
point_set_score(point_e, slow_bleu(data, point_e));
|
||||
extrapolatedwin = point_e->score - point->score; // point->score is the original point
|
||||
|
||||
if (extrapolatedwin > 0 &&
|
||||
2*(2*totalwin - extrapolatedwin) *
|
||||
powf(totalwin - biggestwin, 2.0f) <
|
||||
powf(extrapolatedwin, 2.0f)*biggestwin) {
|
||||
// replace dominant direction vector with sum vector
|
||||
point_delete(u[biggestwin_i]);
|
||||
point_normalize(point);
|
||||
u[biggestwin_i] = point;
|
||||
}
|
||||
|
||||
point_delete(point_e);
|
||||
|
||||
// optimization continues with last point
|
||||
point = p[dim-1];
|
||||
|
||||
for (i=0; i<dim-1; i++)
|
||||
if (i != biggestwin_i)
|
||||
point_delete(p[i]);
|
||||
}
|
||||
|
||||
for (i=0; i<dim; i++)
|
||||
point_delete(u[i]);
|
||||
|
||||
free(u);
|
||||
free(p);
|
||||
|
||||
point_normalize(point);
|
||||
return point;
|
||||
}
|
||||
|
||||
point_t *optimize_koehn(data_t *data, point_t *point) {
|
||||
point_t *dir, **newpoints;
|
||||
int dir_i;
|
||||
int best_dir = -1;
|
||||
dir = new_point();
|
||||
newpoints = malloc(dim*sizeof(point_t *));
|
||||
|
||||
point = point_copy(point);
|
||||
|
||||
while (1) {
|
||||
for (dir_i = 0; dir_i < dim; dir_i++) {
|
||||
dir->weights[dir_i] = 1.0;
|
||||
newpoints[dir_i] = line_optimize(data, point, dir);
|
||||
if (best_dir < 0 || newpoints[dir_i]->score > newpoints[best_dir]->score)
|
||||
best_dir = dir_i;
|
||||
dir->weights[dir_i] = 0.0;
|
||||
}
|
||||
if (point->has_score && newpoints[best_dir]->score - point->score < 0.000001)
|
||||
break;
|
||||
|
||||
point_delete(point);
|
||||
point = newpoints[best_dir];
|
||||
|
||||
// discard the other points
|
||||
for (dir_i = 0; dir_i < dim; dir_i++)
|
||||
if (dir_i != best_dir)
|
||||
point_delete(newpoints[dir_i]);
|
||||
}
|
||||
|
||||
point_delete(dir);
|
||||
free(newpoints);
|
||||
|
||||
point_normalize(point);
|
||||
return point;
|
||||
}
|
||||
|
||||
void usage(void) {
|
||||
fprintf(stderr, "usage: mert -d <dimensions>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
int point_i;
|
||||
int points_n = 20;
|
||||
point_t *min, *max;
|
||||
data_t *data;
|
||||
point_t *bestpoint, *newpoint, *startpoint;
|
||||
int i, c;
|
||||
FILE *fp;
|
||||
|
||||
while ((c = getopt(argc, argv, "d:n:")) != -1) {
|
||||
switch (c) {
|
||||
case 'd':
|
||||
dim = strtol(optarg, NULL, 10);
|
||||
break;
|
||||
case 'n':
|
||||
points_n = strtol(optarg, NULL, 10);
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
}
|
||||
}
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (dim < 0)
|
||||
usage();
|
||||
|
||||
if ((data = read_data()) == NULL) exit(1);
|
||||
|
||||
fp = fopen("init.opt", "r");
|
||||
if ((min = read_point(fp)) == NULL) exit(1);
|
||||
if ((max = read_point(fp)) == NULL) exit(1);
|
||||
if ((startpoint = read_point(fp)) == NULL) exit(1);
|
||||
fclose(fp);
|
||||
|
||||
bestpoint = NULL;
|
||||
for (point_i=0; point_i<points_n; point_i++) {
|
||||
fprintf(stderr, "*** point %d ***\n", point_i);
|
||||
if (point_i == 0)
|
||||
newpoint = startpoint;
|
||||
else
|
||||
newpoint = random_point(min, max);
|
||||
newpoint = optimize_koehn(data, newpoint);
|
||||
if (bestpoint == NULL || newpoint->score > bestpoint->score)
|
||||
bestpoint = newpoint; // who cares about the leak
|
||||
}
|
||||
fprintf(stderr, "Best point: ");
|
||||
point_print(bestpoint, stderr, 1);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fp = fopen("weights.txt", "w");
|
||||
point_print(bestpoint, fp, 0);
|
||||
fprintf(fp, "\n");
|
||||
fclose(fp);
|
||||
}
|
@ -1,117 +0,0 @@
|
||||
// $Id$
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "point.h"
|
||||
|
||||
int dim = -1;
|
||||
|
||||
point_t *new_point() {
|
||||
point_t *point;
|
||||
point = malloc(sizeof(point_t));
|
||||
point->score = 0.0;
|
||||
point->weights = calloc(dim, sizeof(float));
|
||||
point->has_score = 0;
|
||||
return point;
|
||||
}
|
||||
|
||||
void point_set_score(point_t *point, float score) {
|
||||
point->has_score = 1;
|
||||
point->score = score;
|
||||
}
|
||||
|
||||
void point_delete(point_t *point) {
|
||||
free(point->weights);
|
||||
free(point);
|
||||
}
|
||||
|
||||
point_t *random_point(point_t *min, point_t *max) {
|
||||
int i;
|
||||
point_t *point = new_point();
|
||||
for (i=0; i<dim; i++)
|
||||
point->weights[i] = min->weights[i] + (float)random()/RAND_MAX * (max->weights[i]-min->weights[i]);
|
||||
return point;
|
||||
}
|
||||
|
||||
point_t *point_copy(point_t *point) {
|
||||
point_t *newpoint;
|
||||
int i;
|
||||
newpoint = new_point();
|
||||
newpoint->score = point->score;
|
||||
newpoint->has_score = point->has_score;
|
||||
for (i=0; i<dim; i++)
|
||||
newpoint->weights[i] = point->weights[i];
|
||||
return newpoint;
|
||||
}
|
||||
|
||||
float point_dotproduct(point_t *point, float *y) {
|
||||
float result;
|
||||
int i;
|
||||
result = 0.0;
|
||||
for (i=0; i<dim; i++)
|
||||
result += point->weights[i] * y[i];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Destructive operations */
|
||||
void point_multiplyby(point_t *point, float k) {
|
||||
int i;
|
||||
for (i=0; i<dim; i++)
|
||||
point->weights[i] *= k;
|
||||
}
|
||||
|
||||
void point_addto(point_t *point1, point_t *point2) {
|
||||
int i;
|
||||
for (i=0; i<dim; i++)
|
||||
point1->weights[i] += point2->weights[i];
|
||||
}
|
||||
|
||||
void point_normalize(point_t *point) {
|
||||
int i;
|
||||
float norm = 0.0;
|
||||
for (i=0; i<dim; i++)
|
||||
//norm += point->weights[i] * point->weights[i];
|
||||
norm += fabs(point->weights[i]);
|
||||
// norm = sqrt(norm);
|
||||
for (i=0; i<dim; i++)
|
||||
point->weights[i] /= norm;
|
||||
}
|
||||
|
||||
void point_print(point_t *point, FILE *fp, int with_score) {
|
||||
int i;
|
||||
fprintf(fp, "%f", point->weights[0]);
|
||||
for (i=1; i<dim; i++)
|
||||
fprintf(fp, " %f", point->weights[i]);
|
||||
if (point->has_score && with_score)
|
||||
fprintf(fp, " => %f", point->score);
|
||||
}
|
||||
|
||||
point_t *read_point(FILE *fp) {
|
||||
static char buf[1000];
|
||||
char *tok, *s;
|
||||
int field;
|
||||
point_t *point;
|
||||
|
||||
point = new_point();
|
||||
|
||||
fgets(buf, sizeof(buf), fp);
|
||||
s = buf;
|
||||
field = 0;
|
||||
while ((tok = strsep(&s, " \t\n")) != NULL) {
|
||||
if (!*tok) // empty token
|
||||
continue;
|
||||
if (field >= dim) {
|
||||
fprintf(stderr, "read_point(): too many fields in line\n");
|
||||
return NULL;
|
||||
} else
|
||||
point->weights[field] = strtod(tok, NULL);
|
||||
field++;
|
||||
}
|
||||
if (field < dim) {
|
||||
fprintf(stderr, "read_point(): wrong number of fields in line\n");
|
||||
return NULL;
|
||||
}
|
||||
return point;
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
// $Id$
|
||||
#ifndef POINT_H
|
||||
#define POINT_H
|
||||
|
||||
typedef struct {
|
||||
float *weights;
|
||||
int has_score;
|
||||
float score;
|
||||
} point_t;
|
||||
|
||||
extern int dim;
|
||||
|
||||
point_t *new_point();
|
||||
void point_set_score(point_t *point, float score);
|
||||
void point_delete(point_t *point);
|
||||
point_t *point_copy(point_t *point);
|
||||
point_t *random_point(point_t *min, point_t *max);
|
||||
float point_dotproduct(point_t *point, float *y);
|
||||
void point_multiplyby(point_t *point, float k);
|
||||
void point_normalize(point_t *point);
|
||||
void point_addto(point_t *point1, point_t *point2);
|
||||
#include <stdio.h>
|
||||
point_t *read_point(FILE *fp);
|
||||
void point_print(point_t *point, FILE *fp, int with_score);
|
||||
|
||||
#endif
|
@ -1,57 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Psyco top-level file of the Psyco package.
|
||||
# Copyright (C) 2001-2002 Armin Rigo et.al.
|
||||
|
||||
"""Psyco -- the Python Specializing Compiler.
|
||||
|
||||
Typical usage: add the following lines to your application's main module:
|
||||
|
||||
try:
|
||||
import psyco
|
||||
psyco.profile()
|
||||
except:
|
||||
print 'Psyco not found, ignoring it'
|
||||
"""
|
||||
###########################################################################
|
||||
|
||||
|
||||
#
|
||||
# This module is present to make 'psyco' a package and to
|
||||
# publish the main functions and variables.
|
||||
#
|
||||
# More documentation can be found in core.py.
|
||||
#
|
||||
|
||||
|
||||
# Try to import the dynamic-loading _psyco and report errors
|
||||
try:
|
||||
import _psyco
|
||||
except ImportError, e:
|
||||
extramsg = ''
|
||||
import sys, imp
|
||||
try:
|
||||
file, filename, (suffix, mode, type) = imp.find_module('_psyco', __path__)
|
||||
except ImportError:
|
||||
ext = [suffix for suffix, mode, type in imp.get_suffixes()
|
||||
if type == imp.C_EXTENSION]
|
||||
if ext:
|
||||
extramsg = (" (cannot locate the compiled extension '_psyco%s' "
|
||||
"in the package path '%s')" % (ext[0], '; '.join(__path__)))
|
||||
else:
|
||||
extramsg = (" (check that the compiled extension '%s' is for "
|
||||
"the correct Python version; this is Python %s)" %
|
||||
(filename, sys.version.split()[0]))
|
||||
raise ImportError, str(e) + extramsg
|
||||
|
||||
# Publish important data by importing them in the package
|
||||
from support import __version__, error, warning, _getrealframe, _getemulframe
|
||||
from support import version_info, __version__ as hexversion
|
||||
from core import full, profile, background, runonly, stop, cannotcompile
|
||||
from core import log, bind, unbind, proxy, unproxy, dumpcodebuf
|
||||
from _psyco import setfilter
|
||||
|
||||
try:
|
||||
from _psyco import compact, compacttype # Python 2.2 and above only
|
||||
except ImportError:
|
||||
pass
|
Binary file not shown.
@ -1,53 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Psyco class support module.
|
||||
# Copyright (C) 2001-2002 Armin Rigo et.al.
|
||||
|
||||
"""Psyco class support module.
|
||||
|
||||
'psyco.classes.psyobj' is an alternate Psyco-optimized root for classes.
|
||||
Any class inheriting from it or using the metaclass '__metaclass__' might
|
||||
get optimized specifically for Psyco. It is equivalent to call
|
||||
psyco.bind() on the class object after its creation.
|
||||
|
||||
Note that this module has no effect with Python version 2.1 or earlier.
|
||||
|
||||
Importing everything from psyco.classes in a module will import the
|
||||
'__metaclass__' name, so all classes defined after a
|
||||
|
||||
from psyco.classes import *
|
||||
|
||||
will automatically use the Psyco-optimized metaclass.
|
||||
"""
|
||||
###########################################################################
|
||||
|
||||
__all__ = ['psyobj', 'psymetaclass', '__metaclass__']
|
||||
|
||||
|
||||
# Python version check
|
||||
try:
|
||||
from _psyco import compacttype
|
||||
except ImportError:
|
||||
class psyobj: # compatilibity
|
||||
pass
|
||||
psymetaclass = None
|
||||
else:
|
||||
# version >= 2.2 only
|
||||
|
||||
import core
|
||||
from types import FunctionType
|
||||
|
||||
class psymetaclass(compacttype):
|
||||
"Psyco-optimized meta-class. Turns all methods into Psyco proxies."
|
||||
|
||||
def __new__(cls, name, bases, dict):
|
||||
bindlist = dict.get('__psyco__bind__')
|
||||
if bindlist is None:
|
||||
bindlist = [key for key, value in dict.items()
|
||||
if isinstance(value, FunctionType)]
|
||||
for attr in bindlist:
|
||||
dict[attr] = core.proxy(dict[attr])
|
||||
return super(psymetaclass, cls).__new__(cls, name, bases, dict)
|
||||
|
||||
psyobj = psymetaclass("psyobj", (), {})
|
||||
__metaclass__ = psymetaclass
|
@ -1,232 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Psyco main functions.
|
||||
# Copyright (C) 2001-2002 Armin Rigo et.al.
|
||||
|
||||
"""Psyco main functions.
|
||||
|
||||
Here are the routines that you can use from your applications.
|
||||
These are mostly interfaces to the C core, but they depend on
|
||||
the Python version.
|
||||
|
||||
You can use these functions from the 'psyco' module instead of
|
||||
'psyco.core', e.g.
|
||||
|
||||
import psyco
|
||||
psyco.log('/tmp/psyco.log')
|
||||
psyco.profile()
|
||||
"""
|
||||
###########################################################################
|
||||
|
||||
import _psyco
|
||||
import types, new
|
||||
from support import *
|
||||
|
||||
|
||||
# Default charge profiler values
|
||||
default_watermark = 0.09 # between 0.0 (0%) and 1.0 (100%)
|
||||
default_halflife = 0.5 # seconds
|
||||
default_pollfreq_profile = 20 # Hz
|
||||
default_pollfreq_background = 100 # Hz -- a maximum for sleep's resolution
|
||||
default_parentframe = 0.25 # should not be more than 0.5 (50%)
|
||||
|
||||
|
||||
def full(memory=None, time=None, memorymax=None, timemax=None):
|
||||
"""Compile as much as possible.
|
||||
|
||||
Typical use is for small scripts performing intensive computations
|
||||
or string handling."""
|
||||
import profiler
|
||||
if PYTHON_SUPPORT:
|
||||
p = profiler.FullCompiler()
|
||||
else:
|
||||
p = profiler.ActiveProfiler(0.0, 0.5)
|
||||
p.run(memory, time, memorymax, timemax)
|
||||
|
||||
|
||||
def profile(watermark = default_watermark,
|
||||
halflife = default_halflife,
|
||||
pollfreq = default_pollfreq_profile,
|
||||
parentframe = default_parentframe,
|
||||
memory=None, time=None, memorymax=None, timemax=None):
|
||||
"""Turn on profiling.
|
||||
|
||||
The 'watermark' parameter controls how easily running functions will
|
||||
be compiled. The smaller the value, the more functions are compiled."""
|
||||
import profiler
|
||||
p = profiler.ActivePassiveProfiler(watermark, halflife,
|
||||
pollfreq, parentframe)
|
||||
p.run(memory, time, memorymax, timemax)
|
||||
|
||||
|
||||
def background(watermark = default_watermark,
|
||||
halflife = default_halflife,
|
||||
pollfreq = default_pollfreq_background,
|
||||
parentframe = default_parentframe,
|
||||
memory=None, time=None, memorymax=None, timemax=None):
|
||||
"""Turn on passive profiling.
|
||||
|
||||
This is a very lightweight mode in which only intensively computing
|
||||
functions can be detected. The smaller the 'watermark', the more functions
|
||||
are compiled."""
|
||||
import profiler
|
||||
p = profiler.PassiveProfiler(watermark, halflife, pollfreq, parentframe)
|
||||
p.run(memory, time, memorymax, timemax)
|
||||
|
||||
|
||||
def runonly(memory=None, time=None, memorymax=None, timemax=None):
|
||||
"""Nonprofiler.
|
||||
|
||||
XXX check if this is useful and document."""
|
||||
if PYTHON_SUPPORT:
|
||||
import profiler
|
||||
p = profiler.RunOnly()
|
||||
p.run(memory, time, memorymax, timemax)
|
||||
|
||||
|
||||
def stop():
|
||||
"""Turn off all automatic compilation. bind() calls remain in effect."""
|
||||
import profiler
|
||||
profiler.go([])
|
||||
|
||||
|
||||
def log(logfile='', mode='w', top=10):
|
||||
"""Enable logging to the given file.
|
||||
|
||||
If the file name is unspecified, a default name is built by appending
|
||||
a 'log-psyco' extension to the main script name.
|
||||
|
||||
Mode is 'a' to append to a possibly existing file or 'w' to overwrite
|
||||
an existing file. Note that the log file may grow quickly in 'a' mode."""
|
||||
import profiler, logger
|
||||
if not logfile:
|
||||
import os
|
||||
logfile, dummy = os.path.splitext(sys.argv[0])
|
||||
if os.path.basename(logfile):
|
||||
logfile += '.'
|
||||
logfile += 'log-psyco'
|
||||
if hasattr(_psyco, 'VERBOSE_LEVEL'):
|
||||
print >> sys.stderr, 'psyco: logging to', logfile
|
||||
# logger.current should be a real file object; subtle problems
|
||||
# will show up if its write() and flush() methods are written
|
||||
# in Python, as Psyco will invoke them while compiling.
|
||||
logger.current = open(logfile, mode)
|
||||
logger.print_charges = top
|
||||
profiler.logger = logger
|
||||
logger.writedate('Logging started')
|
||||
cannotcompile(logger.psycowrite)
|
||||
_psyco.statwrite(logger=logger.psycowrite)
|
||||
|
||||
|
||||
def bind(x, rec=None):
|
||||
"""Enable compilation of the given function, method, or class object.
|
||||
|
||||
If C is a class (or anything with a '__dict__' attribute), bind(C) will
|
||||
rebind all functions and methods found in C.__dict__ (which means, for
|
||||
classes, all methods defined in the class but not in its parents).
|
||||
|
||||
The optional second argument specifies the number of recursive
|
||||
compilation levels: all functions called by func are compiled
|
||||
up to the given depth of indirection."""
|
||||
if isinstance(x, types.MethodType):
|
||||
x = x.im_func
|
||||
if isinstance(x, types.FunctionType):
|
||||
if rec is None:
|
||||
x.func_code = _psyco.proxycode(x)
|
||||
else:
|
||||
x.func_code = _psyco.proxycode(x, rec)
|
||||
return
|
||||
if hasattr(x, '__dict__'):
|
||||
funcs = [o for o in x.__dict__.values()
|
||||
if isinstance(o, types.MethodType)
|
||||
or isinstance(o, types.FunctionType)]
|
||||
if not funcs:
|
||||
raise error, ("nothing bindable found in %s object" %
|
||||
type(x).__name__)
|
||||
for o in funcs:
|
||||
bind(o, rec)
|
||||
return
|
||||
raise TypeError, "cannot bind %s objects" % type(x).__name__
|
||||
|
||||
|
||||
def unbind(x):
|
||||
"""Reverse of bind()."""
|
||||
if isinstance(x, types.MethodType):
|
||||
x = x.im_func
|
||||
if isinstance(x, types.FunctionType):
|
||||
try:
|
||||
f = _psyco.unproxycode(x.func_code)
|
||||
except error:
|
||||
pass
|
||||
else:
|
||||
x.func_code = f.func_code
|
||||
return
|
||||
if hasattr(x, '__dict__'):
|
||||
for o in x.__dict__.values():
|
||||
if (isinstance(o, types.MethodType)
|
||||
or isinstance(o, types.FunctionType)):
|
||||
unbind(o)
|
||||
return
|
||||
raise TypeError, "cannot unbind %s objects" % type(x).__name__
|
||||
|
||||
|
||||
def proxy(x, rec=None):
|
||||
"""Return a Psyco-enabled copy of the function.
|
||||
|
||||
The original function is still available for non-compiled calls.
|
||||
The optional second argument specifies the number of recursive
|
||||
compilation levels: all functions called by func are compiled
|
||||
up to the given depth of indirection."""
|
||||
if isinstance(x, types.FunctionType):
|
||||
if rec is None:
|
||||
code = _psyco.proxycode(x)
|
||||
else:
|
||||
code = _psyco.proxycode(x, rec)
|
||||
return new.function(code, x.func_globals, x.func_name)
|
||||
if isinstance(x, types.MethodType):
|
||||
p = proxy(x.im_func, rec)
|
||||
return new.instancemethod(p, x.im_self, x.im_class)
|
||||
raise TypeError, "cannot proxy %s objects" % type(x).__name__
|
||||
|
||||
|
||||
def unproxy(proxy):
|
||||
"""Return a new copy of the original function of method behind a proxy.
|
||||
The result behaves like the original function in that calling it
|
||||
does not trigger compilation nor execution of any compiled code."""
|
||||
if isinstance(proxy, types.FunctionType):
|
||||
return _psyco.unproxycode(proxy.func_code)
|
||||
if isinstance(proxy, types.MethodType):
|
||||
f = unproxy(proxy.im_func)
|
||||
return new.instancemethod(f, proxy.im_self, proxy.im_class)
|
||||
raise TypeError, "%s objects cannot be proxies" % type(proxy).__name__
|
||||
|
||||
|
||||
def cannotcompile(x):
|
||||
"""Instruct Psyco never to compile the given function, method
|
||||
or code object."""
|
||||
if isinstance(x, types.MethodType):
|
||||
x = x.im_func
|
||||
if isinstance(x, types.FunctionType):
|
||||
x = x.func_code
|
||||
if isinstance(x, types.CodeType):
|
||||
_psyco.cannotcompile(x)
|
||||
else:
|
||||
raise TypeError, "unexpected %s object" % type(x).__name__
|
||||
|
||||
|
||||
def dumpcodebuf():
|
||||
"""Write in file psyco.dump a copy of the emitted machine code,
|
||||
provided Psyco was compiled with a non-zero CODE_DUMP.
|
||||
See py-utils/httpxam.py to examine psyco.dump."""
|
||||
if hasattr(_psyco, 'dumpcodebuf'):
|
||||
_psyco.dumpcodebuf()
|
||||
|
||||
|
||||
###########################################################################
|
||||
# Psyco variables
|
||||
# error * the error raised by Psyco
|
||||
# warning * the warning raised by Psyco
|
||||
# __in_psyco__ * a new built-in variable which is always zero, but which
|
||||
# Psyco special-cases by returning 1 instead. So
|
||||
# __in_psyco__ can be used in a function to know if
|
||||
# that function is being executed by Psyco or not.
|
@ -1,133 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Support code for the 'psyco.compact' type.
|
||||
|
||||
from __future__ import generators
|
||||
|
||||
try:
|
||||
from UserDict import DictMixin
|
||||
except ImportError:
|
||||
|
||||
# backported from Python 2.3 to Python 2.2
|
||||
class DictMixin:
|
||||
# Mixin defining all dictionary methods for classes that already have
|
||||
# a minimum dictionary interface including getitem, setitem, delitem,
|
||||
# and keys. Without knowledge of the subclass constructor, the mixin
|
||||
# does not define __init__() or copy(). In addition to the four base
|
||||
# methods, progressively more efficiency comes with defining
|
||||
# __contains__(), __iter__(), and iteritems().
|
||||
|
||||
# second level definitions support higher levels
|
||||
def __iter__(self):
|
||||
for k in self.keys():
|
||||
yield k
|
||||
def has_key(self, key):
|
||||
try:
|
||||
value = self[key]
|
||||
except KeyError:
|
||||
return False
|
||||
return True
|
||||
def __contains__(self, key):
|
||||
return self.has_key(key)
|
||||
|
||||
# third level takes advantage of second level definitions
|
||||
def iteritems(self):
|
||||
for k in self:
|
||||
yield (k, self[k])
|
||||
def iterkeys(self):
|
||||
return self.__iter__()
|
||||
|
||||
# fourth level uses definitions from lower levels
|
||||
def itervalues(self):
|
||||
for _, v in self.iteritems():
|
||||
yield v
|
||||
def values(self):
|
||||
return [v for _, v in self.iteritems()]
|
||||
def items(self):
|
||||
return list(self.iteritems())
|
||||
def clear(self):
|
||||
for key in self.keys():
|
||||
del self[key]
|
||||
def setdefault(self, key, default):
|
||||
try:
|
||||
return self[key]
|
||||
except KeyError:
|
||||
self[key] = default
|
||||
return default
|
||||
def pop(self, key, *args):
|
||||
if len(args) > 1:
|
||||
raise TypeError, "pop expected at most 2 arguments, got "\
|
||||
+ repr(1 + len(args))
|
||||
try:
|
||||
value = self[key]
|
||||
except KeyError:
|
||||
if args:
|
||||
return args[0]
|
||||
raise
|
||||
del self[key]
|
||||
return value
|
||||
def popitem(self):
|
||||
try:
|
||||
k, v = self.iteritems().next()
|
||||
except StopIteration:
|
||||
raise KeyError, 'container is empty'
|
||||
del self[k]
|
||||
return (k, v)
|
||||
def update(self, other):
|
||||
# Make progressively weaker assumptions about "other"
|
||||
if hasattr(other, 'iteritems'): # iteritems saves memory and lookups
|
||||
for k, v in other.iteritems():
|
||||
self[k] = v
|
||||
elif hasattr(other, '__iter__'): # iter saves memory
|
||||
for k in other:
|
||||
self[k] = other[k]
|
||||
else:
|
||||
for k in other.keys():
|
||||
self[k] = other[k]
|
||||
def get(self, key, default=None):
|
||||
try:
|
||||
return self[key]
|
||||
except KeyError:
|
||||
return default
|
||||
def __repr__(self):
|
||||
return repr(dict(self.iteritems()))
|
||||
def __cmp__(self, other):
|
||||
if other is None:
|
||||
return 1
|
||||
if isinstance(other, DictMixin):
|
||||
other = dict(other.iteritems())
|
||||
return cmp(dict(self.iteritems()), other)
|
||||
def __len__(self):
|
||||
return len(self.keys())
|
||||
|
||||
###########################################################################
|
||||
|
||||
from _psyco import compact # Python 2.2 and above only
|
||||
|
||||
|
||||
class compactdictproxy(DictMixin):
|
||||
|
||||
def __init__(self, ko):
|
||||
self._ko = ko # compact object of which 'self' is the dict
|
||||
|
||||
def __getitem__(self, key):
|
||||
return compact.__getslot__(self._ko, key)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
compact.__setslot__(self._ko, key, value)
|
||||
|
||||
def __delitem__(self, key):
|
||||
compact.__delslot__(self._ko, key)
|
||||
|
||||
def keys(self):
|
||||
return compact.__members__.__get__(self._ko)
|
||||
|
||||
def clear(self):
|
||||
keys = self.keys()
|
||||
keys.reverse()
|
||||
for key in keys:
|
||||
del self[key]
|
||||
|
||||
def __repr__(self):
|
||||
keys = ', '.join(self.keys())
|
||||
return '<compactdictproxy object {%s}>' % (keys,)
|
@ -1,90 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Psyco logger.
|
||||
# Copyright (C) 2001-2002 Armin Rigo et.al.
|
||||
|
||||
"""Psyco logger.
|
||||
|
||||
See log() in core.py.
|
||||
"""
|
||||
###########################################################################
|
||||
|
||||
|
||||
import _psyco
|
||||
from time import time, localtime, strftime
|
||||
|
||||
|
||||
current = None
|
||||
print_charges = 10
|
||||
dump_delay = 0.2
|
||||
dump_last = 0.0
|
||||
|
||||
def write(s, level):
|
||||
t = time()
|
||||
f = t-int(t)
|
||||
current.write("%s.%02d %-*s%s\n" % (
|
||||
strftime("%X", localtime(int(t))),
|
||||
int(f*100.0), 63-level, s,
|
||||
"%"*level))
|
||||
current.flush()
|
||||
|
||||
def psycowrite(s):
|
||||
t = time()
|
||||
f = t-int(t)
|
||||
current.write("%s.%02d %-*s%s\n" % (
|
||||
strftime("%X", localtime(int(t))),
|
||||
int(f*100.0), 60, s.strip(),
|
||||
"% %"))
|
||||
current.flush()
|
||||
|
||||
##def writelines(lines, level=0):
|
||||
## if lines:
|
||||
## t = time()
|
||||
## f = t-int(t)
|
||||
## timedesc = strftime("%x %X", localtime(int(t)))
|
||||
## print >> current, "%s.%03d %-*s %s" % (
|
||||
## timedesc, int(f*1000),
|
||||
## 50-level, lines[0],
|
||||
## "+"*level)
|
||||
## timedesc = " " * (len(timedesc)+5)
|
||||
## for line in lines[1:]:
|
||||
## print >> current, timedesc, line
|
||||
|
||||
def writememory():
|
||||
write("memory usage: %d+ kb" % _psyco.memory(), 1)
|
||||
|
||||
def dumpcharges():
|
||||
global dump_last
|
||||
if print_charges:
|
||||
t = time()
|
||||
if not (dump_last <= t < dump_last+dump_delay):
|
||||
if t <= dump_last+1.5*dump_delay:
|
||||
dump_last += dump_delay
|
||||
else:
|
||||
dump_last = t
|
||||
#write("%s: charges:" % who, 0)
|
||||
lst = _psyco.stattop(print_charges)
|
||||
if lst:
|
||||
f = t-int(t)
|
||||
lines = ["%s.%02d ______\n" % (
|
||||
strftime("%X", localtime(int(t))),
|
||||
int(f*100.0))]
|
||||
i = 1
|
||||
for co, charge in lst:
|
||||
detail = co.co_filename
|
||||
if len(detail) > 19:
|
||||
detail = '...' + detail[-17:]
|
||||
lines.append(" #%-3d |%4.1f %%| %-26s%20s:%d\n" %
|
||||
(i, charge*100.0, co.co_name, detail,
|
||||
co.co_firstlineno))
|
||||
i += 1
|
||||
current.writelines(lines)
|
||||
current.flush()
|
||||
|
||||
def writefinalstats():
|
||||
dumpcharges()
|
||||
writememory()
|
||||
writedate("program exit")
|
||||
|
||||
def writedate(msg):
|
||||
write('%s, %s' % (msg, strftime("%x")), 20)
|
@ -1,388 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Psyco profiler (Python part).
|
||||
# Copyright (C) 2001-2002 Armin Rigo et.al.
|
||||
|
||||
"""Psyco profiler (Python part).
|
||||
|
||||
The implementation of the non-time-critical parts of the profiler.
|
||||
See profile() and full() in core.py for the easy interface.
|
||||
"""
|
||||
###########################################################################
|
||||
|
||||
import _psyco
|
||||
from support import *
|
||||
import math, time, types, atexit
|
||||
now = time.time
|
||||
try:
|
||||
import thread
|
||||
except ImportError:
|
||||
import dummy_thread as thread
|
||||
|
||||
|
||||
# current profiler instance
|
||||
current = None
|
||||
|
||||
# enabled profilers, in order of priority
|
||||
profilers = []
|
||||
|
||||
# logger module (when enabled by core.log())
|
||||
logger = None
|
||||
|
||||
# a lock for a thread-safe go()
|
||||
go_lock = thread.allocate_lock()
|
||||
|
||||
def go(stop=0):
|
||||
# run the highest-priority profiler in 'profilers'
|
||||
global current
|
||||
go_lock.acquire()
|
||||
try:
|
||||
prev = current
|
||||
if stop:
|
||||
del profilers[:]
|
||||
if prev:
|
||||
if profilers and profilers[0] is prev:
|
||||
return # best profiler already running
|
||||
prev.stop()
|
||||
current = None
|
||||
for p in profilers[:]:
|
||||
if p.start():
|
||||
current = p
|
||||
if logger: # and p is not prev:
|
||||
logger.write("%s: starting" % p.__class__.__name__, 5)
|
||||
return
|
||||
finally:
|
||||
go_lock.release()
|
||||
# no profiler is running now
|
||||
if stop:
|
||||
if logger:
|
||||
logger.writefinalstats()
|
||||
else:
|
||||
tag2bind()
|
||||
|
||||
atexit.register(go, 1)
|
||||
|
||||
|
||||
def buildfncache(globals, cache):
|
||||
if hasattr(types.IntType, '__dict__'):
|
||||
clstypes = (types.ClassType, types.TypeType)
|
||||
else:
|
||||
clstypes = types.ClassType
|
||||
for x in globals.values():
|
||||
if isinstance(x, types.MethodType):
|
||||
x = x.im_func
|
||||
if isinstance(x, types.FunctionType):
|
||||
cache[x.func_code] = x, ''
|
||||
elif isinstance(x, clstypes):
|
||||
for y in x.__dict__.values():
|
||||
if isinstance(y, types.MethodType):
|
||||
y = y.im_func
|
||||
if isinstance(y, types.FunctionType):
|
||||
cache[y.func_code] = y, x.__name__
|
||||
|
||||
# code-to-function mapping (cache)
|
||||
function_cache = {}
|
||||
|
||||
def trytobind(co, globals, log=1):
|
||||
try:
|
||||
f, clsname = function_cache[co]
|
||||
except KeyError:
|
||||
buildfncache(globals, function_cache)
|
||||
try:
|
||||
f, clsname = function_cache[co]
|
||||
except KeyError:
|
||||
if logger:
|
||||
logger.write('warning: cannot find function %s in %s' %
|
||||
(co.co_name, globals.get('__name__', '?')), 3)
|
||||
return # give up
|
||||
if logger and log:
|
||||
modulename = globals.get('__name__', '?')
|
||||
if clsname:
|
||||
modulename += '.' + clsname
|
||||
logger.write('bind function: %s.%s' % (modulename, co.co_name), 1)
|
||||
f.func_code = _psyco.proxycode(f)
|
||||
|
||||
|
||||
if PYTHON_SUPPORT:
|
||||
# the list of code objects that have been tagged
|
||||
tagged_codes = []
|
||||
|
||||
def tag(co, globals):
|
||||
if logger:
|
||||
try:
|
||||
f, clsname = function_cache[co]
|
||||
except KeyError:
|
||||
buildfncache(globals, function_cache)
|
||||
try:
|
||||
f, clsname = function_cache[co]
|
||||
except KeyError:
|
||||
clsname = '' # give up
|
||||
modulename = globals.get('__name__', '?')
|
||||
if clsname:
|
||||
modulename += '.' + clsname
|
||||
logger.write('tag function: %s.%s' % (modulename, co.co_name), 1)
|
||||
tagged_codes.append((co, globals))
|
||||
_psyco.turbo_frame(co)
|
||||
_psyco.turbo_code(co)
|
||||
|
||||
def tag2bind():
|
||||
if tagged_codes:
|
||||
if logger:
|
||||
logger.write('profiling stopped, binding %d functions' %
|
||||
len(tagged_codes), 2)
|
||||
for co, globals in tagged_codes:
|
||||
trytobind(co, globals, 0)
|
||||
function_cache.clear()
|
||||
del tagged_codes[:]
|
||||
|
||||
else:
|
||||
# tagging is impossible, always bind
|
||||
tag = trytobind
|
||||
def tag2bind():
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class Profiler:
|
||||
MemoryTimerResolution = 0.103
|
||||
|
||||
def run(self, memory, time, memorymax, timemax):
|
||||
self.memory = memory
|
||||
self.memorymax = memorymax
|
||||
self.time = time
|
||||
if timemax is None:
|
||||
self.endtime = None
|
||||
else:
|
||||
self.endtime = now() + timemax
|
||||
self.alarms = []
|
||||
profilers.append(self)
|
||||
go()
|
||||
|
||||
def start(self):
|
||||
curmem = _psyco.memory()
|
||||
memlimits = []
|
||||
if self.memorymax is not None:
|
||||
if curmem >= self.memorymax:
|
||||
if logger:
|
||||
logger.writememory()
|
||||
return self.limitreached('memorymax')
|
||||
memlimits.append(self.memorymax)
|
||||
if self.memory is not None:
|
||||
if self.memory <= 0:
|
||||
if logger:
|
||||
logger.writememory()
|
||||
return self.limitreached('memory')
|
||||
memlimits.append(curmem + self.memory)
|
||||
self.memory_at_start = curmem
|
||||
|
||||
curtime = now()
|
||||
timelimits = []
|
||||
if self.endtime is not None:
|
||||
if curtime >= self.endtime:
|
||||
return self.limitreached('timemax')
|
||||
timelimits.append(self.endtime - curtime)
|
||||
if self.time is not None:
|
||||
if self.time <= 0.0:
|
||||
return self.limitreached('time')
|
||||
timelimits.append(self.time)
|
||||
self.time_at_start = curtime
|
||||
|
||||
try:
|
||||
self.do_start()
|
||||
except error, e:
|
||||
if logger:
|
||||
logger.write('%s: disabled by psyco.error:' % (
|
||||
self.__class__.__name__), 4)
|
||||
logger.write(' %s' % str(e), 3)
|
||||
return 0
|
||||
|
||||
if memlimits:
|
||||
self.memlimits_args = (time.sleep, (self.MemoryTimerResolution,),
|
||||
self.check_memory, (min(memlimits),))
|
||||
self.alarms.append(_psyco.alarm(*self.memlimits_args))
|
||||
if timelimits:
|
||||
self.alarms.append(_psyco.alarm(time.sleep, (min(timelimits),),
|
||||
self.time_out))
|
||||
return 1
|
||||
|
||||
def stop(self):
|
||||
for alarm in self.alarms:
|
||||
alarm.stop(0)
|
||||
for alarm in self.alarms:
|
||||
alarm.stop(1) # wait for parallel threads to stop
|
||||
del self.alarms[:]
|
||||
if self.time is not None:
|
||||
self.time -= now() - self.time_at_start
|
||||
if self.memory is not None:
|
||||
self.memory -= _psyco.memory() - self.memory_at_start
|
||||
|
||||
try:
|
||||
self.do_stop()
|
||||
except error:
|
||||
return 0
|
||||
return 1
|
||||
|
||||
def check_memory(self, limit):
|
||||
if _psyco.memory() < limit:
|
||||
return self.memlimits_args
|
||||
go()
|
||||
|
||||
def time_out(self):
|
||||
self.time = 0.0
|
||||
go()
|
||||
|
||||
def limitreached(self, limitname):
|
||||
try:
|
||||
profilers.remove(self)
|
||||
except ValueError:
|
||||
pass
|
||||
if logger:
|
||||
logger.write('%s: disabled (%s limit reached)' % (
|
||||
self.__class__.__name__, limitname), 4)
|
||||
return 0
|
||||
|
||||
|
||||
class FullCompiler(Profiler):
|
||||
|
||||
def do_start(self):
|
||||
_psyco.profiling('f')
|
||||
|
||||
def do_stop(self):
|
||||
_psyco.profiling('.')
|
||||
|
||||
|
||||
class RunOnly(Profiler):
|
||||
|
||||
def do_start(self):
|
||||
_psyco.profiling('n')
|
||||
|
||||
def do_stop(self):
|
||||
_psyco.profiling('.')
|
||||
|
||||
|
||||
class ChargeProfiler(Profiler):
|
||||
|
||||
def __init__(self, watermark, parentframe):
|
||||
self.watermark = watermark
|
||||
self.parent2 = parentframe * 2.0
|
||||
self.lock = thread.allocate_lock()
|
||||
|
||||
def init_charges(self):
|
||||
_psyco.statwrite(watermark = self.watermark,
|
||||
parent2 = self.parent2)
|
||||
|
||||
def do_stop(self):
|
||||
_psyco.profiling('.')
|
||||
_psyco.statwrite(callback = None)
|
||||
|
||||
|
||||
class ActiveProfiler(ChargeProfiler):
|
||||
|
||||
def active_start(self):
|
||||
_psyco.profiling('p')
|
||||
|
||||
def do_start(self):
|
||||
self.init_charges()
|
||||
self.active_start()
|
||||
_psyco.statwrite(callback = self.charge_callback)
|
||||
|
||||
def charge_callback(self, frame, charge):
|
||||
tag(frame.f_code, frame.f_globals)
|
||||
|
||||
|
||||
class PassiveProfiler(ChargeProfiler):
|
||||
|
||||
initial_charge_unit = _psyco.statread('unit')
|
||||
reset_stats_after = 120 # half-lives (maximum 200!)
|
||||
reset_limit = initial_charge_unit * (2.0 ** reset_stats_after)
|
||||
|
||||
def __init__(self, watermark, halflife, pollfreq, parentframe):
|
||||
ChargeProfiler.__init__(self, watermark, parentframe)
|
||||
self.pollfreq = pollfreq
|
||||
# self.progress is slightly more than 1.0, and computed so that
|
||||
# do_profile() will double the change_unit every 'halflife' seconds.
|
||||
self.progress = 2.0 ** (1.0 / (halflife * pollfreq))
|
||||
|
||||
def reset(self):
|
||||
_psyco.statwrite(unit = self.initial_charge_unit, callback = None)
|
||||
_psyco.statreset()
|
||||
if logger:
|
||||
logger.write("%s: resetting stats" % self.__class__.__name__, 1)
|
||||
|
||||
def passive_start(self):
|
||||
self.passivealarm_args = (time.sleep, (1.0 / self.pollfreq,),
|
||||
self.do_profile)
|
||||
self.alarms.append(_psyco.alarm(*self.passivealarm_args))
|
||||
|
||||
def do_start(self):
|
||||
tag2bind()
|
||||
self.init_charges()
|
||||
self.passive_start()
|
||||
|
||||
def do_profile(self):
|
||||
_psyco.statcollect()
|
||||
if logger:
|
||||
logger.dumpcharges()
|
||||
nunit = _psyco.statread('unit') * self.progress
|
||||
if nunit > self.reset_limit:
|
||||
self.reset()
|
||||
else:
|
||||
_psyco.statwrite(unit = nunit, callback = self.charge_callback)
|
||||
return self.passivealarm_args
|
||||
|
||||
def charge_callback(self, frame, charge):
|
||||
trytobind(frame.f_code, frame.f_globals)
|
||||
|
||||
|
||||
class ActivePassiveProfiler(PassiveProfiler, ActiveProfiler):
|
||||
|
||||
def do_start(self):
|
||||
self.init_charges()
|
||||
self.active_start()
|
||||
self.passive_start()
|
||||
|
||||
def charge_callback(self, frame, charge):
|
||||
tag(frame.f_code, frame.f_globals)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# we register our own version of sys.settrace(), sys.setprofile()
|
||||
# and thread.start_new_thread().
|
||||
#
|
||||
|
||||
def psyco_settrace(*args, **kw):
|
||||
"This is the Psyco-aware version of sys.settrace()."
|
||||
result = original_settrace(*args, **kw)
|
||||
go()
|
||||
return result
|
||||
|
||||
def psyco_setprofile(*args, **kw):
|
||||
"This is the Psyco-aware version of sys.setprofile()."
|
||||
result = original_setprofile(*args, **kw)
|
||||
go()
|
||||
return result
|
||||
|
||||
def psyco_thread_stub(callable, args, kw):
|
||||
_psyco.statcollect()
|
||||
if kw is None:
|
||||
return callable(*args)
|
||||
else:
|
||||
return callable(*args, **kw)
|
||||
|
||||
def psyco_start_new_thread(callable, args, kw=None):
|
||||
"This is the Psyco-aware version of thread.start_new_thread()."
|
||||
return original_start_new_thread(psyco_thread_stub, (callable, args, kw))
|
||||
|
||||
original_settrace = sys.settrace
|
||||
original_setprofile = sys.setprofile
|
||||
original_start_new_thread = thread.start_new_thread
|
||||
sys.settrace = psyco_settrace
|
||||
sys.setprofile = psyco_setprofile
|
||||
if PYTHON_SUPPORT:
|
||||
thread.start_new_thread = psyco_start_new_thread
|
||||
# hack to patch threading._start_new_thread if the module is
|
||||
# already loaded
|
||||
if (sys.modules.has_key('threading') and
|
||||
hasattr(sys.modules['threading'], '_start_new_thread')):
|
||||
sys.modules['threading']._start_new_thread = psyco_start_new_thread
|
@ -1,196 +0,0 @@
|
||||
###########################################################################
|
||||
#
|
||||
# Psyco general support module.
|
||||
# Copyright (C) 2001-2002 Armin Rigo et.al.
|
||||
|
||||
"""Psyco general support module.
|
||||
|
||||
For internal use.
|
||||
"""
|
||||
###########################################################################
|
||||
|
||||
import sys, _psyco, __builtin__
|
||||
|
||||
error = _psyco.error
|
||||
class warning(Warning):
|
||||
pass
|
||||
|
||||
_psyco.NoLocalsWarning = warning
|
||||
|
||||
def warn(msg):
|
||||
from warnings import warn
|
||||
warn(msg, warning, stacklevel=2)
|
||||
|
||||
#
|
||||
# Version checks
|
||||
#
|
||||
__version__ = 0x010500f0
|
||||
if _psyco.PSYVER != __version__:
|
||||
raise error, "version mismatch between Psyco parts, reinstall it"
|
||||
|
||||
version_info = (__version__ >> 24,
|
||||
(__version__ >> 16) & 0xff,
|
||||
(__version__ >> 8) & 0xff,
|
||||
{0xa0: 'alpha',
|
||||
0xb0: 'beta',
|
||||
0xc0: 'candidate',
|
||||
0xf0: 'final'}[__version__ & 0xf0],
|
||||
__version__ & 0xf)
|
||||
|
||||
|
||||
VERSION_LIMITS = [0x02010000, # 2.1
|
||||
0x02020000, # 2.2
|
||||
0x02020200, # 2.2.2
|
||||
0x02030000, # 2.3
|
||||
0x02040000] # 2.4
|
||||
|
||||
if ([v for v in VERSION_LIMITS if v <= sys.hexversion] !=
|
||||
[v for v in VERSION_LIMITS if v <= _psyco.PYVER ]):
|
||||
if sys.hexversion < VERSION_LIMITS[0]:
|
||||
warn("Psyco requires Python version 2.1 or later")
|
||||
else:
|
||||
warn("Psyco version does not match Python version. "
|
||||
"Psyco must be updated or recompiled")
|
||||
|
||||
PYTHON_SUPPORT = hasattr(_psyco, 'turbo_code')
|
||||
|
||||
|
||||
if hasattr(_psyco, 'ALL_CHECKS') and hasattr(_psyco, 'VERBOSE_LEVEL'):
|
||||
print >> sys.stderr, ('psyco: running in debugging mode on %s' %
|
||||
_psyco.PROCESSOR)
|
||||
|
||||
|
||||
###########################################################################
|
||||
# sys._getframe() gives strange results on a mixed Psyco- and Python-style
|
||||
# stack frame. Psyco provides a replacement that partially emulates Python
|
||||
# frames from Psyco frames. The new sys._getframe() may return objects of
|
||||
# a custom "Psyco frame" type, which with Python >=2.2 is a subtype of the
|
||||
# normal frame type.
|
||||
#
|
||||
# The same problems require some other built-in functions to be replaced
|
||||
# as well. Note that the local variables are not available in any
|
||||
# dictionary with Psyco.
|
||||
|
||||
|
||||
class Frame:
|
||||
pass
|
||||
|
||||
|
||||
class PythonFrame(Frame):
|
||||
|
||||
def __init__(self, frame):
|
||||
self.__dict__.update({
|
||||
'_frame': frame,
|
||||
})
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if attr == 'f_back':
|
||||
try:
|
||||
result = embedframe(_psyco.getframe(self._frame))
|
||||
except ValueError:
|
||||
result = None
|
||||
except error:
|
||||
warn("f_back is skipping dead Psyco frames")
|
||||
result = self._frame.f_back
|
||||
self.__dict__['f_back'] = result
|
||||
return result
|
||||
else:
|
||||
return getattr(self._frame, attr)
|
||||
|
||||
def __setattr__(self, attr, value):
|
||||
setattr(self._frame, attr, value)
|
||||
|
||||
def __delattr__(self, attr):
|
||||
delattr(self._frame, attr)
|
||||
|
||||
|
||||
class PsycoFrame(Frame):
|
||||
|
||||
def __init__(self, tag):
|
||||
self.__dict__.update({
|
||||
'_tag' : tag,
|
||||
'f_code' : tag[0],
|
||||
'f_globals': tag[1],
|
||||
})
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if attr == 'f_back':
|
||||
try:
|
||||
result = embedframe(_psyco.getframe(self._tag))
|
||||
except ValueError:
|
||||
result = None
|
||||
elif attr == 'f_lineno':
|
||||
result = self.f_code.co_firstlineno # better than nothing
|
||||
elif attr == 'f_builtins':
|
||||
result = self.f_globals['__builtins__']
|
||||
elif attr == 'f_restricted':
|
||||
result = self.f_builtins is not __builtins__
|
||||
elif attr == 'f_locals':
|
||||
raise AttributeError, ("local variables of functions run by Psyco "
|
||||
"cannot be accessed in any way, sorry")
|
||||
else:
|
||||
raise AttributeError, ("emulated Psyco frames have "
|
||||
"no '%s' attribute" % attr)
|
||||
self.__dict__[attr] = result
|
||||
return result
|
||||
|
||||
def __setattr__(self, attr, value):
|
||||
raise AttributeError, "Psyco frame objects are read-only"
|
||||
|
||||
def __delattr__(self, attr):
|
||||
if attr == 'f_trace':
|
||||
# for bdb which relies on CPython frames exhibiting a slightly
|
||||
# buggy behavior: you can 'del f.f_trace' as often as you like
|
||||
# even without having set it previously.
|
||||
return
|
||||
raise AttributeError, "Psyco frame objects are read-only"
|
||||
|
||||
|
||||
def embedframe(result):
|
||||
if type(result) is type(()):
|
||||
return PsycoFrame(result)
|
||||
else:
|
||||
return PythonFrame(result)
|
||||
|
||||
def _getframe(depth=0):
|
||||
"""Return a frame object from the call stack. This is a replacement for
|
||||
sys._getframe() which is aware of Psyco frames.
|
||||
|
||||
The returned objects are instances of either PythonFrame or PsycoFrame
|
||||
instead of being real Python-level frame object, so that they can emulate
|
||||
the common attributes of frame objects.
|
||||
|
||||
The original sys._getframe() ignoring Psyco frames altogether is stored in
|
||||
psyco._getrealframe(). See also psyco._getemulframe()."""
|
||||
# 'depth+1' to account for this _getframe() Python function
|
||||
return embedframe(_psyco.getframe(depth+1))
|
||||
|
||||
def _getemulframe(depth=0):
|
||||
"""As _getframe(), but the returned objects are real Python frame objects
|
||||
emulating Psyco frames. Some of their attributes can be wrong or missing,
|
||||
however."""
|
||||
# 'depth+1' to account for this _getemulframe() Python function
|
||||
return _psyco.getframe(depth+1, 1)
|
||||
|
||||
def patch(name, module=__builtin__):
|
||||
f = getattr(_psyco, name)
|
||||
org = getattr(module, name)
|
||||
if org is not f:
|
||||
setattr(module, name, f)
|
||||
setattr(_psyco, 'original_' + name, org)
|
||||
|
||||
_getrealframe = sys._getframe
|
||||
sys._getframe = _getframe
|
||||
patch('globals')
|
||||
patch('eval')
|
||||
patch('execfile')
|
||||
patch('locals')
|
||||
patch('vars')
|
||||
patch('dir')
|
||||
patch('input')
|
||||
_psyco.original_raw_input = raw_input
|
||||
__builtin__.__in_psyco__ = 0==1 # False
|
||||
|
||||
if hasattr(_psyco, 'compact'):
|
||||
import kdictproxy
|
||||
_psyco.compactdictproxy = kdictproxy.compactdictproxy
|
@ -1,88 +0,0 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
sub PrintArgsAndDie () {
|
||||
print stderr "USAGE: reduce-field.pl [-h] \n";
|
||||
print stderr "This scripts reduce the number of active fields for the mert procedure.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
my $weightfile="";
|
||||
my $size=-1;
|
||||
my $activefields="";
|
||||
my $debug=0;
|
||||
|
||||
while (@ARGV){
|
||||
if ($ARGV[0] eq "-h"){
|
||||
&PrintArgsAndDie();
|
||||
}
|
||||
if ($ARGV[0] eq "-debug"){
|
||||
$debug=1;
|
||||
shift(@ARGV);
|
||||
}
|
||||
if ($ARGV[0] eq "-weight"){
|
||||
$weightfile=$ARGV[1];
|
||||
shift(@ARGV); shift(@ARGV);
|
||||
}
|
||||
if ($ARGV[0] eq "-d"){
|
||||
$size=$ARGV[1];
|
||||
shift(@ARGV); shift(@ARGV);
|
||||
}
|
||||
if ($ARGV[0] eq "-activate"){
|
||||
$activefields=$ARGV[1];
|
||||
shift(@ARGV); shift(@ARGV);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
die "Cannot open/find weight file ($weightfile)\n" if ! -e $weightfile;
|
||||
|
||||
my @weight=();
|
||||
open(IN,$weightfile);
|
||||
chomp($weight=<IN>);
|
||||
close(IN);
|
||||
push @weight,split(/[ \t]+/,"1 $weight");
|
||||
my @active=();
|
||||
my @invertedactive=();
|
||||
|
||||
if ($activefields eq ""){
|
||||
for (my $i=1; $i<=$size; $i++){ $active[$i]=1; };
|
||||
}else{
|
||||
@active=split(/,/,$activefields);
|
||||
}
|
||||
|
||||
for (my $i=0; $i<=$size; $i++){ $invertedactive[$i]=0; };
|
||||
for (my $i=0; $i<scalar(@active); $i++){ $invertedactive[$active[$i]]=1; };
|
||||
my $j=0;
|
||||
for (my $i=1; $i<=$size; $i++){ if (!$invertedactive[$i]){$notactive[$j]=$i; $j++}};
|
||||
|
||||
if ($debug>0){
|
||||
print STDERR "ORIGINAL SIZE: $size\n";
|
||||
print STDERR "ORIGINAL WEIGHTS: @weight\n";
|
||||
print STDERR "ORIGINAL ACTIVE: @active\n";
|
||||
print STDERR "ORIGINAL NOTACTIVE: @notactive\n";
|
||||
print STDERR "ORIGINAL INVERTEDACTIVE: @invertedactive\n";
|
||||
}
|
||||
|
||||
while(chomp($_=<STDIN>)){
|
||||
my @field=(0,split(/[ \t]+/,$_));
|
||||
|
||||
my $notactivedweightedsum=0.0;
|
||||
my $j;
|
||||
for (my $i=0; $i<scalar(@notactive); $i++){
|
||||
$j=$notactive[$i];
|
||||
$notactivedweightedsum+=($weight[$j]*$field[$j]);
|
||||
printf STDERR "notactive -> i:$i j:$j -> $weight[$j] - $field[$j] -> $notactivedweightedsum\n" if $debug>0;
|
||||
};
|
||||
|
||||
printf STDOUT "%.3f",$notactivedweightedsum;
|
||||
printf STDERR "sum not active features: %.3f\n",$notactivedweightedsum if $debug>0;
|
||||
for (my $i=0; $i<scalar(@active); $i++){
|
||||
print STDOUT " $field[$active[$i]]";
|
||||
printf STDERR "active -> i:$i j:$active[$i] -> $field[$active[$i]]\n" if $debug>0;
|
||||
};
|
||||
for (my $i=scalar(@active)+scalar(@notactive)+1; $i< scalar(@field); $i++){
|
||||
print STDOUT " $field[$i]";
|
||||
printf STDERR "extra -> i:$i -> $field[$i]\n" if $debug>0;
|
||||
};
|
||||
print STDOUT "\n";
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
unset LANG
|
||||
export PATH=$PATH:/group/project/statmt/pkoehn/user/abhishek:/group/project/statmt/pkoehn/user/abhishek/cmert-0.5
|
||||
export EVAL=/group/project/statmt/pkoehn/user/abhishek/WST05/fr-en-train/dev
|
||||
|
||||
mert-driver cmert-work $EVAL/low.test400.fr.rest $EVAL/low.test400.en 100 pharaoh.2005-07-21 "-config /group/project/statmt/pkoehn/user/abhishek/WST05/fr-en-train/model/pharaoh.ini -dl 4 -b 0.1 -ttable-limit 100" "0.2,0-1;0.2,0.2-0.2;0.2,0-1;0.2,0-1;0.2,0-1;0.2,0-1;0.2,-1-1;0.2,-1-1"
|
||||
|
@ -1,109 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# $Id$
|
||||
"""Convert n-best list in mert.perl format to format required by
|
||||
Venugopal's MER trainer. This entails calculating the BLEU component scores."""
|
||||
|
||||
"""usage: score-nbest.py <reffile>+ <outprefix>
|
||||
|
||||
The input should be sorted by sentence number and piped into stdin
|
||||
Run it like this: sort -mnk 1,1 *.nbest | score-nbest.py ...
|
||||
"""
|
||||
|
||||
import sys, itertools, re
|
||||
import bleu
|
||||
#Comment out this line when moving to python 2.4
|
||||
from sets import Set as set
|
||||
|
||||
def process(sentnum, testsents):
|
||||
candsfile.write("%d %d\n" % (cur_sentnum, len(testsents)))
|
||||
for (sent,vector) in testsents:
|
||||
comps = bleu.cook_test(sent, cookedrefs[sentnum])
|
||||
|
||||
if comps['testlen'] != comps['guess'][0]:
|
||||
sys.stderr.write("ERROR: test length != guessed 1-grams\n")
|
||||
featsfile.write("%s %s %d\n" % (" ".join([str(v) for v in vector]),
|
||||
" ".join(["%d %d" % (c,g) for (c,g) in zip(comps['correct'], comps['guess'])]),
|
||||
comps['reflen']))
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
import os
|
||||
machtype=os.environ.get("MACHTYPE")
|
||||
if machtype == "i386":
|
||||
#import psyco
|
||||
#psyco.full()
|
||||
sys.stderr.write("psyco library is NOT imported. Uncomment code in score-nbest.py if you wish to enable it\n")
|
||||
else:
|
||||
sys.stderr.write("psyco library is not imported because it is not available for %s \n" % machtype)
|
||||
|
||||
import getopt
|
||||
(opts,args) = getopt.getopt(sys.argv[1:], "casen", [])
|
||||
|
||||
for (opt,parm) in opts:
|
||||
if opt == "-c":
|
||||
bleu.preserve_case = True
|
||||
if opt == "-a":
|
||||
bleu.eff_ref_len = "average"
|
||||
if opt == "-s":
|
||||
bleu.eff_ref_len = "shortest"
|
||||
if opt == "-e":
|
||||
bleu.eff_ref_len = "closest"
|
||||
if opt == "-n":
|
||||
bleu.nonorm = 1
|
||||
|
||||
print args
|
||||
cookedrefs = []
|
||||
reffiles = [file(name) for name in args[:-1]]
|
||||
print reffiles
|
||||
for refs in itertools.izip(*reffiles):
|
||||
cookedrefs.append(bleu.cook_refs(refs))
|
||||
|
||||
outprefix = args[-1]
|
||||
|
||||
featsfile = file(outprefix+"feats.opt", "w")
|
||||
candsfile = file(outprefix+"cands.opt", "w")
|
||||
|
||||
cur_sentnum = None
|
||||
testsents = set()
|
||||
progress = 0
|
||||
|
||||
infile = sys.stdin
|
||||
|
||||
# function that recognizes floats
|
||||
re_float=re.compile(r'^-?[-0-9.e\+]+$')
|
||||
is_float=lambda(x):re_float.match(x)
|
||||
|
||||
for line in infile:
|
||||
try:
|
||||
##Changed to add a further field - AA 29/11/05
|
||||
#(sentnum, sent, vector) = line.split('|||')
|
||||
(sentnum, sent, vector, prob ) = line.split('|||')
|
||||
except:
|
||||
sys.stderr.write("ERROR: bad input line %s\n" % line)
|
||||
sentnum = int(sentnum)
|
||||
sent = " ".join(sent.split())
|
||||
# filter out score labels (keep only floats) and convert numbers to floats
|
||||
vector = tuple(map(lambda(s): -float(s), filter(is_float, vector.split())))
|
||||
|
||||
if sentnum != cur_sentnum:
|
||||
if cur_sentnum is not None:
|
||||
process(cur_sentnum, testsents)
|
||||
cur_sentnum = sentnum
|
||||
testsents = set()
|
||||
testsents.add((sent,vector))
|
||||
|
||||
if progress % 10000 == 0:
|
||||
sys.stdout.write(".")
|
||||
sys.stdout.flush()
|
||||
|
||||
progress += 1
|
||||
process(cur_sentnum, testsents)
|
||||
|
||||
sys.stdout.write("\n")
|
||||
featsfile.close()
|
||||
candsfile.close()
|
||||
|
||||
|
||||
|
||||
|
@ -1,34 +0,0 @@
|
||||
// $Id$
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "score.h"
|
||||
|
||||
int comps_n = 9;
|
||||
|
||||
void comps_addto(int *comps1, int *comps2) {
|
||||
int i;
|
||||
for (i=0; i<comps_n; i++)
|
||||
comps1[i] += comps2[i];
|
||||
}
|
||||
|
||||
float compute_score(int *comps) {
|
||||
float logbleu = 0.0, brevity;
|
||||
int i;
|
||||
int n = (comps_n-1)/2;
|
||||
|
||||
/*for (i=0; i<comps_n; i++)
|
||||
fprintf(stderr, " %d", comps[i]);
|
||||
fprintf(stderr, "\n");*/
|
||||
|
||||
for (i=0; i<n; i++) {
|
||||
if (comps[2*i] == 0)
|
||||
return 0.0;
|
||||
logbleu += log(comps[2*i])-log(comps[2*i+1]);
|
||||
}
|
||||
logbleu /= n;
|
||||
brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
|
||||
if (brevity < 0.0)
|
||||
logbleu += brevity;
|
||||
return exp(logbleu);
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
// $Id$
|
||||
#ifndef SCORE_H
|
||||
#define SCORE_H
|
||||
|
||||
extern int comps_n;
|
||||
|
||||
void comps_addto(int *comps1, int *comps2);
|
||||
float compute_score(int *comps);
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
BIN
zmert/zmert.jar
BIN
zmert/zmert.jar
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user