remove zmert and cmert

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3397 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
bhaddow 2010-08-10 14:28:03 +00:00
parent 8616a2bdee
commit 321f528ff5
38 changed files with 1 additions and 4295 deletions

View File

@ -26,7 +26,7 @@ RELEASEDIR=$(TARGETDIR)/scripts-$(TS)
all: compile
SUBDIRS=cmert-0.5 phrase-extract symal mbr lexical-reordering
SUBDIRS=phrase-extract symal mbr lexical-reordering
SUBDIRS_CLEAN=$(SUBDIRS) memscore
compile: compile-memscore

View File

@ -63,23 +63,6 @@ training/absolutize_moses_model.pl
training/build-generation-table.perl
training/clean-corpus-n.perl
training/clone_moses_model.pl
training/cmert-0.5/bleu.py
training/cmert-0.5/dataset.py
training/cmert-0.5/log.py
training/cmert-0.5/mert
training/cmert-0.5/enhanced-mert
training/cmert-0.5/reduce-field.pl
training/cmert-0.5/extend-field.pl
training/cmert-0.5/python/psyco/classes.py
training/cmert-0.5/python/psyco/core.py
training/cmert-0.5/python/psyco/__init__.py
training/cmert-0.5/python/psyco/kdictproxy.py
training/cmert-0.5/python/psyco/logger.py
training/cmert-0.5/python/psyco/profiler.py
training/cmert-0.5/python/psyco/_psyco.so
training/cmert-0.5/python/psyco/support.py
training/cmert-0.5/README
training/cmert-0.5/score-nbest.py
training/mbr/mbr
training/filter-model-given-input.pl
training/filter-rule-table.py
@ -87,7 +70,6 @@ training/lexical-reordering/score
training/memscore/memscore
training/zmert-moses.pl
training/mert-moses.pl
training/mert-moses-new.pl
training/phrase-extract/extract
training/phrase-extract/extract-rules
training/phrase-extract/score

View File

@ -12,21 +12,6 @@ training/absolutize_moses_model.pl
training/build-generation-table.perl
training/clean-corpus-n.perl
training/clone_moses_model.pl
training/cmert-0.5/bleu.py
training/cmert-0.5/dataset.py
training/cmert-0.5/log.py
training/cmert-0.5/mert.exe
training/cmert-0.5/enhanced-mert
training/cmert-0.5/python/psyco/classes.py
training/cmert-0.5/python/psyco/core.py
training/cmert-0.5/python/psyco/__init__.py
training/cmert-0.5/python/psyco/kdictproxy.py
training/cmert-0.5/python/psyco/logger.py
training/cmert-0.5/python/psyco/profiler.py
training/cmert-0.5/python/psyco/_psyco.so
training/cmert-0.5/python/psyco/support.py
training/cmert-0.5/README
training/cmert-0.5/score-nbest.py
training/combine_factors.pl
training/filter-model-given-input.pl
training/lexical-reordering/score.exe

View File

@ -1,15 +0,0 @@
OBJS=mert.o data.o point.o score.o
CFLAGS=-O3
LDFLAGS=
LDLIBS=-lm
all: mert
clean:
rm -f *.o
mert: $(OBJS)
$(CXX) $(OBJS) $(LDLIBS) -o $@
mert_p: $(OBJS)
$(CXX) $(LDFLAGS) $(OBJS) $(LDLIBS) -o $@

View File

@ -1,10 +0,0 @@
CMERT 0.5
5 Nov 2005
Copyright (c) 2005 David Chiang. All rights reserved (for now).
Minimalist installation instructions:
- make
- set #! lines and sys.path lines in Python scripts
- see run-cmert for example

View File

@ -1,179 +0,0 @@
#!/usr/bin/python
# $Id$
'''Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
score_cooked(alltest, n=4): Score a list of cooked test sentences.
score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
'''
import optparse
import sys, math, re, xml.sax.saxutils
sys.path.append('/fs/clip-mteval/Programs/hiero')
import dataset
import log
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
nonorm = 0
preserve_case = False
eff_ref_len = "shortest"
normalize1 = [
('<skipped>', ''), # strip "skipped" tags
(r'-\n', ''), # strip end-of-line hyphenation and join lines
(r'\n', ' '), # join lines
# (r'(\d)\s+(?=\d)', r'\1'), # join digits
]
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
normalize2 = [
(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
(r'([^0-9])([\.,])',r'\1 \2 '), # tokenize period and comma unless preceded by a digit
(r'([\.,])([^0-9])',r' \1 \2'), # tokenize period and comma unless followed by a digit
(r'([0-9])(-)',r'\1 \2 ') # tokenize dash when preceded by a digit
]
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
def normalize(s):
'''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
if (nonorm):
return s.split()
if type(s) is not str:
s = " ".join(s)
# language-independent part:
for (pattern, replace) in normalize1:
s = re.sub(pattern, replace, s)
s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
# language-dependent part (assuming Western languages):
s = " %s " % s
if not preserve_case:
s = s.lower() # this might not be identical to the original
for (pattern, replace) in normalize2:
s = re.sub(pattern, replace, s)
return s.split()
def count_ngrams(words, n=4):
counts = {}
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] = counts.get(ngram, 0)+1
return counts
def cook_refs(refs, n=4):
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.'''
refs = [normalize(ref) for ref in refs]
maxcounts = {}
for ref in refs:
counts = count_ngrams(ref, n)
for (ngram,count) in counts.iteritems():
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
return ([len(ref) for ref in refs], maxcounts)
def cook_test(test, (reflens, refmaxcounts), n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''
test = normalize(test)
result = {}
result["testlen"] = len(test)
# Calculate effective reference sentence length.
if eff_ref_len == "shortest":
result["reflen"] = min(reflens)
elif eff_ref_len == "average":
result["reflen"] = float(sum(reflens))/len(reflens)
elif eff_ref_len == "closest":
min_diff = None
for reflen in reflens:
if min_diff is None or abs(reflen-len(test)) < min_diff:
min_diff = abs(reflen-len(test))
result['reflen'] = reflen
result["guess"] = [max(len(test)-k+1,0) for k in xrange(1,n+1)]
result['correct'] = [0]*n
counts = count_ngrams(test, n)
for (ngram, count) in counts.iteritems():
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
return result
def score_cooked(allcomps, n=4):
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
for comps in allcomps:
for key in ['testlen','reflen']:
totalcomps[key] += comps[key]
for key in ['guess','correct']:
for k in xrange(n):
totalcomps[key][k] += comps[key][k]
logbleu = 0.0
for k in xrange(n):
if totalcomps['correct'][k] == 0:
return 0.0
log.write("%d-grams: %f\n" % (k,float(totalcomps['correct'][k])/totalcomps['guess'][k]))
logbleu += math.log(totalcomps['correct'][k])-math.log(totalcomps['guess'][k])
logbleu /= float(n)
log.write("Effective reference length: %d test length: %d\n" % (totalcomps['reflen'], totalcomps['testlen']))
logbleu += min(0,1-float(totalcomps['reflen'])/totalcomps['testlen'])
return math.exp(logbleu)
def score_set(set, testid, refids, n=4):
alltest = []
for seg in set.segs():
try:
test = seg.versions[testid].words
except KeyError:
log.write("Warning: missing test sentence\n")
continue
try:
refs = [seg.versions[refid].words for refid in refids]
except KeyError:
log.write("Warning: missing reference sentence, %s\n" % seg.id)
refs = cook_refs(refs, n)
alltest.append(cook_test(test, refs, n))
log.write("%d sentences\n" % len(alltest))
return score_cooked(alltest, n)
if __name__ == "__main__":
import psyco
psyco.full()
import getopt
raw_test = False
(opts,args) = getopt.getopt(sys.argv[1:], "rc", [])
for (opt,parm) in opts:
if opt == "-r":
raw_test = True
elif opt == "-c":
preserve_case = True
s = dataset.Dataset()
if args[0] == '-':
infile = sys.stdin
else:
infile = args[0]
if raw_test:
(root, testids) = s.read_raw(infile, docid='whatever', sysid='testsys')
else:
(root, testids) = s.read(infile)
print "Test systems: %s" % ", ".join(testids)
(root, refids) = s.read(args[1])
print "Reference systems: %s" % ", ".join(refids)
for testid in testids:
print "BLEU score: ", score_set(s, testid, refids)

View File

@ -1,93 +0,0 @@
// $Id$
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "data.h"
#include "point.h"
extern int comps_n;
data_t *read_data(void) {
FILE *fp;
static char buf[1000];
char *tok, *s;
int field;
int sent_i, cand_i, cands_n;
int total_cands_n;
data_t *data;
candidate_t *cands;
data = malloc(sizeof(data_t));
data->sents_max = 100;
data->sents_n = 0;
data->cands_n = malloc(data->sents_max*sizeof(int));
total_cands_n = 0;
fp = fopen("cands.opt", "r");
while (fgets(buf, sizeof(buf), fp) != NULL) {
// should we check to make sure every sentence is accounted for?
sscanf(buf, "%d %d", &sent_i, &cands_n);
if (sent_i >= data->sents_n)
data->sents_n = sent_i+1;
if (sent_i >= data->sents_max) {
data->sents_max = (sent_i+1)*2;
data->cands_n = realloc(data->cands_n, data->sents_max*sizeof(int));
}
data->cands_n[sent_i] = cands_n;
total_cands_n += cands_n;
}
fclose(fp);
/* create master array for candidates and then set data->sents
to point into it */
cands = malloc(total_cands_n * sizeof(candidate_t));
data->sents = malloc(data->sents_n * sizeof(candidate_t *));
total_cands_n = 0;
for (sent_i=0; sent_i<data->sents_n; sent_i++) {
data->sents[sent_i] = cands+total_cands_n;
total_cands_n += data->cands_n[sent_i];
}
cand_i = 0;
fp = fopen("feats.opt", "r");
while (fgets(buf, sizeof(buf), fp) != NULL) {
cands[cand_i].features = malloc(dim*sizeof(float));
cands[cand_i].comps = malloc(comps_n*sizeof(int));
field = 0;
s = buf;
while ((tok = strsep(&s, " \t\n")) != NULL) {
if (!*tok) // empty token
continue;
// read dim floats and then comps_n ints
if (field < dim)
cands[cand_i].features[field] = -strtod(tok, NULL); // Venugopal format uses costs
else if (field < dim+comps_n)
cands[cand_i].comps[field-dim] = strtol(tok, NULL, 10);
else {
fprintf(stderr, "read_data(): too many fields in line in feats.opt\n");
return NULL;
}
field++;
}
if (field != dim+comps_n) {
fprintf(stderr, "read_data(): wrong number of fields in line in feats.opt - expected %d + %d and found %d on line %d\n",dim,comps_n,field,cand_i);
return NULL;
}
cand_i++;
}
if (cand_i != total_cands_n) {
fprintf(stderr, "read_data(): wrong number of lines in cands.opt\n");
return NULL;
}
fclose(fp);
return data;
}

View File

@ -1,18 +0,0 @@
// $Id$
#ifndef DATA_H
#define DATA_H
typedef struct {
float *features;
int *comps;
float m, b; // slope and intercept, used as scratch space
} candidate_t;
typedef struct {
candidate_t **sents;
int sents_n, sents_max, *cands_n;
} data_t;
data_t *read_data(void);
#endif

View File

@ -1,392 +0,0 @@
#!/usr/bin/python2.3
# $Id$
'''Decoder interface:
Dataset.process() expects a function, which in turn takes a Sentence as input
and produces a Sentence or list of Sentences as output.
The input Sentence will be marked with the <seg> tag it was found in
the input file with.
The output Sentences should be marked with <seg> tags if they are to
be marked as such in the output file.
'''
import sys, sgmllib, xml.sax.saxutils, log
def attrs_to_str(d):
if len(d) == 0:
return ""
l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]
return " ".join(l)
def attrs_to_dict(a):
d = {}
for (name, value) in a:
if d.has_key(name.lower()):
raise ValueError, "duplicate attribute names"
d[name.lower()] = value
return d
def strip_newlines(s):
return " ".join(s.split())
class Sentence(object):
def __init__(self, words=None, meta=None):
if words is not None:
self.words = list(words)
else:
self.words = []
if meta is not None:
self.meta = meta
else:
self.meta = []
def mark(self, tag, attrs):
self.meta.append((tag, attrs, 0, len(self.words)))
def getmark(self):
if len(self.meta) > 0:
(tag, attrs, i, j) = self.meta[-1]
if i == 0 and j == len(self.words):
return (tag, attrs)
else:
return None
else:
return None
def unmark(self):
mark = self.getmark()
if mark is not None:
self.meta = self.meta[:-1]
return mark
def __cmp__(self, other):
return cmp((self.words, self.meta), (other.words, other.meta))
def __str__(self):
def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):
if i1==i2<=j1==j2:
return 0
elif i2<=i1<=j1<=j2:
return -1
elif i1<=i2<=j2<=j1:
return 1
else:
return cmp((i1,j1),(i2,j2)) # don't care
# this guarantees that equal spans will come out nested
# we want the later spans to be outer
# this relies on stable sort
open = [[] for i in xrange(len(self.words)+1)]
# there seems to be a bug still with empty spans
empty = [[] for i in xrange(len(self.words)+1)]
close = [[] for j in xrange(len(self.words)+1)]
for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):
if i == j:
# do we want these to nest?
empty[i].append("<%s%s/>" % (tag, attrs_to_str(attrs)))
open[i].append("<%s%s>" % (tag, attrs_to_str(attrs)))
close[j].append("</%s>" % tag)
result = []
if len(empty[0]) > 0:
result.extend(empty[0])
for i in xrange(len(self.words)):
if i > 0:
result.append(" ")
result.extend(reversed(open[i]))
result.append(self.words[i])
result.extend(close[i+1])
if len(empty[i+1]) > 0:
result.extend(empty[i+1])
return "".join(result)
def __add__(self, other):
if type(other) in (list, tuple):
return Sentence(self.words + list(other), self.meta)
else:
othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]
return Sentence(self.words + other.words, self.meta+othermeta)
def read_raw(f):
"""Read a raw file into a list of Sentences."""
if type(f) is str:
f = file(f, "r")
inputs = []
i = 0
for line in f:
sent = process_sgml_line(line, i)
sent.mark('seg', [('id',str(i))])
inputs.append(sent)
i += 1
return inputs
class Dataset(object):
def __init__(self, id=None):
self.id = id
self.docs = {}
self.sysids = []
self.langs = {}
def read(self, f):
'''Read a file into the dataset. Returns (root, sysids)'''
if type(f) is str:
f = file(f, "r")
p = DatasetParser(self)
p.feed(f.read())
p.close()
return (p.root,p.sysids)
def read_raw(self, f, docid, setid=None, sysid=None, lang=None):
"""Read a raw file into the dataset."""
if setid is not None:
if self.id is not None and self.id != setid:
raise ValueError, "Set ID does not match"
else:
self.id = setid
if sysid not in self.sysids:
self.sysids.append(sysid)
self.langs[sysid] = lang
if type(f) is str:
f = file(f, "r")
doc = self.docs.setdefault(docid, Document(docid))
i = 0
for line in f:
if len(doc.segs)-1 < i:
doc.segs.append(Segment(i))
if doc.segs[i].versions.has_key(sysid):
raise ValueError, "multiple versions from same system"
doc.segs[i].versions[sysid] = process_sgml_line(line, i)
doc.segs[i].versions[sysid].mark('seg', [('id',str(i))])
i += 1
return (None, [sysid])
def write(self, f, tag, sysids=None):
if type(f) is str:
f = file(f, "w")
f.write(self.string(tag, sysids))
def write_raw(self, f, sysid=None):
if type(f) is str:
f = file(f, "w")
for seg in self.segs():
f.write(" ".join(seg.versions[sysid].words))
f.write("\n")
def string(self, tag, sysids=None):
if sysids is None:
sysids = self.sysids
elif type(sysids) is str:
sysids = [sysids]
attrs = [('setid', self.id)]
if self.langs.has_key(None):
attrs.append(('srclang', self.langs[None]))
trglangs = [self.langs[sysid] for sysid in sysids if sysid is not None]
for lang in trglangs[1:]:
if lang != trglangs[0]:
raise ValueError, "Inconsistent target language"
if len(trglangs) >= 1:
attrs.append(('trglang', trglangs[0]))
return "<%s%s>\n%s</%s>\n" % (tag,
attrs_to_str(attrs),
"".join([doc.string(sysid) for doc in self.docs.values() for sysid in sysids]),
tag)
def process(self, processor, sysid, lang, srcsysid=None):
if sysid in self.sysids:
raise ValueError, "sysid already in use"
else:
self.sysids.append(sysid)
self.langs[sysid] = lang
for seg in self.segs():
if log.level >= 2:
sys.stderr.write("Input: %s\n" % str(seg.versions[srcsysid]))
seg.versions[sysid] = processor(seg.versions[srcsysid])
if log.level >= 2:
if type(seg.versions[sysid]) is not list:
sys.stderr.write("Output: %s\n" % str(seg.versions[sysid]))
else:
sys.stderr.write("Output (1st): %s\n" % str(seg.versions[sysid][0]))
def segs(self):
for doc in self.docs.values():
for seg in doc.segs:
yield seg
class Document(object):
def __init__(self, id):
self.id = id
self.segs = []
def string(self, sysid):
attrs = [('docid', self.id)]
if sysid is not None:
attrs.append(('sysid', sysid))
return "<doc%s>\n%s</doc>\n" % (attrs_to_str(attrs),
"".join([seg.string(sysid) for seg in self.segs]))
class Segment(object):
def __init__(self, id=None):
self.id = id
self.versions = {}
def string(self, sysid):
v = self.versions[sysid]
if type(v) is not list:
v = [v]
output = []
for i in xrange(len(v)):
output.append(str(v[i]))
output.append('\n')
return "".join(output)
def process_sgml_line(line, id=None):
p = DatasetParser(None)
p.pos = 0
p.words = []
p.meta = []
p.feed(line)
p.close()
sent = Sentence(p.words, p.meta)
return sent
class DatasetParser(sgmllib.SGMLParser):
def __init__(self, set):
sgmllib.SGMLParser.__init__(self)
self.words = None
self.sysids = []
self.set = set
self.mystack = []
def handle_starttag(self, tag, method, attrs):
thing = method(attrs)
self.mystack.append(thing)
def handle_endtag(self, tag, method):
thing = self.mystack.pop()
method(thing)
def unknown_starttag(self, tag, attrs):
thing = self.start(tag, attrs)
self.mystack.append(thing)
def unknown_endtag(self, tag):
thing = self.mystack.pop()
self.end(tag, thing)
def start_srcset(self, attrs):
attrs = attrs_to_dict(attrs)
if self.set.id is None:
self.set.id = attrs['setid']
if 0 and self.set.id != attrs['setid']:
raise ValueError, "Set ID does not match"
self.lang = attrs['srclang']
self.root = 'srcset'
return None
def start_refset(self, attrs):
attrs = attrs_to_dict(attrs)
if self.set.id is None:
self.set.id = attrs['setid']
if 0 and self.set.id != attrs['setid']:
raise ValueError, "Set ID does not match"
if self.set.langs.setdefault(None, attrs['srclang']) != attrs['srclang']:
raise ValueError, "Source language does not match"
self.lang = attrs['trglang']
self.root = 'refset'
return None
def start_tstset(self, attrs):
attrs = attrs_to_dict(attrs)
if self.set.id is None:
self.set.id = attrs['setid']
if 0 and self.set.id != attrs['setid']:
raise ValueError, "Set ID does not match"
if 0 and self.set.langs.setdefault(None, attrs['srclang']) != attrs['srclang']:
raise ValueError, "Source language does not match"
self.lang = attrs['trglang']
self.root = 'tstset'
return None
def end_srcset(self, thing):
for sysid in self.sysids:
if sysid not in self.set.sysids:
self.set.sysids.append(sysid)
self.set.langs[sysid] = self.lang
end_refset = end_tstset = end_srcset
def start_doc(self, attrs):
attrs = attrs_to_dict(attrs)
self.doc = self.set.docs.setdefault(attrs['docid'], Document(attrs['docid']))
self.seg_i = 0
if self.root == 'srcset':
self.sysid = None
else:
self.sysid = attrs['sysid']
if self.sysid not in self.sysids:
self.sysids.append(self.sysid)
return None
def end_doc(self, thing):
pass
def start_seg(self, attrs):
thing = ('seg', attrs, 0, None)
attrs = attrs_to_dict(attrs)
if len(self.doc.segs)-1 < self.seg_i:
self.doc.segs.append(Segment(attrs.get('id', None)))
self.seg = self.doc.segs[self.seg_i]
if 0 and self.seg.id is not None and attrs.has_key('id') and self.seg.id != attrs['id']:
raise ValueError, "segment ids do not match (%s != %s)" % (str(self.seg.id), str(attrs.get('id', None)))
if self.seg.versions.has_key(self.sysid):
raise ValueError, "multiple versions from same system"
self.pos = 0
self.words = []
self.meta = []
return thing
def end_seg(self, thing):
(tag, attrs, i, j) = thing
self.meta.append((tag, attrs, i, self.pos))
self.seg_i += 1
self.seg.versions[self.sysid] = Sentence(self.words, self.meta)
self.words = None
"""# Special case for start and end of sentence
def start_s(self, attrs):
if self.words is not None:
self.pos += 1
self.words.append('<s>')
return None
def end_s(self, thing):
if self.words is not None:
self.pos += 1
self.words.append('</s>')"""
def start(self, tag, attrs):
if self.words is not None:
return (tag, attrs, self.pos, None)
else:
return None
def end(self, tag, thing):
if self.words is not None:
(tag, attrs, i, j) = thing
self.meta.append((tag, attrs, i, self.pos))
def handle_data(self, s):
if self.words is not None:
words = s.split()
self.pos += len(words)
self.words.extend(words)
if __name__ == "__main__":
s = Dataset()
for filename in sys.argv[1:]:
s.read_raw(filename, 'whatever', 'whatever', filename, 'English')
s.write(sys.stdout, 'tstset')

View File

@ -1,119 +0,0 @@
#! /bin/bash
PrintUsageAndDie(){
echo "USAGE: enhanced-cmert.sh -d size [-active] [-help]"
echo " perform cmert on a subset of the feature scores"
echo " the ratios among not activated weights are not modified"
echo " Parameters (*=optional):"
echo " -d: the number of original features"
echo " -rootdir: the scripts root dir"
echo " -activate (*): comma-separated (or blank-separated) list of the indexes of active features"
echo " if not set, all features are optimized"
echo " -debug(*): debug information"
echo " -help(*): print his help"
echo
echo "Example: see examples in the directory example which are created with the script readme.txt"
exit
}
normalize_weights(){
perl -ne '{$tot=0;chomp;split;grep($tot+=($_>0)?$_:-$_,@_); grep($_/=$tot,@_); for ($i=0;$i<scalar(@_);$i++){printf STDOUT "%.6f ",$_[$i];};printf STDOUT "\n";}'
}
activeflag=0;
help=0
debug=""
if [ $# -lt 1 ] ; then PrintUsageAndDie ; fi
while [ $# -gt 0 ]
do
case $1 in
-help) help=1 ; shift 1 ; ;;
-d) size=$2 ; shift 2 ; ;;
-rootdir) SCRIPTS_ROOTDIR=$2 ; shift 2 ; ;;
-debug) debug="-debug"; shift 1 ; ;;
-activate) activeflag=1 ; activefields=$2 ; shift 2 ; ;;
*) shift $# ; ;;
esac
done
if [ $help == 1 ] ; then PrintUsageAndDie ; fi
# call the basic mert command
if [ $activeflag == 0 ] ; then
$SCRIPTS_ROOTDIR/training/cmert-0.5/mert -d $size
exit
fi
# else
if [ $debug ] ; then echo "names of active fields: $activefields" ; fi
#get indexes of active fields from file "names.txt
oldname="__FALSE_NAME__"
name="__FALSE_NAME__"
separator="_"
i=1 lastj=1
for name in `cat names.txt` ; do
if [ $name == $oldname ] ; then i=$(( i + 1 )) ; else i=1 ; fi
arrayname[$lastj]=$name
arrayname2[$lastj]=$name$separator$i
lastj=$(( lastj + 1 ))
oldname=$name
done
#map feature names into feature indexes
out=""
for name in `echo $activefields | tr ',' ' ' ` ; do
match=0; j=1
while [ $j -lt $lastj ] ; do
if [ ${arrayname[$j]} == $name -o ${arrayname2[$j]} == "$name" ] ; then
match=$j
if [ $out ] ; then out="$out,$j" ; else out="$j" ; fi
fi
j=$(( j + 1 ))
done
if [ $match -eq 0 ] ; then echo "feature $name you are asking for is not present" ; fi
done
activefields=`echo $out | tr ',' '\012' | sort -nu | tr '\012' ',' | perl -pe 's/\,$//' `
if [ $debug ] ; then echo "indexes of active fields: $activefields" ; fi
#filter active fields, perform cmert and ...
tmpdir=tmp$$
mkdir -p $tmpdir
for file in feats.opt init.opt ; do
mv $file $tmpdir
done
cat $tmpdir/init.opt | tail -1 > $tmpdir/weight.opt
cat $tmpdir/init.opt | perl $SCRIPTS_ROOTDIR/training/cmert-0.5/reduce-field.pl $debug -weight $tmpdir/weight.opt -d $size -activate $activefields | perl -pe 's/^\S+ /1 /' > init.opt
cat $tmpdir/feats.opt | perl $SCRIPTS_ROOTDIR/training/cmert-0.5/reduce-field.pl $debug -weight $tmpdir/weight.opt -d $size -activate $activefields > feats.opt
active=`cat init.opt | head -1 | awk '{print NF}'`
$SCRIPTS_ROOTDIR/training/cmert-0.5/mert -d $active 2> reduced_cmert.log
for file in feats.opt init.opt; do
mv $file reduced_$file
mv $tmpdir/$file $file
done
mv weights.txt reduced_weights.txt
cat reduced_weights.txt | perl $SCRIPTS_ROOTDIR/training/cmert-0.5/extend-field.pl $debug -weight $tmpdir/weight.opt -d $size -activate $activefields | normalize_weights > weights.txt
rm -r $tmpdir
bestpointline=`echo "Best point:"`
bestpointline="$bestpointline "`cat weights.txt`
bestpointline="$bestpointline => "`cat reduced_cmert.log | grep -i "Best point:" | awk '{print $NF}'`
echo $bestpointline > /dev/stderr
exit

View File

@ -1 +0,0 @@
0 10

View File

@ -1,10 +0,0 @@
4.0 383.916 60.6749 113.308 28.7833 94.443 -27.9971 66.0 49 66 27 65 16 64 10 63 67
6.0 370.709 67.0555 105.849 37.0838 85.7675 -29.9969 64.0 49 64 29 63 17 62 10 61 67
10.0 415.511 57.7613 97.1628 27.7191 83.3125 -28.997 68.0 54 68 30 67 19 66 13 65 67
6.0 412.823 59.5607 99.215 28.2344 82.0559 -28.997 67.0 53 67 32 66 20 65 13 64 67
4.0 422.048 56.6241 97.204 28.6241 80.8079 -28.997 67.0 52 67 30 66 19 65 13 64 67
4.0 392.685 60.6979 105.33 28.4244 90.094 -28.997 66.0 51 66 29 65 17 64 11 63 67
6.0 365.877 69.0651 108.001 37.33 83.4477 -31.9967 63.0 49 63 29 62 17 61 10 60 67
6.0 418.054 57.5832 97.2047 26.9759 83.6841 -29.9969 68.0 54 68 32 67 20 66 13 65 67
6.0 375.021 64.0915 103.471 38.6084 84.3162 -29.9969 63.0 49 63 28 62 16 61 10 60 67
6.0 364.308 71.1182 110.425 35.7858 82.8551 -30.9968 63.0 49 63 29 62 17 61 10 60 67

View File

@ -1,3 +0,0 @@
0 0 0 0 0 0 -1 -1
1 2 2 2 2 2 1 1
1 1 0.3 0.2 0.2 0.3 0 0

View File

@ -1 +0,0 @@
d lm tm tm tm tm tm w

View File

@ -1,12 +0,0 @@
mkdir -p example1
../enhanced-mert -d 8 >& cmert.log
mv cmert.log weights.txt example1
mkdir -p example2
../enhanced-mert -d 8 -activate lm,tm_2,tm_5,w >& cmert.log
mv cmert.log weights.txt reduced_* example2
mkdir -p example3
../enhanced-mert -d 8 -activate d,tm_1,tm_5 >& cmert.log
mv cmert.log weights.txt reduced_* example3

View File

@ -1,80 +0,0 @@
#! /usr/bin/perl
sub PrintArgsAndDie () {
print stderr "USAGE: extend-field.pl [-h] \n";
print stderr "This scripts extend the number of active fields for the mert procedure. (See the dual script reduce-field.pl)\n";
exit(1);
}
my $weightfile="";
my $size=-1;
my $activefields="";
while (@ARGV){
if ($ARGV[0] eq "-h"){
&PrintArgsAndDie();
}
if ($ARGV[0] eq "-debug"){
$debug=1;
shift(@ARGV);
}
if ($ARGV[0] eq "-weight"){
$weightfile=$ARGV[1];
shift(@ARGV); shift(@ARGV);
}
if ($ARGV[0] eq "-d"){
$size=$ARGV[1];
shift(@ARGV); shift(@ARGV);
}
if ($ARGV[0] eq "-activate"){
$activefields=$ARGV[1];
shift(@ARGV); shift(@ARGV);
}
}
die "Cannot open/find weight file ($weightfile)\n" if ! -e $weightfile;
my @weight=();
open(IN,$weightfile);
chomp($weight=<IN>);
close(IN);
push @weight,(0,split(/[ \t]+/,$weight));
my @active=();
my @invertedactive=();
if ($activefields eq ""){
for (my $i=1; $i<=$size; $i++){ $active[$i]=1; };
}else{
@active=split(/,/,$activefields);
}
for (my $i=0; $i<=$size; $i++){ $invertedactive[$i]=0; };
for (my $i=0; $i<scalar(@active); $i++){ $invertedactive[$active[$i]]=1; };
my $j=0;
for (my $i=1; $i<=$size; $i++){ if (!$invertedactive[$i]){$notactive[$j]=$i; $j++}};
if ($debug>0){
print STDERR "ORIGINAL SIZE: $size\n";
print STDERR "ORIGINAL WEIGHTS: @weight\n";
print STDERR "ORIGINAL ACTIVE: @active\n";
print STDERR "ORIGINAL NOTACTIVE: @notactive\n";
print STDERR "ORIGINAL INVERTEDACTIVE: @invertedactive\n";
}
while(chomp($_=<STDIN>)){
@field=split(/[ \t]+/,$_);
my $j=1;
for (my $i=1; $i<=$size; $i++){
if ($invertedactive[$i]){
print STDOUT "$field[$j] ";
print STDERR "j:$j i:$i -> $field[$j]\n" if $debug>0;
$j++;
}else{
printf STDOUT "%.6f ",$field[0]*$weight[$i];
print STDERR "i:$i -> $field[0] $weight[$i]\n" if $debug>0;
}
};
print STDOUT "\n";
}

View File

@ -1,19 +0,0 @@
#!/usr/bin/python
# $Id$
import sys
level = 1
file = sys.stderr
def writeln(s=""):
file.write("%s\n" % s)
file.flush()
def write(s):
file.write(s)
file.flush()

View File

@ -1,27 +0,0 @@
#!/usr/bin/perl -w
if ($#ARGV != 2) {
die "usage: makeinitopt <ranges> <weightfile> <rangefile>"
}
$s = $ARGV[0];
$woutput = $ARGV[1];
$routput = $ARGV[2];
open WOUT, ">$woutput" || die "couldn't open $woutput";
open ROUT, ">$routput" || die "couldn't open $routput";
@w = ();
@lo = ();
@hi = ();
foreach $x (split(/;/, $s)) {
if ($x =~ /(.*),(-?[\d.]+)-(-?[\d.]+)/) {
push(@w, $1);
push(@lo, $2);
push(@hi, $3);
} else {
print STDERR "bad weight range: $x\n";
}
}
print WOUT join(" ", @w), "\n";
print ROUT join(" ", @lo), "\n";
print ROUT join(" ", @hi), "\n";

View File

@ -1,88 +0,0 @@
#!/bin/sh
WORKDIR=$1
if [ ! -d $WORKDIR ]; then
mkdir -p $WORKDIR
fi
SRCFILE=$2
REFPREFIX=$3
REFFILES=$REFPREFIX[0-9]*
NBEST=$4
DECODER=$5
DECODEROPTS=$6
RANGES=$7
START=$8
#default pwdcmd is pwd
#pwdcmd is pawd if exists
PWDCMD="pwd"
___PWDCMD=`which pawd | head -1 | awk '{print $1}'`
if [ $___PWDCMD -a -e $___PWDCMD ] ; then PWDCMD=$___PWDCMD ; fi;
RUNDIR=`$PWDCMD`
makeinitopt "$RANGES" $WORKDIR/weights.txt $WORKDIR/ranges.txt
DIM=`cat $WORKDIR/weights.txt | awk '{print NF; exit}'`
echo $DIM dimensions
PATH=/group/project/statmt/pkoehn/user/abhishek:/group/project/statmt/pkoehn/user/abhishek/cmert-0.5:$PATH
export PATH
date
echo Reference sets: $REFFILES
if [ "x$START" == "x" ]; then
START=1
fi
I=$START
PREVLINECOUNT=0
#$DECODEROPTS =~ s / \-f / -config /;
#$DECODEROPTS =~ s/^\-f /-config /;
filename=$WORKDIR/run$I.best$NBEST.out
while true; do
echo Run decoder
WEIGHTS=`cat $WORKDIR/weights.txt`
###Changes - AA 29/11/05
#echo "$DECODER $NBEST \"$WEIGHTS\" $WORKDIR/run$I \"$DECODEROPTS\" < $SRCFILE > $WORKDIR/run$I.nbest"
#$DECODER $NBEST \"$WEIGHTS\" $WORKDIR/run$I \"$DECODEROPTS\" < $SRCFILE > $WORKDIR/run$I.nbest
echo "$DECODER $DECODEROPTS \"$WEIGHTS\" -n-best-list $filename $NBEST < $SRCFILE > $WORKDIR/run$I.nbest"
$DECODER $DECODEROPTS "$WEIGHTS" -n-best-list $filename $NBEST < $SRCFILE > $WORKDIR/run$I.nbest
echo Calculate BLEU component scores
sort -mn -t\| -k 1,1 $WORKDIR/run*.nbest | score-nbest.py $REFFILES $WORKDIR/
#LINECOUNT=`cat $WORKDIR/feats.opt | awk '{n++} END {print n}'`
LINECOUNT=`cat $WORKDIR/cands.opt | awk '{n += $2} END {print n}'`
echo $LINECOUNT accumulated translations
if [ $LINECOUNT -le $PREVLINECOUNT ]; then
echo "Training finished"
date
break
fi
echo Optimize feature weights
cd $WORKDIR
cat ranges.txt weights.txt > init.opt
rm -f weights.txt
mert -d$DIM
cd $RUNDIR
if [ "x`cat $WORKDIR/weights.txt`" == "x" ]; then
echo Optimization failed
break
fi
I=`expr $I + 1`
PREVLINECOUNT=$LINECOUNT
date
done

View File

@ -1,432 +0,0 @@
// $Id$
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include "data.h"
#include "point.h"
#include "score.h"
int verbose = 2;
float min_interval = 1e-3;
typedef struct {
float x;
int cand;
int *delta_comps;
} intersection_t;
intersection_t *new_intersection(float x, int cand, int *comps1, int *comps2) {
intersection_t *inter;
int i;
inter = malloc(sizeof(intersection_t));
inter->x = x;
inter->cand = cand; // this is not used but sometimes it's handy
inter->delta_comps = malloc(comps_n * sizeof(int));
for (i=0; i<comps_n; i++)
inter->delta_comps[i] = comps1[i]-comps2[i];
return inter;
}
void intersection_delete(intersection_t *inter) {
free(inter->delta_comps);
free(inter);
}
int compare_intersections(intersection_t **i1, intersection_t **i2) {
if ((*i1)->x == (*i2)->x)
return 0;
else if ((*i1)->x < (*i2)->x)
return -1;
else
return 1;
}
float slow_bleu(data_t *data, point_t *point) {
int sent_i, cand_i, cand_n, i;
candidate_t *cands;
float p, best_p;
int best;
int *comps;
float score;
int ties, totalties;
comps = calloc(comps_n, sizeof(int));
totalties = 0;
for (sent_i = 0; sent_i < data->sents_n; sent_i++) {
cands = data->sents[sent_i];
cand_n = data->cands_n[sent_i];
ties = 0;
best = 0;
best_p = point_dotproduct(point, cands[0].features);
for (cand_i = 1; cand_i < cand_n; cand_i++) {
p = point_dotproduct(point, cands[cand_i].features);
if (p > best_p) {
best_p = p;
best = cand_i;
ties = 0;
} else if (p == best_p) {
ties++;
}
}
totalties += ties;
comps_addto(comps, cands[best].comps);
}
//point_print(point, stderr, 1);
//fprintf(stderr, "\n");
//fprintf(stderr, "slow bleu => %f\n", compute_score(comps));
score = compute_score(comps);
free(comps);
return score;
}
/* Global optimization along a line (Och, 2004) */
point_t *line_optimize(data_t *data, point_t *origin, point_t *dir) {
int sent_i, cand_i, cand_n, intersection_i;
candidate_t *cands;
static intersection_t **intersections = NULL;
intersection_t *inter;
static int intersection_max;
int intersection_n = 0;
int prev, leftmost;
float x, leftmost_x, prev_x, best_x;
float score, best_score;
int *comps;
point_t *point;
int first;
if (!origin->has_score)
point_set_score(origin, slow_bleu(data, origin));
if (verbose >= 2) {
fprintf(stderr, "starting point: ");
point_print(origin, stderr, 1);
fprintf(stderr, "\n direction: ");
point_print(dir, stderr, 1);
fprintf(stderr, "\n");
}
comps = calloc(comps_n, sizeof(int));
if (intersections == NULL) {
intersection_max = 10;
intersections = malloc(intersection_max*sizeof(intersection_t *));
}
for (sent_i = 0; sent_i < data->sents_n; sent_i++) {
cands = data->sents[sent_i];
cand_n = data->cands_n[sent_i];
if (verbose >= 3)
fprintf(stderr, "sentence %d\n", sent_i);
if (cand_n < 1)
continue;
/* calculate slopes and intercepts */
for (cand_i = 0; cand_i < cand_n; cand_i++) {
cands[cand_i].m = point_dotproduct(dir, cands[cand_i].features);
cands[cand_i].b = point_dotproduct(origin, cands[cand_i].features);
}
/* find intersection points */
/* find best candidate for x -> -inf */
prev = -1;
for (cand_i = 0; cand_i < cand_n; cand_i++)
if (prev < 0 ||
cands[cand_i].m < cands[prev].m ||
cands[cand_i].m == cands[prev].m && cands[prev].b < cands[cand_i].b)
prev = cand_i;
if (verbose >= 3) {
fprintf(stderr, "x->-inf cand %d\n", prev);
}
comps_addto(comps, cands[prev].comps);
first = 1;
while (1) {
// find leftmost intersection
leftmost = -1;
for (cand_i = 0; cand_i < cand_n; cand_i++) {
if (cands[prev].m == cands[cand_i].m) {
if (cands[cand_i].b > cands[cand_i].b)
fprintf(stderr, "two parallel lines and discarding the higher -- this shouldn't happen\n");
continue; // no intersection
}
/* optimization: piecewise linear function must be concave up.
Maybe it would be still faster to sort by slope beforehand */
if (cands[cand_i].m < cands[prev].m)
continue;
x = -(cands[prev].b-cands[cand_i].b)/(cands[prev].m-cands[cand_i].m);
if (leftmost < 0 || x < leftmost_x) {
leftmost = cand_i;
leftmost_x = x;
}
}
if (leftmost < 0)
break; // no more intersections
/* Require that the intersection point be at least min_interval
to the right of the previous one. If not, we replace the
previous intersection point with this one. Yes, it can even
happen that the new intersection point is slightly to the
left of the old one, because of numerical imprecision. We
don't check that the new point is also min_interval to the
right of the penultimate one. In that case, the points would
switch places in the sort, resulting in a bogus score for
that inteval. */
if (first || leftmost_x - prev_x > min_interval) {
if (intersection_n == intersection_max) {
intersection_max *= 2;
intersections = realloc(intersections, intersection_max*sizeof(intersection_t));
if (intersections == NULL)
fprintf(stderr, "couldn't realloc intersections\n");
}
intersections[intersection_n++] = new_intersection(leftmost_x, leftmost, cands[leftmost].comps, cands[prev].comps);
} else {
// replace the old one
inter = new_intersection(leftmost_x, leftmost, cands[leftmost].comps, cands[prev].comps);
comps_addto(inter->delta_comps, intersections[intersection_n-1]->delta_comps);
intersection_delete(intersections[intersection_n-1]);
intersections[intersection_n-1] = inter;
}
if (verbose >= 3)
fprintf(stderr, "found intersection point: %f, right cand %d\n", leftmost_x, leftmost);
prev = leftmost;
prev_x = leftmost_x;
first = 0;
}
}
best_score = compute_score(comps);
//fprintf(stderr, "x->-inf => %f\n", best_score);
if (intersection_n == 0)
best_x = 0.0;
else {
qsort(intersections, intersection_n, sizeof(intersection_t *), (int(*)(const void *, const void *))compare_intersections);
best_x = intersections[0]->x - 1000.0; // whatever
}
for (intersection_i = 0; intersection_i < intersection_n; intersection_i++) {
comps_addto(comps, intersections[intersection_i]->delta_comps);
score = compute_score(comps);
//fprintf(stderr, "x=%f => %f\n", intersections[intersection_i]->x, score);
if (score > best_score) {
best_score = score;
if (intersection_i+1 < intersection_n)
// what if interval is zero-width?
best_x = 0.5*(intersections[intersection_i]->x + intersections[intersection_i+1]->x);
else
best_x = intersections[intersection_i]->x + 0.1; // whatever
}
}
//fprintf(stderr, "best_x = %f\n", best_x);
point = point_copy(dir);
point_multiplyby(point, best_x);
point_addto(point, origin);
point_set_score(point, best_score);
if (verbose >= 2) {
fprintf(stderr, " ending point: ");
point_print(point, stderr, 1);
fprintf(stderr, "\n");
//check_comps(data, point, comps);
}
for (intersection_i = 0; intersection_i < intersection_n; intersection_i++)
intersection_delete(intersections[intersection_i]);
free(comps);
if (best_score < origin->score) {
/* this can happen in the case of a tie between two candidates with different bleu component scores. just trash the point and return the starting point */
point_delete(point);
return point_copy(origin);
}
return point;
}
point_t *optimize_powell(data_t *data, point_t *point) {
int i;
point_t **u, **p;
float biggestwin, totalwin, extrapolatedwin;
int biggestwin_i;
point_t *point_e;
u = malloc(dim*sizeof(point_t *));
p = malloc(dim*sizeof(point_t *));
point = point_copy(point);
if (!point->has_score)
point_set_score(point, slow_bleu(data, point));
for (i=0; i<dim; i++) {
u[i] = new_point();
u[i]->weights[i] = 1.0;
}
while (1) {
p[0] = line_optimize(data, point, u[0]);
biggestwin_i = 0;
biggestwin = p[0]->score - point->score;
for (i=1; i<dim; i++) {
p[i] = line_optimize(data, p[i-1], u[i]);
if (p[i]->score - p[i-1]->score > biggestwin) {
biggestwin_i = i;
biggestwin = p[i]->score - p[i-1]->score;
}
}
totalwin = p[dim-1]->score - point->score;
if (totalwin < 0.000001)
break;
// last point minus first point
point_multiplyby(point, -1.0);
point_addto(point, p[dim-1]);
point_e = point_copy(point);
point_addto(point_e, p[dim-1]);
point_set_score(point_e, slow_bleu(data, point_e));
extrapolatedwin = point_e->score - point->score; // point->score is the original point
if (extrapolatedwin > 0 &&
2*(2*totalwin - extrapolatedwin) *
powf(totalwin - biggestwin, 2.0f) <
powf(extrapolatedwin, 2.0f)*biggestwin) {
// replace dominant direction vector with sum vector
point_delete(u[biggestwin_i]);
point_normalize(point);
u[biggestwin_i] = point;
}
point_delete(point_e);
// optimization continues with last point
point = p[dim-1];
for (i=0; i<dim-1; i++)
if (i != biggestwin_i)
point_delete(p[i]);
}
for (i=0; i<dim; i++)
point_delete(u[i]);
free(u);
free(p);
point_normalize(point);
return point;
}
point_t *optimize_koehn(data_t *data, point_t *point) {
point_t *dir, **newpoints;
int dir_i;
int best_dir = -1;
dir = new_point();
newpoints = malloc(dim*sizeof(point_t *));
point = point_copy(point);
while (1) {
for (dir_i = 0; dir_i < dim; dir_i++) {
dir->weights[dir_i] = 1.0;
newpoints[dir_i] = line_optimize(data, point, dir);
if (best_dir < 0 || newpoints[dir_i]->score > newpoints[best_dir]->score)
best_dir = dir_i;
dir->weights[dir_i] = 0.0;
}
if (point->has_score && newpoints[best_dir]->score - point->score < 0.000001)
break;
point_delete(point);
point = newpoints[best_dir];
// discard the other points
for (dir_i = 0; dir_i < dim; dir_i++)
if (dir_i != best_dir)
point_delete(newpoints[dir_i]);
}
point_delete(dir);
free(newpoints);
point_normalize(point);
return point;
}
void usage(void) {
fprintf(stderr, "usage: mert -d <dimensions>\n");
exit(1);
}
int main (int argc, char **argv) {
int point_i;
int points_n = 20;
point_t *min, *max;
data_t *data;
point_t *bestpoint, *newpoint, *startpoint;
int i, c;
FILE *fp;
while ((c = getopt(argc, argv, "d:n:")) != -1) {
switch (c) {
case 'd':
dim = strtol(optarg, NULL, 10);
break;
case 'n':
points_n = strtol(optarg, NULL, 10);
break;
default:
usage();
}
}
argc -= optind;
argv += optind;
if (dim < 0)
usage();
if ((data = read_data()) == NULL) exit(1);
fp = fopen("init.opt", "r");
if ((min = read_point(fp)) == NULL) exit(1);
if ((max = read_point(fp)) == NULL) exit(1);
if ((startpoint = read_point(fp)) == NULL) exit(1);
fclose(fp);
bestpoint = NULL;
for (point_i=0; point_i<points_n; point_i++) {
fprintf(stderr, "*** point %d ***\n", point_i);
if (point_i == 0)
newpoint = startpoint;
else
newpoint = random_point(min, max);
newpoint = optimize_koehn(data, newpoint);
if (bestpoint == NULL || newpoint->score > bestpoint->score)
bestpoint = newpoint; // who cares about the leak
}
fprintf(stderr, "Best point: ");
point_print(bestpoint, stderr, 1);
fprintf(stderr, "\n");
fp = fopen("weights.txt", "w");
point_print(bestpoint, fp, 0);
fprintf(fp, "\n");
fclose(fp);
}

View File

@ -1,117 +0,0 @@
// $Id$
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "point.h"
int dim = -1;
point_t *new_point() {
point_t *point;
point = malloc(sizeof(point_t));
point->score = 0.0;
point->weights = calloc(dim, sizeof(float));
point->has_score = 0;
return point;
}
void point_set_score(point_t *point, float score) {
point->has_score = 1;
point->score = score;
}
void point_delete(point_t *point) {
free(point->weights);
free(point);
}
point_t *random_point(point_t *min, point_t *max) {
int i;
point_t *point = new_point();
for (i=0; i<dim; i++)
point->weights[i] = min->weights[i] + (float)random()/RAND_MAX * (max->weights[i]-min->weights[i]);
return point;
}
point_t *point_copy(point_t *point) {
point_t *newpoint;
int i;
newpoint = new_point();
newpoint->score = point->score;
newpoint->has_score = point->has_score;
for (i=0; i<dim; i++)
newpoint->weights[i] = point->weights[i];
return newpoint;
}
float point_dotproduct(point_t *point, float *y) {
float result;
int i;
result = 0.0;
for (i=0; i<dim; i++)
result += point->weights[i] * y[i];
return result;
}
/* Destructive operations */
void point_multiplyby(point_t *point, float k) {
int i;
for (i=0; i<dim; i++)
point->weights[i] *= k;
}
void point_addto(point_t *point1, point_t *point2) {
int i;
for (i=0; i<dim; i++)
point1->weights[i] += point2->weights[i];
}
void point_normalize(point_t *point) {
int i;
float norm = 0.0;
for (i=0; i<dim; i++)
//norm += point->weights[i] * point->weights[i];
norm += fabs(point->weights[i]);
// norm = sqrt(norm);
for (i=0; i<dim; i++)
point->weights[i] /= norm;
}
void point_print(point_t *point, FILE *fp, int with_score) {
int i;
fprintf(fp, "%f", point->weights[0]);
for (i=1; i<dim; i++)
fprintf(fp, " %f", point->weights[i]);
if (point->has_score && with_score)
fprintf(fp, " => %f", point->score);
}
point_t *read_point(FILE *fp) {
static char buf[1000];
char *tok, *s;
int field;
point_t *point;
point = new_point();
fgets(buf, sizeof(buf), fp);
s = buf;
field = 0;
while ((tok = strsep(&s, " \t\n")) != NULL) {
if (!*tok) // empty token
continue;
if (field >= dim) {
fprintf(stderr, "read_point(): too many fields in line\n");
return NULL;
} else
point->weights[field] = strtod(tok, NULL);
field++;
}
if (field < dim) {
fprintf(stderr, "read_point(): wrong number of fields in line\n");
return NULL;
}
return point;
}

View File

@ -1,26 +0,0 @@
// $Id$
#ifndef POINT_H
#define POINT_H
typedef struct {
float *weights;
int has_score;
float score;
} point_t;
extern int dim;
point_t *new_point();
void point_set_score(point_t *point, float score);
void point_delete(point_t *point);
point_t *point_copy(point_t *point);
point_t *random_point(point_t *min, point_t *max);
float point_dotproduct(point_t *point, float *y);
void point_multiplyby(point_t *point, float k);
void point_normalize(point_t *point);
void point_addto(point_t *point1, point_t *point2);
#include <stdio.h>
point_t *read_point(FILE *fp);
void point_print(point_t *point, FILE *fp, int with_score);
#endif

View File

@ -1,57 +0,0 @@
###########################################################################
#
# Psyco top-level file of the Psyco package.
# Copyright (C) 2001-2002 Armin Rigo et.al.
"""Psyco -- the Python Specializing Compiler.
Typical usage: add the following lines to your application's main module:
try:
import psyco
psyco.profile()
except:
print 'Psyco not found, ignoring it'
"""
###########################################################################
#
# This module is present to make 'psyco' a package and to
# publish the main functions and variables.
#
# More documentation can be found in core.py.
#
# Try to import the dynamic-loading _psyco and report errors
try:
import _psyco
except ImportError, e:
extramsg = ''
import sys, imp
try:
file, filename, (suffix, mode, type) = imp.find_module('_psyco', __path__)
except ImportError:
ext = [suffix for suffix, mode, type in imp.get_suffixes()
if type == imp.C_EXTENSION]
if ext:
extramsg = (" (cannot locate the compiled extension '_psyco%s' "
"in the package path '%s')" % (ext[0], '; '.join(__path__)))
else:
extramsg = (" (check that the compiled extension '%s' is for "
"the correct Python version; this is Python %s)" %
(filename, sys.version.split()[0]))
raise ImportError, str(e) + extramsg
# Publish important data by importing them in the package
from support import __version__, error, warning, _getrealframe, _getemulframe
from support import version_info, __version__ as hexversion
from core import full, profile, background, runonly, stop, cannotcompile
from core import log, bind, unbind, proxy, unproxy, dumpcodebuf
from _psyco import setfilter
try:
from _psyco import compact, compacttype # Python 2.2 and above only
except ImportError:
pass

View File

@ -1,53 +0,0 @@
###########################################################################
#
# Psyco class support module.
# Copyright (C) 2001-2002 Armin Rigo et.al.
"""Psyco class support module.
'psyco.classes.psyobj' is an alternate Psyco-optimized root for classes.
Any class inheriting from it or using the metaclass '__metaclass__' might
get optimized specifically for Psyco. It is equivalent to call
psyco.bind() on the class object after its creation.
Note that this module has no effect with Python version 2.1 or earlier.
Importing everything from psyco.classes in a module will import the
'__metaclass__' name, so all classes defined after a
from psyco.classes import *
will automatically use the Psyco-optimized metaclass.
"""
###########################################################################
__all__ = ['psyobj', 'psymetaclass', '__metaclass__']
# Python version check
try:
from _psyco import compacttype
except ImportError:
class psyobj: # compatilibity
pass
psymetaclass = None
else:
# version >= 2.2 only
import core
from types import FunctionType
class psymetaclass(compacttype):
"Psyco-optimized meta-class. Turns all methods into Psyco proxies."
def __new__(cls, name, bases, dict):
bindlist = dict.get('__psyco__bind__')
if bindlist is None:
bindlist = [key for key, value in dict.items()
if isinstance(value, FunctionType)]
for attr in bindlist:
dict[attr] = core.proxy(dict[attr])
return super(psymetaclass, cls).__new__(cls, name, bases, dict)
psyobj = psymetaclass("psyobj", (), {})
__metaclass__ = psymetaclass

View File

@ -1,232 +0,0 @@
###########################################################################
#
# Psyco main functions.
# Copyright (C) 2001-2002 Armin Rigo et.al.
"""Psyco main functions.
Here are the routines that you can use from your applications.
These are mostly interfaces to the C core, but they depend on
the Python version.
You can use these functions from the 'psyco' module instead of
'psyco.core', e.g.
import psyco
psyco.log('/tmp/psyco.log')
psyco.profile()
"""
###########################################################################
import _psyco
import types, new
from support import *
# Default charge profiler values
default_watermark = 0.09 # between 0.0 (0%) and 1.0 (100%)
default_halflife = 0.5 # seconds
default_pollfreq_profile = 20 # Hz
default_pollfreq_background = 100 # Hz -- a maximum for sleep's resolution
default_parentframe = 0.25 # should not be more than 0.5 (50%)
def full(memory=None, time=None, memorymax=None, timemax=None):
"""Compile as much as possible.
Typical use is for small scripts performing intensive computations
or string handling."""
import profiler
if PYTHON_SUPPORT:
p = profiler.FullCompiler()
else:
p = profiler.ActiveProfiler(0.0, 0.5)
p.run(memory, time, memorymax, timemax)
def profile(watermark = default_watermark,
halflife = default_halflife,
pollfreq = default_pollfreq_profile,
parentframe = default_parentframe,
memory=None, time=None, memorymax=None, timemax=None):
"""Turn on profiling.
The 'watermark' parameter controls how easily running functions will
be compiled. The smaller the value, the more functions are compiled."""
import profiler
p = profiler.ActivePassiveProfiler(watermark, halflife,
pollfreq, parentframe)
p.run(memory, time, memorymax, timemax)
def background(watermark = default_watermark,
halflife = default_halflife,
pollfreq = default_pollfreq_background,
parentframe = default_parentframe,
memory=None, time=None, memorymax=None, timemax=None):
"""Turn on passive profiling.
This is a very lightweight mode in which only intensively computing
functions can be detected. The smaller the 'watermark', the more functions
are compiled."""
import profiler
p = profiler.PassiveProfiler(watermark, halflife, pollfreq, parentframe)
p.run(memory, time, memorymax, timemax)
def runonly(memory=None, time=None, memorymax=None, timemax=None):
"""Nonprofiler.
XXX check if this is useful and document."""
if PYTHON_SUPPORT:
import profiler
p = profiler.RunOnly()
p.run(memory, time, memorymax, timemax)
def stop():
"""Turn off all automatic compilation. bind() calls remain in effect."""
import profiler
profiler.go([])
def log(logfile='', mode='w', top=10):
"""Enable logging to the given file.
If the file name is unspecified, a default name is built by appending
a 'log-psyco' extension to the main script name.
Mode is 'a' to append to a possibly existing file or 'w' to overwrite
an existing file. Note that the log file may grow quickly in 'a' mode."""
import profiler, logger
if not logfile:
import os
logfile, dummy = os.path.splitext(sys.argv[0])
if os.path.basename(logfile):
logfile += '.'
logfile += 'log-psyco'
if hasattr(_psyco, 'VERBOSE_LEVEL'):
print >> sys.stderr, 'psyco: logging to', logfile
# logger.current should be a real file object; subtle problems
# will show up if its write() and flush() methods are written
# in Python, as Psyco will invoke them while compiling.
logger.current = open(logfile, mode)
logger.print_charges = top
profiler.logger = logger
logger.writedate('Logging started')
cannotcompile(logger.psycowrite)
_psyco.statwrite(logger=logger.psycowrite)
def bind(x, rec=None):
"""Enable compilation of the given function, method, or class object.
If C is a class (or anything with a '__dict__' attribute), bind(C) will
rebind all functions and methods found in C.__dict__ (which means, for
classes, all methods defined in the class but not in its parents).
The optional second argument specifies the number of recursive
compilation levels: all functions called by func are compiled
up to the given depth of indirection."""
if isinstance(x, types.MethodType):
x = x.im_func
if isinstance(x, types.FunctionType):
if rec is None:
x.func_code = _psyco.proxycode(x)
else:
x.func_code = _psyco.proxycode(x, rec)
return
if hasattr(x, '__dict__'):
funcs = [o for o in x.__dict__.values()
if isinstance(o, types.MethodType)
or isinstance(o, types.FunctionType)]
if not funcs:
raise error, ("nothing bindable found in %s object" %
type(x).__name__)
for o in funcs:
bind(o, rec)
return
raise TypeError, "cannot bind %s objects" % type(x).__name__
def unbind(x):
"""Reverse of bind()."""
if isinstance(x, types.MethodType):
x = x.im_func
if isinstance(x, types.FunctionType):
try:
f = _psyco.unproxycode(x.func_code)
except error:
pass
else:
x.func_code = f.func_code
return
if hasattr(x, '__dict__'):
for o in x.__dict__.values():
if (isinstance(o, types.MethodType)
or isinstance(o, types.FunctionType)):
unbind(o)
return
raise TypeError, "cannot unbind %s objects" % type(x).__name__
def proxy(x, rec=None):
"""Return a Psyco-enabled copy of the function.
The original function is still available for non-compiled calls.
The optional second argument specifies the number of recursive
compilation levels: all functions called by func are compiled
up to the given depth of indirection."""
if isinstance(x, types.FunctionType):
if rec is None:
code = _psyco.proxycode(x)
else:
code = _psyco.proxycode(x, rec)
return new.function(code, x.func_globals, x.func_name)
if isinstance(x, types.MethodType):
p = proxy(x.im_func, rec)
return new.instancemethod(p, x.im_self, x.im_class)
raise TypeError, "cannot proxy %s objects" % type(x).__name__
def unproxy(proxy):
"""Return a new copy of the original function of method behind a proxy.
The result behaves like the original function in that calling it
does not trigger compilation nor execution of any compiled code."""
if isinstance(proxy, types.FunctionType):
return _psyco.unproxycode(proxy.func_code)
if isinstance(proxy, types.MethodType):
f = unproxy(proxy.im_func)
return new.instancemethod(f, proxy.im_self, proxy.im_class)
raise TypeError, "%s objects cannot be proxies" % type(proxy).__name__
def cannotcompile(x):
"""Instruct Psyco never to compile the given function, method
or code object."""
if isinstance(x, types.MethodType):
x = x.im_func
if isinstance(x, types.FunctionType):
x = x.func_code
if isinstance(x, types.CodeType):
_psyco.cannotcompile(x)
else:
raise TypeError, "unexpected %s object" % type(x).__name__
def dumpcodebuf():
"""Write in file psyco.dump a copy of the emitted machine code,
provided Psyco was compiled with a non-zero CODE_DUMP.
See py-utils/httpxam.py to examine psyco.dump."""
if hasattr(_psyco, 'dumpcodebuf'):
_psyco.dumpcodebuf()
###########################################################################
# Psyco variables
# error * the error raised by Psyco
# warning * the warning raised by Psyco
# __in_psyco__ * a new built-in variable which is always zero, but which
# Psyco special-cases by returning 1 instead. So
# __in_psyco__ can be used in a function to know if
# that function is being executed by Psyco or not.

View File

@ -1,133 +0,0 @@
###########################################################################
#
# Support code for the 'psyco.compact' type.
from __future__ import generators
try:
from UserDict import DictMixin
except ImportError:
# backported from Python 2.3 to Python 2.2
class DictMixin:
# Mixin defining all dictionary methods for classes that already have
# a minimum dictionary interface including getitem, setitem, delitem,
# and keys. Without knowledge of the subclass constructor, the mixin
# does not define __init__() or copy(). In addition to the four base
# methods, progressively more efficiency comes with defining
# __contains__(), __iter__(), and iteritems().
# second level definitions support higher levels
def __iter__(self):
for k in self.keys():
yield k
def has_key(self, key):
try:
value = self[key]
except KeyError:
return False
return True
def __contains__(self, key):
return self.has_key(key)
# third level takes advantage of second level definitions
def iteritems(self):
for k in self:
yield (k, self[k])
def iterkeys(self):
return self.__iter__()
# fourth level uses definitions from lower levels
def itervalues(self):
for _, v in self.iteritems():
yield v
def values(self):
return [v for _, v in self.iteritems()]
def items(self):
return list(self.iteritems())
def clear(self):
for key in self.keys():
del self[key]
def setdefault(self, key, default):
try:
return self[key]
except KeyError:
self[key] = default
return default
def pop(self, key, *args):
if len(args) > 1:
raise TypeError, "pop expected at most 2 arguments, got "\
+ repr(1 + len(args))
try:
value = self[key]
except KeyError:
if args:
return args[0]
raise
del self[key]
return value
def popitem(self):
try:
k, v = self.iteritems().next()
except StopIteration:
raise KeyError, 'container is empty'
del self[k]
return (k, v)
def update(self, other):
# Make progressively weaker assumptions about "other"
if hasattr(other, 'iteritems'): # iteritems saves memory and lookups
for k, v in other.iteritems():
self[k] = v
elif hasattr(other, '__iter__'): # iter saves memory
for k in other:
self[k] = other[k]
else:
for k in other.keys():
self[k] = other[k]
def get(self, key, default=None):
try:
return self[key]
except KeyError:
return default
def __repr__(self):
return repr(dict(self.iteritems()))
def __cmp__(self, other):
if other is None:
return 1
if isinstance(other, DictMixin):
other = dict(other.iteritems())
return cmp(dict(self.iteritems()), other)
def __len__(self):
return len(self.keys())
###########################################################################
from _psyco import compact # Python 2.2 and above only
class compactdictproxy(DictMixin):
def __init__(self, ko):
self._ko = ko # compact object of which 'self' is the dict
def __getitem__(self, key):
return compact.__getslot__(self._ko, key)
def __setitem__(self, key, value):
compact.__setslot__(self._ko, key, value)
def __delitem__(self, key):
compact.__delslot__(self._ko, key)
def keys(self):
return compact.__members__.__get__(self._ko)
def clear(self):
keys = self.keys()
keys.reverse()
for key in keys:
del self[key]
def __repr__(self):
keys = ', '.join(self.keys())
return '<compactdictproxy object {%s}>' % (keys,)

View File

@ -1,90 +0,0 @@
###########################################################################
#
# Psyco logger.
# Copyright (C) 2001-2002 Armin Rigo et.al.
"""Psyco logger.
See log() in core.py.
"""
###########################################################################
import _psyco
from time import time, localtime, strftime
current = None
print_charges = 10
dump_delay = 0.2
dump_last = 0.0
def write(s, level):
t = time()
f = t-int(t)
current.write("%s.%02d %-*s%s\n" % (
strftime("%X", localtime(int(t))),
int(f*100.0), 63-level, s,
"%"*level))
current.flush()
def psycowrite(s):
t = time()
f = t-int(t)
current.write("%s.%02d %-*s%s\n" % (
strftime("%X", localtime(int(t))),
int(f*100.0), 60, s.strip(),
"% %"))
current.flush()
##def writelines(lines, level=0):
## if lines:
## t = time()
## f = t-int(t)
## timedesc = strftime("%x %X", localtime(int(t)))
## print >> current, "%s.%03d %-*s %s" % (
## timedesc, int(f*1000),
## 50-level, lines[0],
## "+"*level)
## timedesc = " " * (len(timedesc)+5)
## for line in lines[1:]:
## print >> current, timedesc, line
def writememory():
write("memory usage: %d+ kb" % _psyco.memory(), 1)
def dumpcharges():
global dump_last
if print_charges:
t = time()
if not (dump_last <= t < dump_last+dump_delay):
if t <= dump_last+1.5*dump_delay:
dump_last += dump_delay
else:
dump_last = t
#write("%s: charges:" % who, 0)
lst = _psyco.stattop(print_charges)
if lst:
f = t-int(t)
lines = ["%s.%02d ______\n" % (
strftime("%X", localtime(int(t))),
int(f*100.0))]
i = 1
for co, charge in lst:
detail = co.co_filename
if len(detail) > 19:
detail = '...' + detail[-17:]
lines.append(" #%-3d |%4.1f %%| %-26s%20s:%d\n" %
(i, charge*100.0, co.co_name, detail,
co.co_firstlineno))
i += 1
current.writelines(lines)
current.flush()
def writefinalstats():
dumpcharges()
writememory()
writedate("program exit")
def writedate(msg):
write('%s, %s' % (msg, strftime("%x")), 20)

View File

@ -1,388 +0,0 @@
###########################################################################
#
# Psyco profiler (Python part).
# Copyright (C) 2001-2002 Armin Rigo et.al.
"""Psyco profiler (Python part).
The implementation of the non-time-critical parts of the profiler.
See profile() and full() in core.py for the easy interface.
"""
###########################################################################
import _psyco
from support import *
import math, time, types, atexit
now = time.time
try:
import thread
except ImportError:
import dummy_thread as thread
# current profiler instance
current = None
# enabled profilers, in order of priority
profilers = []
# logger module (when enabled by core.log())
logger = None
# a lock for a thread-safe go()
go_lock = thread.allocate_lock()
def go(stop=0):
# run the highest-priority profiler in 'profilers'
global current
go_lock.acquire()
try:
prev = current
if stop:
del profilers[:]
if prev:
if profilers and profilers[0] is prev:
return # best profiler already running
prev.stop()
current = None
for p in profilers[:]:
if p.start():
current = p
if logger: # and p is not prev:
logger.write("%s: starting" % p.__class__.__name__, 5)
return
finally:
go_lock.release()
# no profiler is running now
if stop:
if logger:
logger.writefinalstats()
else:
tag2bind()
atexit.register(go, 1)
def buildfncache(globals, cache):
if hasattr(types.IntType, '__dict__'):
clstypes = (types.ClassType, types.TypeType)
else:
clstypes = types.ClassType
for x in globals.values():
if isinstance(x, types.MethodType):
x = x.im_func
if isinstance(x, types.FunctionType):
cache[x.func_code] = x, ''
elif isinstance(x, clstypes):
for y in x.__dict__.values():
if isinstance(y, types.MethodType):
y = y.im_func
if isinstance(y, types.FunctionType):
cache[y.func_code] = y, x.__name__
# code-to-function mapping (cache)
function_cache = {}
def trytobind(co, globals, log=1):
try:
f, clsname = function_cache[co]
except KeyError:
buildfncache(globals, function_cache)
try:
f, clsname = function_cache[co]
except KeyError:
if logger:
logger.write('warning: cannot find function %s in %s' %
(co.co_name, globals.get('__name__', '?')), 3)
return # give up
if logger and log:
modulename = globals.get('__name__', '?')
if clsname:
modulename += '.' + clsname
logger.write('bind function: %s.%s' % (modulename, co.co_name), 1)
f.func_code = _psyco.proxycode(f)
if PYTHON_SUPPORT:
# the list of code objects that have been tagged
tagged_codes = []
def tag(co, globals):
if logger:
try:
f, clsname = function_cache[co]
except KeyError:
buildfncache(globals, function_cache)
try:
f, clsname = function_cache[co]
except KeyError:
clsname = '' # give up
modulename = globals.get('__name__', '?')
if clsname:
modulename += '.' + clsname
logger.write('tag function: %s.%s' % (modulename, co.co_name), 1)
tagged_codes.append((co, globals))
_psyco.turbo_frame(co)
_psyco.turbo_code(co)
def tag2bind():
if tagged_codes:
if logger:
logger.write('profiling stopped, binding %d functions' %
len(tagged_codes), 2)
for co, globals in tagged_codes:
trytobind(co, globals, 0)
function_cache.clear()
del tagged_codes[:]
else:
# tagging is impossible, always bind
tag = trytobind
def tag2bind():
pass
class Profiler:
MemoryTimerResolution = 0.103
def run(self, memory, time, memorymax, timemax):
self.memory = memory
self.memorymax = memorymax
self.time = time
if timemax is None:
self.endtime = None
else:
self.endtime = now() + timemax
self.alarms = []
profilers.append(self)
go()
def start(self):
curmem = _psyco.memory()
memlimits = []
if self.memorymax is not None:
if curmem >= self.memorymax:
if logger:
logger.writememory()
return self.limitreached('memorymax')
memlimits.append(self.memorymax)
if self.memory is not None:
if self.memory <= 0:
if logger:
logger.writememory()
return self.limitreached('memory')
memlimits.append(curmem + self.memory)
self.memory_at_start = curmem
curtime = now()
timelimits = []
if self.endtime is not None:
if curtime >= self.endtime:
return self.limitreached('timemax')
timelimits.append(self.endtime - curtime)
if self.time is not None:
if self.time <= 0.0:
return self.limitreached('time')
timelimits.append(self.time)
self.time_at_start = curtime
try:
self.do_start()
except error, e:
if logger:
logger.write('%s: disabled by psyco.error:' % (
self.__class__.__name__), 4)
logger.write(' %s' % str(e), 3)
return 0
if memlimits:
self.memlimits_args = (time.sleep, (self.MemoryTimerResolution,),
self.check_memory, (min(memlimits),))
self.alarms.append(_psyco.alarm(*self.memlimits_args))
if timelimits:
self.alarms.append(_psyco.alarm(time.sleep, (min(timelimits),),
self.time_out))
return 1
def stop(self):
for alarm in self.alarms:
alarm.stop(0)
for alarm in self.alarms:
alarm.stop(1) # wait for parallel threads to stop
del self.alarms[:]
if self.time is not None:
self.time -= now() - self.time_at_start
if self.memory is not None:
self.memory -= _psyco.memory() - self.memory_at_start
try:
self.do_stop()
except error:
return 0
return 1
def check_memory(self, limit):
if _psyco.memory() < limit:
return self.memlimits_args
go()
def time_out(self):
self.time = 0.0
go()
def limitreached(self, limitname):
try:
profilers.remove(self)
except ValueError:
pass
if logger:
logger.write('%s: disabled (%s limit reached)' % (
self.__class__.__name__, limitname), 4)
return 0
class FullCompiler(Profiler):
def do_start(self):
_psyco.profiling('f')
def do_stop(self):
_psyco.profiling('.')
class RunOnly(Profiler):
def do_start(self):
_psyco.profiling('n')
def do_stop(self):
_psyco.profiling('.')
class ChargeProfiler(Profiler):
def __init__(self, watermark, parentframe):
self.watermark = watermark
self.parent2 = parentframe * 2.0
self.lock = thread.allocate_lock()
def init_charges(self):
_psyco.statwrite(watermark = self.watermark,
parent2 = self.parent2)
def do_stop(self):
_psyco.profiling('.')
_psyco.statwrite(callback = None)
class ActiveProfiler(ChargeProfiler):
def active_start(self):
_psyco.profiling('p')
def do_start(self):
self.init_charges()
self.active_start()
_psyco.statwrite(callback = self.charge_callback)
def charge_callback(self, frame, charge):
tag(frame.f_code, frame.f_globals)
class PassiveProfiler(ChargeProfiler):
initial_charge_unit = _psyco.statread('unit')
reset_stats_after = 120 # half-lives (maximum 200!)
reset_limit = initial_charge_unit * (2.0 ** reset_stats_after)
def __init__(self, watermark, halflife, pollfreq, parentframe):
ChargeProfiler.__init__(self, watermark, parentframe)
self.pollfreq = pollfreq
# self.progress is slightly more than 1.0, and computed so that
# do_profile() will double the change_unit every 'halflife' seconds.
self.progress = 2.0 ** (1.0 / (halflife * pollfreq))
def reset(self):
_psyco.statwrite(unit = self.initial_charge_unit, callback = None)
_psyco.statreset()
if logger:
logger.write("%s: resetting stats" % self.__class__.__name__, 1)
def passive_start(self):
self.passivealarm_args = (time.sleep, (1.0 / self.pollfreq,),
self.do_profile)
self.alarms.append(_psyco.alarm(*self.passivealarm_args))
def do_start(self):
tag2bind()
self.init_charges()
self.passive_start()
def do_profile(self):
_psyco.statcollect()
if logger:
logger.dumpcharges()
nunit = _psyco.statread('unit') * self.progress
if nunit > self.reset_limit:
self.reset()
else:
_psyco.statwrite(unit = nunit, callback = self.charge_callback)
return self.passivealarm_args
def charge_callback(self, frame, charge):
trytobind(frame.f_code, frame.f_globals)
class ActivePassiveProfiler(PassiveProfiler, ActiveProfiler):
def do_start(self):
self.init_charges()
self.active_start()
self.passive_start()
def charge_callback(self, frame, charge):
tag(frame.f_code, frame.f_globals)
#
# we register our own version of sys.settrace(), sys.setprofile()
# and thread.start_new_thread().
#
def psyco_settrace(*args, **kw):
"This is the Psyco-aware version of sys.settrace()."
result = original_settrace(*args, **kw)
go()
return result
def psyco_setprofile(*args, **kw):
"This is the Psyco-aware version of sys.setprofile()."
result = original_setprofile(*args, **kw)
go()
return result
def psyco_thread_stub(callable, args, kw):
_psyco.statcollect()
if kw is None:
return callable(*args)
else:
return callable(*args, **kw)
def psyco_start_new_thread(callable, args, kw=None):
"This is the Psyco-aware version of thread.start_new_thread()."
return original_start_new_thread(psyco_thread_stub, (callable, args, kw))
original_settrace = sys.settrace
original_setprofile = sys.setprofile
original_start_new_thread = thread.start_new_thread
sys.settrace = psyco_settrace
sys.setprofile = psyco_setprofile
if PYTHON_SUPPORT:
thread.start_new_thread = psyco_start_new_thread
# hack to patch threading._start_new_thread if the module is
# already loaded
if (sys.modules.has_key('threading') and
hasattr(sys.modules['threading'], '_start_new_thread')):
sys.modules['threading']._start_new_thread = psyco_start_new_thread

View File

@ -1,196 +0,0 @@
###########################################################################
#
# Psyco general support module.
# Copyright (C) 2001-2002 Armin Rigo et.al.
"""Psyco general support module.
For internal use.
"""
###########################################################################
import sys, _psyco, __builtin__
error = _psyco.error
class warning(Warning):
pass
_psyco.NoLocalsWarning = warning
def warn(msg):
from warnings import warn
warn(msg, warning, stacklevel=2)
#
# Version checks
#
__version__ = 0x010500f0
if _psyco.PSYVER != __version__:
raise error, "version mismatch between Psyco parts, reinstall it"
version_info = (__version__ >> 24,
(__version__ >> 16) & 0xff,
(__version__ >> 8) & 0xff,
{0xa0: 'alpha',
0xb0: 'beta',
0xc0: 'candidate',
0xf0: 'final'}[__version__ & 0xf0],
__version__ & 0xf)
VERSION_LIMITS = [0x02010000, # 2.1
0x02020000, # 2.2
0x02020200, # 2.2.2
0x02030000, # 2.3
0x02040000] # 2.4
if ([v for v in VERSION_LIMITS if v <= sys.hexversion] !=
[v for v in VERSION_LIMITS if v <= _psyco.PYVER ]):
if sys.hexversion < VERSION_LIMITS[0]:
warn("Psyco requires Python version 2.1 or later")
else:
warn("Psyco version does not match Python version. "
"Psyco must be updated or recompiled")
PYTHON_SUPPORT = hasattr(_psyco, 'turbo_code')
if hasattr(_psyco, 'ALL_CHECKS') and hasattr(_psyco, 'VERBOSE_LEVEL'):
print >> sys.stderr, ('psyco: running in debugging mode on %s' %
_psyco.PROCESSOR)
###########################################################################
# sys._getframe() gives strange results on a mixed Psyco- and Python-style
# stack frame. Psyco provides a replacement that partially emulates Python
# frames from Psyco frames. The new sys._getframe() may return objects of
# a custom "Psyco frame" type, which with Python >=2.2 is a subtype of the
# normal frame type.
#
# The same problems require some other built-in functions to be replaced
# as well. Note that the local variables are not available in any
# dictionary with Psyco.
class Frame:
pass
class PythonFrame(Frame):
def __init__(self, frame):
self.__dict__.update({
'_frame': frame,
})
def __getattr__(self, attr):
if attr == 'f_back':
try:
result = embedframe(_psyco.getframe(self._frame))
except ValueError:
result = None
except error:
warn("f_back is skipping dead Psyco frames")
result = self._frame.f_back
self.__dict__['f_back'] = result
return result
else:
return getattr(self._frame, attr)
def __setattr__(self, attr, value):
setattr(self._frame, attr, value)
def __delattr__(self, attr):
delattr(self._frame, attr)
class PsycoFrame(Frame):
def __init__(self, tag):
self.__dict__.update({
'_tag' : tag,
'f_code' : tag[0],
'f_globals': tag[1],
})
def __getattr__(self, attr):
if attr == 'f_back':
try:
result = embedframe(_psyco.getframe(self._tag))
except ValueError:
result = None
elif attr == 'f_lineno':
result = self.f_code.co_firstlineno # better than nothing
elif attr == 'f_builtins':
result = self.f_globals['__builtins__']
elif attr == 'f_restricted':
result = self.f_builtins is not __builtins__
elif attr == 'f_locals':
raise AttributeError, ("local variables of functions run by Psyco "
"cannot be accessed in any way, sorry")
else:
raise AttributeError, ("emulated Psyco frames have "
"no '%s' attribute" % attr)
self.__dict__[attr] = result
return result
def __setattr__(self, attr, value):
raise AttributeError, "Psyco frame objects are read-only"
def __delattr__(self, attr):
if attr == 'f_trace':
# for bdb which relies on CPython frames exhibiting a slightly
# buggy behavior: you can 'del f.f_trace' as often as you like
# even without having set it previously.
return
raise AttributeError, "Psyco frame objects are read-only"
def embedframe(result):
if type(result) is type(()):
return PsycoFrame(result)
else:
return PythonFrame(result)
def _getframe(depth=0):
"""Return a frame object from the call stack. This is a replacement for
sys._getframe() which is aware of Psyco frames.
The returned objects are instances of either PythonFrame or PsycoFrame
instead of being real Python-level frame object, so that they can emulate
the common attributes of frame objects.
The original sys._getframe() ignoring Psyco frames altogether is stored in
psyco._getrealframe(). See also psyco._getemulframe()."""
# 'depth+1' to account for this _getframe() Python function
return embedframe(_psyco.getframe(depth+1))
def _getemulframe(depth=0):
"""As _getframe(), but the returned objects are real Python frame objects
emulating Psyco frames. Some of their attributes can be wrong or missing,
however."""
# 'depth+1' to account for this _getemulframe() Python function
return _psyco.getframe(depth+1, 1)
def patch(name, module=__builtin__):
f = getattr(_psyco, name)
org = getattr(module, name)
if org is not f:
setattr(module, name, f)
setattr(_psyco, 'original_' + name, org)
_getrealframe = sys._getframe
sys._getframe = _getframe
patch('globals')
patch('eval')
patch('execfile')
patch('locals')
patch('vars')
patch('dir')
patch('input')
_psyco.original_raw_input = raw_input
__builtin__.__in_psyco__ = 0==1 # False
if hasattr(_psyco, 'compact'):
import kdictproxy
_psyco.compactdictproxy = kdictproxy.compactdictproxy

View File

@ -1,88 +0,0 @@
#! /usr/bin/perl
sub PrintArgsAndDie () {
print stderr "USAGE: reduce-field.pl [-h] \n";
print stderr "This scripts reduce the number of active fields for the mert procedure.\n";
exit(1);
}
my $weightfile="";
my $size=-1;
my $activefields="";
my $debug=0;
while (@ARGV){
if ($ARGV[0] eq "-h"){
&PrintArgsAndDie();
}
if ($ARGV[0] eq "-debug"){
$debug=1;
shift(@ARGV);
}
if ($ARGV[0] eq "-weight"){
$weightfile=$ARGV[1];
shift(@ARGV); shift(@ARGV);
}
if ($ARGV[0] eq "-d"){
$size=$ARGV[1];
shift(@ARGV); shift(@ARGV);
}
if ($ARGV[0] eq "-activate"){
$activefields=$ARGV[1];
shift(@ARGV); shift(@ARGV);
}
}
die "Cannot open/find weight file ($weightfile)\n" if ! -e $weightfile;
my @weight=();
open(IN,$weightfile);
chomp($weight=<IN>);
close(IN);
push @weight,split(/[ \t]+/,"1 $weight");
my @active=();
my @invertedactive=();
if ($activefields eq ""){
for (my $i=1; $i<=$size; $i++){ $active[$i]=1; };
}else{
@active=split(/,/,$activefields);
}
for (my $i=0; $i<=$size; $i++){ $invertedactive[$i]=0; };
for (my $i=0; $i<scalar(@active); $i++){ $invertedactive[$active[$i]]=1; };
my $j=0;
for (my $i=1; $i<=$size; $i++){ if (!$invertedactive[$i]){$notactive[$j]=$i; $j++}};
if ($debug>0){
print STDERR "ORIGINAL SIZE: $size\n";
print STDERR "ORIGINAL WEIGHTS: @weight\n";
print STDERR "ORIGINAL ACTIVE: @active\n";
print STDERR "ORIGINAL NOTACTIVE: @notactive\n";
print STDERR "ORIGINAL INVERTEDACTIVE: @invertedactive\n";
}
while(chomp($_=<STDIN>)){
my @field=(0,split(/[ \t]+/,$_));
my $notactivedweightedsum=0.0;
my $j;
for (my $i=0; $i<scalar(@notactive); $i++){
$j=$notactive[$i];
$notactivedweightedsum+=($weight[$j]*$field[$j]);
printf STDERR "notactive -> i:$i j:$j -> $weight[$j] - $field[$j] -> $notactivedweightedsum\n" if $debug>0;
};
printf STDOUT "%.3f",$notactivedweightedsum;
printf STDERR "sum not active features: %.3f\n",$notactivedweightedsum if $debug>0;
for (my $i=0; $i<scalar(@active); $i++){
print STDOUT " $field[$active[$i]]";
printf STDERR "active -> i:$i j:$active[$i] -> $field[$active[$i]]\n" if $debug>0;
};
for (my $i=scalar(@active)+scalar(@notactive)+1; $i< scalar(@field); $i++){
print STDOUT " $field[$i]";
printf STDERR "extra -> i:$i -> $field[$i]\n" if $debug>0;
};
print STDOUT "\n";
}

View File

@ -1,8 +0,0 @@
#!/bin/sh
unset LANG
export PATH=$PATH:/group/project/statmt/pkoehn/user/abhishek:/group/project/statmt/pkoehn/user/abhishek/cmert-0.5
export EVAL=/group/project/statmt/pkoehn/user/abhishek/WST05/fr-en-train/dev
mert-driver cmert-work $EVAL/low.test400.fr.rest $EVAL/low.test400.en 100 pharaoh.2005-07-21 "-config /group/project/statmt/pkoehn/user/abhishek/WST05/fr-en-train/model/pharaoh.ini -dl 4 -b 0.1 -ttable-limit 100" "0.2,0-1;0.2,0.2-0.2;0.2,0-1;0.2,0-1;0.2,0-1;0.2,0-1;0.2,-1-1;0.2,-1-1"

View File

@ -1,109 +0,0 @@
#!/usr/bin/python
# $Id$
"""Convert n-best list in mert.perl format to format required by
Venugopal's MER trainer. This entails calculating the BLEU component scores."""
"""usage: score-nbest.py <reffile>+ <outprefix>
The input should be sorted by sentence number and piped into stdin
Run it like this: sort -mnk 1,1 *.nbest | score-nbest.py ...
"""
import sys, itertools, re
import bleu
#Comment out this line when moving to python 2.4
from sets import Set as set
def process(sentnum, testsents):
candsfile.write("%d %d\n" % (cur_sentnum, len(testsents)))
for (sent,vector) in testsents:
comps = bleu.cook_test(sent, cookedrefs[sentnum])
if comps['testlen'] != comps['guess'][0]:
sys.stderr.write("ERROR: test length != guessed 1-grams\n")
featsfile.write("%s %s %d\n" % (" ".join([str(v) for v in vector]),
" ".join(["%d %d" % (c,g) for (c,g) in zip(comps['correct'], comps['guess'])]),
comps['reflen']))
if __name__ == "__main__":
import os
machtype=os.environ.get("MACHTYPE")
if machtype == "i386":
#import psyco
#psyco.full()
sys.stderr.write("psyco library is NOT imported. Uncomment code in score-nbest.py if you wish to enable it\n")
else:
sys.stderr.write("psyco library is not imported because it is not available for %s \n" % machtype)
import getopt
(opts,args) = getopt.getopt(sys.argv[1:], "casen", [])
for (opt,parm) in opts:
if opt == "-c":
bleu.preserve_case = True
if opt == "-a":
bleu.eff_ref_len = "average"
if opt == "-s":
bleu.eff_ref_len = "shortest"
if opt == "-e":
bleu.eff_ref_len = "closest"
if opt == "-n":
bleu.nonorm = 1
print args
cookedrefs = []
reffiles = [file(name) for name in args[:-1]]
print reffiles
for refs in itertools.izip(*reffiles):
cookedrefs.append(bleu.cook_refs(refs))
outprefix = args[-1]
featsfile = file(outprefix+"feats.opt", "w")
candsfile = file(outprefix+"cands.opt", "w")
cur_sentnum = None
testsents = set()
progress = 0
infile = sys.stdin
# function that recognizes floats
re_float=re.compile(r'^-?[-0-9.e\+]+$')
is_float=lambda(x):re_float.match(x)
for line in infile:
try:
##Changed to add a further field - AA 29/11/05
#(sentnum, sent, vector) = line.split('|||')
(sentnum, sent, vector, prob ) = line.split('|||')
except:
sys.stderr.write("ERROR: bad input line %s\n" % line)
sentnum = int(sentnum)
sent = " ".join(sent.split())
# filter out score labels (keep only floats) and convert numbers to floats
vector = tuple(map(lambda(s): -float(s), filter(is_float, vector.split())))
if sentnum != cur_sentnum:
if cur_sentnum is not None:
process(cur_sentnum, testsents)
cur_sentnum = sentnum
testsents = set()
testsents.add((sent,vector))
if progress % 10000 == 0:
sys.stdout.write(".")
sys.stdout.flush()
progress += 1
process(cur_sentnum, testsents)
sys.stdout.write("\n")
featsfile.close()
candsfile.close()

View File

@ -1,34 +0,0 @@
// $Id$
#include <math.h>
#include <stdio.h>
#include "score.h"
int comps_n = 9;
void comps_addto(int *comps1, int *comps2) {
int i;
for (i=0; i<comps_n; i++)
comps1[i] += comps2[i];
}
float compute_score(int *comps) {
float logbleu = 0.0, brevity;
int i;
int n = (comps_n-1)/2;
/*for (i=0; i<comps_n; i++)
fprintf(stderr, " %d", comps[i]);
fprintf(stderr, "\n");*/
for (i=0; i<n; i++) {
if (comps[2*i] == 0)
return 0.0;
logbleu += log(comps[2*i])-log(comps[2*i+1]);
}
logbleu /= n;
brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
if (brevity < 0.0)
logbleu += brevity;
return exp(logbleu);
}

View File

@ -1,10 +0,0 @@
// $Id$
#ifndef SCORE_H
#define SCORE_H
extern int comps_n;
void comps_addto(int *comps1, int *comps2);
float compute_score(int *comps);
#endif

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.