mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-19 23:27:46 +03:00
ef028446f3
This is not pleasant to read (and much, much less pleasant to write!) but sort of necessary in an open project. Right now it's quite hard to figure out what is licensed how, which doesn't matter much to most people but can suddenly become very important when people want to know what they're being allowed to do. I kept the notices as short as I could. As far as I could see, everything without a clear license notice is LGPL v2.1 or later.
1464 lines
53 KiB
PHP
1464 lines
53 KiB
PHP
<?php
|
|
|
|
/*
|
|
This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
Public License version 2.1 or, at your option, any later version.
|
|
*/
|
|
|
|
# main page frame, triggers the loading of parts
|
|
function show_analysis() {
|
|
global $task,$user,$setup,$id,$set;
|
|
global $dir;
|
|
|
|
head("Analysis: $task ($user), Set $set, Run $id");
|
|
|
|
?><script>
|
|
function show(field,sort,count,filter) {
|
|
var url = '?analysis=' + field + '_show'
|
|
+ '&setup=<?php print $setup ?>'
|
|
+ '&id=<?php print $id ?>'
|
|
+ '&set=<?php print $set ?>'
|
|
+ '&sort=' + sort
|
|
+ '&count=' + count
|
|
+ '&filter=' + filter;
|
|
new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
|
|
}
|
|
function ngram_show(type,order,count,sort,smooth) {
|
|
var url = '?analysis=ngram_' + type + '_show'
|
|
+ '&setup=<?php print $setup ?>'
|
|
+ '&id=<?php print $id ?>'
|
|
+ '&set=<?php print $set ?>'
|
|
+ '&order=' + order
|
|
+ '&smooth=' + smooth
|
|
+ '&sort=' + sort
|
|
+ '&count=' + count;
|
|
var field = (type == "precision" ? "nGramPrecision" : "nGramRecall") + order;
|
|
new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
|
|
}
|
|
function generic_show(field,parameters) {
|
|
var url = '?analysis=' + field + '_show'
|
|
+ '&setup=<?php print $setup ?>'
|
|
+ '&id=<?php print $id ?>'
|
|
+ '&set=<?php print $set ?>'
|
|
+ '&' + parameters;
|
|
new Ajax.Updater(field, url, { method: 'get', evalScripts: true });
|
|
}
|
|
function highlight_phrase(sentence,phrase) {
|
|
var input = "input-"+sentence+"-"+phrase;
|
|
$(input).setStyle({ borderColor: 'red' });
|
|
var output = "output-"+sentence+"-"+phrase;
|
|
$(output).setStyle({ borderColor: 'red' });
|
|
}
|
|
function show_word_info(sentence,cc,tc,te) {
|
|
var info = "info-"+sentence;
|
|
document.getElementById(info).innerHTML = ''+cc+' occurrences in corpus, '+tc+' distinct translations, translation entropy: '+te;
|
|
$(info).setStyle({ opacity: 1 });
|
|
}
|
|
function lowlight_phrase(sentence,phrase) {
|
|
var input = "input-"+sentence+"-"+phrase;
|
|
$(input).setStyle({ borderColor: 'black' });
|
|
var output = "output-"+sentence+"-"+phrase;
|
|
$(output).setStyle({ borderColor: 'black' });
|
|
}
|
|
function hide_word_info(sentence) {
|
|
var info = "info-"+sentence;
|
|
$(info).setStyle({ opacity: 0 });
|
|
}
|
|
function show_biconcor(sentence,phrase) {
|
|
var div = "biconcor-"+sentence;
|
|
var url = '?analysis=biconcor'
|
|
+ '&setup=<?php print $setup ?>&id=<?php print get_biconcor_version($dir,$set,$id); ?>&set=<?php print $set ?>'
|
|
+ '&sentence=' + sentence
|
|
+ '&phrase=' + encodeURIComponent(phrase);
|
|
document.getElementById(div).innerHTML = "<center><img src=\"spinner.gif\" width=48 height=48></center>";
|
|
$(div).setStyle({ borderStyle: 'solid', 'border-width': '3px', borderColor: 'black' });
|
|
new Ajax.Updater(div, url, { method: 'get', evalScripts: true });
|
|
}
|
|
function close_biconcor(sentence) {
|
|
var div = "biconcor-"+sentence;
|
|
document.getElementById(div).innerHTML = "";
|
|
$(div).setStyle({ borderStyle: 'none', 'border-width': '0px', borderColor: 'white' });
|
|
}
|
|
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<div id="nGramSummary"><?php ngram_summary() ?></div>
|
|
<div id="CoverageDetails"></div>
|
|
<div id="PrecisionByCoverage"></div>
|
|
<div id="PrecisionRecallDetails"></div>
|
|
<div id="bleu">(loading...)</div>
|
|
<script language="javascript">
|
|
show('bleu','',5,'');
|
|
</script>
|
|
</body></html>
|
|
<?php
|
|
}
|
|
|
|
function precision_by_coverage() {
|
|
global $experiment,$evalset,$dir,$set,$id;
|
|
$img_width = 1000;
|
|
|
|
print "<h3>Precision of Input Words by Coverage</h3>";
|
|
print "The graphs display what ratio of words of a specific type are translated correctly (yellow), and what ratio is deleted (blue).";
|
|
print " The extend of the boxes is scaled on the x-axis by the number of tokens of the displayed type.";
|
|
|
|
// load data
|
|
$data = file(get_current_analysis_filename("precision","precision-by-corpus-coverage"));
|
|
$total = 0;
|
|
$log_info = array();
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split("\t",$data[$i]);
|
|
$info[$item[0]]["precision"] = $item[1];
|
|
$info[$item[0]]["delete"] = $item[2];
|
|
$info[$item[0]]["length"] = $item[3];
|
|
$info[$item[0]]["total"] = $item[4];
|
|
$total += $item[4];
|
|
$log_count = -1;
|
|
if ($item[0]>0) {
|
|
$log_count = (int) (log($item[0])/log(2));
|
|
}
|
|
if (!array_key_exists($log_count,$log_info)) {
|
|
$log_info[$log_count]["precision"] = 0;
|
|
$log_info[$log_count]["delete"] = 0;
|
|
$log_info[$log_count]["length"] = 0;
|
|
$log_info[$log_count]["total"] = 0;
|
|
}
|
|
$log_info[$log_count]["precision"] += $item[1];
|
|
$log_info[$log_count]["delete"] += $item[2];
|
|
$log_info[$log_count]["length"] += $item[3];
|
|
$log_info[$log_count]["total"] += $item[4];
|
|
}
|
|
print "<h4>By log<sub>2</sub>-count in the training corpus</h4>";
|
|
precision_by_coverage_graph("byCoverage",$log_info,$total,$img_width,SORT_NUMERIC);
|
|
|
|
# load factored data
|
|
$d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id));
|
|
while (false !== ($file = $d->read())) {
|
|
if (preg_match('/precision-by-corpus-coverage.(.+)$/',$file, $match)) {
|
|
precision_by_coverage_factored($img_width,$total,$file,$match[1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
function precision_by_coverage_factored($img_width,$total,$file,$factor_id) {
|
|
global $dir,$set,$id;
|
|
$data = file(get_current_analysis_filename("precision",$file));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split("\t",$data[$i]);
|
|
$factor = $item[0];
|
|
$count = $item[1];
|
|
$info_factored[$factor][$count]["precision"] = $item[2];
|
|
$info_factored[$factor][$count]["delete"] = $item[3];
|
|
$info_factored[$factor][$count]["length"] = $item[4];
|
|
$info_factored[$factor][$count]["total"] = $item[5];
|
|
$info_factored_sum[$factor]["precision"] += $item[2];
|
|
$info_factored_sum[$factor]["delete"] += $item[3];
|
|
$info_factored_sum[$factor]["length"] += $item[4];
|
|
$info_factored_sum[$factor]["total"] += $item[5];
|
|
$total_factored[$factor] += $item[5];
|
|
$log_count = -1;
|
|
if ($count>0) {
|
|
$log_count = (int) (log($count)/log(2));
|
|
}
|
|
$log_info_factored[$factor][$log_count]["precision"] += $item[2];
|
|
$log_info_factored[$factor][$log_count]["delete"] += $item[3];
|
|
$log_info_factored[$factor][$log_count]["length"] += $item[4];
|
|
$log_info_factored[$factor][$log_count]["total"] += $item[5];
|
|
}
|
|
print "<h4>By factor ".factor_name("input",$factor_id)."</h4>";
|
|
precision_by_coverage_graph("byFactor",$info_factored_sum,$total,$img_width,SORT_STRING);
|
|
|
|
print "<h4>For each factor, by log<sub>2</sub>-count in the corpus</h4>";
|
|
foreach ($log_info_factored as $factor => $info) {
|
|
if ($total_factored[$factor]/$total > 0.01) {
|
|
print "<table style=\"display:inline;\"><tr><td align=center><font size=-2><b>$factor</b></font></td></tr><tr><td align=center>";
|
|
precision_by_coverage_graph("byCoverageFactor$factor",$info,$total_factored[$factor],10+2*$img_width*$total_factored[$factor]/$total,SORT_NUMERIC);
|
|
print "</td></tr></table>";
|
|
}
|
|
}
|
|
}
|
|
|
|
function precision_by_word($type) {
|
|
global $dir,$set,$id;
|
|
$byCoverage = -2;
|
|
$byFactor = "false";
|
|
if ($type == "byCoverage") {
|
|
$byCoverage = (int) $_GET["type"];
|
|
}
|
|
else if ($type == "byFactor") {
|
|
$byFactor = $_GET["type"];
|
|
}
|
|
else if (preg_match("/byCoverageFactor(.+)/",$type,$match)) {
|
|
$byCoverage = (int) $_GET["type"];
|
|
$byFactor = $match[1];
|
|
}
|
|
|
|
$data = file(get_current_analysis_filename("precision","precision-by-input-word"));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$line = rtrim($data[$i]);
|
|
$item = split("\t",$line);
|
|
|
|
//# filter for count
|
|
$count = $item[4];
|
|
$log_count = -1;
|
|
if ($count>0) {
|
|
$log_count = (int) (log($count)/log(2));
|
|
}
|
|
if ($byCoverage != -2 && $byCoverage != $log_count) {
|
|
continue;
|
|
}
|
|
|
|
//# filter for factor
|
|
$word = $item[5];
|
|
if ($byFactor != "false" && $byFactor != $item[6]) {
|
|
continue;
|
|
}
|
|
|
|
$info[$word]["precision"] = $item[0];
|
|
$info[$word]["delete"] = $item[1];
|
|
$info[$word]["length"] = $item[2];
|
|
$info[$word]["total"] = $item[3];
|
|
$total += $item[3];
|
|
}
|
|
|
|
print "<table border=1><tr><td align=center>Count</td><td align=center colspan=2>Precision</td><td align=center colspan=2>Delete</td><td align=center>Length</td></tr>\n";
|
|
foreach ($info as $word => $wordinfo) {
|
|
print "<tr><td align=center><a href=\"javascript:show('bleu','order',5,'".base64_encode($word)."')\">$word</a></td>";
|
|
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%.1f/%d</font></td>",$wordinfo["precision"]/$wordinfo["total"]*100,"%",$wordinfo["precision"],$wordinfo["total"]);
|
|
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%d/%d</font></td>",$wordinfo["delete"]/$wordinfo["total"]*100,"%",$wordinfo["delete"],$wordinfo["total"]);
|
|
printf("<td align=right>%.3f</td>",$wordinfo["length"]/$wordinfo["total"]);
|
|
print "</tr>";
|
|
}
|
|
print "</table>\n";
|
|
}
|
|
|
|
function precision_by_coverage_latex($name,$log_info,$total,$img_width,$sort_type) {
|
|
$keys = array_keys($log_info);
|
|
sort($keys,$sort_type);
|
|
|
|
$img_width /= 100;
|
|
print "<div id=\"LatexToggle$name\" onClick=\"document.getElementById('Latex$name').style.display = 'block'; this.style.display = 'none';\" style=\"display:none;\"><font size=-2>(show LaTeX)</font></div>\n";
|
|
print "<div id=\"Latex$name\" style=\"display:none;\">\n";
|
|
print "<code>\\begin{tikzpicture}<br>";
|
|
|
|
print "% co-ordinates for precision<br>";
|
|
for($line=0;$line<=9;$line++) {
|
|
$height = 1.8-$line/10*1.8;
|
|
print "\\draw[thin,lightgray] (0.2,-$height) ";
|
|
print "node[anchor=east,black] {".$line."0\\%} -- ";
|
|
print "($img_width,-$height) ;<br>\n";
|
|
}
|
|
print "% co-ordinates for deletion<br>\n";
|
|
for($line=0;$line<=3;$line++) {
|
|
$height = 2+$line/10*1.80;
|
|
print "\\draw[thin,lightgray] (0.2,-$height) ";
|
|
if ($line != 0) {
|
|
print "node[anchor=east,black] {".$line."0\\%} ";
|
|
}
|
|
print "-- ($img_width,-$height) ;<br>\n";
|
|
}
|
|
|
|
print "% boxes<br>\n";
|
|
$total_so_far = 0;
|
|
foreach ($keys as $i) {
|
|
$prec_ratio = $log_info[$i]["precision"]/$log_info[$i]["total"];
|
|
$x = .2+($img_width-.2) * $total_so_far/$total;
|
|
$y = 1.80-($prec_ratio*1.80);
|
|
$width = $img_width * $log_info[$i]["total"]/$total;
|
|
$height = $prec_ratio*1.80;
|
|
|
|
$width += $x;
|
|
$height += $y;
|
|
|
|
print "\\filldraw[very thin,gray] ($x,-$y) rectangle($width,-$height) ;<br>";
|
|
print "\\draw[very thin,black] ($x,-$y) rectangle($width,-$height);<br>";
|
|
if ($width-$x>.1) {
|
|
print "\\draw (".(($x+$width)/2).",-1.8) node[anchor=north,black] {".$i."};<br>";
|
|
}
|
|
|
|
|
|
$del_ratio = $log_info[$i]["delete"]/$log_info[$i]["total"];
|
|
$height = $del_ratio*1.80;
|
|
|
|
$height += 2;
|
|
|
|
print "\\filldraw[very thin,lightgray] ($x,-2) rectangle($width,-$height);<br>\n";
|
|
print "\\draw[very thin,black] ($x,-2) rectangle($width,-$height);<br>\n";
|
|
|
|
$total_so_far += $log_info[$i]["total"];
|
|
}
|
|
print "\\end{tikzpicture}</code>";
|
|
print "</div>";
|
|
}
|
|
|
|
function precision_by_coverage_graph($name,$log_info,$total,$img_width,$sort_type) {
|
|
|
|
$keys = array_keys($log_info);
|
|
sort($keys,$sort_type);
|
|
|
|
print "<div id=\"Toggle$name\" onClick=\"document.getElementById('Table$name').style.display = 'none'; document.getElementById('LatexToggle$name').style.display = 'none'; document.getElementById('Latex$name').style.display = 'none'; this.style.display = 'none';\" style=\"display:none;\"><font size=-2>(hide table)</font></div>\n";
|
|
precision_by_coverage_latex($name,$log_info,$total,$img_width,$sort_type);
|
|
|
|
print "<div id=\"Table$name\" style=\"display:none;\">\n";
|
|
print "<table border=1><tr><td align=center>Count</td><td align=center colspan=2>Precision</td><td align=center colspan=2>Delete</td><td align=center>Length</td></tr>\n";
|
|
foreach ($keys as $i) {
|
|
if (array_key_exists($i,$log_info)) {
|
|
print "<tr><td align=center>$i</td>";
|
|
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%.1f/%d</font></td>",$log_info[$i]["precision"]/$log_info[$i]["total"]*100,"%",$log_info[$i]["precision"],$log_info[$i]["total"]);
|
|
printf("<td align=right>%.1f%s</td><td align=right><font size=-1>%d/%d</font></td>",$log_info[$i]["delete"]/$log_info[$i]["total"]*100,"%",$log_info[$i]["delete"],$log_info[$i]["total"]);
|
|
printf("<td align=right>%.3f</td>",$log_info[$i]["length"]/$log_info[$i]["total"]);
|
|
print "<td><A HREF=\"javascript:generic_show('PrecisionByWord$name','type=$i')\">Ⓘ</A></td>";
|
|
print "</tr>";
|
|
}
|
|
}
|
|
print "</table><div id=\"PrecisionByWord$name\"></div></div>";
|
|
|
|
print "<div id=\"Graph$name\" onClick=\"document.getElementById('Table$name').style.display = 'block'; document.getElementById('LatexToggle$name').style.display = 'block'; document.getElementById('Toggle$name').style.display = 'block';\">";
|
|
print "<canvas id=\"$name\" width=$img_width height=300></canvas></div>";
|
|
print "<script language=\"javascript\">
|
|
var canvas = document.getElementById(\"$name\");
|
|
var ctx = canvas.getContext(\"2d\");
|
|
ctx.lineWidth = 0.5;
|
|
ctx.font = '9px serif';
|
|
";
|
|
for($line=0;$line<=9;$line++) {
|
|
$height = 180-$line/10*180;
|
|
print "ctx.moveTo(20, $height);\n";
|
|
print "ctx.lineTo($img_width, $height);\n";
|
|
if ($line != 0) {
|
|
print "ctx.fillText(\"${line}0\%\", 0, $height+4);";
|
|
}
|
|
}
|
|
for($line=0;$line<=3;$line++) {
|
|
$height = 200+$line/10*180;
|
|
print "ctx.moveTo(20, $height);\n";
|
|
print "ctx.lineTo($img_width, $height);\n";
|
|
if ($line != 0) {
|
|
print "ctx.fillText(\"${line}0\%\", 0, $height+4);";
|
|
}
|
|
}
|
|
print "ctx.strokeStyle = \"rgb(100,100,100)\"; ctx.stroke();\n";
|
|
|
|
$total_so_far = 0;
|
|
foreach ($keys as $i) {
|
|
$prec_ratio = $log_info[$i]["precision"]/$log_info[$i]["total"];
|
|
$x = (int)(20+($img_width-20) * $total_so_far / $total);
|
|
$y = (int)(180-($prec_ratio*180));
|
|
$width = (int)($img_width * $log_info[$i]["total"]/$total);
|
|
$height = (int)($prec_ratio*180);
|
|
print "ctx.fillStyle = \"rgb(200,200,0)\";";
|
|
print "ctx.fillRect ($x, $y, $width, $height);";
|
|
|
|
$del_ratio = $log_info[$i]["delete"]/$log_info[$i]["total"];
|
|
$height = (int)($del_ratio*180);
|
|
print "ctx.fillStyle = \"rgb(100,100,255)\";";
|
|
print "ctx.fillRect ($x, 200, $width, $height);";
|
|
|
|
$total_so_far += $log_info[$i]["total"];
|
|
|
|
if ($width>3) {
|
|
print "ctx.fillStyle = \"rgb(0,0,0)\";";
|
|
// print "ctx.rotate(-1.5707);";
|
|
print "ctx.fillText(\"$i\", $x+$width/2-3, 190);";
|
|
//print "ctx.rotate(1.5707);";
|
|
}
|
|
}
|
|
print "</script>";
|
|
}
|
|
|
|
//# stats on precision and recall
|
|
function precision_recall_details() {
|
|
?>
|
|
<table width=100%>
|
|
<tr>
|
|
<td width=25% valign=top><div id="nGramPrecision1">(loading...)</div></td>
|
|
<td width=25% valign=top><div id="nGramPrecision2">(loading...)</div></td>
|
|
<td width=25% valign=top><div id="nGramPrecision3">(loading...)</div></td>
|
|
<td width=25% valign=top><div id="nGramPrecision4">(loading...)</div></td>
|
|
</tr><tr>
|
|
<td width=25% valign=top><div id="nGramRecall1">(loading...)</div></td>
|
|
<td width=25% valign=top><div id="nGramRecall2">(loading...)</div></td>
|
|
<td width=25% valign=top><div id="nGramRecall3">(loading...)</div></td>
|
|
<td width=25% valign=top><div id="nGramRecall4">(loading...)</div></td>
|
|
</tr></table>
|
|
<script language="javascript">
|
|
ngram_show('precision',1,5,'',0);
|
|
ngram_show('precision',2,5,'',0);
|
|
ngram_show('precision',3,5,'',0);
|
|
ngram_show('precision',4,5,'',0);
|
|
ngram_show('recall',1,5,'',0);
|
|
ngram_show('recall',2,5,'',0);
|
|
ngram_show('recall',3,5,'',0);
|
|
ngram_show('recall',4,5,'',0);
|
|
</script>
|
|
<?php
|
|
}
|
|
|
|
//# stats on ngram precision
|
|
function ngram_summary() {
|
|
global $experiment,$evalset,$dir,$set,$id;
|
|
|
|
//# load data
|
|
$data = file(get_current_analysis_filename("basic","summary"));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split(": ",$data[$i]);
|
|
$info[$item[0]] = $item[1];
|
|
}
|
|
|
|
print "<table cellspacing=5 width=100%><tr><td valign=top align=center bgcolor=#eeeeee>";
|
|
//#foreach (array("precision","recall") as $type) {
|
|
print "<b>Precision of Output</b>\n";
|
|
$type = "precision";
|
|
print "<table><tr><td>$type</td><td>1-gram</td><td>2-gram</td><td>3-gram</td><td>4-gram</td></tr>\n";
|
|
printf("<tr><td>correct</td><td>%d</td><td>%d</td><td>%d</td><td>%d</td></tr>\n",
|
|
$info["$type-1-correct"],
|
|
$info["$type-2-correct"],
|
|
$info["$type-3-correct"],
|
|
$info["$type-4-correct"]);
|
|
printf("<tr><td> </td><td>%.1f%s</td><td>%.1f%s</td><td>%.1f%s</td><td>%.1f%s</td></tr>\n",
|
|
$info["$type-1-correct"]/$info["$type-1-total"]*100,'%',
|
|
$info["$type-2-correct"]/$info["$type-2-total"]*100,'%',
|
|
$info["$type-3-correct"]/$info["$type-3-total"]*100,'%',
|
|
$info["$type-4-correct"]/$info["$type-4-total"]*100,'%');
|
|
printf("<tr><td>wrong</td><td>%d</td><td>%d</td><td>%d</td><td>%d</td></tr>\n",
|
|
$info["$type-1-total"]-$info["$type-1-correct"],
|
|
$info["$type-2-total"]-$info["$type-2-correct"],
|
|
$info["$type-3-total"]-$info["$type-3-correct"],
|
|
$info["$type-4-total"]-$info["$type-4-correct"]);
|
|
print "</table>";
|
|
//}
|
|
|
|
print "<A HREF=\"javascript:generic_show('PrecisionRecallDetails','')\">details</A> ";
|
|
if (file_exists(get_current_analysis_filename("precision","precision-by-corpus-coverage"))) {
|
|
print "| <A HREF=\"javascript:generic_show('PrecisionByCoverage','')\">precision of input by coverage</A> ";
|
|
}
|
|
|
|
print "</td><td valign=top valign=top align=center bgcolor=#eeeeee>";
|
|
|
|
$each_score = explode(" ; ",$experiment[$id]->result[$set]);
|
|
$header = "";
|
|
$score_line = "";
|
|
for($i=0;$i<count($each_score);$i++) {
|
|
if (preg_match('/([\d\(\)\.\s]+) (BLEU[\-c]*)/',$each_score[$i],$match) ||
|
|
preg_match('/([\d\(\)\.\s]+) (IBM[\-c]*)/',$each_score[$i],$match) ||
|
|
preg_match('/([\d\(\)\.\s]+) (METEOR[\-c]*)/',$each_score[$i],$match)) {
|
|
$header .= "<td>$match[2]</td>";
|
|
$score_line .= "<td>$match[1]</td>";
|
|
}
|
|
}
|
|
print "<b>Metrics</b><table border=1><tr>".$header."</tr><tr>".$score_line."</tr></table>";
|
|
|
|
printf("<p>length-diff: %d (%.1f%s)",$info["precision-1-total"]-$info["recall-1-total"],($info["precision-1-total"]-$info["recall-1-total"])/$info["recall-1-total"]*100,"%");
|
|
|
|
// coverage
|
|
if (file_exists(get_current_analysis_filename("coverage","corpus-coverage-summary"))) {
|
|
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
|
print "<div id=\"CoverageSummary\">";
|
|
coverage_summary();
|
|
print "</div>";
|
|
}
|
|
|
|
// phrase segmentation
|
|
if (file_exists(get_current_analysis_filename("basic","segmentation")) ||
|
|
file_exists(get_current_analysis_filename("basic","rule"))) {
|
|
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
|
print "<div id=\"SegmentationSummary\">";
|
|
segmentation_summary();
|
|
print "</div>";
|
|
}
|
|
|
|
// rules
|
|
if (file_exists(get_current_analysis_filename("basic","rule"))) {
|
|
print "</td><td valign=top align=center bgcolor=#eeeeee>";
|
|
print "<div id=\"RuleSummary\">";
|
|
rule_summary();
|
|
print "</div>";
|
|
}
|
|
|
|
print "</td></tr></table>";
|
|
}
|
|
|
|
// details on ngram precision/recall
|
|
function ngram_show($type) {
|
|
global $set,$id,$dir;
|
|
|
|
// load data
|
|
$order = $_GET['order'];
|
|
$data = file(get_current_analysis_filename("basic","n-gram-$type.$order"));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split("\t",$data[$i]);
|
|
$line["total"] = $item[0];
|
|
$line["correct"] = $item[1];
|
|
$line["ngram"] = $item[2];
|
|
$ngram[] = $line;
|
|
}
|
|
|
|
// sort option
|
|
$sort = $_GET['sort'];
|
|
$smooth = $_GET['smooth'];
|
|
if ($sort == '') {
|
|
$sort = 'ratio_worst';
|
|
$smooth = 1;
|
|
}
|
|
|
|
// sort index
|
|
for($i=0;$i<count($ngram);$i++) {
|
|
if ($sort == "abs_worst") {
|
|
$ngram[$i]["index"] = $ngram[$i]["correct"] - $ngram[$i]["total"];
|
|
}
|
|
else if ($sort == "ratio_worst") {
|
|
$ngram[$i]["index"] = ($ngram[$i]["correct"] + $smooth) / ($ngram[$i]["total"] + $smooth);
|
|
}
|
|
}
|
|
|
|
// sort
|
|
function cmp($a, $b) {
|
|
if ($a["index"] == $b["index"]) {
|
|
return 0;
|
|
}
|
|
return ($a["index"] < $b["index"]) ? -1 : 1;
|
|
}
|
|
|
|
usort($ngram, 'cmp');
|
|
|
|
// display
|
|
$count = $_GET['count'];
|
|
if ($count == 0) { $count = 5; }
|
|
|
|
print "<B>$order-gram $type</B><br><font size=-1>sorted by ";
|
|
if ($sort == "ratio_worst") {
|
|
print "ratio ";
|
|
print "smooth-$smooth ";
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'ratio_worst',$smooth+1)\">+</A> ";
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'ratio_worst',$smooth-1)\">-</A> ";
|
|
}
|
|
else {
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'ratio_worst',1)\">ratio</A> ";
|
|
}
|
|
if ($sort == "abs_worst") {
|
|
print "absolute ";
|
|
}
|
|
else {
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,$count,'abs_worst',0)\">absolute</A> ";
|
|
}
|
|
print "showing $count ";
|
|
if ($count < 9999) {
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,$count+5,'$sort',$smooth)\">more</A> ";
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,9999,'$sort',$smooth)\">all</A> ";
|
|
}
|
|
else {
|
|
print "<A HREF=\"javascript:ngram_show('$type',$order,5,'$sort',$smooth)\">top5</A> ";
|
|
}
|
|
print "</font><br>\n";
|
|
|
|
print "<table width=100%>\n";
|
|
print "<tr><td>$order-gram</td><td>ok</td><td>x</td><td>ratio</td></tr>\n";
|
|
for($i=0;$i<$count && $i<count($ngram);$i++) {
|
|
$line = $ngram[$i];
|
|
print "<tr><td>".$line["ngram"]."</td>";
|
|
print "<td>".$line["correct"]."</td>";
|
|
print "<td>".($line["total"]-$line["correct"])."</td>";
|
|
printf("<td>%.3f</td></tr>",$line["correct"]/$line["total"]);
|
|
}
|
|
print "</table>\n";
|
|
}
|
|
|
|
// details on ngram coverage
|
|
function coverage_details() {
|
|
global $dir,$set,$id;
|
|
|
|
$count = array(); $token = array();
|
|
foreach (array("ttable","corpus") as $corpus) {
|
|
foreach (array("token","type") as $b) {
|
|
for($i=0;$i<=7;$i++) {
|
|
foreach (array("6+","2-5","1","0") as $range) {
|
|
$count[$corpus][$b][$i][$range] = 0;
|
|
}
|
|
$total[$corpus][$b][$i] = 0;
|
|
}
|
|
}
|
|
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split("\t",$data[$i]);
|
|
if ($item[1]>5) {
|
|
$count[$corpus]["type"][$item[0]]["6+"] += $item[2];
|
|
$count[$corpus]["token"][$item[0]]["6+"] += $item[3];
|
|
}
|
|
else if ($item[1]>1) {
|
|
$count[$corpus]["type"][$item[0]]["2-5"] += $item[2];
|
|
$count[$corpus]["token"][$item[0]]["2-5"] += $item[3];
|
|
}
|
|
else if ($item[1]==1) {
|
|
$count[$corpus]["type"][$item[0]]["1"] += $item[2];
|
|
$count[$corpus]["token"][$item[0]]["1"] += $item[3];
|
|
}
|
|
else {
|
|
$count[$corpus]["type"][$item[0]]["0"] += $item[2];
|
|
$count[$corpus]["token"][$item[0]]["0"] += $item[3];
|
|
}
|
|
$total[$corpus]["type"][$item[0]] += $item[2];
|
|
$total[$corpus]["token"][$item[0]] += $item[3];
|
|
}
|
|
}
|
|
print "<b>coverage</b><br>\n";
|
|
print "<table width=100%><tr>";
|
|
foreach (array("token","type") as $by) {
|
|
for($i=1;$i<=4;$i++) {
|
|
print "<td align=center><b>$i-gram ($by)</b><br>\n";
|
|
print "<table><tr><td></td><td>model</td><td>corpus</td></tr>\n";
|
|
foreach (array("0","1","2-5","6+") as $range) {
|
|
print "<tr><td>$range</td>";
|
|
foreach (array("ttable","corpus") as $corpus) {
|
|
printf("<td align=right nowrap>%d (%.1f%s)</td>",$count[$corpus][$by][$i][$range],100*$count[$corpus][$by][$i][$range]/($total[$corpus][$by][$i]+0.0001),"%");
|
|
}
|
|
print "</tr>\n";
|
|
}
|
|
print "</table></td>\n";
|
|
}
|
|
print "</tr><tr>";
|
|
}
|
|
print "</tr></table>\n";
|
|
|
|
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","ttable-unknown")));
|
|
for($i=0;$i<count($data);$i++) {
|
|
list($word,$count) = split("\t",$data[$i]);
|
|
$item["word"] = $word;
|
|
$item["count"] = rtrim($count);
|
|
$unknown[] = $item;
|
|
}
|
|
|
|
function cmp($a,$b) {
|
|
if ($a["count"] > $b["count"]) {
|
|
return -1;
|
|
}
|
|
else if ($a["count"] < $b["count"]) {
|
|
return 1;
|
|
}
|
|
else {
|
|
return strcmp($a["word"],$b["word"]);
|
|
}
|
|
}
|
|
|
|
usort($unknown, 'cmp');
|
|
|
|
print "<b>unknown words (to model)</b><br>\n";
|
|
print "<table><tr><td valign=top><table>";
|
|
$state = 5;
|
|
foreach ($unknown as $item) {
|
|
if ($item["count"] < $state) {
|
|
if ($state == 5) { print "</table>"; }
|
|
print "</td><td valign=top><b>".$item["count"].":</b> ";
|
|
$state = $item["count"];
|
|
if ($state == 1) { print "<font size=-1>"; }
|
|
}
|
|
else if ($state<5) {
|
|
print ", ";
|
|
}
|
|
if ($state == 5) {
|
|
print "<tr><td>".$item["count"]."</td><td>".$item["word"]."</td></tr>";
|
|
}
|
|
else {
|
|
print $item["word"];
|
|
}
|
|
}
|
|
print "</font></td></tr></table>\n";
|
|
}
|
|
|
|
function filename_fallback_to_factored($file) {
|
|
if (file_exists($file)) {
|
|
return $file;
|
|
}
|
|
$path = pathinfo($file);
|
|
$dh = opendir($path['dirname']);
|
|
while (($factored_file = readdir($dh)) !== false) {
|
|
if (strlen($factored_file) > strlen($path['basename']) &&
|
|
substr($factored_file,0,strlen($path['basename'])) == $path['basename'] &&
|
|
preg_match("/0/",substr($factored_file,strlen($path['basename'])))) {
|
|
return $path['dirname']."/".$factored_file;
|
|
}
|
|
}
|
|
// found nothing...
|
|
return $file;
|
|
}
|
|
|
|
function factor_name($input_output,$factor_id) {
|
|
global $dir,$set,$id;
|
|
$file = get_current_analysis_filename("coverage","factor-names");
|
|
if (!file_exists($file)) {
|
|
return $factor_id;
|
|
}
|
|
$in_out_names = file($file);
|
|
$names = explode(",",trim($in_out_names[($input_output == "input")?0:1]));
|
|
return "'".$names[$factor_id]."' ($factor_id)";
|
|
}
|
|
|
|
// stats on ngram coverage
|
|
function coverage_summary() {
|
|
global $dir,$set,$id,$corpus;
|
|
|
|
if (array_key_exists("by",$_GET)) { $by = $_GET['by']; }
|
|
else { $by = 'token'; }
|
|
|
|
$total = array(); $count = array();
|
|
foreach (array("ttable","corpus") as $corpus) {
|
|
foreach (array("token","type") as $b) {
|
|
foreach (array("6+","2-5","1","0") as $c) {
|
|
$count[$corpus][$b][$c] = 0;
|
|
}
|
|
$total[$corpus][$b] = 0;
|
|
}
|
|
$data = file(filename_fallback_to_factored(get_current_analysis_filename("coverage","$corpus-coverage-summary")));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split("\t",$data[$i]);
|
|
if ($item[0] == 1) {
|
|
if ($item[1]>5) {
|
|
$count[$corpus]["type"]["6+"] += $item[2];
|
|
$count[$corpus]["token"]["6+"] += $item[3];
|
|
}
|
|
else if ($item[1]>1) {
|
|
$count[$corpus]["type"]["2-5"] += $item[2];
|
|
$count[$corpus]["token"]["2-5"] += $item[3];
|
|
}
|
|
else if ($item[1]==1) {
|
|
$count[$corpus]["type"]["1"] += $item[2];
|
|
$count[$corpus]["token"]["1"] += $item[3];
|
|
}
|
|
else {
|
|
$count[$corpus]["type"]["0"] += $item[2];
|
|
$count[$corpus]["token"]["0"] += $item[3];
|
|
}
|
|
$total[$corpus]["type"] += $item[2];
|
|
$total[$corpus]["token"] += $item[3];
|
|
}
|
|
}
|
|
}
|
|
|
|
print "<b>Coverage</b>\n";
|
|
print "<table><tr><td></td><td>model</td><td>corpus</td></tr>\n";
|
|
foreach (array("0","1","2-5","6+") as $range) {
|
|
print "<tr><td>$range</td>";
|
|
foreach (array("ttable","corpus") as $corpus) {
|
|
printf("<td align=right nowrap>%d (%.1f%s)</td>",$count[$corpus][$by][$range],100*$count[$corpus][$by][$range]/($total[$corpus][$by]+0.0001),"%");
|
|
}
|
|
print "</tr>\n";
|
|
}
|
|
print "</table>\n";
|
|
if ($by == 'token') { print "by token"; } else {
|
|
print "<A HREF=\"javascript:generic_show('CoverageSummary','by=token')\">by token</A> ";
|
|
}
|
|
print " / ";
|
|
if ($by == 'type') { print "by type"; } else {
|
|
print "<A HREF=\"javascript:generic_show('CoverageSummary','by=type')\">by type</A> ";
|
|
}
|
|
print " / ";
|
|
print "<div id=\"CoverageDetailsLink\"><A HREF=\"javascript:generic_show('CoverageDetails','')\">details</A></div> ";
|
|
}
|
|
|
|
|
|
// stats on segmenation (phrase-based)
|
|
function segmentation_summary() {
|
|
global $dir,$set,$id;
|
|
|
|
if (array_key_exists("by",$_GET)) { $by = $_GET['by']; }
|
|
else { $by = 'word'; }
|
|
|
|
$count = array();
|
|
for($i=0;$i<=4;$i++) {
|
|
$count[$i] = array();
|
|
for($j=0;$j<=4;$j++) {
|
|
$count[$i][$j] = 0;
|
|
}
|
|
}
|
|
|
|
$total = 0;
|
|
$file = get_current_analysis_filename("basic","segmentation");
|
|
if (file_exists($file)) {
|
|
$data = file($file);
|
|
for($i=0;$i<count($data);$i++) {
|
|
list($in,$out,$c) = split("\t",$data[$i]);
|
|
if ($by == "word") { $c *= $in; }
|
|
if ($in>4) { $in = 4; }
|
|
if ($out>4) { $out = 4; }
|
|
$total += $c;
|
|
$count[$in][$out] += $c;
|
|
}
|
|
}
|
|
else {
|
|
$data = file(get_current_analysis_filename("basic","rule"));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$field = split("\t",$data[$i]);
|
|
$type = $field[0];
|
|
$rule = $field[1];
|
|
if (count($field) > 2) { $c = $field[2]; } else { $c = 0; }
|
|
if ($type == "rule") {
|
|
list($rule_in,$in,$nt,$rule_out,$out) = split(":",$rule);
|
|
if ($by == "word") { $c *= $in; }
|
|
if ($in>4) { $in = 4; }
|
|
if ($out>4) { $out = 4; }
|
|
$total += $c;
|
|
$count[$in][$out] += $c;
|
|
}
|
|
}
|
|
}
|
|
|
|
print "<b>Phrase Segmentation</b><br>\n";
|
|
print "<table>";
|
|
print "<tr><td></td><td align=center>1</td><td align=center>2</td><td align=center>3</td><td align=center>4+</td></tr>";
|
|
for($in=1;$in<=4;$in++) {
|
|
print "<tr><td nowrap>$in".($in==4?"+":"")." to</td>";
|
|
for($out=1;$out<=4;$out++) {
|
|
if (array_key_exists($in,$count) &&
|
|
array_key_exists($out,$count[$in])) {
|
|
$c = $count[$in][$out];
|
|
}
|
|
else { $c = 0; }
|
|
printf("<td align=right nowrap>%d (%.1f%s)</td>",$c,100*$c/$total,"%");
|
|
}
|
|
print "</tr>";
|
|
}
|
|
print "</table>\n";
|
|
if ($by == 'word') { print "by word"; } else {
|
|
print "<A HREF=\"javascript:generic_show('SegmentationSummary','by=word')\">by word</A> ";
|
|
}
|
|
print " / ";
|
|
if ($by == 'phrase') { print "by phrase"; } else {
|
|
print "<A HREF=\"javascript:generic_show('SegmentationSummary','by=phrase')\">by phrase</A> ";
|
|
}
|
|
}
|
|
|
|
// hierarchical rules used in translation
|
|
function rule_summary() {
|
|
global $dir,$set,$id;
|
|
$data = file(get_current_analysis_filename("basic","rule"));
|
|
$rule = array(); $count = array(); $count_nt = array(); $count_w = array();
|
|
$nt_count = 0; $total = 0;
|
|
foreach ($data as $item) {
|
|
$field = split("\t",$item);
|
|
$type = $field[0];
|
|
$d = $field[1];
|
|
if (count($field) > 2) { $d2 = $field[2]; } else { $d2 = 0; }
|
|
if ($type == "sentence-count") {
|
|
$sentence_count = $d;
|
|
}
|
|
else if ($type == "glue-rule") {
|
|
$glue_rule = $d / $sentence_count;
|
|
}
|
|
else if ($type == "depth") {
|
|
$depth = $d / $sentence_count;
|
|
}
|
|
else {
|
|
list($rule_in,$word_in,$nt,$rule_out,$word_out) = split(":",$d);
|
|
$rule_in = preg_replace("/a/","x",$rule_in);
|
|
$rule_in = preg_replace("/b/","y",$rule_in);
|
|
$rule_in = preg_replace("/c/","z",$rule_in);
|
|
$rule_out = preg_replace("/a/","x",$rule_out);
|
|
$rule_out = preg_replace("/b/","y",$rule_out);
|
|
$rule_out = preg_replace("/c/","z",$rule_out);
|
|
$nt_count += $d2 * $nt;
|
|
if (!array_key_exists($d,$rule)) { $rule[$d] = 0; }
|
|
$rule[$d] += $d2;
|
|
if (!array_key_exists($nt,$count)) { $count[$nt] = 0; }
|
|
$count[$nt] += $d2;
|
|
$just_nt = preg_replace("/\d/","",$rule_in)."-".preg_replace("/\d/","",$rule_out);
|
|
$no_wc = preg_replace("/\d/","W",$rule_in)."-".preg_replace("/\d/","",$rule_out);
|
|
if ($just_nt == "-") { $just_nt = "lexical"; }
|
|
if (!array_key_exists($just_nt,$count_nt)) { $count_nt[$just_nt] = 0; }
|
|
$count_nt[$just_nt] += $d2;
|
|
if (!array_key_exists($no_wc,$count_w)) { $count_w[$no_wc] = 0; }
|
|
$count_w[$no_wc] += $d2;
|
|
$total += $d2;
|
|
}
|
|
}
|
|
print "<b>Rules</b><br>\n";
|
|
printf("glue rule: %.2f<br>\n",$glue_rule);
|
|
printf("tree depth: %.2f<br>\n",$depth);
|
|
printf("nt/rule: %.2f<br>\n",$nt_count/$total);
|
|
print "<table>\n";
|
|
arsort($count_nt);
|
|
$i=0;
|
|
foreach ($count_nt as $rule => $count) {
|
|
if ($i++ < 5) { printf("<tr><td>%s</td><td align=right>%d</td><td align=right>%.1f%s</td></tr>\n",$rule,$count,$count/$total*100,'%'); }
|
|
}
|
|
if (count($count_nt)>5) { print "<tr><td align=center>...</td><td align=center>...</td><td align=center>...</td></tr>\n"; }
|
|
print "</table>\n";
|
|
}
|
|
|
|
// annotated sentences, navigation
|
|
function bleu_show() {
|
|
|
|
$count = $_GET['count'];
|
|
if ($count == 0) { $count = 5; }
|
|
|
|
$filter = "";
|
|
if (array_key_exists("filter",$_GET)) {
|
|
$filter = base64_decode($_GET['filter']);
|
|
}
|
|
|
|
print "<b>annotated sentences</b><br><font size=-1>sorted by: ";
|
|
|
|
if ($_GET['sort'] == "order" || $_GET['sort'] == "") { print "order "; }
|
|
else {
|
|
print "<A HREF=\"javascript:show('bleu','order',$count,'".base64_encode($filter)."')\">order</A> ";
|
|
}
|
|
if ($_GET['sort'] == "best") { print "best "; }
|
|
else {
|
|
print "<A HREF=\"javascript:show('bleu','best',$count,'".base64_encode($filter)."')\">best</A> ";
|
|
}
|
|
if ($_GET['sort'] == "25") { print "25% "; }
|
|
else {
|
|
print "<A HREF=\"javascript:show('bleu','25',$count,'".base64_encode($filter)."')\">25%</A> ";
|
|
}
|
|
if ($_GET['sort'] == "avg") { print "avg "; }
|
|
else {
|
|
print "<A HREF=\"javascript:show('bleu','avg',$count,'".base64_encode($filter)."')\">avg</A> ";
|
|
}
|
|
if ($_GET['sort'] == "75") { print "75% "; }
|
|
else {
|
|
print "<A HREF=\"javascript:show('bleu','75',$count,'".base64_encode($filter)."')\">75%</A> ";
|
|
}
|
|
if ($_GET['sort'] == "worst") { print "worst; "; }
|
|
else {
|
|
print "<A HREF=\"javascript:show('bleu','worst',$count,'".base64_encode($filter)."')\">worst</A>; ";
|
|
}
|
|
|
|
print "showing: $count ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">more</A> ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A>";
|
|
|
|
if ($filter != "") {
|
|
print "; filter: '$filter'";
|
|
}
|
|
|
|
sentence_annotation($count,$filter);
|
|
print "<p align=center><A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',5+$count,'".base64_encode($filter)."')\">5 more</A> | ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',10+$count,'".base64_encode($filter)."')\">10 more</A> | ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',20+$count,'".base64_encode($filter)."')\">20 more</A> | ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',50+$count,'".base64_encode($filter)."')\">50 more</A> | ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',100+$count,'".base64_encode($filter)."')\">100 more</A> | ";
|
|
print "<A HREF=\"javascript:show('bleu','" . $_GET['sort'] . "',9999,'".base64_encode($filter)."')\">all</A> ";
|
|
}
|
|
|
|
// annotated sentences core: reads data, sorts sentences, displays them
|
|
function sentence_annotation($count,$filter) {
|
|
global $set,$id,$dir,$biconcor;
|
|
|
|
# get input
|
|
$filtered = array();
|
|
$file = get_current_analysis_filename("coverage","input-annotation");
|
|
if (file_exists($file)) {
|
|
$input = file($file);
|
|
# filter is so specified
|
|
if ($filter != "") {
|
|
for($i=0;$i<count($input);$i++) {
|
|
$item = explode("\t",$input[$i]);
|
|
$word = explode(" ",$item[0]);
|
|
$keep = 0;
|
|
for($j=0;$j<count($word);$j++) {
|
|
if ($word[$j] == $filter) {
|
|
$keep = 1;
|
|
}
|
|
}
|
|
if (!$keep) { $filtered[$i] = 1; }
|
|
}
|
|
}
|
|
}
|
|
|
|
# load bleu scores
|
|
$data = file(get_current_analysis_filename("basic","bleu-annotation"));
|
|
for($i=0;$i<count($data);$i++) {
|
|
$item = split("\t",$data[$i]);
|
|
if (! array_key_exists($item[1],$filtered)) {
|
|
$line["bleu"] = $item[0];
|
|
$line["id"] = $item[1];
|
|
$line["system"] = $item[2];
|
|
$line["reference"] = "";
|
|
for($j=3;$j<count($item);$j++) {
|
|
if ($j>3) { $line["reference"] .= "<br>"; };
|
|
$line["reference"] .= $item[$j];
|
|
}
|
|
$bleu[] = $line;
|
|
}
|
|
}
|
|
|
|
# sort and label additional sentences as filtered
|
|
global $sort;
|
|
function cmp($a, $b) {
|
|
global $sort;
|
|
if ($sort == "order") {
|
|
$a_idx = $a["id"];
|
|
$b_idx = $b["id"];
|
|
}
|
|
else if ($sort == "worst" || $sort == "75") {
|
|
$a_idx = $a["bleu"];
|
|
$b_idx = $b["bleu"];
|
|
if ($a_idx == $b_idx) {
|
|
$a_idx = $b["id"];
|
|
$b_idx = $a["id"];
|
|
}
|
|
}
|
|
else if ($sort == "best" || $sort == "avg" || $sort == "25") {
|
|
$a_idx = -$a["bleu"];
|
|
$b_idx = -$b["bleu"];
|
|
if ($a_idx == $b_idx) {
|
|
$a_idx = $a["id"];
|
|
$b_idx = $b["id"];
|
|
}
|
|
}
|
|
if ($a_idx == $b_idx) {
|
|
return 0;
|
|
}
|
|
return ($a_idx < $b_idx) ? -1 : 1;
|
|
}
|
|
$sort = $_GET['sort'];
|
|
if ($sort == '') {
|
|
$sort = "order";
|
|
}
|
|
usort($bleu, 'cmp');
|
|
|
|
$offset = 0;
|
|
if ($sort == "25" || $sort == "75") {
|
|
$offset = (int) (count($bleu)/4);
|
|
}
|
|
else if ($sort == "avg") {
|
|
$offset = (int) (count($bleu)/2);
|
|
}
|
|
|
|
$retained = array();
|
|
for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
|
|
$line = $bleu[$i];
|
|
$retained[$line["id"]] = 1;
|
|
}
|
|
|
|
# get segmentation (phrase alignment)
|
|
$file = get_current_analysis_filename("basic","segmentation-annotation");
|
|
if (file_exists($file)) {
|
|
$data = file($file);
|
|
for($i=0;$i<count($data);$i++) {
|
|
if ($filter == "" || array_key_exists($i,$retained)) {
|
|
$segment = 0;
|
|
foreach (split(" ",$data[$i]) as $item) {
|
|
list($in_start,$in_end,$out_start,$out_end) = split(":",$item);
|
|
$segment++;
|
|
$segmentation[$i]["input_start"][$in_start] = $segment;
|
|
$segmentation[$i]["input_end"][$in_end] = $segment;
|
|
$segmentation[$i]["output_start"][$out_start] = $segment;
|
|
$segmentation[$i]["output_end"][$out_end+0] = $segment;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# get hierarchical data
|
|
$hierarchical = 0;
|
|
$file = get_current_analysis_filename("basic","input-tree");
|
|
if (file_exists($file)) {
|
|
$data = file($file);
|
|
$span = 0;
|
|
$last_sentence = -1;
|
|
$nt_count = array();
|
|
for($i=0;$i<count($data);$i++) {
|
|
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
|
|
if ($sentence != $last_sentence) { $span = 0; }
|
|
$last_sentence = $sentence;
|
|
if (array_key_exists($sentence,$retained)) {
|
|
$segmentation[$sentence][$span]["brackets"] = $brackets;
|
|
# $segmentation[$sentence][$span]["nt"] = $nt;
|
|
$segmentation[$sentence][$span]["words"] = rtrim($words);
|
|
if ($nt != "") { $nt_count[$nt]=1; }
|
|
$span++;
|
|
}
|
|
}
|
|
$hierarchical = 1;
|
|
# if (count($nt_count) <= 2) {
|
|
# foreach ($segmentation as $sentence => $segmentation_span) {
|
|
# foreach ($segmentation_span as $span => $type) {
|
|
# $segmentation[$sentence][$span]["nt"]="";
|
|
# }
|
|
# }
|
|
# }
|
|
}
|
|
$file = get_current_analysis_filename("basic","output-tree");
|
|
if (file_exists($file)) {
|
|
$data = file($file);
|
|
$span = 0;
|
|
$last_sentence = -1;
|
|
$nt_count = array();
|
|
for($i=0;$i<count($data);$i++) {
|
|
list($sentence,$brackets,$nt,$words) = split("\t",$data[$i]);
|
|
if ($sentence != $last_sentence) { $span = 0; }
|
|
$last_sentence = $sentence;
|
|
if (array_key_exists($sentence,$retained)) {
|
|
$segmentation_out[$sentence][$span]["brackets"] = $brackets;
|
|
$segmentation_out[$sentence][$span]["nt"] = $nt;
|
|
$segmentation_out[$sentence][$span]["words"] = rtrim($words);
|
|
if ($nt != "") { $nt_count[$nt]=1; }
|
|
$span++;
|
|
}
|
|
}
|
|
# no non-terminal markup, if there are two or less non-terminals (X,S)
|
|
if (count($nt_count) <= 2) {
|
|
foreach ($segmentation_out as $sentence => $segmentation_span) {
|
|
foreach ($segmentation_span as $span => $type) {
|
|
$segmentation_out[$sentence][$span]["nt"]="";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
$file = get_current_analysis_filename("basic","node");
|
|
if (file_exists($file)) {
|
|
$data = file($file);
|
|
$n = 0;
|
|
$last_sentence = -1;
|
|
for($i=0;$i<count($data);$i++) {
|
|
list($sentence,$depth,$start_div,$end_div,$start_div_in,$end_div_in,$children) = split(" ",$data[$i]);
|
|
if ($sentence != $last_sentence) { $n = 0; }
|
|
$last_sentence = $sentence;
|
|
if (array_key_exists($sentence,$retained)) {
|
|
$node[$sentence][$n]['depth'] = $depth;
|
|
$node[$sentence][$n]['start_div'] = $start_div;
|
|
$node[$sentence][$n]['end_div'] = $end_div;
|
|
$node[$sentence][$n]['start_div_in'] = $start_div_in;
|
|
$node[$sentence][$n]['end_div_in'] = $end_div_in;
|
|
$node[$sentence][$n]['children'] = rtrim($children);
|
|
$n++;
|
|
}
|
|
}
|
|
}
|
|
|
|
# display
|
|
if ($filter != "") {
|
|
print " (".(count($input)-count($filtered))." retaining)";
|
|
}
|
|
print "</font><BR>\n";
|
|
|
|
$biconcor = get_biconcor_version($dir,$set,$id);
|
|
//print "<div id=\"debug\">$sort / $offset</div>";
|
|
for($i=$offset;$i<$count+$offset && $i<count($bleu);$i++) {
|
|
$line = $bleu[$i];
|
|
$search_graph_dir = get_current_analysis_filename("basic","search-graph");
|
|
if (file_exists($search_graph_dir) && file_exists($search_graph_dir."/graph.".$line["id"])) {
|
|
$state = return_state_for_link();
|
|
print "<FONT SIZE=-1><A TARGET=_blank HREF=\"?$state&analysis=sgviz&set=$set&id=$id&sentence=".$line["id"]."\">show search graph</a></FONT><br>\n";
|
|
}
|
|
|
|
if ($hierarchical) {
|
|
annotation_hierarchical($line["id"],$segmentation[$line["id"]],$segmentation_out[$line["id"]],$node[$line["id"]]);
|
|
}
|
|
if ($input) {
|
|
print "<div id=\"info-".$line["id"]."\" style=\"border-color:black; background:#ffff80; opacity:0; width:100%; border:1px;\">0 occ. in corpus, 0 translations, entropy: 0.00</div>\n";
|
|
if ($biconcor) {
|
|
print "<div id=\"biconcor-".$line["id"]."\" class=\"biconcor\"><font size=-2>(click on input phrase for bilingual concordancer)</font></div>";
|
|
}
|
|
if ($hierarchical) {
|
|
sentence_annotation_hierarchical("#".$line["id"],$line["id"],$input[$line["id"]],$segmentation[$line["id"]],"in");
|
|
}
|
|
else {
|
|
print "<font size=-2>[#".$line["id"]."]</font> ";
|
|
input_annotation($line["id"],$input[$line["id"]],$segmentation[$line["id"]],$filter);
|
|
}
|
|
}
|
|
//else {
|
|
// print "<font size=-2>[".$line["id"].":".$line["bleu"]."]</font> ";
|
|
//}
|
|
if ($hierarchical) {
|
|
sentence_annotation_hierarchical($line["bleu"],$line["id"],$line["system"],$segmentation_out[$line["id"]],"out");
|
|
}
|
|
else {
|
|
print "<font size=-2>[".$line["bleu"]."]</font> ";
|
|
output_annotation($line["id"],$line["system"],$segmentation[$line["id"]]);
|
|
}
|
|
print "<br><font size=-2>[ref]</font> ".$line["reference"]."<hr>";
|
|
}
|
|
}
|
|
|
|
function coverage($coverage_vector) {
|
|
# get information from line in input annotation file
|
|
$coverage = array();
|
|
foreach (split(" ",$coverage_vector) as $item) {
|
|
if (preg_match("/[\-:]/",$item)) {
|
|
$field = preg_split("/[\-:]/",$item);
|
|
$from = $field[0];
|
|
$to = $field[1];
|
|
if (count($field)>2){ $coverage[$from][$to]["corpus_count"]=$field[2]; }
|
|
if (count($field)>3){ $coverage[$from][$to]["ttable_count"]=$field[3]; }
|
|
if (count($field)>4){ $coverage[$from][$to]["ttabel_entropy"]=$field[4]; }
|
|
}
|
|
}
|
|
|
|
return $coverage;
|
|
}
|
|
|
|
// annotate an inpute sentence
|
|
function input_annotation($sentence,$input,$segmentation,$filter) {
|
|
global $biconcor;
|
|
list($words,$coverage_vector) = split("\t",$input);
|
|
|
|
# get information from line in input annotation file
|
|
$coverage = array();
|
|
foreach (split(" ",$coverage_vector) as $item) {
|
|
if (preg_match("/[\-:]/",$item)) {
|
|
list($from,$to,$corpus_count,$ttable_count,$ttable_entropy) = preg_split("/[\-:]/",$item);
|
|
$coverage[$from][$to]["corpus_count"] = $corpus_count;
|
|
$coverage[$from][$to]["ttable_count"] = $ttable_count;
|
|
$coverage[$from][$to]["ttable_entropy"] = $ttable_entropy;
|
|
}
|
|
}
|
|
$word = split(" ",$words);
|
|
|
|
# compute the display level for each input phrase
|
|
for($j=0;$j<count($word);$j++) {
|
|
$box[] = array();
|
|
$separable[] = 1;
|
|
}
|
|
$max_level = 0;
|
|
for($length=1;$length<=7;$length++) {
|
|
for($from=0;$from<count($word)-($length-1);$from++) {
|
|
$to = $from + ($length-1);
|
|
if (array_key_exists($from,$coverage) &&
|
|
array_key_exists($to,$coverage[$from]) &&
|
|
array_key_exists("corpus_count",$coverage[$from][$to])) {
|
|
$level=0;
|
|
$available = 0;
|
|
while(!$available) {
|
|
$available = 1;
|
|
$level++;
|
|
for($j=$from;$j<=$to;$j++) {
|
|
if (array_key_exists($level,$box) &&
|
|
array_key_exists($j,$box[$level])) {
|
|
$available = 0;
|
|
}
|
|
}
|
|
}
|
|
for($j=$from;$j<=$to;$j++) {
|
|
$box[$level][$j] = $to;
|
|
}
|
|
$max_level = max($max_level,$level);
|
|
for($j=$from+1;$j<=$to;$j++) {
|
|
$separable[$j] = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
$separable[count($word)] = 1;
|
|
|
|
# display input phrases
|
|
$sep_start = 0;
|
|
for($sep_end=1;$sep_end<=count($word);$sep_end++) {
|
|
if ($separable[$sep_end] == 1) {
|
|
# one table for each separable block
|
|
print "<table cellpadding=1 cellspacing=0 border=0 style=\"display: inline-table;\">";
|
|
for($level=$max_level;$level>=1;$level--) {
|
|
# rows for phrase display
|
|
print "<tr style=\"height:5px;\">";
|
|
for($from=$sep_start;$from<$sep_end;$from++) {
|
|
if (array_key_exists($from,$box[$level])) {
|
|
$to = $box[$level][$from];
|
|
$size = $to - $from + 1;
|
|
if ($size == 1) {
|
|
print "<td><div style=\"height:0px; opacity:0; position:relative; z-index:-9;\">".$word[$from];
|
|
}
|
|
else {
|
|
$color = coverage_color($coverage[$from][$to]);
|
|
$phrase = "";
|
|
$highlightwords = "";
|
|
$lowlightwords = "";
|
|
for($j=$from;$j<=$to;$j++) {
|
|
if ($j>$from) { $phrase .= " "; }
|
|
$phrase .= $word[$j];
|
|
$highlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='#ffff80';";
|
|
$lowlightwords .= " document.getElementById('inputword-$sentence-$j').style.backgroundColor='".coverage_color($coverage[$j][$j])."';";
|
|
}
|
|
print "<td colspan=$size><div style=\"background-color: $color; height:3px;\" onmouseover=\"show_word_info($sentence,".$coverage[$from][$to]["corpus_count"].",".$coverage[$from][$to]["ttable_count"].",".$coverage[$from][$to]["ttable_entropy"]."); this.style.backgroundColor='#ffff80';$highlightwords\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';$lowlightwords;\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($phrase)."');\"":"").">";
|
|
}
|
|
print "</div></td>";
|
|
$from += $size-1;
|
|
}
|
|
else {
|
|
print "<td><div style=\"height:".($from==$to ? 0 : 3)."px;\"></div></td>";
|
|
}
|
|
}
|
|
print "</tr>\n";
|
|
}
|
|
# display input words
|
|
print "<tr><td colspan=".($sep_end-$sep_start)."><div style=\"position:relative; z-index:1;\">";
|
|
for($j=$sep_start;$j<$sep_end;$j++) {
|
|
if ($segmentation && array_key_exists($j,$segmentation["input_start"])) {
|
|
$id = $segmentation["input_start"][$j];
|
|
print "<span id=\"input-$sentence-$id\" style=\"border-color:#000000; border-style:solid; border-width:1px;\" onmouseover=\"highlight_phrase($sentence,$id);\" onmouseout=\"lowlight_phrase($sentence,$id);\">";
|
|
}
|
|
if (array_key_exists($j,$coverage)) {
|
|
$color = coverage_color($coverage[$j][$j]);
|
|
$cc = $coverage[$j][$j]["corpus_count"];
|
|
$tc = $coverage[$j][$j]["ttable_count"];
|
|
$te = $coverage[$j][$j]["ttable_entropy"];
|
|
}
|
|
else { # unknown words
|
|
$color = '#ffffff';
|
|
$cc = 0; $tc = 0; $te = 0;
|
|
}
|
|
print "<span id=\"inputword-$sentence-$j\" style=\"background-color: $color;\" onmouseover=\"show_word_info($sentence,$cc,$tc,$te); this.style.backgroundColor='#ffff80';\" onmouseout=\"hide_word_info($sentence); this.style.backgroundColor='$color';\"".($biconcor?" onclick=\"show_biconcor($sentence,'".base64_encode($word[$j])."');\"":"").">";
|
|
if ($word[$j] == $filter) {
|
|
print "<b><font color=#ff0000>".$word[$j]."</font></b>";
|
|
}
|
|
else {
|
|
print $word[$j];
|
|
}
|
|
print "</span>";
|
|
if ($segmentation && array_key_exists($j,$segmentation["input_end"])) {
|
|
print "</span>";
|
|
}
|
|
print " ";
|
|
}
|
|
print "</div></td></tr>\n";
|
|
print "</table>\n";
|
|
$sep_start = $sep_end;
|
|
}
|
|
}
|
|
print "<br>";
|
|
}
|
|
|
|
// color-coded coverage stats (corpus count, ttable count, entropy)
|
|
function coverage_color($phrase) {
|
|
$corpus_count = 255 - 10 * log(1 + $phrase["corpus_count"]);
|
|
if ($corpus_count < 128) { $corpus_count = 128; }
|
|
$cc_color = dechex($corpus_count / 16) . dechex($corpus_count % 16);
|
|
|
|
$ttable_count = 255 - 20 * log(1 + $phrase["ttable_count"]);
|
|
if ($ttable_count < 128) { $ttable_count = 128; }
|
|
$tc_color = dechex($ttable_count / 16) . dechex($ttable_count % 16);
|
|
|
|
$ttable_entropy = 255 - 32 * $phrase["ttable_entropy"];
|
|
if ($ttable_entropy < 128) { $ttable_entropy = 128; }
|
|
$te_color = dechex($ttable_entropy / 16) . dechex($ttable_entropy % 16);
|
|
|
|
// $color = "#". $cc_color . $te_color . $tc_color; # reddish browns with some green
|
|
// $color = "#". $cc_color . $tc_color . $te_color; # reddish brown with some blueish purple
|
|
$color = "#". $te_color . $cc_color . $tc_color; # pale green towards red
|
|
// $color = "#". $te_color . $tc_color . $cc_color; # pale purple towards red
|
|
// $color = "#". $tc_color . $te_color . $cc_color; // # blue-grey towards green
|
|
// $color = "#". $tc_color . $cc_color . $te_color; // # green-grey towards blue
|
|
|
|
return $color;
|
|
}
|
|
|
|
// annotate an output sentence
|
|
function output_annotation($sentence,$system,$segmentation) {
|
|
#$color = array("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
|
|
$color = array("#c0c0c0","#e0e0ff","#b0b0ff","#8080ff","#4040ff");
|
|
$word = split(" ",$system);
|
|
|
|
for($j=0;$j<count($word);$j++) {
|
|
list($surface,$correct) = split("\|", $word[$j]);
|
|
if ($segmentation && array_key_exists($j,$segmentation["output_start"])) {
|
|
$id = $segmentation["output_start"][$j];
|
|
print "<span id=\"output-$sentence-$id\" style=\"border-color:#000000; border-style:solid; border-width:1px;\" onmouseover=\"highlight_phrase($sentence,$id);\" onmouseout=\"lowlight_phrase($sentence,$id);\">";
|
|
}
|
|
print "<span style=\"background-color: $color[$correct]\">$surface</span>";
|
|
if ($segmentation && array_key_exists($j,$segmentation["output_end"])) {
|
|
print "</span>";
|
|
}
|
|
print " ";
|
|
}
|
|
}
|
|
|
|
function annotation_hierarchical($sentence,$segmentation,$segmentation_out,$node) {
|
|
print "<script language=\"javascript\">\n";
|
|
print "max_depth[$sentence] = ".strlen($segmentation[0]["brackets"]).";\n";
|
|
print "span_count_out[$sentence] = ".count($segmentation_out).";\n";
|
|
print "span_count_in[$sentence] = ".count($segmentation).";\n";
|
|
print "nodeIn[$sentence] = [];\n";
|
|
print "nodeOut[$sentence] = [];\n";
|
|
print "nodeChildren[$sentence] = [];\n";
|
|
for($n=0;$n<count($node);$n++) {
|
|
print "nodeIn[$sentence].push({ start: ".$node[$n]['start_div_in'].", end: ".$node[$n]['end_div_in'].", depth: ".$node[$n]['depth']." });\n";
|
|
print "nodeOut[$sentence].push({ start: ".$node[$n]['start_div'].", end: ".$node[$n]['end_div'].", depth: ".$node[$n]['depth']." });\n";
|
|
print "nodeChildren[$sentence].push([".$node[$n]['children']."]);\n";
|
|
}
|
|
print "</script>\n";
|
|
}
|
|
|
|
function sentence_annotation_hierarchical($info,$sentence,$sequence,$segmentation,$in_out) {
|
|
$In_Out = $in_out == "out" ? "Out" : "In";
|
|
|
|
#list($words,$coverage_vector) = split("\t",$input);
|
|
$coverage = coverage($sequence);
|
|
$word = preg_split("/\s/",$sequence);
|
|
|
|
$color = array("#ffe0e0","#f0e0ff","#e0e0ff","#c0c0ff","#a0a0ff");
|
|
#$color = array("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
|
|
#$color = array("#c0c0c0","#e0e0ff","#b0b0ff","#8080ff","#4040ff");
|
|
|
|
print "<table><tr><td>\n";
|
|
print "<div class=\"enclosing\"><font size=-2>[$info]</font></div>";
|
|
$word_count = 0;
|
|
for($span=0;$span<count($segmentation);$span++) {
|
|
print "<div class=\"enclosing\">";
|
|
for($depth=0;$depth<strlen($segmentation[$span]["brackets"]);$depth++) {
|
|
$class = substr($segmentation[$span]["brackets"],$depth,1);
|
|
$action = " id=\"$in_out-$sentence-$span-$depth\" onMouseOver=\"align$In_Out($sentence,$span,$depth);\" onMouseOut=\"unAlign($sentence);\"";
|
|
if ($class == " ") { $class = "empty"; $action = ""; }
|
|
else if ($class == "[") { $class = "opening"; }
|
|
else if ($class == "]") { $class = "closing"; }
|
|
else if ($class == "O") { $class = "leaf"; }
|
|
else if ($class == "-") { $class = "continued"; }
|
|
print "<div class=\"$class\"$action>";
|
|
}
|
|
|
|
$words = $segmentation[$span]["words"];
|
|
|
|
# non terminal
|
|
if (array_key_exists("nt",$segmentation[$span]) &&
|
|
$segmentation[$span]["nt"] != "") {
|
|
print $segmentation[$span]["nt"].": ";
|
|
}
|
|
|
|
# no nonterminal and no words => invisible bar
|
|
else if($words == "") {
|
|
print "<span style=\"opacity:0\">|</span>";
|
|
}
|
|
|
|
$span_word = array();
|
|
if ($words != "") { $span_word = split(" ",$words); }
|
|
for($w=0;$w<count($span_word);$w++) {
|
|
if ($w > 0) { print " "; }
|
|
if ($in_out == "in") {
|
|
#print "<span style=\"background-color: ".coverage_color($coverage[$word_count][$word_count]).";\">";
|
|
print $word[$word_count];
|
|
#print "</span>";
|
|
}
|
|
else {
|
|
list($surface,$correct) = split("\|", $word[$word_count]);
|
|
|
|
print "<span style=\"background-color: $color[$correct]\">$surface</span>";
|
|
}
|
|
$word_count++;
|
|
}
|
|
|
|
for($depth=0;$depth<strlen($segmentation[$span]["brackets"]);$depth++) {
|
|
print "</div>";
|
|
}
|
|
print "</div>"; # enclosing
|
|
}
|
|
print "</td></tr></table>\n";
|
|
}
|
|
|
|
function biconcor($query) {
|
|
global $set,$id,$dir;
|
|
$sentence = $_GET['sentence'];
|
|
$biconcor = get_biconcor_version($dir,$set,$id);
|
|
print "<center>
|
|
<form method=\"get\" id=\"BiconcorForm\" onsubmit=\"return false;\">
|
|
<img src=\"close.gif\" width=17 height=17 onClick=\"close_biconcor($sentence);\">
|
|
<input width=20 id=\"BiconcorQuery\" value=\"$query\">
|
|
<input type=submit onclick=\"show_biconcor($sentence,Base64.encode(document.getElementById('BiconcorQuery').value));\" value=\"look up\">
|
|
</form>
|
|
<div class=\"biconcor-content\">";
|
|
$cmd = "./biconcor -html -l $dir/model/biconcor.$biconcor -Q ".base64_encode($query)." 2>/dev/null";
|
|
# print $cmd."<p>";
|
|
system($cmd);
|
|
# print "<p>done.";
|
|
print "</div></center>";
|
|
|
|
}
|