mosesdecoder/scripts/tokenizer/escape-special-chars.perl

28 lines
586 B
Plaintext
Raw Normal View History

#!/usr/bin/perl -w
use strict;
while(<STDIN>) {
chop;
# avoid general madness
2012-05-26 03:09:50 +04:00
s/[\000-\037]//g;
s/\s+/ /g;
s/^ //g;
s/ $//g;
# special characters in moses
s/\&/\&amp;/g; # escape escape
s/\|/\&#124;/g; # factor separator
s/\</\&lt;/g; # xml
s/\>/\&gt;/g; # xml
s/\'/\&apos;/g; # xml
s/\"/\&quot;/g; # xml
s/\[/\&#91;/g; # syntax non-terminal
s/\]/\&#93;/g; # syntax non-terminal
2012-05-26 03:09:50 +04:00
# restore xml instructions
2012-06-27 01:49:59 +04:00
s/\&lt;(\S+) translation=&quot;(.+?)&quot;&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
print $_."\n";
}