mosesdecoder/scripts/generic/strip-xml.perl

46 lines
761 B
Plaintext
Raw Normal View History

#!/usr/bin/env perl
2013-07-29 14:27:13 +04:00
2015-04-13 19:42:33 +03:00
use warnings;
2013-07-29 14:27:13 +04:00
use strict;
while (my $line = <STDIN>) {
chomp($line);
#print "$line\n";
my $len = length($line);
my $inXML = 0;
my $prevSpace = 1;
2014-06-06 00:30:29 +04:00
my $prevBar = 0;
2013-07-29 14:27:13 +04:00
for (my $i = 0; $i < $len; ++$i) {
my $c = substr($line, $i, 1);
2014-06-06 00:30:29 +04:00
if ($c eq "<" && !$prevBar) {
2013-07-29 14:27:13 +04:00
++$inXML;
}
2014-06-06 00:30:29 +04:00
elsif ($c eq ">" && $inXML>0) {
2013-07-29 14:27:13 +04:00
--$inXML;
}
elsif ($prevSpace == 1 && $c eq " ")
{ # duplicate space. Do nothing
}
2013-07-29 14:27:13 +04:00
elsif ($inXML == 0) {
if ($c eq " ") {
$prevSpace = 1;
2014-06-06 17:14:35 +04:00
$prevBar = 0;
2014-06-06 00:30:29 +04:00
}
elsif ($c eq "|") {
$prevSpace = 0;
$prevBar = 1;
}
else {
$prevSpace = 0;
2014-06-06 17:14:35 +04:00
$prevBar = 0;
}
2013-07-29 14:27:13 +04:00
print $c;
}
}
2013-07-29 14:27:13 +04:00
print "\n";
}