diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 94ddbf2fa..589ee43e3 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -103,8 +103,20 @@ sub split_xml { while($line =~ /\S/) { # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { - $MARKUP[$i] .= $1." "; - $line = $2; + my $potential_xml = $1; + my $line_next = $2; + # exception for factor that is an XML tag + if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { + $WORD[$i-1] .= $potential_xml; + if ($line_next =~ /^(\|+)(.*)$/) { + $WORD[$i-1] .= $1; + $line_next = $2; + } + } + else { + $MARKUP[$i] .= $potential_xml." "; + } + $line = $line_next; } # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {