split_xml should be consistent for training and using

This commit is contained in:
alvations 2018-12-20 11:53:02 +08:00
parent 413ba6b583
commit 40748e528d

View File

@ -103,8 +103,20 @@ sub split_xml {
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
$MARKUP[$i] .= $1." ";
$line = $2;
my $potential_xml = $1;
my $line_next = $2;
# exception for factor that is an XML tag
if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
$WORD[$i-1] .= $potential_xml;
if ($line_next =~ /^(\|+)(.*)$/) {
$WORD[$i-1] .= $1;
$line_next = $2;
}
}
else {
$MARKUP[$i] .= $potential_xml." ";
}
$line = $line_next;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {