Merge pull request #168 from tofula/master

Named group added for the safer 'protected patterns' recognition regexp
This commit is contained in:
Hieu Hoang 2016-12-23 10:26:19 +00:00 committed by GitHub
commit 59119c0044
2 changed files with 10 additions and 3 deletions

View File

@ -176,6 +176,13 @@ sub ngrams {
return { md5(encode_utf8($sent)) => 1 };
} else {
my @words = split /\s+/, $sent;
#factors
if ( $sent =~ m/[|]/) {
my $use_index = 0; # default factor is the first one
@words = map { ( split /[|]/, $_ ) [$use_index] } @words;
}
my $out;
if ($n == 1) {
foreach my $w (@words) {

View File

@ -243,9 +243,9 @@ sub tokenize
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
my $t = $text;
while ($t =~ /($protected_pattern)(.*)$/) {
push @protected, $1;
$t = $2;
while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
push @protected, $+{PATTERN};
$t = $+{TAIL};
}
}