diff --git a/regression-testing/run-test-detokenizer.t b/regression-testing/run-test-detokenizer.t index f9cc3423a..9d677b43e 100644 --- a/regression-testing/run-test-detokenizer.t +++ b/regression-testing/run-test-detokenizer.t @@ -82,9 +82,7 @@ Moi, j'ai une apostrophe. EXP ); -# A (failing) French test involving an apostrophe on the second-last word -{ -my $testCase = +# A French test involving an apostrophe on the second-last word &addDetokenizerTest("TEST_FRENCH_APOSTROPHE_PENULTIMATE", "fr", <<'TOK' de musique rap issus de l' immigration @@ -95,9 +93,6 @@ de musique rap issus de l'immigration EXP ); -$testCase->setExpectedToFail("A bug is causing this to be detokenized wrong."); -} - # A German test involving non-ASCII characters # Note: We don't specify a language because the detokenizer errors if you pass in a language for which it has no special rules, of which German is an example. &addDetokenizerTest("TEST_GERMAN_NONASCII", undef, diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index 0b8e5af73..f049b8080 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -92,7 +92,7 @@ sub detokenize { #left-shift floats in Czech $text=$text.$words[$i]; $prependSpace = " "; - } elsif ((($language eq "fr") ||($language eq "it")) && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { + } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { #right-shift the contraction for French and Italian $text = $text.$prependSpace.$words[$i]; $prependSpace = "";