2011-08-05 18:32:39 +04:00
#!/usr/bin/perl -w
#
# Detokenization tests.
#
use strict ;
# This is here to suppress (false) warnings about OLDOUT and OLDERR being used only once. Maybe there is a less brutish way to suppress that, but I don't know it.
no warnings 'once' ;
use utf8 ;
use Cwd ( 'abs_path' ) ;
use File::Spec::Functions ;
use File::Basename ( 'dirname' ) ;
use IPC::Run3 ;
use Getopt::Long ;
2011-08-05 22:48:37 +04:00
use Test::More ;
2011-08-05 18:32:39 +04:00
GetOptions ( "detokenizer=s" = > \ ( my $ detokenizer ) ,
"results-dir=s" = > \ ( my $ results_dir )
) or exit 1 ;
unless ( defined $ results_dir ) {
print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n" ;
exit 1 ;
}
die "ERROR: Results directory " . $ results_dir . " doesn't exist or is not a writable directory. Dying" unless ( - d $ results_dir && - w $ results_dir ) ;
$ detokenizer = catfile ( dirname ( dirname ( abs_path ( $ 0 ) ) ) , "scripts" , "tokenizer" , "detokenizer.perl" ) unless $ detokenizer ;
die "ERROR: Detokenizer script " . $ detokenizer . " does not exist. Dying" unless - f $ detokenizer ;
2011-08-05 22:48:37 +04:00
my @ testCases = ( ) ;
2011-08-05 18:32:39 +04:00
######################################
# Definitions of individual test cases
######################################
# A simple English test
2011-08-05 22:48:37 +04:00
& addDetokenizerTest ( "TEST_ENGLISH_EASY" , "en" ,
2011-08-05 20:37:49 +04:00
<< 'TOK'
2011-08-05 18:32:39 +04:00
This sentence is really simple , so it should not be hard to detokenize .
This one is no more difficult , but , hey , it is on a new line .
TOK
2011-08-05 20:37:49 +04:00
,
2011-08-05 18:32:39 +04:00
<< 'EXP'
This sentence is really simple , so it should not be hard to detokenize .
This one is no more difficult , but , hey , it is on a new line .
EXP
) ;
2011-08-05 20:23:47 +04:00
# An English test involving double-quotes
2011-08-05 22:48:37 +04:00
& addDetokenizerTest ( "TEST_ENGLISH_DOUBLEQUOTES" , "en" ,
2011-08-05 20:37:49 +04:00
<< 'TOK'
2011-08-05 20:23:47 +04:00
This is a somewhat " less simple " test .
TOK
2011-08-05 20:37:49 +04:00
,
2011-08-05 20:23:47 +04:00
<< 'EXP'
This is a somewhat "less simple" test .
EXP
) ;
2011-08-05 18:32:39 +04:00
# A simple French test
2011-08-05 22:48:37 +04:00
& addDetokenizerTest ( "TEST_FRENCH_EASY" , "fr" ,
2011-08-05 20:37:49 +04:00
<< 'TOK'
2011-08-05 20:23:47 +04:00
Voici une phrase simple .
2011-08-05 18:32:39 +04:00
TOK
2011-08-05 20:37:49 +04:00
,
2011-08-05 18:32:39 +04:00
<< 'EXP'
2011-08-05 20:23:47 +04:00
Voici une phrase simple .
2011-08-05 18:32:39 +04:00
EXP
) ;
2011-08-05 20:23:47 +04:00
# A French test involving an apostrophe
2011-08-05 22:48:37 +04:00
& addDetokenizerTest ( "TEST_FRENCH_APOSTROPHE" , "fr" ,
2011-08-05 20:37:49 +04:00
<< 'TOK'
2011-08-05 20:23:47 +04:00
Moi , j ' ai une apostrophe .
TOK
2011-08-05 20:37:49 +04:00
,
2011-08-05 20:23:47 +04:00
<< 'EXP'
Moi , j ' ai une apostrophe .
EXP
) ;
2011-08-05 22:48:37 +04:00
# A (failing) French test involving an apostrophe on the second-last word
{
my $ testCase =
& addDetokenizerTest ( "TEST_FRENCH_APOSTROPHE_PENULTIMATE" , "fr" ,
2011-08-05 20:37:49 +04:00
<< 'TOK'
2011-08-05 20:23:47 +04:00
de musique rap issus de l ' immigration
TOK
2011-08-05 20:37:49 +04:00
,
2011-08-05 20:23:47 +04:00
<< 'EXP'
de musique rap issus de l ' immigration
EXP
) ;
2011-08-05 22:48:37 +04:00
$ testCase - > setExpectedToFail ( "A bug is causing this to be detokenized wrong." ) ;
2011-08-05 20:23:47 +04:00
}
2011-08-05 23:14:01 +04:00
# A German test involving non-ASCII characters
# Note: We don't specify a language because the detokenizer errors if you pass in a language for which it has no special rules, of which German is an example.
& addDetokenizerTest ( "TEST_GERMAN_NONASCII" , undef ,
<< 'TOK'
Ich hoffe , daß Sie schöne Ferien hatten .
Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen
TOK
,
<< 'EXP'
Ich hoffe , daß Sie schöne Ferien hatten .
Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen
EXP
) ;
2011-08-05 18:32:39 +04:00
######################################
2011-08-05 22:48:37 +04:00
# Now run those babies ...
2011-08-05 18:32:39 +04:00
######################################
2011-08-05 22:48:37 +04:00
plan tests = > scalar ( @ testCases ) ;
2011-08-05 18:32:39 +04:00
2011-08-05 22:48:37 +04:00
foreach my $ testCase ( @ testCases ) {
& runDetokenizerTest ( $ testCase ) ;
}
2011-08-05 18:32:39 +04:00
############
## Utilities
############
2011-08-05 22:48:37 +04:00
# Creates a new detokenizer test case, adds it to the array of test cases to be run, and returns it.
sub addDetokenizerTest {
my ( $ testName , $ language , $ tokenizedText , $ rightAnswer ) = @ _ ;
my $ testCase = new DetokenizerTestCase ( $ testName , $ language , $ tokenizedText , $ rightAnswer ) ;
push ( @ testCases , $ testCase ) ;
return $ testCase ;
}
2011-08-05 18:32:39 +04:00
sub runDetokenizerTest {
2011-08-05 22:48:37 +04:00
my ( $ testCase ) = @ _ ;
2011-08-05 18:32:39 +04:00
2011-08-05 22:48:37 +04:00
my $ testOutputDir = catfile ( $ results_dir , $ testCase - > getName ( ) ) ;
2011-08-05 18:32:39 +04:00
my $ tokenizedFile = catfile ( $ testOutputDir , "input.txt" ) ;
my $ expectedFile = catfile ( $ testOutputDir , "expected.txt" ) ;
# Fail if we can't make the test output directory
unless ( mkdir ( $ testOutputDir ) ) {
2011-08-05 22:48:37 +04:00
fail ( $ testCase - > getName ( ) . ": Failed to create output directory " . $ testOutputDir . " [" . $! . "]" ) ;
2011-08-05 18:32:39 +04:00
exit ;
}
open TOK , ">" . $ tokenizedFile ;
binmode TOK , ":utf8" ;
2011-08-05 22:48:37 +04:00
print TOK $ testCase - > getTokenizedText ( ) ;
2011-08-05 18:32:39 +04:00
close TOK ;
open TRUTH , ">" . $ expectedFile ;
binmode TRUTH , ":utf8" ;
2011-08-05 22:48:37 +04:00
print TRUTH $ testCase - > getRightAnswer ( ) ;
2011-08-05 18:32:39 +04:00
close TRUTH ;
2011-08-05 22:48:37 +04:00
& runTest ( $ testCase - > getName ( ) , $ testOutputDir , $ tokenizedFile , sub {
2011-08-05 23:14:01 +04:00
return defined ( $ testCase - > getLanguage ( ) ) ? [ $ detokenizer , "-l" , $ testCase - > getLanguage ( ) ] : [ $ detokenizer ] ;
2011-08-05 18:32:39 +04:00
} , sub {
2011-08-05 22:48:37 +04:00
& verifyIdentical ( $ testCase - > getName ( ) , $ expectedFile , catfile ( $ testOutputDir , "stdout.txt" ) )
} , 1 , $ testCase - > getFailureExplanation ( ) ) ;
2011-08-05 18:32:39 +04:00
}
# $stdinFile, if defined, is a file to send to the command via STDIN
# $buildCommandRoutineReference is a reference to a zero-argument subroutine that returns the
2011-08-05 22:48:37 +04:00
# system command to run in the form of an array reference
2011-08-05 18:32:39 +04:00
# $validationRoutineReference is a reference to a zero-argument subroutine that makes some calls
# to ok() or similar to validate the contents of the output directory
# $separateStdoutFromStderr is an optional boolean argument; if omitted or false, the command's
# STDOUT and STDERR are mixed together in out output file called
# stdout-and-stderr.txt; otherwise, they are printed to separate output
# files called stdout.txt and stderr.txt, respectively
2011-08-05 22:48:37 +04:00
# $failureExplanation is an explanation of why the test is expected to fail. If the test is expected
# to pass, then this should be left undefined. Even in the case of a test that
# is expected to fail, the system command is still expected to exit normally --
# only the validation routine is expected to fail.
2011-08-05 18:32:39 +04:00
sub runTest {
2011-08-05 22:48:37 +04:00
my ( $ testName , $ outputDir , $ stdinFile , $ buildCommandRoutineReference , $ validationRoutineReference , $ separateStdoutFromStderr , $ failureExplanation ) = @ _ ;
2011-08-05 18:32:39 +04:00
# Note: You may need to upgrade your version of the Perl module Test::Simple in order to get this 'subtest' thing to work. (Perl modules are installed/upgraded using CPAN; google 'how do I upgrade a perl module')
subtest $ testName = > sub {
my ( $ stdoutFile , $ stderrFile ) ;
if ( $ separateStdoutFromStderr ) {
$ stdoutFile = catfile ( $ outputDir , "stdout.txt" ) ;
$ stderrFile = catfile ( $ outputDir , "stderr.txt" ) ;
} else {
$ stdoutFile = catfile ( $ outputDir , "stdout-and-stderr.txt" ) ;
$ stderrFile = $ stdoutFile ;
}
my $ commandRef = $ buildCommandRoutineReference - > ( ) ;
my $ exitStatus = & runVerbosely ( $ commandRef , $ stdinFile , $ stdoutFile , $ stderrFile ) ;
return unless is ( $ exitStatus , 0 , $ testName . ": command exited with status 0" ) ;
2011-08-05 22:48:37 +04:00
if ( defined $ failureExplanation ) {
TODO: {
local $ TODO = $ failureExplanation ;
$ validationRoutineReference - > ( ) ;
}
} else {
$ validationRoutineReference - > ( ) ;
}
2011-08-05 18:32:39 +04:00
} ;
}
# Announce that we're going to run the given command, then run it.
# $stdinFile, if defined, is a file to send to the command via STDIN
# $stdoutFile and $stderrFile, if defined, are file paths to which the command's standard output
# and standard error, respectively, are written. They can be the same file.
# The exit code of the command is returned.
sub runVerbosely {
my ( $ commandRef , $ stdinFile , $ stdoutFile , $ stderrFile ) = @ _ ;
my @ command = @ { $ commandRef } ;
note ( "Executing command:\n @command\n" ) ;
note ( "standard input coming from: " . $ stdinFile ) if defined $ stdinFile ;
note ( "standard output going to: " . $ stdoutFile ) if defined $ stdoutFile ;
note ( "standard error going to: " . $ stderrFile ) if defined $ stderrFile ;
run3 ( $ commandRef , $ stdinFile , $ stdoutFile , $ stderrFile ) ;
return $? ;
}
# Verify that the given output file is identical to the given reference file.
sub verifyIdentical {
my ( $ testName , $ referenceFile , $ outputFile ) = @ _ ;
open ( REF , $ referenceFile ) or return fail ( $ testName . ": Can't open reference file " . $ referenceFile . " [" . $! . "]." ) ;
open ( OUT , $ outputFile ) or return fail ( $ testName . ": Can't open output file " . $ outputFile . " [" . $! . "]." ) ;
my @ referenceFileAsArray = <REF> ;
my @ outputFileAsArray = <OUT> ;
close ( REF ) ;
close ( OUT ) ;
is_deeply ( \ @ outputFileAsArray , \ @ referenceFileAsArray , $ testName . ": Output file " . $ outputFile . " matches reference file " . $ referenceFile . "." ) ;
}
2011-08-05 22:48:37 +04:00
##%%%%%%%%%%%%%%%%%%%%%%%%%%%##
## DetokenizerTestCase class ##
package DetokenizerTestCase ;
# Constructor
sub new {
my $ class = shift ;
my $ self = {
_name = > shift ,
_language = > shift ,
_tokenizedText = > shift ,
_rightAnswer = > shift ,
_failureExplanation = > undef
} ;
bless $ self , $ class ;
}
sub getName {
my ( $ self ) = @ _ ;
return $ self - > { _name } ;
}
sub getLanguage {
my ( $ self ) = @ _ ;
return $ self - > { _language } ;
}
sub getTokenizedText {
my ( $ self ) = @ _ ;
return $ self - > { _tokenizedText } ;
}
sub getRightAnswer {
my ( $ self ) = @ _ ;
return $ self - > { _rightAnswer } ;
}
# Call this routine to indicate that this test case is expected to fail.
# (The detokenizer script is still expected to exit normally, but the output is not expected to
# match the right answer because of a bug or unimplemented use case.)
sub setExpectedToFail {
my ( $ self , $ failureExplanation ) = @ _ ;
$ self - > { _failureExplanation } = $ failureExplanation || "This test is expected to fail." ;
}
# Returns a string explaining why this test is expected to fail, or undef if this test is expected
# to pass.
sub getFailureExplanation {
my ( $ self ) = @ _ ;
return $ self - > { _failureExplanation } ;
}