uint -> size_t

This commit is contained in:
Hieu Hoang 2011-12-12 19:13:32 +07:00
parent 9ec1bef6fb
commit 9861ecbbe5
191 changed files with 4496 additions and 4143 deletions

View File

@ -38,22 +38,21 @@
typedef struct _cmd CMD; typedef struct _cmd CMD;
struct _cmd struct _cmd {
{ CMD * next;
CMD * next; CMD * tail; /* valid on in head */
CMD * tail; /* valid on in head */ RULE * rule; /* rule->actions contains shell script */
RULE * rule; /* rule->actions contains shell script */ LIST * shell; /* $(SHELL) value */
LIST * shell; /* $(SHELL) value */ LOL args; /* LISTs for $(<), $(>) */
LOL args; /* LISTs for $(<), $(>) */ char * buf; /* actual commands */
char * buf; /* actual commands */
}; };
CMD * cmd_new CMD * cmd_new
( (
RULE * rule, /* rule (referenced) */ RULE * rule, /* rule (referenced) */
LIST * targets, /* $(<) (freed) */ LIST * targets, /* $(<) (freed) */
LIST * sources, /* $(>) (freed) */ LIST * sources, /* $(>) (freed) */
LIST * shell /* $(SHELL) (freed) */ LIST * shell /* $(SHELL) (freed) */
); );
void cmd_free( CMD * ); void cmd_free( CMD * );

View File

@ -10,35 +10,33 @@
#include <time.h> #include <time.h>
struct profile_info struct profile_info {
{ /* name of rule being called */
/* name of rule being called */ char* name;
char* name; /* cumulative time spent in rule */
/* cumulative time spent in rule */ clock_t cumulative;
clock_t cumulative; /* time spent in rule proper */
/* time spent in rule proper */ clock_t net;
clock_t net; /* number of time rule was entered */
/* number of time rule was entered */ unsigned long num_entries;
unsigned long num_entries; /* number of the times this function is present in stack */
/* number of the times this function is present in stack */ unsigned long stack_count;
unsigned long stack_count; /* bytes of memory allocated by the call */
/* bytes of memory allocated by the call */ unsigned long memory;
unsigned long memory;
}; };
typedef struct profile_info profile_info; typedef struct profile_info profile_info;
struct profile_frame struct profile_frame {
{ /* permanent storage where data accumulates */
/* permanent storage where data accumulates */ profile_info* info;
profile_info* info; /* overhead for profiling in this call */
/* overhead for profiling in this call */ clock_t overhead;
clock_t overhead; /* time of last entry to rule */
/* time of last entry to rule */ clock_t entry_time;
clock_t entry_time; /* stack frame of caller */
/* stack frame of caller */ struct profile_frame* caller;
struct profile_frame* caller; /* time spent in subrules */
/* time spent in subrules */ clock_t subrules;
clock_t subrules;
}; };
typedef struct profile_frame profile_frame; typedef struct profile_frame profile_frame;

View File

@ -18,22 +18,21 @@
#include <time.h> #include <time.h>
typedef struct timing_info typedef struct timing_info {
{ double system;
double system; double user;
double user; time_t start;
time_t start; time_t end;
time_t end;
} timing_info; } timing_info;
void exec_cmd void exec_cmd
( (
char * string, char * string,
void (* func)( void * closure, int status, timing_info *, char *, char * ), void (* func)( void * closure, int status, timing_info *, char *, char * ),
void * closure, void * closure,
LIST * shell, LIST * shell,
char * action, char * action,
char * target char * target
); );
int exec_wait(); int exec_wait();

View File

@ -33,14 +33,13 @@ int file_is_file(char* filename);
int file_mkdir(char *pathname); int file_mkdir(char *pathname);
typedef struct file_info_t file_info_t ; typedef struct file_info_t file_info_t ;
struct file_info_t struct file_info_t {
{ char * name;
char * name; short is_file;
short is_file; short is_dir;
short is_dir; unsigned long size;
unsigned long size; time_t time;
time_t time; LIST * files;
LIST * files;
}; };

View File

@ -12,15 +12,14 @@
typedef struct _PARSE PARSE; typedef struct _PARSE PARSE;
typedef struct frame FRAME; typedef struct frame FRAME;
struct frame struct frame {
{ FRAME * prev;
FRAME * prev; /* The nearest enclosing frame for which module->user_module is true. */
/* The nearest enclosing frame for which module->user_module is true. */ FRAME * prev_user;
FRAME * prev_user; LOL args[ 1 ];
LOL args[ 1 ]; module_t * module;
module_t * module; PARSE * procedure;
PARSE * procedure; char * rulename;
char * rulename;
}; };

View File

@ -91,7 +91,7 @@
#include <ctype.h> #include <ctype.h>
#include <malloc.h> #include <malloc.h>
#ifndef __MWERKS__ #ifndef __MWERKS__
#include <memory.h> #include <memory.h>
#endif #endif
#include <signal.h> #include <signal.h>
#include <string.h> #include <string.h>
@ -113,17 +113,17 @@
/* AS400 cross-compile from NT. */ /* AS400 cross-compile from NT. */
#ifdef AS400 #ifdef AS400
#undef OSMINOR #undef OSMINOR
#undef OSMAJOR #undef OSMAJOR
#define OSMAJOR "AS400=true" #define OSMAJOR "AS400=true"
#define OSMINOR "OS=AS400" #define OSMINOR "OS=AS400"
#define OS_AS400 #define OS_AS400
#endif #endif
/* Metrowerks Standard Library on Windows. */ /* Metrowerks Standard Library on Windows. */
#ifdef __MSL__ #ifdef __MSL__
#undef HAVE_POPEN #undef HAVE_POPEN
#endif #endif
# endif # endif
@ -182,7 +182,7 @@
#define DOWNSHIFT_PATHS #define DOWNSHIFT_PATHS
#ifdef __EMX__ #ifdef __EMX__
#define USE_FILEUNIX #define USE_FILEUNIX
#endif #endif
#endif #endif
@ -218,181 +218,181 @@
#define PATH_DELIM '/' #define PATH_DELIM '/'
#ifdef _AIX #ifdef _AIX
#define unix #define unix
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */ #define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
#define OSMINOR "OS=AIX" #define OSMINOR "OS=AIX"
#define OS_AIX #define OS_AIX
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef AMIGA #ifdef AMIGA
#define OSMINOR "OS=AMIGA" #define OSMINOR "OS=AMIGA"
#define OS_AMIGA #define OS_AMIGA
#endif #endif
#ifdef __BEOS__ #ifdef __BEOS__
#define unix #define unix
#define OSMINOR "OS=BEOS" #define OSMINOR "OS=BEOS"
#define OS_BEOS #define OS_BEOS
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef __bsdi__ #ifdef __bsdi__
#define OSMINOR "OS=BSDI" #define OSMINOR "OS=BSDI"
#define OS_BSDI #define OS_BSDI
#endif #endif
#if defined (COHERENT) && defined (_I386) #if defined (COHERENT) && defined (_I386)
#define OSMINOR "OS=COHERENT" #define OSMINOR "OS=COHERENT"
#define OS_COHERENT #define OS_COHERENT
#define NO_VFORK #define NO_VFORK
#endif #endif
#if defined(__cygwin__) || defined(__CYGWIN__) #if defined(__cygwin__) || defined(__CYGWIN__)
#define OSMINOR "OS=CYGWIN" #define OSMINOR "OS=CYGWIN"
#define OS_CYGWIN #define OS_CYGWIN
#endif #endif
#if defined(__FreeBSD__) && !defined(__DragonFly__) #if defined(__FreeBSD__) && !defined(__DragonFly__)
#define OSMINOR "OS=FREEBSD" #define OSMINOR "OS=FREEBSD"
#define OS_FREEBSD #define OS_FREEBSD
#endif #endif
#ifdef __DragonFly__ #ifdef __DragonFly__
#define OSMINOR "OS=DRAGONFLYBSD" #define OSMINOR "OS=DRAGONFLYBSD"
#define OS_DRAGONFLYBSD #define OS_DRAGONFLYBSD
#endif #endif
#ifdef __DGUX__ #ifdef __DGUX__
#define OSMINOR "OS=DGUX" #define OSMINOR "OS=DGUX"
#define OS_DGUX #define OS_DGUX
#endif #endif
#ifdef __hpux #ifdef __hpux
#define OSMINOR "OS=HPUX" #define OSMINOR "OS=HPUX"
#define OS_HPUX #define OS_HPUX
#endif #endif
#ifdef __OPENNT #ifdef __OPENNT
#define unix #define unix
#define OSMINOR "OS=INTERIX" #define OSMINOR "OS=INTERIX"
#define OS_INTERIX #define OS_INTERIX
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef __sgi #ifdef __sgi
#define OSMINOR "OS=IRIX" #define OSMINOR "OS=IRIX"
#define OS_IRIX #define OS_IRIX
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef __ISC #ifdef __ISC
#define OSMINOR "OS=ISC" #define OSMINOR "OS=ISC"
#define OS_ISC #define OS_ISC
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef linux #ifdef linux
#define OSMINOR "OS=LINUX" #define OSMINOR "OS=LINUX"
#define OS_LINUX #define OS_LINUX
#endif #endif
#ifdef __Lynx__ #ifdef __Lynx__
#define OSMINOR "OS=LYNX" #define OSMINOR "OS=LYNX"
#define OS_LYNX #define OS_LYNX
#define NO_VFORK #define NO_VFORK
#define unix #define unix
#endif #endif
#ifdef __MACHTEN__ #ifdef __MACHTEN__
#define OSMINOR "OS=MACHTEN" #define OSMINOR "OS=MACHTEN"
#define OS_MACHTEN #define OS_MACHTEN
#endif #endif
#ifdef mpeix #ifdef mpeix
#define unix #define unix
#define OSMINOR "OS=MPEIX" #define OSMINOR "OS=MPEIX"
#define OS_MPEIX #define OS_MPEIX
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef __MVS__ #ifdef __MVS__
#define unix #define unix
#define OSMINOR "OS=MVS" #define OSMINOR "OS=MVS"
#define OS_MVS #define OS_MVS
#endif #endif
#ifdef _ATT4 #ifdef _ATT4
#define OSMINOR "OS=NCR" #define OSMINOR "OS=NCR"
#define OS_NCR #define OS_NCR
#endif #endif
#ifdef __NetBSD__ #ifdef __NetBSD__
#define unix #define unix
#define OSMINOR "OS=NETBSD" #define OSMINOR "OS=NETBSD"
#define OS_NETBSD #define OS_NETBSD
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef __QNX__ #ifdef __QNX__
#define unix #define unix
#ifdef __QNXNTO__ #ifdef __QNXNTO__
#define OSMINOR "OS=QNXNTO" #define OSMINOR "OS=QNXNTO"
#define OS_QNXNTO #define OS_QNXNTO
#else #else
#define OSMINOR "OS=QNX" #define OSMINOR "OS=QNX"
#define OS_QNX #define OS_QNX
#define NO_VFORK #define NO_VFORK
#define MAXLINE 996 #define MAXLINE 996
#endif #endif
#endif #endif
#ifdef NeXT #ifdef NeXT
#ifdef __APPLE__ #ifdef __APPLE__
#define OSMINOR "OS=RHAPSODY" #define OSMINOR "OS=RHAPSODY"
#define OS_RHAPSODY #define OS_RHAPSODY
#else #else
#define OSMINOR "OS=NEXT" #define OSMINOR "OS=NEXT"
#define OS_NEXT #define OS_NEXT
#endif #endif
#endif #endif
#ifdef __APPLE__ #ifdef __APPLE__
#define unix #define unix
#define OSMINOR "OS=MACOSX" #define OSMINOR "OS=MACOSX"
#define OS_MACOSX #define OS_MACOSX
#endif #endif
#ifdef __osf__ #ifdef __osf__
#ifndef unix #ifndef unix
#define unix #define unix
#endif #endif
#define OSMINOR "OS=OSF" #define OSMINOR "OS=OSF"
#define OS_OSF #define OS_OSF
#endif #endif
#ifdef _SEQUENT_ #ifdef _SEQUENT_
#define OSMINOR "OS=PTX" #define OSMINOR "OS=PTX"
#define OS_PTX #define OS_PTX
#endif #endif
#ifdef M_XENIX #ifdef M_XENIX
#define OSMINOR "OS=SCO" #define OSMINOR "OS=SCO"
#define OS_SCO #define OS_SCO
#define NO_VFORK #define NO_VFORK
#endif #endif
#ifdef sinix #ifdef sinix
#define unix #define unix
#define OSMINOR "OS=SINIX" #define OSMINOR "OS=SINIX"
#define OS_SINIX #define OS_SINIX
#endif #endif
#ifdef sun #ifdef sun
#if defined(__svr4__) || defined(__SVR4) #if defined(__svr4__) || defined(__SVR4)
#define OSMINOR "OS=SOLARIS" #define OSMINOR "OS=SOLARIS"
#define OS_SOLARIS #define OS_SOLARIS
#else #else
#define OSMINOR "OS=SUNOS" #define OSMINOR "OS=SUNOS"
#define OS_SUNOS #define OS_SUNOS
#endif #endif
#endif #endif
#ifdef ultrix #ifdef ultrix
#define OSMINOR "OS=ULTRIX" #define OSMINOR "OS=ULTRIX"
#define OS_ULTRIX #define OS_ULTRIX
#endif #endif
#ifdef _UNICOS #ifdef _UNICOS
#define OSMINOR "OS=UNICOS" #define OSMINOR "OS=UNICOS"
#define OS_UNICOS #define OS_UNICOS
#endif #endif
#if defined(__USLC__) && !defined(M_XENIX) #if defined(__USLC__) && !defined(M_XENIX)
#define OSMINOR "OS=UNIXWARE" #define OSMINOR "OS=UNIXWARE"
#define OS_UNIXWARE #define OS_UNIXWARE
#endif #endif
#ifdef __OpenBSD__ #ifdef __OpenBSD__
#define OSMINOR "OS=OPENBSD" #define OSMINOR "OS=OPENBSD"
#define OS_OPENBSD #define OS_OPENBSD
#define unix #define unix
#endif #endif
#if defined (__FreeBSD_kernel__) && !defined(__FreeBSD__) #if defined (__FreeBSD_kernel__) && !defined(__FreeBSD__)
#define OSMINOR "OS=KFREEBSD" #define OSMINOR "OS=KFREEBSD"
#define OS_KFREEBSD #define OS_KFREEBSD
#endif #endif
#ifndef OSMINOR #ifndef OSMINOR
#define OSMINOR "OS=UNKNOWN" #define OSMINOR "OS=UNKNOWN"
#endif #endif
/* All the UNIX includes */ /* All the UNIX includes */
@ -401,7 +401,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#ifndef OS_MPEIX #ifndef OS_MPEIX
#include <sys/file.h> #include <sys/file.h>
#endif #endif
#include <fcntl.h> #include <fcntl.h>
@ -413,11 +413,11 @@
#include <unistd.h> #include <unistd.h>
#ifndef OS_QNX #ifndef OS_QNX
#include <memory.h> #include <memory.h>
#endif #endif
#ifndef OS_ULTRIX #ifndef OS_ULTRIX
#include <stdlib.h> #include <stdlib.h>
#endif #endif
#if !defined( OS_BSDI ) && \ #if !defined( OS_BSDI ) && \
@ -429,7 +429,7 @@
!defined( OS_RHAPSODY ) && \ !defined( OS_RHAPSODY ) && \
!defined( OS_MVS ) && \ !defined( OS_MVS ) && \
!defined( OS_OPENBSD ) !defined( OS_OPENBSD )
#include <malloc.h> #include <malloc.h>
#endif #endif
#endif #endif
@ -443,57 +443,57 @@
defined( ppc ) || \ defined( ppc ) || \
defined( __powerpc__ ) || \ defined( __powerpc__ ) || \
defined( __ppc__ ) defined( __ppc__ )
#define OSPLAT "OSPLAT=PPC" #define OSPLAT "OSPLAT=PPC"
#endif #endif
#if defined( _ALPHA_ ) || \ #if defined( _ALPHA_ ) || \
defined( __alpha__ ) defined( __alpha__ )
#define OSPLAT "OSPLAT=AXP" #define OSPLAT "OSPLAT=AXP"
#endif #endif
#if defined( _i386_ ) || \ #if defined( _i386_ ) || \
defined( __i386__ ) || \ defined( __i386__ ) || \
defined( __i386 ) || \ defined( __i386 ) || \
defined( _M_IX86 ) defined( _M_IX86 )
#define OSPLAT "OSPLAT=X86" #define OSPLAT "OSPLAT=X86"
#endif #endif
#if defined( __ia64__ ) || \ #if defined( __ia64__ ) || \
defined( __IA64__ ) || \ defined( __IA64__ ) || \
defined( __ia64 ) defined( __ia64 )
#define OSPLAT "OSPLAT=IA64" #define OSPLAT "OSPLAT=IA64"
#endif #endif
#if defined( __x86_64__ ) || \ #if defined( __x86_64__ ) || \
defined( __amd64__ ) || \ defined( __amd64__ ) || \
defined( _M_AMD64 ) defined( _M_AMD64 )
#define OSPLAT "OSPLAT=X86_64" #define OSPLAT "OSPLAT=X86_64"
#endif #endif
#if defined( __sparc__ ) || \ #if defined( __sparc__ ) || \
defined( __sparc ) defined( __sparc )
#define OSPLAT "OSPLAT=SPARC" #define OSPLAT "OSPLAT=SPARC"
#endif #endif
#ifdef __mips__ #ifdef __mips__
#define OSPLAT "OSPLAT=MIPS" #define OSPLAT "OSPLAT=MIPS"
#endif #endif
#ifdef __arm__ #ifdef __arm__
#define OSPLAT "OSPLAT=ARM" #define OSPLAT "OSPLAT=ARM"
#endif #endif
#ifdef __s390__ #ifdef __s390__
#define OSPLAT "OSPLAT=390" #define OSPLAT "OSPLAT=390"
#endif #endif
#ifdef __hppa #ifdef __hppa
#define OSPLAT "OSPLAT=PARISC" #define OSPLAT "OSPLAT=PARISC"
#endif #endif
#ifndef OSPLAT #ifndef OSPLAT
#define OSPLAT "" #define OSPLAT ""
#endif #endif
/* /*
@ -501,16 +501,16 @@
*/ */
#ifndef MAXLINE #ifndef MAXLINE
#define MAXLINE 102400 /* longest 'together' actions' */ #define MAXLINE 102400 /* longest 'together' actions' */
#endif #endif
#ifndef EXITOK #ifndef EXITOK
#define EXITOK 0 #define EXITOK 0
#define EXITBAD 1 #define EXITBAD 1
#endif #endif
#ifndef SPLITPATH #ifndef SPLITPATH
#define SPLITPATH ':' #define SPLITPATH ':'
#endif #endif
/* You probably do not need to muck with these. */ /* You probably do not need to muck with these. */
@ -526,19 +526,18 @@
#define DEBUG_MAX 14 #define DEBUG_MAX 14
struct globs struct globs {
{ int noexec;
int noexec; int jobs;
int jobs; int quitquick;
int quitquick; int newestfirst; /* build newest sources first */
int newestfirst; /* build newest sources first */ int pipe_action;
int pipe_action; char debug[ DEBUG_MAX ];
char debug[ DEBUG_MAX ]; FILE * cmdout; /* print cmds, not run them */
FILE * cmdout; /* print cmds, not run them */ long timeout; /* number of seconds to limit actions to,
long timeout; /* number of seconds to limit actions to,
* default 0 for no limit. * default 0 for no limit.
*/ */
int dart; /* output build and test results formatted for Dart */ int dart; /* output build and test results formatted for Dart */
}; };
extern struct globs globs; extern struct globs globs;

View File

@ -26,56 +26,56 @@
/* Tokens. */ /* Tokens. */
#ifndef YYTOKENTYPE #ifndef YYTOKENTYPE
# define YYTOKENTYPE # define YYTOKENTYPE
/* Put the tokens into the symbol table, so that GDB and other debuggers /* Put the tokens into the symbol table, so that GDB and other debuggers
know about them. */ know about them. */
enum yytokentype { enum yytokentype {
_BANG_t = 258, _BANG_t = 258,
_BANG_EQUALS_t = 259, _BANG_EQUALS_t = 259,
_AMPER_t = 260, _AMPER_t = 260,
_AMPERAMPER_t = 261, _AMPERAMPER_t = 261,
_LPAREN_t = 262, _LPAREN_t = 262,
_RPAREN_t = 263, _RPAREN_t = 263,
_PLUS_EQUALS_t = 264, _PLUS_EQUALS_t = 264,
_COLON_t = 265, _COLON_t = 265,
_SEMIC_t = 266, _SEMIC_t = 266,
_LANGLE_t = 267, _LANGLE_t = 267,
_LANGLE_EQUALS_t = 268, _LANGLE_EQUALS_t = 268,
_EQUALS_t = 269, _EQUALS_t = 269,
_RANGLE_t = 270, _RANGLE_t = 270,
_RANGLE_EQUALS_t = 271, _RANGLE_EQUALS_t = 271,
_QUESTION_EQUALS_t = 272, _QUESTION_EQUALS_t = 272,
_LBRACKET_t = 273, _LBRACKET_t = 273,
_RBRACKET_t = 274, _RBRACKET_t = 274,
ACTIONS_t = 275, ACTIONS_t = 275,
BIND_t = 276, BIND_t = 276,
CASE_t = 277, CASE_t = 277,
CLASS_t = 278, CLASS_t = 278,
DEFAULT_t = 279, DEFAULT_t = 279,
ELSE_t = 280, ELSE_t = 280,
EXISTING_t = 281, EXISTING_t = 281,
FOR_t = 282, FOR_t = 282,
IF_t = 283, IF_t = 283,
IGNORE_t = 284, IGNORE_t = 284,
IN_t = 285, IN_t = 285,
INCLUDE_t = 286, INCLUDE_t = 286,
LOCAL_t = 287, LOCAL_t = 287,
MODULE_t = 288, MODULE_t = 288,
ON_t = 289, ON_t = 289,
PIECEMEAL_t = 290, PIECEMEAL_t = 290,
QUIETLY_t = 291, QUIETLY_t = 291,
RETURN_t = 292, RETURN_t = 292,
RULE_t = 293, RULE_t = 293,
SWITCH_t = 294, SWITCH_t = 294,
TOGETHER_t = 295, TOGETHER_t = 295,
UPDATED_t = 296, UPDATED_t = 296,
WHILE_t = 297, WHILE_t = 297,
_LBRACE_t = 298, _LBRACE_t = 298,
_BAR_t = 299, _BAR_t = 299,
_BARBAR_t = 300, _BARBAR_t = 300,
_RBRACE_t = 301, _RBRACE_t = 301,
ARG = 302, ARG = 302,
STRING = 303 STRING = 303
}; };
#endif #endif
#define _BANG_t 258 #define _BANG_t 258
#define _BANG_EQUALS_t 259 #define _BANG_EQUALS_t 259

View File

@ -1,44 +1,44 @@
{ "!", _BANG_t }, { "!", _BANG_t },
{ "!=", _BANG_EQUALS_t }, { "!=", _BANG_EQUALS_t },
{ "&", _AMPER_t }, { "&", _AMPER_t },
{ "&&", _AMPERAMPER_t }, { "&&", _AMPERAMPER_t },
{ "(", _LPAREN_t }, { "(", _LPAREN_t },
{ ")", _RPAREN_t }, { ")", _RPAREN_t },
{ "+=", _PLUS_EQUALS_t }, { "+=", _PLUS_EQUALS_t },
{ ":", _COLON_t }, { ":", _COLON_t },
{ ";", _SEMIC_t }, { ";", _SEMIC_t },
{ "<", _LANGLE_t }, { "<", _LANGLE_t },
{ "<=", _LANGLE_EQUALS_t }, { "<=", _LANGLE_EQUALS_t },
{ "=", _EQUALS_t }, { "=", _EQUALS_t },
{ ">", _RANGLE_t }, { ">", _RANGLE_t },
{ ">=", _RANGLE_EQUALS_t }, { ">=", _RANGLE_EQUALS_t },
{ "?=", _QUESTION_EQUALS_t }, { "?=", _QUESTION_EQUALS_t },
{ "[", _LBRACKET_t }, { "[", _LBRACKET_t },
{ "]", _RBRACKET_t }, { "]", _RBRACKET_t },
{ "actions", ACTIONS_t }, { "actions", ACTIONS_t },
{ "bind", BIND_t }, { "bind", BIND_t },
{ "case", CASE_t }, { "case", CASE_t },
{ "class", CLASS_t }, { "class", CLASS_t },
{ "default", DEFAULT_t }, { "default", DEFAULT_t },
{ "else", ELSE_t }, { "else", ELSE_t },
{ "existing", EXISTING_t }, { "existing", EXISTING_t },
{ "for", FOR_t }, { "for", FOR_t },
{ "if", IF_t }, { "if", IF_t },
{ "ignore", IGNORE_t }, { "ignore", IGNORE_t },
{ "in", IN_t }, { "in", IN_t },
{ "include", INCLUDE_t }, { "include", INCLUDE_t },
{ "local", LOCAL_t }, { "local", LOCAL_t },
{ "module", MODULE_t }, { "module", MODULE_t },
{ "on", ON_t }, { "on", ON_t },
{ "piecemeal", PIECEMEAL_t }, { "piecemeal", PIECEMEAL_t },
{ "quietly", QUIETLY_t }, { "quietly", QUIETLY_t },
{ "return", RETURN_t }, { "return", RETURN_t },
{ "rule", RULE_t }, { "rule", RULE_t },
{ "switch", SWITCH_t }, { "switch", SWITCH_t },
{ "together", TOGETHER_t }, { "together", TOGETHER_t },
{ "updated", UPDATED_t }, { "updated", UPDATED_t },
{ "while", WHILE_t }, { "while", WHILE_t },
{ "{", _LBRACE_t }, { "{", _LBRACE_t },
{ "|", _BAR_t }, { "|", _BAR_t },
{ "||", _BARBAR_t }, { "||", _BARBAR_t },
{ "}", _RBRACE_t }, { "}", _RBRACE_t },

View File

@ -56,9 +56,9 @@
typedef struct _list LIST; typedef struct _list LIST;
struct _list { struct _list {
LIST *next; LIST *next;
LIST *tail; /* only valid in head node */ LIST *tail; /* only valid in head node */
char *string; /* private copy */ char *string; /* private copy */
}; };
/* /*
@ -70,8 +70,8 @@ typedef struct _lol LOL;
# define LOL_MAX 19 # define LOL_MAX 19
struct _lol { struct _lol {
int count; int count;
LIST *list[ LOL_MAX ]; LIST *list[ LOL_MAX ];
}; };
LIST * list_append( LIST *l, LIST *nl ); LIST * list_append( LIST *l, LIST *nl );

View File

@ -14,12 +14,12 @@ int make( int n_targets, const char **targets, int anyhow );
int make1( TARGET *t ); int make1( TARGET *t );
typedef struct { typedef struct {
int temp; int temp;
int updating; int updating;
int cantfind; int cantfind;
int cantmake; int cantmake;
int targets; int targets;
int made; int made;
} COUNTS ; } COUNTS ;

View File

@ -65,24 +65,24 @@ typedef unsigned int md5_word_t; /* 32-bit word */
/* Define the state of the MD5 Algorithm. */ /* Define the state of the MD5 Algorithm. */
typedef struct md5_state_s { typedef struct md5_state_s {
md5_word_t count[2]; /* message length in bits, lsw first */ md5_word_t count[2]; /* message length in bits, lsw first */
md5_word_t abcd[4]; /* digest buffer */ md5_word_t abcd[4]; /* digest buffer */
md5_byte_t buf[64]; /* accumulate block */ md5_byte_t buf[64]; /* accumulate block */
} md5_state_t; } md5_state_t;
#ifdef __cplusplus #ifdef __cplusplus
extern "C" extern "C"
{ {
#endif #endif
/* Initialize the algorithm. */ /* Initialize the algorithm. */
void md5_init(md5_state_t *pms); void md5_init(md5_state_t *pms);
/* Append a string to the message. */ /* Append a string to the message. */
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes); void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
/* Finish the message and return the digest. */ /* Finish the message and return the digest. */
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]); void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
#ifdef __cplusplus #ifdef __cplusplus
} /* end extern "C" */ } /* end extern "C" */

View File

@ -11,122 +11,122 @@ http://www.boost.org/LICENSE_1_0.txt)
#ifdef OPT_BOEHM_GC #ifdef OPT_BOEHM_GC
/* Use Boehm GC memory allocator. */ /* Use Boehm GC memory allocator. */
#include <gc.h> #include <gc.h>
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s) #define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s) #define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s)) #define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s)) #define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
#define bjam_realloc_x(p,s) GC_realloc(p,s) #define bjam_realloc_x(p,s) GC_realloc(p,s)
#define bjam_free_x(p) GC_free(p) #define bjam_free_x(p) GC_free(p)
#define bjam_mem_init_x() GC_init(); GC_enable_incremental() #define bjam_mem_init_x() GC_init(); GC_enable_incremental()
#define bjam_malloc_raw_x(s) malloc(s) #define bjam_malloc_raw_x(s) malloc(s)
#define bjam_calloc_raw_x(n,s) calloc(n,s) #define bjam_calloc_raw_x(n,s) calloc(n,s)
#define bjam_realloc_raw_x(p,s) realloc(p,s) #define bjam_realloc_raw_x(p,s) realloc(p,s)
#define bjam_free_raw_x(p) free(p) #define bjam_free_raw_x(p) free(p)
#ifndef BJAM_NEWSTR_NO_ALLOCATE #ifndef BJAM_NEWSTR_NO_ALLOCATE
#define BJAM_NEWSTR_NO_ALLOCATE #define BJAM_NEWSTR_NO_ALLOCATE
#endif #endif
#elif defined(OPT_DUMA) #elif defined(OPT_DUMA)
/* Use Duma memory debugging library. */ /* Use Duma memory debugging library. */
#include <stdlib.h> #include <stdlib.h>
#define _DUMA_CONFIG_H_ #define _DUMA_CONFIG_H_
#define DUMA_NO_GLOBAL_MALLOC_FREE #define DUMA_NO_GLOBAL_MALLOC_FREE
#define DUMA_EXPLICIT_INIT #define DUMA_EXPLICIT_INIT
#define DUMA_NO_THREAD_SAFETY #define DUMA_NO_THREAD_SAFETY
#define DUMA_NO_CPP_SUPPORT #define DUMA_NO_CPP_SUPPORT
/* #define DUMA_NO_LEAKDETECTION */ /* #define DUMA_NO_LEAKDETECTION */
/* #define DUMA_USE_FRAMENO */ /* #define DUMA_USE_FRAMENO */
/* #define DUMA_PREFER_ATEXIT */ /* #define DUMA_PREFER_ATEXIT */
/* #define DUMA_OLD_DEL_MACRO */ /* #define DUMA_OLD_DEL_MACRO */
/* #define DUMA_NO_HANG_MSG */ /* #define DUMA_NO_HANG_MSG */
#define DUMA_PAGE_SIZE 4096 #define DUMA_PAGE_SIZE 4096
#define DUMA_MIN_ALIGNMENT 1 #define DUMA_MIN_ALIGNMENT 1
/* #define DUMA_GNU_INIT_ATTR 0 */ /* #define DUMA_GNU_INIT_ATTR 0 */
typedef unsigned int DUMA_ADDR; typedef unsigned int DUMA_ADDR;
typedef unsigned int DUMA_SIZE; typedef unsigned int DUMA_SIZE;
#include <duma.h> #include <duma.h>
#define bjam_malloc_x(s) malloc(s) #define bjam_malloc_x(s) malloc(s)
#define bjam_calloc_x(n,s) calloc(n,s) #define bjam_calloc_x(n,s) calloc(n,s)
#define bjam_realloc_x(p,s) realloc(p,s) #define bjam_realloc_x(p,s) realloc(p,s)
#define bjam_free_x(p) free(p) #define bjam_free_x(p) free(p)
#ifndef BJAM_NEWSTR_NO_ALLOCATE #ifndef BJAM_NEWSTR_NO_ALLOCATE
#define BJAM_NEWSTR_NO_ALLOCATE #define BJAM_NEWSTR_NO_ALLOCATE
#endif #endif
#else #else
/* Standard C memory allocation. */ /* Standard C memory allocation. */
#define bjam_malloc_x(s) malloc(s) #define bjam_malloc_x(s) malloc(s)
#define bjam_calloc_x(n,s) calloc(n,s) #define bjam_calloc_x(n,s) calloc(n,s)
#define bjam_realloc_x(p,s) realloc(p,s) #define bjam_realloc_x(p,s) realloc(p,s)
#define bjam_free_x(p) free(p) #define bjam_free_x(p) free(p)
#endif #endif
#ifndef bjam_malloc_atomic_x #ifndef bjam_malloc_atomic_x
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s) #define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
#endif #endif
#ifndef bjam_calloc_atomic_x #ifndef bjam_calloc_atomic_x
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s) #define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
#endif #endif
#ifndef bjam_mem_init_x #ifndef bjam_mem_init_x
#define bjam_mem_init_x() #define bjam_mem_init_x()
#endif #endif
#ifndef bjam_mem_close_x #ifndef bjam_mem_close_x
#define bjam_mem_close_x() #define bjam_mem_close_x()
#endif #endif
#ifndef bjam_malloc_raw_x #ifndef bjam_malloc_raw_x
#define bjam_malloc_raw_x(s) bjam_malloc_x(s) #define bjam_malloc_raw_x(s) bjam_malloc_x(s)
#endif #endif
#ifndef bjam_calloc_raw_x #ifndef bjam_calloc_raw_x
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s) #define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
#endif #endif
#ifndef bjam_realloc_raw_x #ifndef bjam_realloc_raw_x
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s) #define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
#endif #endif
#ifndef bjam_free_raw_x #ifndef bjam_free_raw_x
#define bjam_free_raw_x(p) bjam_free_x(p) #define bjam_free_raw_x(p) bjam_free_x(p)
#endif #endif
#ifdef OPT_DEBUG_PROFILE #ifdef OPT_DEBUG_PROFILE
/* Profile tracing of memory allocations. */ /* Profile tracing of memory allocations. */
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s)) #define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s)) #define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s)) #define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s)) #define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s)) #define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
#define BJAM_FREE(p) bjam_free_x(p) #define BJAM_FREE(p) bjam_free_x(p)
#define BJAM_MEM_INIT() bjam_mem_init_x() #define BJAM_MEM_INIT() bjam_mem_init_x()
#define BJAM_MEM_CLOSE() bjam_mem_close_x() #define BJAM_MEM_CLOSE() bjam_mem_close_x()
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s)) #define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s)) #define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s)) #define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p) #define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
#else #else
/* No mem tracing. */ /* No mem tracing. */
#define BJAM_MALLOC(s) bjam_malloc_x(s) #define BJAM_MALLOC(s) bjam_malloc_x(s)
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s) #define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s) #define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s) #define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s) #define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
#define BJAM_FREE(p) bjam_free_x(p) #define BJAM_FREE(p) bjam_free_x(p)
#define BJAM_MEM_INIT() bjam_mem_init_x() #define BJAM_MEM_INIT() bjam_mem_init_x()
#define BJAM_MEM_CLOSE() bjam_mem_close_x() #define BJAM_MEM_CLOSE() bjam_mem_close_x()
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s) #define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s) #define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s) #define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p) #define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
#endif #endif

View File

@ -8,15 +8,14 @@
#include "lists.h" #include "lists.h"
struct module_t struct module_t {
{ char* name;
char* name; struct hash* rules;
struct hash* rules; struct hash* variables;
struct hash* variables; struct hash* imported_modules;
struct hash* imported_modules; struct module_t* class_module;
struct module_t* class_module; struct hash* native_rules;
struct hash* native_rules; int user_module;
int user_module;
}; };
typedef struct module_t module_t ; /* MSVC debugger gets confused unless this is provided */ typedef struct module_t module_t ; /* MSVC debugger gets confused unless this is provided */

View File

@ -7,20 +7,19 @@
#include "rules.h" #include "rules.h"
struct native_rule_t struct native_rule_t {
{ char* name;
char* name; argument_list* arguments;
argument_list* arguments; PARSE* procedure;
PARSE* procedure; /* Version of the interface that the native rule provides.
/* Version of the interface that the native rule provides. It's possible that we want to change the set parameter
It's possible that we want to change the set parameter for existing native rule. In that case, version number
for existing native rule. In that case, version number should be incremented so that Boost.Build can check for
should be incremented so that Boost.Build can check for version it relies on.
version it relies on.
Versions are numbered from 1. Versions are numbered from 1.
*/ */
int version; int version;
}; };
/* MSVC debugger gets confused unless this is provided */ /* MSVC debugger gets confused unless this is provided */

View File

@ -11,10 +11,9 @@
* \ -) "Command line option." * \ -) "Command line option."
*/ */
typedef struct bjam_option typedef struct bjam_option {
{ char flag; /* filled in by getoption() */
char flag; /* filled in by getoption() */ char *val; /* set to random address if true */
char *val; /* set to random address if true */
} bjam_option; } bjam_option;
# define N_OPTS 256 # define N_OPTS 256

View File

@ -14,13 +14,13 @@
#define EXIT_TIMEOUT 2 #define EXIT_TIMEOUT 2
void out_action( void out_action(
const char * action, const char * action,
const char * target, const char * target,
const char * command, const char * command,
const char * out_data, const char * out_data,
const char * err_data, const char * err_data,
int exit_reason int exit_reason
); );
char * outf_int( int value ); char * outf_int( int value );
char * outf_double( double value ); char * outf_double( double value );

View File

@ -26,31 +26,31 @@
*/ */
struct _PARSE { struct _PARSE {
LIST * (* func)( PARSE *, FRAME * ); LIST * (* func)( PARSE *, FRAME * );
PARSE * left; PARSE * left;
PARSE * right; PARSE * right;
PARSE * third; PARSE * third;
char * string; char * string;
char * string1; char * string1;
int num; int num;
int refs; int refs;
/* module * module; */ /* module * module; */
char * rulename; char * rulename;
char * file; char * file;
int line; int line;
}; };
void parse_file( char *, FRAME * ); void parse_file( char *, FRAME * );
void parse_save( PARSE * ); void parse_save( PARSE * );
PARSE * parse_make( PARSE * parse_make(
LIST * (* func)( PARSE *, FRAME * ), LIST * (* func)( PARSE *, FRAME * ),
PARSE * left, PARSE * left,
PARSE * right, PARSE * right,
PARSE * third, PARSE * third,
char * string, char * string,
char * string1, char * string1,
int num ); int num );
void parse_refer ( PARSE * ); void parse_refer ( PARSE * );
void parse_free ( PARSE * ); void parse_free ( PARSE * );

View File

@ -28,17 +28,15 @@
typedef struct _pathname PATHNAME; typedef struct _pathname PATHNAME;
typedef struct _pathpart PATHPART; typedef struct _pathpart PATHPART;
struct _pathpart struct _pathpart {
{ char * ptr;
char * ptr; int len;
int len;
}; };
struct _pathname struct _pathname {
{ PATHPART part[6];
PATHPART part[6];
#ifdef OS_VMS #ifdef OS_VMS
int parent; int parent;
#endif #endif
#define f_grist part[0] #define f_grist part[0]

View File

@ -9,13 +9,13 @@
#define NSUBEXP 10 #define NSUBEXP 10
typedef struct regexp { typedef struct regexp {
char *startp[NSUBEXP]; char *startp[NSUBEXP];
char *endp[NSUBEXP]; char *endp[NSUBEXP];
char regstart; /* Internal use only. */ char regstart; /* Internal use only. */
char reganch; /* Internal use only. */ char reganch; /* Internal use only. */
char *regmust; /* Internal use only. */ char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */ int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */ char program[1]; /* Unwarranted chumminess with compiler. */
} regexp; } regexp;
regexp *regcomp( char *exp ); regexp *regcomp( char *exp );

View File

@ -53,19 +53,17 @@ typedef struct _settings SETTINGS ;
/* RULE - a generic jam rule, the product of RULE and ACTIONS. */ /* RULE - a generic jam rule, the product of RULE and ACTIONS. */
/* A rule's argument list. */ /* A rule's argument list. */
struct argument_list struct argument_list {
{ int reference_count;
int reference_count; LOL data[1];
LOL data[1];
}; };
/* Build actions corresponding to a rule. */ /* Build actions corresponding to a rule. */
struct rule_actions struct rule_actions {
{ int reference_count;
int reference_count; char * command; /* command string from ACTIONS */
char * command; /* command string from ACTIONS */ LIST * bindlist;
LIST * bindlist; int flags; /* modifiers on ACTIONS */
int flags; /* modifiers on ACTIONS */
#define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */ #define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */
#define RULE_TOGETHER 0x02 /* combine actions on single target */ #define RULE_TOGETHER 0x02 /* combine actions on single target */
@ -78,67 +76,61 @@ struct rule_actions
typedef struct rule_actions rule_actions; typedef struct rule_actions rule_actions;
typedef struct argument_list argument_list; typedef struct argument_list argument_list;
struct _rule struct _rule {
{ char * name;
char * name; PARSE * procedure; /* parse tree from RULE */
PARSE * procedure; /* parse tree from RULE */ argument_list * arguments; /* argument checking info, or NULL for unchecked
argument_list * arguments; /* argument checking info, or NULL for unchecked
*/ */
rule_actions * actions; /* build actions, or NULL for no actions */ rule_actions * actions; /* build actions, or NULL for no actions */
module_t * module; /* module in which this rule is executed */ module_t * module; /* module in which this rule is executed */
int exported; /* nonzero if this rule is supposed to appear in int exported; /* nonzero if this rule is supposed to appear in
* the global module and be automatically * the global module and be automatically
* imported into other modules * imported into other modules
*/ */
#ifdef HAVE_PYTHON #ifdef HAVE_PYTHON
PyObject * python_function; PyObject * python_function;
#endif #endif
}; };
/* ACTIONS - a chain of ACTIONs. */ /* ACTIONS - a chain of ACTIONs. */
struct _actions struct _actions {
{ ACTIONS * next;
ACTIONS * next; ACTIONS * tail; /* valid only for head */
ACTIONS * tail; /* valid only for head */ ACTION * action;
ACTION * action;
}; };
/* ACTION - a RULE instance with targets and sources. */ /* ACTION - a RULE instance with targets and sources. */
struct _action struct _action {
{ RULE * rule;
RULE * rule; TARGETS * targets;
TARGETS * targets; TARGETS * sources; /* aka $(>) */
TARGETS * sources; /* aka $(>) */ char running; /* has been started */
char running; /* has been started */ char status; /* see TARGET status */
char status; /* see TARGET status */
}; };
/* SETTINGS - variables to set when executing a TARGET's ACTIONS. */ /* SETTINGS - variables to set when executing a TARGET's ACTIONS. */
struct _settings struct _settings {
{ SETTINGS * next;
SETTINGS * next; char * symbol; /* symbol name for var_set() */
char * symbol; /* symbol name for var_set() */ LIST * value; /* symbol value for var_set() */
LIST * value; /* symbol value for var_set() */ int multiple;
int multiple;
}; };
/* TARGETS - a chain of TARGETs. */ /* TARGETS - a chain of TARGETs. */
struct _targets struct _targets {
{ TARGETS * next;
TARGETS * next; TARGETS * tail; /* valid only for head */
TARGETS * tail; /* valid only for head */ TARGET * target;
TARGET * target;
}; };
/* TARGET - an entity (e.g. a file) that can be built. */ /* TARGET - an entity (e.g. a file) that can be built. */
struct _target struct _target {
{ char * name;
char * name; char * boundname; /* if search() relocates target */
char * boundname; /* if search() relocates target */ ACTIONS * actions; /* rules to execute, if any */
ACTIONS * actions; /* rules to execute, if any */ SETTINGS * settings; /* variables to define */
SETTINGS * settings; /* variables to define */
short flags; /* status info */ short flags; /* status info */
#define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */ #define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */
#define T_FLAG_NOCARE 0x0002 /* NOCARE applied */ #define T_FLAG_NOCARE 0x0002 /* NOCARE applied */
@ -148,28 +140,28 @@ struct _target
#define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */ #define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */
#define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */ #define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */
/* This flag has been added to support a new built-in rule named "RMBAD". It is /* This flag has been added to support a new built-in rule named "RMBAD". It is
* used to force removal of outdated targets whose dependencies fail to build. * used to force removal of outdated targets whose dependencies fail to build.
*/ */
#define T_FLAG_RMOLD 0x0080 /* RMBAD applied */ #define T_FLAG_RMOLD 0x0080 /* RMBAD applied */
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used /* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
* to indicate that the result of running a given action should be inverted, * to indicate that the result of running a given action should be inverted,
* i.e. ok <=> fail. This is useful for launching certain test runs from a * i.e. ok <=> fail. This is useful for launching certain test runs from a
* Jamfile. * Jamfile.
*/ */
#define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */ #define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */
#define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */ #define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */
/* Indicates that the target must be a file. This prevents matching non-files, /* Indicates that the target must be a file. This prevents matching non-files,
* like directories, when a target is searched. * like directories, when a target is searched.
*/ */
#define T_FLAG_ISFILE 0x0400 #define T_FLAG_ISFILE 0x0400
#define T_FLAG_PRECIOUS 0x0800 #define T_FLAG_PRECIOUS 0x0800
char binding; /* how target relates to a real file or char binding; /* how target relates to a real file or
* folder * folder
*/ */
@ -178,32 +170,32 @@ struct _target
#define T_BIND_PARENTS 2 /* using parent's timestamp */ #define T_BIND_PARENTS 2 /* using parent's timestamp */
#define T_BIND_EXISTS 3 /* real file, timestamp valid */ #define T_BIND_EXISTS 3 /* real file, timestamp valid */
TARGETS * depends; /* dependencies */ TARGETS * depends; /* dependencies */
TARGETS * dependants; /* the inverse of dependencies */ TARGETS * dependants; /* the inverse of dependencies */
TARGETS * rebuilds; /* targets that should be force-rebuilt TARGETS * rebuilds; /* targets that should be force-rebuilt
* whenever this one is * whenever this one is
*/ */
TARGET * includes; /* internal includes node */ TARGET * includes; /* internal includes node */
TARGET * original_target; /* original_target->includes = this */ TARGET * original_target; /* original_target->includes = this */
char rescanned; char rescanned;
time_t time; /* update time */ time_t time; /* update time */
time_t leaf; /* update time of leaf sources */ time_t leaf; /* update time of leaf sources */
char fate; /* make0()'s diagnosis */ char fate; /* make0()'s diagnosis */
#define T_FATE_INIT 0 /* nothing done to target */ #define T_FATE_INIT 0 /* nothing done to target */
#define T_FATE_MAKING 1 /* make0(target) on stack */ #define T_FATE_MAKING 1 /* make0(target) on stack */
#define T_FATE_STABLE 2 /* target did not need updating */ #define T_FATE_STABLE 2 /* target did not need updating */
#define T_FATE_NEWER 3 /* target newer than parent */ #define T_FATE_NEWER 3 /* target newer than parent */
#define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */ #define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */
#define T_FATE_ISTMP 4 /* unneeded temp target oddly present */ #define T_FATE_ISTMP 4 /* unneeded temp target oddly present */
#define T_FATE_BUILD 5 /* >= BUILD rebuilds target */ #define T_FATE_BUILD 5 /* >= BUILD rebuilds target */
#define T_FATE_TOUCHED 5 /* manually touched with -t */ #define T_FATE_TOUCHED 5 /* manually touched with -t */
#define T_FATE_REBUILD 6 #define T_FATE_REBUILD 6
#define T_FATE_MISSING 7 /* is missing, needs updating */ #define T_FATE_MISSING 7 /* is missing, needs updating */
#define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */ #define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */
#define T_FATE_OUTDATED 9 /* is out of date, needs updating */ #define T_FATE_OUTDATED 9 /* is out of date, needs updating */
@ -213,7 +205,7 @@ struct _target
#define T_FATE_CANTFIND 11 /* no rules to make missing target */ #define T_FATE_CANTFIND 11 /* no rules to make missing target */
#define T_FATE_CANTMAKE 12 /* can not find dependencies */ #define T_FATE_CANTMAKE 12 /* can not find dependencies */
char progress; /* tracks make1() progress */ char progress; /* tracks make1() progress */
#define T_MAKE_INIT 0 /* make1(target) not yet called */ #define T_MAKE_INIT 0 /* make1(target) not yet called */
#define T_MAKE_ONSTACK 1 /* make1(target) on stack */ #define T_MAKE_ONSTACK 1 /* make1(target) on stack */
@ -222,20 +214,20 @@ struct _target
#define T_MAKE_DONE 4 /* make1(target) done */ #define T_MAKE_DONE 4 /* make1(target) done */
#ifdef OPT_SEMAPHORE #ifdef OPT_SEMAPHORE
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */ #define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
#endif #endif
#ifdef OPT_SEMAPHORE #ifdef OPT_SEMAPHORE
TARGET * semaphore; /* used in serialization */ TARGET * semaphore; /* used in serialization */
#endif #endif
char status; /* exec_cmd() result */ char status; /* exec_cmd() result */
int asynccnt; /* child deps outstanding */ int asynccnt; /* child deps outstanding */
TARGETS * parents; /* used by make1() for completion */ TARGETS * parents; /* used by make1() for completion */
char * cmds; /* type-punned command list */ char * cmds; /* type-punned command list */
char * failed; char * failed;
}; };

View File

@ -29,15 +29,14 @@
#define YYSTYPE YYSYMBOL #define YYSTYPE YYSYMBOL
typedef struct _YYSTYPE typedef struct _YYSTYPE {
{ int type;
int type; char * string;
char * string; PARSE * parse;
PARSE * parse; LIST * list;
LIST * list; int number;
int number; char * file;
char * file; int line;
int line;
} YYSTYPE; } YYSTYPE;
extern YYSTYPE yylval; extern YYSTYPE yylval;

View File

@ -7,14 +7,13 @@
# include <stddef.h> # include <stddef.h>
typedef struct string typedef struct string {
{ char* value;
char* value; unsigned long size;
unsigned long size; unsigned long capacity;
unsigned long capacity; char opt[32];
char opt[32];
#ifndef NDEBUG #ifndef NDEBUG
char magic[4]; char magic[4];
#endif #endif
} string; } string;

View File

@ -50,10 +50,10 @@ Data::~Data() {
//ADDED BY TS //ADDED BY TS
void Data::remove_duplicates() { void Data::remove_duplicates() {
uint nSentences = featdata->size(); size_t nSentences = featdata->size();
assert(scoredata->size() == nSentences); assert(scoredata->size() == nSentences);
for (uint s=0; s < nSentences; s++) { for (size_t s=0; s < nSentences; s++) {
FeatureArray& feat_array = featdata->get(s); FeatureArray& feat_array = featdata->get(s);
ScoreArray& score_array = scoredata->get(s); ScoreArray& score_array = scoredata->get(s);
@ -61,29 +61,29 @@ void Data::remove_duplicates() {
assert(feat_array.size() == score_array.size()); assert(feat_array.size() == score_array.size());
//serves as a hash-map: //serves as a hash-map:
std::map<double, std::vector<uint> > lookup; std::map<double, std::vector<size_t> > lookup;
uint end_pos = feat_array.size() - 1; size_t end_pos = feat_array.size() - 1;
uint nRemoved = 0; size_t nRemoved = 0;
for (uint k=0; k <= end_pos; k++) { for (size_t k=0; k <= end_pos; k++) {
const FeatureStats& cur_feats = feat_array.get(k); const FeatureStats& cur_feats = feat_array.get(k);
double sum = 0.0; double sum = 0.0;
for (uint l=0; l < cur_feats.size(); l++) for (size_t l=0; l < cur_feats.size(); l++)
sum += cur_feats.get(l); sum += cur_feats.get(l);
if (lookup.find(sum) != lookup.end()) { if (lookup.find(sum) != lookup.end()) {
//std::cerr << "hit" << std::endl; //std::cerr << "hit" << std::endl;
std::vector<uint>& cur_list = lookup[sum]; std::vector<size_t>& cur_list = lookup[sum];
uint l=0; size_t l=0;
for (l=0; l < cur_list.size(); l++) { for (l=0; l < cur_list.size(); l++) {
uint j=cur_list[l]; size_t j=cur_list[l];
if (cur_feats == feat_array.get(j) if (cur_feats == feat_array.get(j)
&& score_array.get(k) == score_array.get(j)) { && score_array.get(k) == score_array.get(j)) {

View File

@ -129,7 +129,8 @@ IOWrapper::~IOWrapper()
delete m_singleBestOutputCollector; delete m_singleBestOutputCollector;
} }
void IOWrapper::ResetTranslationId() { void IOWrapper::ResetTranslationId()
{
m_translationId = StaticData::Instance().GetStartTranslationId(); m_translationId = StaticData::Instance().GetStartTranslationId();
} }
@ -369,18 +370,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
if (pds.size() > 0) { if (pds.size() > 0) {
for( size_t i=0; i<pds.size(); i++ ) { for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores(); size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] ); vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){ for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){ if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)){ if ((j == 0) || (j == pd_numinputscore)) {
lastName = pds[i]->GetScoreProducerWeightShortName(j); lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":"; out << " " << lastName << ":";
} }
} }
out << " " << scores[j]; out << " " << scores[j];
} }
} }
} }
@ -394,18 +395,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
if (gds.size() > 0) { if (gds.size() > 0) {
for( size_t i=0; i<gds.size(); i++ ) { for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores(); size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] ); vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){ for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){ if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)){ if ((j == 0) || (j == pd_numinputscore)) {
lastName = gds[i]->GetScoreProducerWeightShortName(j); lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":"; out << " " << lastName << ":";
} }
} }
out << " " << scores[j]; out << " " << scores[j];
} }
} }
} }

View File

@ -210,13 +210,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
{ {
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec; typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments(); AlignVec alignments = ai.GetSortedAlignments();
AlignVec::const_iterator it; AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) { for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it; const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " "; out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
} }
} }
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges) void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
@ -227,7 +227,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
const Hypothesis &edge = *edges[currEdge]; const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase(); const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos(); size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset); OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize(); targetOffset += tp.GetSize();
@ -239,7 +239,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
{ {
ostringstream out; ostringstream out;
OutputAlignment(out, edges); OutputAlignment(out, edges);
collector->Write(lineNo,out.str()); collector->Write(lineNo,out.str());
} }
@ -412,18 +412,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
if (pds.size() > 0) { if (pds.size() > 0) {
for( size_t i=0; i<pds.size(); i++ ) { for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores(); size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] ); vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){ for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){ if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)){ if ((j == 0) || (j == pd_numinputscore)) {
lastName = pds[i]->GetScoreProducerWeightShortName(j); lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":"; out << " " << lastName << ":";
} }
} }
out << " " << scores[j]; out << " " << scores[j];
} }
} }
} }
@ -432,18 +432,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
if (gds.size() > 0) { if (gds.size() > 0) {
for( size_t i=0; i<gds.size(); i++ ) { for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores(); size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] ); vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){ for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){ if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)){ if ((j == 0) || (j == pd_numinputscore)) {
lastName = gds[i]->GetScoreProducerWeightShortName(j); lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":"; out << " " << lastName << ":";
} }
} }
out << " " << scores[j]; out << " " << scores[j];
} }
} }
} }
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
const int sourceOffset = sourceRange.GetStartPos(); const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos(); const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo(); const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
OutputAlignment(out, ai, sourceOffset, targetOffset); OutputAlignment(out, ai, sourceOffset, targetOffset);
} }

View File

@ -83,7 +83,7 @@ public:
m_detailedTranslationCollector(detailedTranslationCollector), m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector) {} m_alignmentInfoCollector(alignmentInfoCollector) {}
/** Translate one sentence /** Translate one sentence
* gets called by main function implemented at end of this source file */ * gets called by main function implemented at end of this source file */
void Run() { void Run() {
@ -130,7 +130,7 @@ public:
manager.SerializeSearchGraphPB(m_lineNumber, output); manager.SerializeSearchGraphPB(m_lineNumber, output);
} }
#endif #endif
} }
// apply decision rule and output best translation(s) // apply decision rule and output best translation(s)
if (m_outputCollector) { if (m_outputCollector) {
@ -145,8 +145,7 @@ public:
// MAP decoding: best hypothesis // MAP decoding: best hypothesis
const Hypothesis* bestHypo = NULL; const Hypothesis* bestHypo = NULL;
if (!staticData.UseMBR()) if (!staticData.UseMBR()) {
{
bestHypo = manager.GetBestHypothesis(); bestHypo = manager.GetBestHypothesis();
if (bestHypo) { if (bestHypo) {
if (staticData.IsPathRecoveryEnabled()) { if (staticData.IsPathRecoveryEnabled()) {
@ -165,11 +164,10 @@ public:
} }
} }
out << endl; out << endl;
} }
// MBR decoding (n-best MBR, lattice MBR, consensus) // MBR decoding (n-best MBR, lattice MBR, consensus)
else else {
{
// we first need the n-best translations // we first need the n-best translations
size_t nBestSize = staticData.GetMBRSize(); size_t nBestSize = staticData.GetMBRSize();
if (nBestSize <= 0) { if (nBestSize <= 0) {
@ -205,7 +203,7 @@ public:
} }
// consensus decoding // consensus decoding
else if (staticData.UseConsensusDecoding()) { else if (staticData.UseConsensusDecoding()) {
const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList); const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
OutputBestHypo(conBestHypo, m_lineNumber, OutputBestHypo(conBestHypo, m_lineNumber,
staticData.GetReportSegmentation(), staticData.GetReportSegmentation(),
@ -214,8 +212,8 @@ public:
IFVERBOSE(2) { IFVERBOSE(2) {
PrintUserTime("finished Consensus decoding"); PrintUserTime("finished Consensus decoding");
} }
} }
// n-best MBR decoding // n-best MBR decoding
else { else {
const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList); const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
@ -482,7 +480,7 @@ int main(int argc, char** argv)
alignmentInfoCollector.get() ); alignmentInfoCollector.get() );
// execute task // execute task
#ifdef WITH_THREADS #ifdef WITH_THREADS
pool.Submit(task); pool.Submit(task);
#else #else
task->Run(); task->Run();
#endif #endif

View File

@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
} }
} }
} }
bool epsilon = false; bool epsilon = false;
if (target == "") { if (target == "") {
target="<EPSILON>"; target="<EPSILON>";

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -42,10 +42,11 @@ void AlignmentInfo::BuildNonTermIndexMap()
for (p = begin(); p != end(); ++p) { for (p = begin(); p != end(); ++p) {
m_nonTermIndexMap[p->second] = i++; m_nonTermIndexMap[p->second] = i++;
} }
} }
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) { bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b)
{
if(a->second < b->second) return true; if(a->second < b->second) return true;
if(a->second == b->second) return (a->first < b->first); if(a->second == b->second) return (a->first < b->first);
return false; return false;
@ -55,34 +56,32 @@ bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,si
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
{ {
std::vector< const std::pair<size_t,size_t>* > ret; std::vector< const std::pair<size_t,size_t>* > ret;
CollType::const_iterator iter; CollType::const_iterator iter;
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
{
const std::pair<size_t,size_t> &alignPair = *iter; const std::pair<size_t,size_t> &alignPair = *iter;
ret.push_back(&alignPair); ret.push_back(&alignPair);
} }
const StaticData &staticData = StaticData::Instance(); const StaticData &staticData = StaticData::Instance();
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort(); WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
switch (wordAlignmentSort) switch (wordAlignmentSort) {
{ case NoSort:
case NoSort: break;
break;
case TargetOrder:
case TargetOrder: std::sort(ret.begin(), ret.end(), compare_target);
std::sort(ret.begin(), ret.end(), compare_target); break;
break;
default:
default: CHECK(false);
CHECK(false);
} }
return ret; return ret;
} }
std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo) std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo)
{ {
AlignmentInfo::const_iterator iter; AlignmentInfo::const_iterator iter;

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -37,12 +37,16 @@ class AlignmentInfo
friend struct AlignmentInfoOrderer; friend struct AlignmentInfoOrderer;
friend class AlignmentInfoCollection; friend class AlignmentInfoCollection;
public: public:
typedef std::vector<size_t> NonTermIndexMap; typedef std::vector<size_t> NonTermIndexMap;
typedef CollType::const_iterator const_iterator; typedef CollType::const_iterator const_iterator;
const_iterator begin() const { return m_collection.begin(); } const_iterator begin() const {
const_iterator end() const { return m_collection.end(); } return m_collection.begin();
}
const_iterator end() const {
return m_collection.end();
}
// Provides a map from target-side to source-side non-terminal indices. // Provides a map from target-side to source-side non-terminal indices.
// The target-side index should be the rule symbol index (counting terminals). // The target-side index should be the rule symbol index (counting terminals).
@ -52,12 +56,11 @@ class AlignmentInfo
} }
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const; std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
private: private:
// AlignmentInfo objects should only be created by an AlignmentInfoCollection // AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs) explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
: m_collection(pairs) : m_collection(pairs) {
{
BuildNonTermIndexMap(); BuildNonTermIndexMap();
} }
@ -69,8 +72,7 @@ class AlignmentInfo
// Define an arbitrary strict weak ordering between AlignmentInfo objects // Define an arbitrary strict weak ordering between AlignmentInfo objects
// for use by AlignmentInfoCollection. // for use by AlignmentInfoCollection.
struct AlignmentInfoOrderer struct AlignmentInfoOrderer {
{
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const { bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
return a.m_collection < b.m_collection; return a.m_collection < b.m_collection;
} }

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -36,7 +36,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
} }
const AlignmentInfo *AlignmentInfoCollection::Add( const AlignmentInfo *AlignmentInfoCollection::Add(
const std::set<std::pair<size_t,size_t> > &pairs) const std::set<std::pair<size_t,size_t> > &pairs)
{ {
std::pair<AlignmentInfoSet::iterator, bool> ret = std::pair<AlignmentInfoSet::iterator, bool> ret =
m_collection.insert(AlignmentInfo(pairs)); m_collection.insert(AlignmentInfo(pairs));

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -29,8 +29,10 @@ namespace Moses
// Singleton collection of all AlignmentInfo objects. // Singleton collection of all AlignmentInfo objects.
class AlignmentInfoCollection class AlignmentInfoCollection
{ {
public: public:
static AlignmentInfoCollection &Instance() { return s_instance; } static AlignmentInfoCollection &Instance() {
return s_instance;
}
// Returns a pointer to an AlignmentInfo object with the same source-target // Returns a pointer to an AlignmentInfo object with the same source-target
// alignment pairs as given in the argument. If the collection already // alignment pairs as given in the argument. If the collection already
@ -41,7 +43,7 @@ class AlignmentInfoCollection
// Returns a pointer to an empty AlignmentInfo object. // Returns a pointer to an empty AlignmentInfo object.
const AlignmentInfo &GetEmptyAlignmentInfo() const; const AlignmentInfo &GetEmptyAlignmentInfo() const;
private: private:
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet; typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
// Only a single static variable should be created. // Only a single static variable should be created.

View File

@ -7,455 +7,454 @@
using namespace std; using namespace std;
namespace Moses { namespace Moses
{
BilingualDynSuffixArray::BilingualDynSuffixArray(): BilingualDynSuffixArray::BilingualDynSuffixArray():
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()), m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
m_maxSampleSize(20) m_maxSampleSize(20)
{ {
m_srcSA = 0; m_srcSA = 0;
m_trgSA = 0; m_trgSA = 0;
m_srcCorpus = new std::vector<wordID_t>(); m_srcCorpus = new std::vector<wordID_t>();
m_trgCorpus = new std::vector<wordID_t>(); m_trgCorpus = new std::vector<wordID_t>();
m_srcVocab = new Vocab(false); m_srcVocab = new Vocab(false);
m_trgVocab = new Vocab(false); m_trgVocab = new Vocab(false);
m_scoreCmp = 0; m_scoreCmp = 0;
} }
BilingualDynSuffixArray::~BilingualDynSuffixArray() BilingualDynSuffixArray::~BilingualDynSuffixArray()
{ {
if(m_srcSA) delete m_srcSA; if(m_srcSA) delete m_srcSA;
if(m_trgSA) delete m_trgSA; if(m_trgSA) delete m_trgSA;
if(m_srcVocab) delete m_srcVocab; if(m_srcVocab) delete m_srcVocab;
if(m_trgVocab) delete m_trgVocab; if(m_trgVocab) delete m_trgVocab;
if(m_srcCorpus) delete m_srcCorpus; if(m_srcCorpus) delete m_srcCorpus;
if(m_trgCorpus) delete m_trgCorpus; if(m_trgCorpus) delete m_trgCorpus;
if(m_scoreCmp) delete m_scoreCmp; if(m_scoreCmp) delete m_scoreCmp;
} }
bool BilingualDynSuffixArray::Load( bool BilingualDynSuffixArray::Load(
const std::vector<FactorType>& inputFactors, const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputFactors, const std::vector<FactorType>& outputFactors,
std::string source, std::string target, std::string alignments, std::string source, std::string target, std::string alignments,
const std::vector<float> &weight) const std::vector<float> &weight)
{ {
m_inputFactors = inputFactors; m_inputFactors = inputFactors;
m_outputFactors = outputFactors; m_outputFactors = outputFactors;
m_scoreCmp = new ScoresComp(weight); m_scoreCmp = new ScoresComp(weight);
InputFileStream sourceStrme(source); InputFileStream sourceStrme(source);
InputFileStream targetStrme(target); InputFileStream targetStrme(target);
cerr << "Loading source corpus...\n"; cerr << "Loading source corpus...\n";
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
cerr << "Loading target corpus...\n"; cerr << "Loading target corpus...\n";
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size()); CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
// build suffix arrays and auxilliary arrays // build suffix arrays and auxilliary arrays
cerr << "Building Source Suffix Array...\n"; cerr << "Building Source Suffix Array...\n";
m_srcSA = new DynSuffixArray(m_srcCorpus); m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false; if(!m_srcSA) return false;
cerr << "Building Target Suffix Array...\n"; cerr << "Building Target Suffix Array...\n";
//m_trgSA = new DynSuffixArray(m_trgCorpus); //m_trgSA = new DynSuffixArray(m_trgCorpus);
//if(!m_trgSA) return false; //if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n"; cerr << "\t(Skipped. Not used)\n";
InputFileStream alignStrme(alignments); InputFileStream alignStrme(alignments);
cerr << "Loading Alignment File...\n"; cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme); LoadRawAlignments(alignStrme);
//LoadAlignments(alignStrme); //LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n"; cerr << "Building frequent word cache...\n";
CacheFreqWords(); CacheFreqWords();
return true; return true;
} }
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align) int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
{ {
// stores the alignments in the raw file format // stores the alignments in the raw file format
std::string line; std::string line;
std::vector<int> vtmp; std::vector<int> vtmp;
while(getline(align, line)) { while(getline(align, line)) {
Utils::splitToInt(line, vtmp, "- "); Utils::splitToInt(line, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0); CHECK(vtmp.size() % 2 == 0);
std::vector<short> vAlgn; // store as short ints for memory std::vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin(); for (std::vector<int>::const_iterator itr = vtmp.begin();
itr != vtmp.end(); ++itr) { itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr)); vAlgn.push_back(short(*itr));
} }
m_rawAlignments.push_back(vAlgn); m_rawAlignments.push_back(vAlgn);
} }
return m_rawAlignments.size(); return m_rawAlignments.size();
} }
int BilingualDynSuffixArray::LoadRawAlignments(string& align) { int BilingualDynSuffixArray::LoadRawAlignments(string& align)
// stores the alignments in the raw file format {
// stores the alignments in the raw file format
vector<int> vtmp; vector<int> vtmp;
Utils::splitToInt(align, vtmp, "- "); Utils::splitToInt(align, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0); CHECK(vtmp.size() % 2 == 0);
vector<short> vAlgn; // store as short ints for memory vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin(); for (std::vector<int>::const_iterator itr = vtmp.begin();
itr != vtmp.end(); ++itr) { itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr)); vAlgn.push_back(short(*itr));
} }
m_rawAlignments.push_back(vAlgn); m_rawAlignments.push_back(vAlgn);
return m_rawAlignments.size(); return m_rawAlignments.size();
} }
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align) int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
{ {
std::string line; std::string line;
std::vector<int> vtmp; std::vector<int> vtmp;
int sntIndex(0); int sntIndex(0);
while(getline(align, line)) {
Utils::splitToInt(line, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
int sourceSize = GetSourceSentenceSize(sntIndex);
int targetSize = GetTargetSentenceSize(sntIndex);
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence while(getline(align, line)) {
for(int i=0; i < (int)vtmp.size(); i+=2) { Utils::splitToInt(line, vtmp, "- ");
int sourcePos = vtmp[i]; CHECK(vtmp.size() % 2 == 0);
int targetPos = vtmp[i+1];
CHECK(sourcePos < sourceSize); int sourceSize = GetSourceSentenceSize(sntIndex);
CHECK(targetPos < targetSize); int targetSize = GetTargetSentenceSize(sntIndex);
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word for(int i=0; i < (int)vtmp.size(); i+=2) {
} int sourcePos = vtmp[i];
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence int targetPos = vtmp[i+1];
curSnt.trgSnt = m_trgCorpus + sntIndex; CHECK(sourcePos < sourceSize);
m_alignments.push_back(curSnt); CHECK(targetPos < targetSize);
sntIndex++; curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
} curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
return m_alignments.size(); }
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
curSnt.trgSnt = m_trgCorpus + sntIndex;
m_alignments.push_back(curSnt);
sntIndex++;
}
return m_alignments.size();
} }
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
{ {
// retrieves the alignments in the format used by SentenceAlignment.Extract() // retrieves the alignments in the format used by SentenceAlignment.Extract()
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex); int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex); int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
std::vector<short> alignment = m_rawAlignments.at(sntIndex); std::vector<short> alignment = m_rawAlignments.at(sntIndex);
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
for(size_t i=0; i < alignment.size(); i+=2) { for(size_t i=0; i < alignment.size(); i+=2) {
int sourcePos = alignment[i]; int sourcePos = alignment[i];
int targetPos = alignment[i+1]; int targetPos = alignment[i+1];
if(trg2Src) { if(trg2Src) {
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
} } else {
else { curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word }
} }
} curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence curSnt.trgSnt = m_trgCorpus + sntIndex;
curSnt.trgSnt = m_trgCorpus + sntIndex;
return curSnt;
return curSnt;
} }
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex, bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
{ {
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
* parameter */ * parameter */
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src); SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
// get span of phrase in source sentence // get span of phrase in source sentence
int beginSentence = m_srcSntBreaks[sntIndex]; int beginSentence = m_srcSntBreaks[sntIndex];
int rightIdx = wordIndex - beginSentence int rightIdx = wordIndex - beginSentence
,leftIdx = rightIdx - sourceSize + 1; ,leftIdx = rightIdx - sourceSize + 1;
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
} }
void BilingualDynSuffixArray::CleanUp() void BilingualDynSuffixArray::CleanUp()
{ {
//m_wordPairCache.clear(); //m_wordPairCache.clear();
} }
int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors, int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray, std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
Vocab* vocab) Vocab* vocab)
{ {
std::string line, word; std::string line, word;
int sntIdx(0); int sntIdx(0);
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking). // corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
while(getline(corpus, line)) { while(getline(corpus, line)) {
sntArray.push_back(sntIdx); sntArray.push_back(sntIdx);
Phrase phrase(ARRAY_SIZE_INCR); Phrase phrase(ARRAY_SIZE_INCR);
// parse phrase // parse phrase
phrase.CreateFromString( factors, line, factorDelimiter); phrase.CreateFromString( factors, line, factorDelimiter);
// store words in vocabulary and corpus // store words in vocabulary and corpus
for( size_t i = 0; i < phrase.GetSize(); ++i) { for( size_t i = 0; i < phrase.GetSize(); ++i) {
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) ); cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
} }
sntIdx += phrase.GetSize(); sntIdx += phrase.GetSize();
} }
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
vocab->MakeClosed(); // avoid adding words vocab->MakeClosed(); // avoid adding words
return cArray.size(); return cArray.size();
} }
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
{ {
// looks up the SA vocab ids for the current src phrase // looks up the SA vocab ids for the current src phrase
size_t phraseSize = src.GetSize(); size_t phraseSize = src.GetSize();
for (size_t pos = 0; pos < phraseSize; ++pos) { for (size_t pos = 0; pos < phraseSize; ++pos) {
const Word &word = src.GetWord(pos); const Word &word = src.GetWord(pos);
wordID_t arrayId = m_srcVocab->GetWordID(word); wordID_t arrayId = m_srcVocab->GetWordID(word);
if (arrayId == m_srcVocab->GetkOOVWordID()) if (arrayId == m_srcVocab->GetkOOVWordID()) {
{ // oov // oov
return false; return false;
} } else {
else output.SetId(pos, arrayId);
{ //cerr << arrayId << " ";
output.SetId(pos, arrayId); }
//cerr << arrayId << " "; }
} return true;
}
return true;
} }
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
{ {
//return pair<float, float>(1, 1); //return pair<float, float>(1, 1);
float srcLexWeight(1.0), trgLexWeight(1.0); float srcLexWeight(1.0), trgLexWeight(1.0);
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex]; //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex); const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache; std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
// for each source word // for each source word
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) { for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
float srcSumPairProbs(0); float srcSumPairProbs(0);
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx); const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
// for each target word aligned to this source word in this alignment // for each target word aligned to this source word in this alignment
if(srcWordAlignments.size() == 0) { // get p(NULL|src) if(srcWordAlignments.size() == 0) { // get p(NULL|src)
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID()); pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
itrCache = m_wordPairCache.find(wordpair); itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord); CacheWordProbs(srcWord);
itrCache = m_wordPairCache.find(wordpair); // search cache again itrCache = m_wordPairCache.find(wordpair); // search cache again
} }
CHECK(itrCache != m_wordPairCache.end()); CHECK(itrCache != m_wordPairCache.end());
srcSumPairProbs += itrCache->second.first; srcSumPairProbs += itrCache->second.first;
targetProbs[wordpair] = itrCache->second.second; targetProbs[wordpair] = itrCache->second.second;
} } else { // extract p(trg|src)
else { // extract p(trg|src) for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word int trgIdx = srcWordAlignments[i];
int trgIdx = srcWordAlignments[i]; wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); // get probability of this source->target word pair
// get probability of this source->target word pair pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord); itrCache = m_wordPairCache.find(wordpair);
itrCache = m_wordPairCache.find(wordpair); if(itrCache == m_wordPairCache.end()) { // if not in cache
if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord); CacheWordProbs(srcWord);
itrCache = m_wordPairCache.find(wordpair); // search cache again itrCache = m_wordPairCache.find(wordpair); // search cache again
}
CHECK(itrCache != m_wordPairCache.end());
srcSumPairProbs += itrCache->second.first;
targetProbs[wordpair] = itrCache->second.second;
}
}
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
} // end for each source word
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
float trgSumPairProbs(0);
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
if(trgItr->first.second == trgWord)
trgSumPairProbs += trgItr->second;
} }
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA CHECK(itrCache != m_wordPairCache.end());
int noAligned = alignment.numberAligned.at(trgIdx); srcSumPairProbs += itrCache->second.first;
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned); targetProbs[wordpair] = itrCache->second.second;
trgLexWeight *= (trgNormalizer * trgSumPairProbs); }
} }
// TODO::Need to get p(NULL|trg) float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
return pair<float, float>(srcLexWeight, trgLexWeight); srcLexWeight *= (srcNormalizer * srcSumPairProbs);
} // end for each source word
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
float trgSumPairProbs(0);
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
if(trgItr->first.second == trgWord)
trgSumPairProbs += trgItr->second;
}
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
int noAligned = alignment.numberAligned.at(trgIdx);
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
}
// TODO::Need to get p(NULL|trg)
return pair<float, float>(srcLexWeight, trgLexWeight);
} }
void BilingualDynSuffixArray::CacheFreqWords() const { void BilingualDynSuffixArray::CacheFreqWords() const
{
std::multimap<int, wordID_t> wordCnts; std::multimap<int, wordID_t> wordCnts;
// for each source word in vocab // for each source word in vocab
Vocab::Word2Id::const_iterator it; Vocab::Word2Id::const_iterator it;
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) { for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
// get its frequency // get its frequency
wordID_t srcWord = it->second; wordID_t srcWord = it->second;
std::vector<wordID_t> sword(1, srcWord), wrdIndices; std::vector<wordID_t> sword(1, srcWord), wrdIndices;
m_srcSA->GetCorpusIndex(&sword, &wrdIndices); m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
if(wrdIndices.size() >= 1000) { // min count if(wrdIndices.size() >= 1000) { // min count
wordCnts.insert(make_pair(wrdIndices.size(), srcWord)); wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
} }
} }
int numSoFar(0); int numSoFar(0);
std::multimap<int, wordID_t>::reverse_iterator ritr; std::multimap<int, wordID_t>::reverse_iterator ritr;
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) { for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
m_freqWordsCached.insert(ritr->second); m_freqWordsCached.insert(ritr->second);
CacheWordProbs(ritr->second); CacheWordProbs(ritr->second);
if(++numSoFar == 50) break; // get top counts if(++numSoFar == 50) break; // get top counts
} }
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n"; cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
} }
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
{ {
std::map<wordID_t, int> counts; std::map<wordID_t, int> counts;
std::vector<wordID_t> sword(1, srcWord), wrdIndices; std::vector<wordID_t> sword(1, srcWord), wrdIndices;
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices); bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
CHECK(ret); CHECK(ret);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks); std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
float denom(0); float denom(0);
// for each occurrence of this word // for each occurrence of this word
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
CHECK(sntIdx != -1); CHECK(sntIdx != -1);
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
if(srcAlg.size() == 0) { if(srcAlg.size() == 0) {
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
++denom; ++denom;
} } else { //get target words aligned to srcword in this sentence
else { //get target words aligned to srcword in this sentence for(size_t i=0; i < srcAlg.size(); ++i) {
for(size_t i=0; i < srcAlg.size(); ++i) { wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]); ++counts[trgWord];
++counts[trgWord]; ++denom;
++denom; }
} }
} }
} // now we've gotten counts of all target words aligned to this source word
// now we've gotten counts of all target words aligned to this source word // get probs and cache all pairs
// get probs and cache all pairs for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin(); itrCnt != counts.end(); ++itrCnt) {
itrCnt != counts.end(); ++itrCnt) { pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first); float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg) float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src) m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb); }
}
} }
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
{ {
// takes sentence indexes and looks up vocab IDs // takes sentence indexes and looks up vocab IDs
SAPhrase phraseIds(phrasepair.GetTargetSize()); SAPhrase phraseIds(phrasepair.GetTargetSize());
int sntIndex = phrasepair.m_sntIndex; int sntIndex = phrasepair.m_sntIndex;
int id(-1), pos(0); int id(-1), pos(0);
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i); id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
phraseIds.SetId(pos++, id); phraseIds.SetId(pos++, id);
} }
return phraseIds; return phraseIds;
}
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
{
TargetPhrase* targetPhrase = new TargetPhrase(Output);
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
Word& word = m_trgVocab->GetWord( phrase.words[i]);
CHECK(word != m_trgVocab->GetkOOVWord());
targetPhrase->AddWord(word);
}
// scoring
return targetPhrase;
} }
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
{
TargetPhrase* targetPhrase = new TargetPhrase(Output);
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
Word& word = m_trgVocab->GetWord( phrase.words[i]);
CHECK(word != m_trgVocab->GetkOOVWord());
targetPhrase->AddWord(word);
}
// scoring
return targetPhrase;
}
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
{ {
//cerr << "phrase is \"" << src << endl; //cerr << "phrase is \"" << src << endl;
size_t sourceSize = src.GetSize(); size_t sourceSize = src.GetSize();
SAPhrase localIDs(sourceSize); SAPhrase localIDs(sourceSize);
if(!GetLocalVocabIDs(src, localIDs)) return; if(!GetLocalVocabIDs(src, localIDs)) return;
float totalTrgPhrases(0); float totalTrgPhrases(0);
std::map<SAPhrase, int> phraseCounts; std::map<SAPhrase, int> phraseCounts;
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
std::map<SAPhrase, pair<float, float> > lexicalWeights; std::map<SAPhrase, pair<float, float> > lexicalWeights;
std::map<SAPhrase, pair<float, float> >::iterator itrLexW; std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
std::vector<unsigned> wrdIndices; std::vector<unsigned> wrdIndices;
// extract sentence IDs from SA and return rightmost index of phrases // extract sentence IDs from SA and return rightmost index of phrases
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return; if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
SampleSelection(wrdIndices); SampleSelection(wrdIndices);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks); std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
// for each sentence with this phrase // for each sentence with this phrase
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes() if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs); ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
//cerr << "extracted " << phrasePairs.size() << endl; //cerr << "extracted " << phrasePairs.size() << endl;
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
std::vector<PhrasePair*>::iterator iterPhrasePair; std::vector<PhrasePair*>::iterator iterPhrasePair;
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) { for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair); SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
phraseCounts[phrase]++; // count each unique phrase phraseCounts[phrase]++; // count each unique phrase
// NOTE::Correct but slow to extract lexical weight here. could do // NOTE::Correct but slow to extract lexical weight here. could do
// it later for only the top phrases chosen by phrase prob p(e|f) // it later for only the top phrases chosen by phrase prob p(e|f)
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first)) if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
itrLexW->second = lexWeight; // if this lex weight is greater save it itrLexW->second = lexWeight; // if this lex weight is greater save it
else lexicalWeights[phrase] = lexWeight; // else save else lexicalWeights[phrase] = lexWeight; // else save
} }
// done with sentence. delete SA phrase pairs // done with sentence. delete SA phrase pairs
RemoveAllInColl(phrasePairs); RemoveAllInColl(phrasePairs);
} // done with all sentences } // done with all sentences
// convert to moses phrase pairs // convert to moses phrase pairs
std::map<SAPhrase, int>::const_iterator iterPhrases; std::map<SAPhrase, int>::const_iterator iterPhrases;
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp); std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
// get scores of all phrases // get scores of all phrases
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) { for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases; float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
itrLexW = lexicalWeights.find(iterPhrases->first); itrLexW = lexicalWeights.find(iterPhrases->first);
CHECK(itrLexW != lexicalWeights.end()); CHECK(itrLexW != lexicalWeights.end());
Scores scoreVector(3); Scores scoreVector(3);
scoreVector[0] = trg2SrcMLE; scoreVector[0] = trg2SrcMLE;
scoreVector[1] = itrLexW->second.first; scoreVector[1] = itrLexW->second.first;
scoreVector[2] = 2.718; // exp(1); scoreVector[2] = 2.718; // exp(1);
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first)); phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
} }
// return top scoring phrases // return top scoring phrases
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr; std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) { for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
Scores scoreVector = ritr->first; Scores scoreVector = ritr->first;
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second); TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
target.push_back(make_pair( scoreVector, targetPhrase)); target.push_back(make_pair( scoreVector, targetPhrase));
if(target.size() == m_maxSampleSize) break; if(target.size() == m_maxSampleSize) break;
} }
} }
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices, std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
const int sourceSize, const std::vector<unsigned>& sntBreaks) const const int sourceSize, const std::vector<unsigned>& sntBreaks) const
{ {
std::vector<unsigned>::const_iterator vit; std::vector<unsigned>::const_iterator vit;
std::vector<int> sntIndexes; std::vector<int> sntIndexes;
for(size_t i=0; i < wrdIndices.size(); ++i) { for(size_t i=0; i < wrdIndices.size(); ++i) {
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]); vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
int index = int(vit - sntBreaks.begin()) - 1; int index = int(vit - sntBreaks.begin()) - 1;
// check for phrases that cross sentence boundaries // check for phrases that cross sentence boundaries
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index)) if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
sntIndexes.push_back(-1); // set bad flag sntIndexes.push_back(-1); // set bad flag
else else
sntIndexes.push_back(index); // store the index of the sentence in the corpus sntIndexes.push_back(index); // store the index of the sentence in the corpus
} }
return sntIndexes; return sntIndexes;
} }
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample, int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
int sampleSize) const int sampleSize) const
{ {
// only use top 'sampleSize' number of samples // only use top 'sampleSize' number of samples
if(sample.size() > sampleSize) if(sample.size() > sampleSize)
sample.erase(sample.begin()+sampleSize, sample.end()); sample.erase(sample.begin()+sampleSize, sample.end());
return sample.size(); return sample.size();
} }
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) { void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
{
vuint_t srcFactor, trgFactor; vuint_t srcFactor, trgFactor;
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl; cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size(); const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl; cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR); Phrase sphrase(ARRAY_SIZE_INCR);
@ -471,7 +470,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl; cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
} }
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcVocab->MakeClosed(); m_srcVocab->MakeClosed();
Phrase tphrase(ARRAY_SIZE_INCR); Phrase tphrase(ARRAY_SIZE_INCR);
tphrase.CreateFromString(m_outputFactors, target, factorDelimiter); tphrase.CreateFromString(m_outputFactors, target, factorDelimiter);
@ -494,16 +493,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
LoadRawAlignments(alignment); LoadRawAlignments(alignment);
m_trgVocab->MakeClosed(); m_trgVocab->MakeClosed();
//for(size_t i=0; i < sphrase.GetSize(); ++i) //for(size_t i=0; i < sphrase.GetSize(); ++i)
//ClearWordInCache(sIDs[i]); //ClearWordInCache(sIDs[i]);
} }
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) { void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
{
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end()) if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
return; return;
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it, std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
first, last; first, last;
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) { for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
if(it->first.first == srcWord) { // all source words grouped if(it->first.first == srcWord) { // all source words grouped
first = it; // copy first entry of srcWord first = it; // copy first entry of srcWord
last = it++; last = it++;
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) { while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
@ -513,80 +513,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
m_wordPairCache.erase(first, last); m_wordPairCache.erase(first, last);
} }
} }
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize) SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
:m_sntIndex(sntIndex) :m_sntIndex(sntIndex)
,numberAligned(targetSize, 0) ,numberAligned(targetSize, 0)
,alignedList(sourceSize) ,alignedList(sourceSize)
{ {
for(int i=0; i < sourceSize; ++i) { for(int i=0; i < sourceSize; ++i) {
std::vector<int> trgWrd; std::vector<int> trgWrd;
alignedList[i] = trgWrd; alignedList[i] = trgWrd;
} }
} }
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
{ {
// foreign = target, F=T // foreign = target, F=T
// english = source, E=S // english = source, E=S
int countTarget = numberAligned.size(); int countTarget = numberAligned.size();
int minTarget = 9999; int minTarget = 9999;
int maxTarget = -1; int maxTarget = -1;
std::vector< int > usedTarget = numberAligned; std::vector< int > usedTarget = numberAligned;
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
{ for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++) int targetPos = alignedList[sourcePos][ind];
{ // cout << "point (" << targetPos << ", " << sourcePos << ")\n";
int targetPos = alignedList[sourcePos][ind]; if (targetPos<minTarget) {
// cout << "point (" << targetPos << ", " << sourcePos << ")\n"; minTarget = targetPos;
if (targetPos<minTarget) { minTarget = targetPos; } }
if (targetPos>maxTarget) { maxTarget = targetPos; } if (targetPos>maxTarget) {
usedTarget[ targetPos ]--; maxTarget = targetPos;
} // for(int ind=0;ind<sentence }
} // for(int sourcePos=startSource usedTarget[ targetPos ]--;
} // for(int ind=0;ind<sentence
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n"; } // for(int sourcePos=startSource
if (maxTarget >= 0 && // aligned to any foreign words at all // cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
maxTarget-minTarget < maxPhraseLength)
{ // foreign phrase within limits if (maxTarget >= 0 && // aligned to any foreign words at all
maxTarget-minTarget < maxPhraseLength) {
// check if foreign words are aligned to out of bound english words // foreign phrase within limits
bool out_of_bounds = false;
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) // check if foreign words are aligned to out of bound english words
{ bool out_of_bounds = false;
if (usedTarget[targetPos]>0) for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
{ if (usedTarget[targetPos]>0) {
// cout << "ouf of bounds: " << targetPos << "\n"; // cout << "ouf of bounds: " << targetPos << "\n";
out_of_bounds = true; out_of_bounds = true;
} }
} }
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n"; // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
if (!out_of_bounds) if (!out_of_bounds) {
{ // start point of foreign phrase may retreat over unaligned
// start point of foreign phrase may retreat over unaligned for(int startTarget = minTarget;
for(int startTarget = minTarget; (startTarget >= 0 &&
(startTarget >= 0 && startTarget > maxTarget-maxPhraseLength && // within length limit
startTarget > maxTarget-maxPhraseLength && // within length limit (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned startTarget--) {
startTarget--) // end point of foreign phrase may advance over unaligned
{ for (int endTarget=maxTarget;
// end point of foreign phrase may advance over unaligned (endTarget<countTarget &&
for (int endTarget=maxTarget; endTarget<startTarget+maxPhraseLength && // within length limit
(endTarget<countTarget && (endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
endTarget<startTarget+maxPhraseLength && // within length limit endTarget++) {
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
endTarget++) ret.push_back(phrasePair);
{ } // for (int endTarget=maxTarget;
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex); } // for(int startTarget=minTarget;
ret.push_back(phrasePair); } // if (!out_of_bounds)
} // for (int endTarget=maxTarget; } // if (maxTarget >= 0 &&
} // for(int startTarget=minTarget; return (ret.size() > 0);
} // if (!out_of_bounds)
} // if (maxTarget >= 0 &&
return (ret.size() > 0);
} }
}// end namepsace }// end namepsace

View File

@ -2,70 +2,73 @@
#define moses_BilingualDynSuffixArray_h #define moses_BilingualDynSuffixArray_h
#include "TargetPhrase.h" #include "TargetPhrase.h"
#include "DynSuffixArray.h" #include "DynSuffixArray.h"
#include "DynSAInclude/vocab.h" #include "DynSAInclude/vocab.h"
#include "DynSAInclude/types.h" #include "DynSAInclude/types.h"
#include "DynSAInclude/utils.h" #include "DynSAInclude/utils.h"
#include "InputFileStream.h" #include "InputFileStream.h"
#include "FactorTypeSet.h" #include "FactorTypeSet.h"
namespace Moses { namespace Moses
{
class SAPhrase class SAPhrase
{ {
public: public:
std::vector<wordID_t> words; std::vector<wordID_t> words;
SAPhrase(size_t phraseSize) SAPhrase(size_t phraseSize)
:words(phraseSize) :words(phraseSize)
{} {}
void SetId(size_t pos, wordID_t id) void SetId(size_t pos, wordID_t id) {
{
CHECK(pos < words.size()); CHECK(pos < words.size());
words[pos] = id; words[pos] = id;
} }
bool operator<(const SAPhrase& phr2) const bool operator<(const SAPhrase& phr2) const {
{ return words < phr2.words; } return words < phr2.words;
}
}; };
class PhrasePair class PhrasePair
{ {
public: public:
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex; int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex) PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
: m_startTarget(startTarget) : m_startTarget(startTarget)
, m_endTarget(endTarget) , m_endTarget(endTarget)
, m_startSource(startSource) , m_startSource(startSource)
, m_endSource(endSource) , m_endSource(endSource)
, m_sntIndex(sntIndex) , m_sntIndex(sntIndex)
{} {}
size_t GetTargetSize() const size_t GetTargetSize() const {
{ return m_endTarget - m_startTarget + 1; } return m_endTarget - m_startTarget + 1;
}
}; };
class SentenceAlignment class SentenceAlignment
{ {
public: public:
SentenceAlignment(int sntIndex, int sourceSize, int targetSize); SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
int m_sntIndex; int m_sntIndex;
std::vector<wordID_t>* trgSnt; std::vector<wordID_t>* trgSnt;
std::vector<wordID_t>* srcSnt; std::vector<wordID_t>* srcSnt;
std::vector<int> numberAligned; std::vector<int> numberAligned;
std::vector< std::vector<int> > alignedList; std::vector< std::vector<int> > alignedList;
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const; bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
}; };
class ScoresComp { class ScoresComp
public: {
public:
ScoresComp(const std::vector<float>& weights): m_weights(weights) {} ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
bool operator()(const Scores& s1, const Scores& s2) const { bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation return s1[0] < s2[0]; // just p(e|f) as approximation
/*float score1(0), score2(0); /*float score1(0), score2(0);
int idx1(0), idx2(0); int idx1(0), idx2(0);
for (Scores::const_iterator itr = s1.begin(); for (Scores::const_iterator itr = s1.begin();
itr != s1.end(); ++itr) { itr != s1.end(); ++itr) {
score1 += log(*itr * m_weights.at(idx1++)); score1 += log(*itr * m_weights.at(idx1++));
} }
for (Scores::const_iterator itr = s2.begin(); for (Scores::const_iterator itr = s2.begin();
itr != s2.end(); ++itr) { itr != s2.end(); ++itr) {
@ -73,73 +76,72 @@ public:
} }
return score1 < score2;*/ return score1 < score2;*/
} }
private: private:
const std::vector<float>& m_weights; const std::vector<float>& m_weights;
}; };
class BilingualDynSuffixArray { class BilingualDynSuffixArray
public: {
BilingualDynSuffixArray(); public:
~BilingualDynSuffixArray(); BilingualDynSuffixArray();
bool Load( const std::vector<FactorType>& inputFactors, ~BilingualDynSuffixArray();
const std::vector<FactorType>& outputTactors, bool Load( const std::vector<FactorType>& inputFactors,
std::string source, std::string target, std::string alignments, const std::vector<FactorType>& outputTactors,
const std::vector<float> &weight); std::string source, std::string target, std::string alignments,
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const; const std::vector<float> &weight);
void CleanUp(); void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void CleanUp();
void addSntPair(string& source, string& target, string& alignment); void addSntPair(string& source, string& target, string& alignment);
private: private:
DynSuffixArray* m_srcSA; DynSuffixArray* m_srcSA;
DynSuffixArray* m_trgSA; DynSuffixArray* m_trgSA;
std::vector<wordID_t>* m_srcCorpus; std::vector<wordID_t>* m_srcCorpus;
std::vector<wordID_t>* m_trgCorpus; std::vector<wordID_t>* m_trgCorpus;
std::vector<FactorType> m_inputFactors; std::vector<FactorType> m_inputFactors;
std::vector<FactorType> m_outputFactors; std::vector<FactorType> m_outputFactors;
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks; std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
Vocab* m_srcVocab, *m_trgVocab; Vocab* m_srcVocab, *m_trgVocab;
ScoresComp* m_scoreCmp; ScoresComp* m_scoreCmp;
std::vector<SentenceAlignment> m_alignments; std::vector<SentenceAlignment> m_alignments;
std::vector<std::vector<short> > m_rawAlignments; std::vector<std::vector<short> > m_rawAlignments;
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache; mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached; mutable std::set<wordID_t> m_freqWordsCached;
const size_t m_maxPhraseLength, m_maxSampleSize; const size_t m_maxPhraseLength, m_maxSampleSize;
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors, int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
std::vector<wordID_t>&, std::vector<wordID_t>&, std::vector<wordID_t>&, std::vector<wordID_t>&,
Vocab*); Vocab*);
int LoadAlignments(InputFileStream& aligs); int LoadAlignments(InputFileStream& aligs);
int LoadRawAlignments(InputFileStream& aligs); int LoadRawAlignments(InputFileStream& aligs);
int LoadRawAlignments(string& aligs); int LoadRawAlignments(string& aligs);
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const; bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const; SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
int SampleSelection(std::vector<unsigned>&, int = 300) const; int SampleSelection(std::vector<unsigned>&, int = 300) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const; std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const; TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const; void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const; void CacheFreqWords() const;
void ClearWordInCache(wordID_t); void ClearWordInCache(wordID_t);
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const; std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
int GetSourceSentenceSize(size_t sentenceId) const int GetSourceSentenceSize(size_t sentenceId) const {
{ return (sentenceId==m_srcSntBreaks.size()-1) ?
return (sentenceId==m_srcSntBreaks.size()-1) ? m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); }
} int GetTargetSentenceSize(size_t sentenceId) const {
int GetTargetSentenceSize(size_t sentenceId) const return (sentenceId==m_trgSntBreaks.size()-1) ?
{ m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
return (sentenceId==m_trgSntBreaks.size()-1) ? m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : }
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
}
}; };
} // end namespace } // end namespace
#endif #endif

View File

@ -98,8 +98,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
// add all trans opt into queue. using only 1st child node. // add all trans opt into queue. using only 1st child node.
ChartTranslationOptionList::const_iterator iterList; ChartTranslationOptionList::const_iterator iterList;
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList) for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList) {
{
const ChartTranslationOption &transOpt = **iterList; const ChartTranslationOption &transOpt = **iterList;
RuleCube *ruleCube = new RuleCube(transOpt, allChartCells, m_manager); RuleCube *ruleCube = new RuleCube(transOpt, allChartCells, m_manager);
queue.Add(ruleCube); queue.Add(ruleCube);
@ -107,8 +106,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
// pluck things out of queue and add to hypo collection // pluck things out of queue and add to hypo collection
const size_t popLimit = staticData.GetCubePruningPopLimit(); const size_t popLimit = staticData.GetCubePruningPopLimit();
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
{
ChartHypothesis *hypo = queue.Pop(); ChartHypothesis *hypo = queue.Pop();
AddHypothesis(hypo); AddHypothesis(hypo);
} }

View File

@ -34,7 +34,7 @@ class Word;
class ChartCellLabel class ChartCellLabel
{ {
public: public:
ChartCellLabel(const WordsRange &coverage, const Word &label, ChartCellLabel(const WordsRange &coverage, const Word &label,
const ChartHypothesisCollection *stack=NULL) const ChartHypothesisCollection *stack=NULL)
: m_coverage(coverage) : m_coverage(coverage)
@ -42,12 +42,17 @@ class ChartCellLabel
, m_stack(stack) , m_stack(stack)
{} {}
const WordsRange &GetCoverage() const { return m_coverage; } const WordsRange &GetCoverage() const {
const Word &GetLabel() const { return m_label; } return m_coverage;
const ChartHypothesisCollection *GetStack() const { return m_stack; } }
const Word &GetLabel() const {
return m_label;
}
const ChartHypothesisCollection *GetStack() const {
return m_stack;
}
bool operator<(const ChartCellLabel &other) const bool operator<(const ChartCellLabel &other) const {
{
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't // m_coverage and m_label uniquely identify a ChartCellLabel, so don't
// need to compare m_stack. // need to compare m_stack.
if (m_coverage == other.m_coverage) { if (m_coverage == other.m_coverage) {
@ -56,7 +61,7 @@ class ChartCellLabel
return m_coverage < other.m_coverage; return m_coverage < other.m_coverage;
} }
private: private:
const WordsRange &m_coverage; const WordsRange &m_coverage;
const Word &m_label; const Word &m_label;
const ChartHypothesisCollection *m_stack; const ChartHypothesisCollection *m_stack;

View File

@ -34,40 +34,45 @@ class ChartHypothesisCollection;
class ChartCellLabelSet class ChartCellLabelSet
{ {
private: private:
typedef std::set<ChartCellLabel> SetType; typedef std::set<ChartCellLabel> SetType;
public: public:
typedef SetType::const_iterator const_iterator; typedef SetType::const_iterator const_iterator;
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {} ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
const_iterator begin() const { return m_set.begin(); } const_iterator begin() const {
const_iterator end() const { return m_set.end(); } return m_set.begin();
}
const_iterator end() const {
return m_set.end();
}
void AddWord(const Word &w) void AddWord(const Word &w) {
{
ChartCellLabel cellLabel(m_coverage, w); ChartCellLabel cellLabel(m_coverage, w);
m_set.insert(cellLabel); m_set.insert(cellLabel);
} }
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack) void AddConstituent(const Word &w, const ChartHypothesisCollection &stack) {
{
ChartCellLabel cellLabel(m_coverage, w, &stack); ChartCellLabel cellLabel(m_coverage, w, &stack);
m_set.insert(cellLabel); m_set.insert(cellLabel);
} }
bool Empty() const { return m_set.empty(); } bool Empty() const {
return m_set.empty();
}
size_t GetSize() const { return m_set.size(); } size_t GetSize() const {
return m_set.size();
}
const ChartCellLabel *Find(const Word &w) const const ChartCellLabel *Find(const Word &w) const {
{
SetType::const_iterator p = m_set.find(ChartCellLabel(m_coverage, w)); SetType::const_iterator p = m_set.find(ChartCellLabel(m_coverage, w));
return p == m_set.end() ? 0 : &(*p); return p == m_set.end() ? 0 : &(*p);
} }
private: private:
const WordsRange &m_coverage; const WordsRange &m_coverage;
SetType m_set; SetType m_set;
}; };

View File

@ -57,15 +57,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOption &transOpt,
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions(); const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
m_prevHypos.reserve(childEntries.size()); m_prevHypos.reserve(childEntries.size());
std::vector<HypothesisDimension>::const_iterator iter; std::vector<HypothesisDimension>::const_iterator iter;
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) {
{
m_prevHypos.push_back(iter->GetHypothesis()); m_prevHypos.push_back(iter->GetHypothesis());
} }
} }
ChartHypothesis::~ChartHypothesis() ChartHypothesis::~ChartHypothesis()
{ {
// delete feature function states // delete feature function states
for (unsigned i = 0; i < m_ffStates.size(); ++i) { for (unsigned i = 0; i < m_ffStates.size(); ++i) {
delete m_ffStates[i]; delete m_ffStates[i];
} }
@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const
size_t nonTermInd = nonTermIndexMap[pos]; size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd]; const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
prevHypo->CreateOutputPhrase(outPhrase); prevHypo->CreateOutputPhrase(outPhrase);
} } else {
else {
outPhrase.AddWord(word); outPhrase.AddWord(word);
} }
} }
@ -120,20 +118,19 @@ Phrase ChartHypothesis::GetOutputPhrase() const
*/ */
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
{ {
int comp = 0; int comp = 0;
// -1 = this < compare // -1 = this < compare
// +1 = this > compare // +1 = this > compare
// 0 = this ==compare // 0 = this ==compare
for (unsigned i = 0; i < m_ffStates.size(); ++i) for (unsigned i = 0; i < m_ffStates.size(); ++i) {
{ if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
comp = m_ffStates[i] - compare.m_ffStates[i]; comp = m_ffStates[i] - compare.m_ffStates[i];
else else
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]); comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
if (comp != 0) if (comp != 0)
return comp; return comp;
} }
return 0; return 0;
@ -154,12 +151,12 @@ void ChartHypothesis::CalcScore()
const ScoreComponentCollection &scoreBreakdown = GetCurrTargetPhrase().GetScoreBreakdown(); const ScoreComponentCollection &scoreBreakdown = GetCurrTargetPhrase().GetScoreBreakdown();
m_scoreBreakdown.PlusEquals(scoreBreakdown); m_scoreBreakdown.PlusEquals(scoreBreakdown);
// compute values of stateless feature functions that were not // compute values of stateless feature functions that were not
// cached in the translation option-- there is no principled distinction // cached in the translation option-- there is no principled distinction
//const vector<const StatelessFeatureFunction*>& sfs = //const vector<const StatelessFeatureFunction*>& sfs =
// m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions(); // m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions();
// TODO! // TODO!
//for (unsigned i = 0; i < sfs.size(); ++i) { //for (unsigned i = 0; i < sfs.size(); ++i) {
// sfs[i]->ChartEvaluate(m_targetPhrase, &m_scoreBreakdown); // sfs[i]->ChartEvaluate(m_targetPhrase, &m_scoreBreakdown);
//} //}
@ -167,7 +164,7 @@ void ChartHypothesis::CalcScore()
const std::vector<const StatefulFeatureFunction*>& ffs = const std::vector<const StatefulFeatureFunction*>& ffs =
m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions(); m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) { for (unsigned i = 0; i < ffs.size(); ++i) {
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown); m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
} }
m_totalScore = m_scoreBreakdown.GetWeightedScore(); m_totalScore = m_scoreBreakdown.GetWeightedScore();
@ -258,13 +255,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
{ {
out << hypo.GetId(); out << hypo.GetId();
// recombination // recombination
if (hypo.GetWinningHypothesis() != NULL && if (hypo.GetWinningHypothesis() != NULL &&
hypo.GetWinningHypothesis() != &hypo) hypo.GetWinningHypothesis() != &hypo) {
{ out << "->" << hypo.GetWinningHypothesis()->GetId();
out << "->" << hypo.GetWinningHypothesis()->GetId(); }
}
out << " " << hypo.GetCurrTargetPhrase() out << " " << hypo.GetCurrTargetPhrase()
//<< " " << outPhrase //<< " " << outPhrase

View File

@ -55,7 +55,7 @@ protected:
const ChartTranslationOption &m_transOpt; const ChartTranslationOption &m_transOpt;
WordsRange m_currSourceWordsRange; WordsRange m_currSourceWordsRange;
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */ std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */ ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
,m_lmNGram ,m_lmNGram
,m_lmPrefix; ,m_lmPrefix;
@ -94,7 +94,9 @@ public:
~ChartHypothesis(); ~ChartHypothesis();
unsigned GetId() const { return m_id; } unsigned GetId() const {
return m_id;
}
const ChartTranslationOption &GetTranslationOption()const { const ChartTranslationOption &GetTranslationOption()const {
return m_transOpt; return m_transOpt;
@ -108,15 +110,17 @@ public:
inline const ChartArcList* GetArcList() const { inline const ChartArcList* GetArcList() const {
return m_arcList; return m_arcList;
} }
inline const FFState* GetFFState( size_t featureID ) const { inline const FFState* GetFFState( size_t featureID ) const {
return m_ffStates[ featureID ]; return m_ffStates[ featureID ];
} }
inline const ChartManager& GetManager() const { return m_manager; } inline const ChartManager& GetManager() const {
return m_manager;
}
void CreateOutputPhrase(Phrase &outPhrase) const; void CreateOutputPhrase(Phrase &outPhrase) const;
Phrase GetOutputPhrase() const; Phrase GetOutputPhrase() const;
int RecombineCompare(const ChartHypothesis &compare) const; int RecombineCompare(const ChartHypothesis &compare) const;
void CalcScore(); void CalcScore();
@ -135,17 +139,17 @@ public:
return m_prevHypos; return m_prevHypos;
} }
const ChartHypothesis* GetPrevHypo(size_t pos) const { const ChartHypothesis* GetPrevHypo(size_t pos) const {
return m_prevHypos[pos]; return m_prevHypos[pos];
} }
const Word &GetTargetLHS() const { const Word &GetTargetLHS() const {
return GetCurrTargetPhrase().GetTargetLHS(); return GetCurrTargetPhrase().GetTargetLHS();
} }
const ChartHypothesis* GetWinningHypothesis() const { const ChartHypothesis* GetWinningHypothesis() const {
return m_winningHypo; return m_winningHypo;
} }
TO_STRING(); TO_STRING();

View File

@ -101,8 +101,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl) VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) { if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo); hypoExisting->AddArc(hypo);
} } else {
else {
ChartHypothesis::Delete(hypo); ChartHypothesis::Delete(hypo);
} }
return false; return false;

View File

@ -43,7 +43,7 @@ public:
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const { bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
// assert in same cell // assert in same cell
const WordsRange &rangeA = hypoA->GetCurrSourceRange() const WordsRange &rangeA = hypoA->GetCurrSourceRange()
, &rangeB = hypoB->GetCurrSourceRange(); , &rangeB = hypoB->GetCurrSourceRange();
CHECK(rangeA == rangeB); CHECK(rangeA == rangeB);
// shouldn't be mixing hypos with different lhs // shouldn't be mixing hypos with different lhs
@ -113,7 +113,9 @@ public:
return m_hyposOrdered; return m_hyposOrdered;
} }
float GetBestScore() const { return m_bestScore; } float GetBestScore() const {
return m_bestScore;
}
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const; void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;

View File

@ -231,17 +231,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
{ {
size_t size = m_source.GetSize(); size_t size = m_source.GetSize();
// which hypotheses are reachable? // which hypotheses are reachable?
std::map<unsigned,bool> reachable; std::map<unsigned,bool> reachable;
WordsRange fullRange(0, size-1); WordsRange fullRange(0, size-1);
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange); const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
const ChartHypothesis *hypo = lastCell.GetBestHypothesis(); const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
if (hypo == NULL) { if (hypo == NULL) {
// no hypothesis // no hypothesis
return; return;
} }
FindReachableHypotheses( hypo, reachable); FindReachableHypotheses( hypo, reachable);
for (size_t width = 1; width <= size; ++width) { for (size_t width = 1; width <= size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) { for (size_t startPos = 0; startPos <= size-width; ++startPos) {
@ -257,42 +257,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
{ {
// do not recurse, if already visited // do not recurse, if already visited
if (reachable.find(hypo->GetId()) != reachable.end()) if (reachable.find(hypo->GetId()) != reachable.end()) {
{ return;
return; }
}
// recurse // recurse
reachable[ hypo->GetId() ] = true; reachable[ hypo->GetId() ] = true;
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos(); const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
{ FindReachableHypotheses( *i, reachable );
FindReachableHypotheses( *i, reachable ); }
}
// also loop over recombined hypotheses (arcs) // also loop over recombined hypotheses (arcs)
const ChartArcList *arcList = hypo->GetArcList(); const ChartArcList *arcList = hypo->GetArcList();
if (arcList) { if (arcList) {
ChartArcList::const_iterator iterArc; ChartArcList::const_iterator iterArc;
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) { for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
const ChartHypothesis &arc = **iterArc; const ChartHypothesis &arc = **iterArc;
FindReachableHypotheses( &arc, reachable ); FindReachableHypotheses( &arc, reachable );
} }
} }
} }
void ChartManager::CreateDeviantPaths( void ChartManager::CreateDeviantPaths(
boost::shared_ptr<const ChartTrellisPath> basePath, boost::shared_ptr<const ChartTrellisPath> basePath,
ChartTrellisDetourQueue &q) ChartTrellisDetourQueue &q)
{ {
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q); CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
} }
void ChartManager::CreateDeviantPaths( void ChartManager::CreateDeviantPaths(
boost::shared_ptr<const ChartTrellisPath> basePath, boost::shared_ptr<const ChartTrellisPath> basePath,
const ChartTrellisNode &substitutedNode, const ChartTrellisNode &substitutedNode,
ChartTrellisDetourQueue &queue) ChartTrellisDetourQueue &queue)
{ {
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList(); const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
if (arcList) { if (arcList) {

View File

@ -69,7 +69,7 @@ public:
void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const; void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */ void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
const InputType& GetSource() const { const InputType& GetSource() const {
return m_source; return m_source;
@ -89,7 +89,9 @@ public:
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source)); m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
} }
unsigned GetNextHypoId() { return m_hypothesisId++; } unsigned GetNextHypoId() {
return m_hypothesisId++;
}
}; };
} }

View File

@ -77,19 +77,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// get list of all rules that apply to spans at same starting position // get list of all rules that apply to spans at same starting position
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()]; DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList(); const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel(); const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();
// loop through the rules // loop through the rules
// (note that expandableDottedRuleList can be expanded as the loop runs // (note that expandableDottedRuleList can be expanded as the loop runs
// through calls to ExtendPartialRuleApplication()) // through calls to ExtendPartialRuleApplication())
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) { for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
// rule we are about to extend // rule we are about to extend
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind]; const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
// we will now try to extend it, starting after where it ended // we will now try to extend it, starting after where it ended
size_t startPos = prevDottedRule.IsRoot() size_t startPos = prevDottedRule.IsRoot()
? range.GetStartPos() ? range.GetStartPos()
: prevDottedRule.GetWordsRange().GetEndPos() + 1; : prevDottedRule.GetWordsRange().GetEndPos() + 1;
// search for terminal symbol // search for terminal symbol
// (if only one more word position needs to be covered) // (if only one more word position needs to be covered)
@ -102,15 +102,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// if we found a new rule -> create it and add it to the list // if we found a new rule -> create it and add it to the list
if (node != NULL) { if (node != NULL) {
// create the rule // create the rule
#ifdef USE_BOOST_POOL #ifdef USE_BOOST_POOL
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc(); DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel, new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
prevDottedRule); prevDottedRule);
#else #else
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node, DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
sourceWordLabel, sourceWordLabel,
prevDottedRule); prevDottedRule);
#endif #endif
dottedRuleCol.Add(relEndPos+1, dottedRule); dottedRuleCol.Add(relEndPos+1, dottedRule);
} }
@ -136,9 +136,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// word. // word.
endPos = absEndPos - 1; endPos = absEndPos - 1;
stackInd = relEndPos; stackInd = relEndPos;
} } else {
else
{
endPos = absEndPos; endPos = absEndPos;
stackInd = relEndPos + 1; stackInd = relEndPos + 1;
} }
@ -215,7 +213,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
// We'll do whichever minimises the number of lookups: // We'll do whichever minimises the number of lookups:
if (numCombinations <= numChildren*2) { if (numCombinations <= numChildren*2) {
// loop over possible source non-terminal labels (as found in input tree) // loop over possible source non-terminal labels (as found in input tree)
NonTerminalSet::const_iterator p = sourceNonTerms.begin(); NonTerminalSet::const_iterator p = sourceNonTerms.begin();
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end(); NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
for (; p != sEnd; ++p) { for (; p != sEnd; ++p) {
@ -242,14 +240,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule); new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else #else
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel, DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
prevDottedRule); prevDottedRule);
#endif #endif
dottedRuleColl.Add(stackInd, rule); dottedRuleColl.Add(stackInd, rule);
} }
} }
} } else {
else
{
// loop over possible expansions of the rule // loop over possible expansions of the rule
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p; PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end = PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end =
@ -274,7 +270,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule); new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else #else
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel, DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
prevDottedRule); prevDottedRule);
#endif #endif
dottedRuleColl.Add(stackInd, rule); dottedRuleColl.Add(stackInd, rule);
} }

View File

@ -30,7 +30,7 @@ namespace Moses
{ {
void ChartTranslationOption::CalcEstimateOfBestScore( void ChartTranslationOption::CalcEstimateOfBestScore(
const ChartCellCollection &allChartCells) const ChartCellCollection &allChartCells)
{ {
const TargetPhrase &targetPhrase = **(m_targetPhraseCollection.begin()); const TargetPhrase &targetPhrase = **(m_targetPhraseCollection.begin());
m_estimateOfBestScore = targetPhrase.GetFutureScore(); m_estimateOfBestScore = targetPhrase.GetFutureScore();

View File

@ -37,7 +37,7 @@ class ChartCellCollection;
// of translations and provdes an estimate of the best score. // of translations and provdes an estimate of the best score.
class ChartTranslationOption class ChartTranslationOption
{ {
public: public:
ChartTranslationOption(const TargetPhraseCollection &targetPhraseColl, ChartTranslationOption(const TargetPhraseCollection &targetPhraseColl,
const DottedRule &dottedRule, const DottedRule &dottedRule,
const WordsRange &wordsRange, const WordsRange &wordsRange,
@ -45,16 +45,17 @@ class ChartTranslationOption
: m_dottedRule(dottedRule) : m_dottedRule(dottedRule)
, m_targetPhraseCollection(targetPhraseColl) , m_targetPhraseCollection(targetPhraseColl)
, m_wordsRange(wordsRange) , m_wordsRange(wordsRange)
, m_estimateOfBestScore(0) , m_estimateOfBestScore(0) {
{
CalcEstimateOfBestScore(allChartCells); CalcEstimateOfBestScore(allChartCells);
} }
~ChartTranslationOption() {} ~ChartTranslationOption() {}
const DottedRule &GetDottedRule() const { return m_dottedRule; } const DottedRule &GetDottedRule() const {
return m_dottedRule;
}
const TargetPhraseCollection &GetTargetPhraseCollection() const { const TargetPhraseCollection &GetTargetPhraseCollection() const {
return m_targetPhraseCollection; return m_targetPhraseCollection;
} }
@ -65,9 +66,11 @@ class ChartTranslationOption
// return an estimate of the best score possible with this translation option. // return an estimate of the best score possible with this translation option.
// the estimate is the sum of the top target phrase's estimated score plus the // the estimate is the sum of the top target phrase's estimated score plus the
// scores of the best child hypotheses. // scores of the best child hypotheses.
inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; } inline float GetEstimateOfBestScore() const {
return m_estimateOfBestScore;
}
private: private:
// not implemented // not implemented
ChartTranslationOption &operator=(const ChartTranslationOption &); ChartTranslationOption &operator=(const ChartTranslationOption &);

View File

@ -106,8 +106,8 @@ void ChartTranslationOptionCollection::ProcessUnknownWord(size_t startPos, size_
return; return;
} }
if (startPos == 0 || startPos == m_source.GetSize() - 1) if (startPos == 0 || startPos == m_source.GetSize() - 1) {
{ // don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules // don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
return; return;
} }

View File

@ -74,9 +74,9 @@ protected:
public: public:
ChartTranslationOptionCollection(InputType const& source ChartTranslationOptionCollection(InputType const& source
, const TranslationSystem* system , const TranslationSystem* system
, const ChartCellCollection &hypoStackColl , const ChartCellCollection &hypoStackColl
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers); , const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
virtual ~ChartTranslationOptionCollection(); virtual ~ChartTranslationOptionCollection();
void CreateTranslationOptionsForRange(size_t startPos void CreateTranslationOptionsForRange(size_t startPos
, size_t endPos); , size_t endPos);

View File

@ -66,12 +66,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &targetPhraseC
if (m_collection.size() < ruleLimit) { if (m_collection.size() < ruleLimit) {
// not yet filled out quota. add everything // not yet filled out quota. add everything
ChartTranslationOption *option = new ChartTranslationOption( ChartTranslationOption *option = new ChartTranslationOption(
targetPhraseCollection, dottedRule, m_range, chartCellColl); targetPhraseCollection, dottedRule, m_range, chartCellColl);
m_collection.push_back(option); m_collection.push_back(option);
float score = option->GetEstimateOfBestScore(); float score = option->GetEstimateOfBestScore();
m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold; m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
} } else {
else {
// full but not bursting. add if better than worst score // full but not bursting. add if better than worst score
ChartTranslationOption option(targetPhraseCollection, dottedRule, ChartTranslationOption option(targetPhraseCollection, dottedRule,
m_range, chartCellColl); m_range, chartCellColl);

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -27,15 +27,15 @@ namespace Moses
{ {
ChartTrellisDetour::ChartTrellisDetour( ChartTrellisDetour::ChartTrellisDetour(
boost::shared_ptr<const ChartTrellisPath> basePath, boost::shared_ptr<const ChartTrellisPath> basePath,
const ChartTrellisNode &substitutedNode, const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo) const ChartHypothesis &replacementHypo)
: m_basePath(basePath) : m_basePath(basePath)
, m_substitutedNode(substitutedNode) , m_substitutedNode(substitutedNode)
, m_replacementHypo(replacementHypo) , m_replacementHypo(replacementHypo)
{ {
float diff = replacementHypo.GetTotalScore() float diff = replacementHypo.GetTotalScore()
- substitutedNode.GetHypothesis().GetTotalScore(); - substitutedNode.GetHypothesis().GetTotalScore();
m_totalScore = basePath->GetTotalScore() + diff; m_totalScore = basePath->GetTotalScore() + diff;
} }

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -29,20 +29,24 @@ class ChartTrellisPath;
class ChartTrellisDetour class ChartTrellisDetour
{ {
public: public:
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>, ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
const ChartTrellisNode &, const ChartHypothesis &); const ChartTrellisNode &, const ChartHypothesis &);
const ChartTrellisPath &GetBasePath() const { return *m_basePath; } const ChartTrellisPath &GetBasePath() const {
return *m_basePath;
}
const ChartTrellisNode &GetSubstitutedNode() const { const ChartTrellisNode &GetSubstitutedNode() const {
return m_substitutedNode; return m_substitutedNode;
} }
const ChartHypothesis &GetReplacementHypo() const { const ChartHypothesis &GetReplacementHypo() const {
return m_replacementHypo; return m_replacementHypo;
} }
float GetTotalScore() const { return m_totalScore; } float GetTotalScore() const {
return m_totalScore;
}
private: private:
boost::shared_ptr<const ChartTrellisPath> m_basePath; boost::shared_ptr<const ChartTrellisPath> m_basePath;
const ChartTrellisNode &m_substitutedNode; const ChartTrellisNode &m_substitutedNode;
const ChartHypothesis &m_replacementHypo; const ChartHypothesis &m_replacementHypo;

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -21,13 +21,16 @@
#include "Util.h" #include "Util.h"
namespace Moses { namespace Moses
{
ChartTrellisDetourQueue::~ChartTrellisDetourQueue() { ChartTrellisDetourQueue::~ChartTrellisDetourQueue()
{
RemoveAllInColl(m_queue); RemoveAllInColl(m_queue);
} }
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) { void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour)
{
if (m_capacity == 0 || m_queue.size() < m_capacity) { if (m_capacity == 0 || m_queue.size() < m_capacity) {
m_queue.insert(detour); m_queue.insert(detour);
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) { } else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
} }
} }
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() { const ChartTrellisDetour *ChartTrellisDetourQueue::Pop()
{
QueueType::iterator p = m_queue.begin(); QueueType::iterator p = m_queue.begin();
const ChartTrellisDetour *top = *p; const ChartTrellisDetour *top = *p;
m_queue.erase(p); m_queue.erase(p);

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -23,19 +23,23 @@
#include <set> #include <set>
namespace Moses { namespace Moses
{
// A bounded priority queue of ChartTrellisDetour pointers. The top item is // A bounded priority queue of ChartTrellisDetour pointers. The top item is
// the best scoring detour. The queue assumes ownership of pushed items and // the best scoring detour. The queue assumes ownership of pushed items and
// relinquishes ownership when they are popped. Any remaining items at the // relinquishes ownership when they are popped. Any remaining items at the
// time of the queue's destruction are deleted. // time of the queue's destruction are deleted.
class ChartTrellisDetourQueue { class ChartTrellisDetourQueue
public: {
public:
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded. // Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {} ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
~ChartTrellisDetourQueue(); ~ChartTrellisDetourQueue();
bool Empty() const { return m_queue.empty(); } bool Empty() const {
return m_queue.empty();
}
// Add the detour to the queue or delete it if the queue is full and the // Add the detour to the queue or delete it if the queue is full and the
// score is no better than the queue's worst score. // score is no better than the queue's worst score.
@ -45,7 +49,7 @@ class ChartTrellisDetourQueue {
// caller is responsible for deleting the object. // caller is responsible for deleting the object.
const ChartTrellisDetour *Pop(); const ChartTrellisDetour *Pop();
private: private:
struct DetourOrderer { struct DetourOrderer {
bool operator()(const ChartTrellisDetour* a, bool operator()(const ChartTrellisDetour* a,
const ChartTrellisDetour* b) const { const ChartTrellisDetour* b) const {

View File

@ -31,16 +31,16 @@ namespace Moses
{ {
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo) ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
: m_hypo(hypo) : m_hypo(hypo)
{ {
CreateChildren(); CreateChildren();
} }
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour, ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
ChartTrellisNode *&deviationPoint) ChartTrellisNode *&deviationPoint)
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode()) : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
? detour.GetReplacementHypo() ? detour.GetReplacementHypo()
: detour.GetBasePath().GetFinalNode().GetHypothesis()) : detour.GetBasePath().GetFinalNode().GetHypothesis())
{ {
if (&m_hypo == &detour.GetReplacementHypo()) { if (&m_hypo == &detour.GetReplacementHypo()) {
deviationPoint = this; deviationPoint = this;
@ -56,9 +56,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root,
const ChartTrellisNode &substitutedNode, const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo, const ChartHypothesis &replacementHypo,
ChartTrellisNode *&deviationPoint) ChartTrellisNode *&deviationPoint)
: m_hypo((&root == &substitutedNode) : m_hypo((&root == &substitutedNode)
? replacementHypo ? replacementHypo
: root.GetHypothesis()) : root.GetHypothesis())
{ {
if (&root == &substitutedNode) { if (&root == &substitutedNode) {
deviationPoint = this; deviationPoint = this;
@ -124,8 +124,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode,
for (size_t ind = 0; ind < children.size(); ++ind) { for (size_t ind = 0; ind < children.size(); ++ind) {
const ChartTrellisNode *origChild = children[ind]; const ChartTrellisNode *origChild = children[ind];
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode, ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
replacementHypo, replacementHypo,
deviationPoint); deviationPoint);
m_children.push_back(child); m_children.push_back(child);
} }
} }

View File

@ -32,7 +32,7 @@ class ChartTrellisDetour;
class ChartTrellisNode class ChartTrellisNode
{ {
public: public:
typedef std::vector<ChartTrellisNode*> NodeChildren; typedef std::vector<ChartTrellisNode*> NodeChildren;
ChartTrellisNode(const ChartHypothesis &hypo); ChartTrellisNode(const ChartHypothesis &hypo);
@ -40,15 +40,21 @@ class ChartTrellisNode
~ChartTrellisNode(); ~ChartTrellisNode();
const ChartHypothesis &GetHypothesis() const { return m_hypo; } const ChartHypothesis &GetHypothesis() const {
return m_hypo;
}
const NodeChildren &GetChildren() const { return m_children; } const NodeChildren &GetChildren() const {
return m_children;
}
const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; } const ChartTrellisNode &GetChild(size_t i) const {
return *m_children[i];
}
Phrase GetOutputPhrase() const; Phrase GetOutputPhrase() const;
private: private:
ChartTrellisNode(const ChartTrellisNode &); // Not implemented ChartTrellisNode(const ChartTrellisNode &); // Not implemented
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented

View File

@ -30,17 +30,17 @@ namespace Moses
{ {
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo) ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
: m_finalNode(new ChartTrellisNode(hypo)) : m_finalNode(new ChartTrellisNode(hypo))
, m_deviationPoint(NULL) , m_deviationPoint(NULL)
, m_scoreBreakdown(hypo.GetScoreBreakdown()) , m_scoreBreakdown(hypo.GetScoreBreakdown())
, m_totalScore(hypo.GetTotalScore()) , m_totalScore(hypo.GetTotalScore())
{ {
} }
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour) ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint)) : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown) , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
, m_totalScore(0) , m_totalScore(0)
{ {
CHECK(m_deviationPoint); CHECK(m_deviationPoint);
ScoreComponentCollection scoreChange; ScoreComponentCollection scoreChange;

View File

@ -36,18 +36,24 @@ class ChartTrellisNode;
class ChartTrellisPath class ChartTrellisPath
{ {
public: public:
ChartTrellisPath(const ChartHypothesis &hypo); ChartTrellisPath(const ChartHypothesis &hypo);
ChartTrellisPath(const ChartTrellisDetour &detour); ChartTrellisPath(const ChartTrellisDetour &detour);
~ChartTrellisPath(); ~ChartTrellisPath();
const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; } const ChartTrellisNode &GetFinalNode() const {
return *m_finalNode;
}
const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; } const ChartTrellisNode *GetDeviationPoint() const {
return m_deviationPoint;
}
//! get score for this path throught trellis //! get score for this path throught trellis
float GetTotalScore() const { return m_totalScore; } float GetTotalScore() const {
return m_totalScore;
}
Phrase GetOutputPhrase() const; Phrase GetOutputPhrase() const;
@ -56,7 +62,7 @@ class ChartTrellisPath
return m_scoreBreakdown; return m_scoreBreakdown;
} }
private: private:
ChartTrellisPath(const ChartTrellisPath &); // Not implemented ChartTrellisPath(const ChartTrellisPath &); // Not implemented
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented

View File

@ -32,26 +32,38 @@ class DottedRule
{ {
friend std::ostream& operator<<(std::ostream &, const DottedRule &); friend std::ostream& operator<<(std::ostream &, const DottedRule &);
public: public:
// used only to init dot stack. // used only to init dot stack.
DottedRule() DottedRule()
: m_cellLabel(NULL) : m_cellLabel(NULL)
, m_prev(NULL) {} , m_prev(NULL) {}
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev) DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
: m_cellLabel(&ccl) : m_cellLabel(&ccl)
, m_prev(&prev) {} , m_prev(&prev) {}
const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); } const WordsRange &GetWordsRange() const {
const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); } return m_cellLabel->GetCoverage();
bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); } }
const DottedRule *GetPrev() const { return m_prev; } const Word &GetSourceWord() const {
bool IsRoot() const { return m_prev == NULL; } return m_cellLabel->GetLabel();
const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; } }
bool IsNonTerminal() const {
return m_cellLabel->GetLabel().IsNonTerminal();
}
const DottedRule *GetPrev() const {
return m_prev;
}
bool IsRoot() const {
return m_prev == NULL;
}
const ChartCellLabel &GetChartCellLabel() const {
return *m_cellLabel;
}
private: private:
const ChartCellLabel *m_cellLabel; // usually contains something, unless const ChartCellLabel *m_cellLabel; // usually contains something, unless
// it's the init processed rule // it's the init processed rule
const DottedRule *m_prev; const DottedRule *m_prev;
}; };

View File

@ -1,17 +1,17 @@
/*********************************************************************** /***********************************************************************
Moses - statistical machine translation system Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version. version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details. Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

View File

@ -34,21 +34,23 @@ namespace Moses
class DottedRuleInMemory : public DottedRule class DottedRuleInMemory : public DottedRule
{ {
public: public:
// used only to init dot stack. // used only to init dot stack.
explicit DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node) explicit DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node)
: DottedRule() : DottedRule()
, m_node(node) {} , m_node(node) {}
DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node, DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node,
const ChartCellLabel &cellLabel, const ChartCellLabel &cellLabel,
const DottedRuleInMemory &prev) const DottedRuleInMemory &prev)
: DottedRule(cellLabel, prev) : DottedRule(cellLabel, prev)
, m_node(node) {} , m_node(node) {}
const PhraseDictionaryNodeSCFG &GetLastNode() const { return m_node; }
private: const PhraseDictionaryNodeSCFG &GetLastNode() const {
return m_node;
}
private:
const PhraseDictionaryNodeSCFG &m_node; const PhraseDictionaryNodeSCFG &m_node;
}; };

View File

@ -34,26 +34,32 @@ namespace Moses
{ {
class DottedRuleOnDisk : public DottedRule class DottedRuleOnDisk : public DottedRule
{ {
public: public:
// used only to init dot stack. // used only to init dot stack.
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode) explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
: DottedRule() : DottedRule()
, m_lastNode(lastNode) , m_lastNode(lastNode)
, m_done(false) {} , m_done(false) {}
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode, DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
const ChartCellLabel &cellLabel, const ChartCellLabel &cellLabel,
const DottedRuleOnDisk &prev) const DottedRuleOnDisk &prev)
: DottedRule(cellLabel, prev) : DottedRule(cellLabel, prev)
, m_lastNode(lastNode) , m_lastNode(lastNode)
, m_done(false) {} , m_done(false) {}
const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; } const OnDiskPt::PhraseNode &GetLastNode() const {
return m_lastNode;
}
bool Done() const { return m_done; } bool Done() const {
void Done(bool value) const { m_done = value; } return m_done;
}
void Done(bool value) const {
m_done = value;
}
private: private:
const OnDiskPt::PhraseNode &m_lastNode; const OnDiskPt::PhraseNode &m_lastNode;
mutable bool m_done; mutable bool m_done;
}; };

View File

@ -36,9 +36,9 @@ public:
const ChartHypothesis&, const ChartHypothesis&,
int /* featureID */, int /* featureID */,
ScoreComponentCollection*) const { ScoreComponentCollection*) const {
CHECK(0); // feature function not valid in chart decoder CHECK(0); // feature function not valid in chart decoder
return NULL; return NULL;
} }
}; };
/** Doesn't do anything but provide a key into the global /** Doesn't do anything but provide a key into the global

View File

@ -22,176 +22,179 @@
#include <ctime> #include <ctime>
#include <iostream> #include <iostream>
namespace randlm { namespace randlm
{
template<typename T>
class CacheNode {
public:
typedef std::map<wordID_t, CacheNode<T>* > childMap;
// initialise value to 'unknown' (i.e. not yet queried or cached).
CacheNode(T unknown_value) : value_(unknown_value) {}
childMap childs_; // child pointers
T value_; // value stored
const void* state_; // state pointer
};
template<typename T>
class Cache {
public:
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
// unknown_value is used to indicate the ngram was not queried (yet)
// null_value_ indicates it was queried but not found in model
// space usage is handled by client.
Cache(T unknown_value, T null_value) :
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
root_ = newNode();
}
~Cache() {
if(clear()) {
delete root_;
root_ = NULL;
} else {
std::cerr << "Error freeing cache memory.\n";
}
}
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
// inserts full ngram into cache
CacheNode<T>* node = root_;
for (int i = len - 1; i > -1; --i) {
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// current node is already prefix. Go to child node
node = node->childs_[ngram[i]];
} else {
// no child for prefix. set new child link in current node
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
}
}
node->value_ = value;
node->state_ = state;
return true;
}
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
// finds value for this full ngram only (returns false if full ngram not in cache)
CacheNode<T> * node = root_;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
} else {
// not cached
return false;
}
}
*value = node->value_;
if(state) *state = node->state_;
return *value != null_value_ && *value != unknown_value_;
}
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
// set values array to point to cache value nodes
CacheNode<T> * node = root_;
*found = 0;
//values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_) {
return len - 1 - i; // max length posible
}
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[i] = &node->value_;
}
}
return len; // all possible
}
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
// get pointers to values for ngram and constituents.
// returns upper bound on longest subngram in model.
// 'found' stores longest non-null and known value found.
CacheNode<T> * node = root_;
*found = 0;
values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[len - i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_)
return len - 1 - i; // max length posible
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[len - i] = &node->value_;
}
}
return len; // all possible
}
bool clear() {
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
return clearNodes(root_);
}
int nodes() {
// returns number of nodes
return cur_nodes_;
}
int nodeSize() {
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
}
private:
CacheNode<T> * root_;
count_t cur_nodes_;
T unknown_value_; // Used to initialise data at each node
T null_value_; // Indicates cached something not in model
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
++cur_nodes_;
return new CacheNode<T>(unknown_value_);
}
bool clearNodes(CacheNode<T> * node) {
//delete children from this node
if(!node->childs_.empty()) {
iterate(node->childs_, itr) {
if(!clearNodes(itr->second))
std::cerr << "Error emptying cache\n";
delete itr->second;
--cur_nodes_;
}
node->childs_.clear();
}
return true;
}
}; template<typename T>
class CacheNode
{
public:
typedef std::map<wordID_t, CacheNode<T>* > childMap;
// initialise value to 'unknown' (i.e. not yet queried or cached).
CacheNode(T unknown_value) : value_(unknown_value) {}
childMap childs_; // child pointers
T value_; // value stored
const void* state_; // state pointer
};
template<typename T>
class Cache
{
public:
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
// unknown_value is used to indicate the ngram was not queried (yet)
// null_value_ indicates it was queried but not found in model
// space usage is handled by client.
Cache(T unknown_value, T null_value) :
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
root_ = newNode();
}
~Cache() {
if(clear()) {
delete root_;
root_ = NULL;
} else {
std::cerr << "Error freeing cache memory.\n";
}
}
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
// inserts full ngram into cache
CacheNode<T>* node = root_;
for (int i = len - 1; i > -1; --i) {
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// current node is already prefix. Go to child node
node = node->childs_[ngram[i]];
} else {
// no child for prefix. set new child link in current node
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
}
}
node->value_ = value;
node->state_ = state;
return true;
}
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
// finds value for this full ngram only (returns false if full ngram not in cache)
CacheNode<T> * node = root_;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
} else {
// not cached
return false;
}
}
*value = node->value_;
if(state) *state = node->state_;
return *value != null_value_ && *value != unknown_value_;
}
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
// set values array to point to cache value nodes
CacheNode<T> * node = root_;
*found = 0;
//values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_) {
return len - 1 - i; // max length posible
}
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[i] = &node->value_;
}
}
return len; // all possible
}
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
// get pointers to values for ngram and constituents.
// returns upper bound on longest subngram in model.
// 'found' stores longest non-null and known value found.
CacheNode<T> * node = root_;
*found = 0;
values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[len - i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_)
return len - 1 - i; // max length posible
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[len - i] = &node->value_;
}
}
return len; // all possible
}
bool clear() {
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
return clearNodes(root_);
}
int nodes() {
// returns number of nodes
return cur_nodes_;
}
int nodeSize() {
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
}
private:
CacheNode<T> * root_;
count_t cur_nodes_;
T unknown_value_; // Used to initialise data at each node
T null_value_; // Indicates cached something not in model
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
++cur_nodes_;
return new CacheNode<T>(unknown_value_);
}
bool clearNodes(CacheNode<T> * node) {
//delete children from this node
if(!node->childs_.empty()) {
iterate(node->childs_, itr) {
if(!clearNodes(itr->second))
std::cerr << "Error emptying cache\n";
delete itr->second;
--cur_nodes_;
}
node->childs_.clear();
}
return true;
}
};
} //end namespace } //end namespace
#endif //INC_RANDLM_CACHE_H #endif //INC_RANDLM_CACHE_H

View File

@ -20,295 +20,306 @@
#include <cmath> #include <cmath>
#include "file.h" #include "file.h"
namespace randlm { namespace randlm
{
// Class Filter wraps a contiguous array of data. Filter and its subclasses
// implement read/write/increment functionality on arrays with arbitrary sized addresses
// (i.e. an address may not use a full number of bytes). When converting to byte-based
// representation we assume "unused" bits are to left.
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
// been masked out.
template<typename T>
class Filter {
public:
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
// number of bits in T
cell_width_ = sizeof(T) << 3;
// current implementation has following constraints
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
// used for >> division
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
// size of underlying data in Ts
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
// instantiate underlying data
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(reset());
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
// mask for full cell
full_mask_ = static_cast<T>(0xffffffffffffffffull);
// mask for bits that make up the address
address_mask_ = full_mask_ >> first_bit_;
}
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
CHECK(loadHeader(fin));
if (loaddata)
CHECK(loadData(fin));
}
virtual ~Filter() {
delete[] data_;
}
bool reset() {
for (uint64_t i = 0; i < cells_; ++i)
data_[i] = 0;
return true;
}
count_t size() {
// return approx size of filter in MBs
return cells_ * sizeof(T) >> 20;
}
// read / write functions
inline bool read(uint64_t address, T* value) {
CHECK(address <= addresses_);
// copy address to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = data_[data_cell] & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = (data_[data_cell] >> -offset) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return true;
}
inline T read(uint64_t address) {
CHECK(address <= addresses_);
// return value at address
T value = 0;
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
value = data_[data_cell] & address_mask_;
}
// data address starts to left so shift it right
else if (offset < 0) {
value = (data_[data_cell] >> -offset) & address_mask_;
}
// data address is to right so shift it left and look at one more cell to right
else
value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return value;
}
inline bool write(uint64_t address, T value) {
CHECK(address <= addresses_);
CHECK(log2(value) <= width_);
// write 'value' to address
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = (value << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = (value >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
// copy 'address' ^ 'finger' to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = (((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
& address_mask_ ;
return true;
}
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
// write 'value' ^ 'finger' to address
finger &= address_mask_; // make sure fingerprint is correct size
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = ((finger ^ value) << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = ((finger ^ value) >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
// debugging
void printFilter(const std::string & prefix = "", uint32_t truncate = 64){
std::cout << prefix;
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
for (int j = cell_width_ - 1; j >= 0; --j)
if (data_[i] & (1ull << j))
std::cout << 1;
else
std::cout << 0;
std::cout << "\n";
}
std::cout << std::endl;
}
// i/o
uint64_t getAddresses() { return addresses_; }
int getWidth() { return width_; }
int getCellWidth() { return cell_width_; }
uint32_t getCells() { return cells_; }
virtual bool save(FileHandler* out) {
CHECK(out != NULL);
CHECK(out->write((char*)&cells_, sizeof(cells_)));
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
CHECK(out->write((char*)&width_, sizeof(width_)));
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
if((width_ == 1) || cells_ < jump)
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
else {
uint64_t idx(0);
while(idx + jump < cells_) {
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
idx += jump;
}
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
return true;
}
protected:
bool loadHeader(FileHandler* fin) {
CHECK(fin != NULL);
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
CHECK(fin->read((char*)&width_, sizeof(width_)));
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
return true;
}
bool loadData(FileHandler* fin) {
// instantiate underlying array
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
return true;
}
uint64_t cells_; // number T making up 'data_'
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
int log_cell_width_; // log of bits used for >> division
uint64_t addresses_; // number of addresses in the filter
int width_; // width in bits of each address
int first_bit_; // position of first bit in initial byte
T full_mask_; // all 1s
T address_mask_; // 1s in those positions that are part of address
T* data_; // the raw data as bytes
};
// Extension with bit test/setter methods added // Class Filter wraps a contiguous array of data. Filter and its subclasses
class BitFilter : public Filter<uint8_t> { // implement read/write/increment functionality on arrays with arbitrary sized addresses
public: // (i.e. an address may not use a full number of bytes). When converting to byte-based
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {} // representation we assume "unused" bits are to left.
BitFilter(FileHandler* fin, bool loaddata = true) // E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
: Filter<uint8_t>(fin, loaddata) { // to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
if (loaddata) // and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
CHECK(load(fin)); // been masked out.
} template<typename T>
// TODO: overload operator[] class Filter
virtual bool testBit(uint64_t location) { {
// test bit referenced by location public:
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8); Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
} // number of bits in T
virtual bool setBit(uint64_t location) { cell_width_ = sizeof(T) << 3;
// set bit referenced by location // current implementation has following constraints
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8); CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
// used for >> division
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
// size of underlying data in Ts
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
// instantiate underlying data
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(reset());
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
// mask for full cell
full_mask_ = static_cast<T>(0xffffffffffffffffull);
// mask for bits that make up the address
address_mask_ = full_mask_ >> first_bit_;
}
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
CHECK(loadHeader(fin));
if (loaddata)
CHECK(loadData(fin));
}
virtual ~Filter() {
delete[] data_;
}
bool reset() {
for (uint64_t i = 0; i < cells_; ++i)
data_[i] = 0;
return true;
}
count_t size() {
// return approx size of filter in MBs
return cells_ * sizeof(T) >> 20;
}
// read / write functions
inline bool read(uint64_t address, T* value) {
CHECK(address <= addresses_);
// copy address to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = data_[data_cell] & address_mask_;
return true; return true;
} }
virtual bool clearBit(uint64_t location) { // data address starts to left so shift it right
// set bit referenced by location if (offset < 0) {
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8); *value = (data_[data_cell] >> -offset) & address_mask_;
return true; return true;
} }
bool save(FileHandler* fout) { // data address is to right so shift it left and look at one more cell to right
CHECK(Filter<uint8_t>::save(fout)); *value = ((data_[data_cell] << offset)
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;; | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return true;
}
inline T read(uint64_t address) {
CHECK(address <= addresses_);
// return value at address
T value = 0;
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
value = data_[data_cell] & address_mask_;
}
// data address starts to left so shift it right
else if (offset < 0) {
value = (data_[data_cell] >> -offset) & address_mask_;
}
// data address is to right so shift it left and look at one more cell to right
else
value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return value;
}
inline bool write(uint64_t address, T value) {
CHECK(address <= addresses_);
CHECK(log2(value) <= width_);
// write 'value' to address
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true; return true;
} }
float rho(uint64_t limit = 0) { // address in data is to left so shift value left by -offset
uint64_t ones = 0; if (offset < 0) {
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_; data_[data_cell] = (value << -offset)
for (uint64_t i = 0; i < range; ++i) | (data_[data_cell] & ~(address_mask_ << -offset));
for (int j = 0; j < 8; ++j)
if (data_[i] & (1 << j))
++ones;
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
}
protected:
bool load(FileHandler* fin) {
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
return true; return true;
} }
}; // address in data is to right so shift value right by offset
/* data_[data_cell] = (value >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
// copy 'address' ^ 'finger' to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = (((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
& address_mask_ ;
return true;
}
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
// write 'value' ^ 'finger' to address
finger &= address_mask_; // make sure fingerprint is correct size
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = ((finger ^ value) << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = ((finger ^ value) >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
// debugging
void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
std::cout << prefix;
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
for (int j = cell_width_ - 1; j >= 0; --j)
if (data_[i] & (1ull << j))
std::cout << 1;
else
std::cout << 0;
std::cout << "\n";
}
std::cout << std::endl;
}
// i/o
uint64_t getAddresses() {
return addresses_;
}
int getWidth() {
return width_;
}
int getCellWidth() {
return cell_width_;
}
uint32_t getCells() {
return cells_;
}
virtual bool save(FileHandler* out) {
CHECK(out != NULL);
CHECK(out->write((char*)&cells_, sizeof(cells_)));
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
CHECK(out->write((char*)&width_, sizeof(width_)));
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
if((width_ == 1) || cells_ < jump)
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
else {
uint64_t idx(0);
while(idx + jump < cells_) {
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
idx += jump;
}
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
return true;
}
protected:
bool loadHeader(FileHandler* fin) {
CHECK(fin != NULL);
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
CHECK(fin->read((char*)&width_, sizeof(width_)));
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
return true;
}
bool loadData(FileHandler* fin) {
// instantiate underlying array
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
return true;
}
uint64_t cells_; // number T making up 'data_'
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
int log_cell_width_; // log of bits used for >> division
uint64_t addresses_; // number of addresses in the filter
int width_; // width in bits of each address
int first_bit_; // position of first bit in initial byte
T full_mask_; // all 1s
T address_mask_; // 1s in those positions that are part of address
T* data_; // the raw data as bytes
};
// Extension with bit test/setter methods added
class BitFilter : public Filter<uint8_t>
{
public:
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
BitFilter(FileHandler* fin, bool loaddata = true)
: Filter<uint8_t>(fin, loaddata) {
if (loaddata)
CHECK(load(fin));
}
// TODO: overload operator[]
virtual bool testBit(uint64_t location) {
// test bit referenced by location
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
}
virtual bool setBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
return true;
}
virtual bool clearBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
return true;
}
bool save(FileHandler* fout) {
CHECK(Filter<uint8_t>::save(fout));
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
return true;
}
float rho(uint64_t limit = 0) {
uint64_t ones = 0;
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
for (uint64_t i = 0; i < range; ++i)
for (int j = 0; j < 8; ++j)
if (data_[i] & (1 << j))
++ones;
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
}
protected:
bool load(FileHandler* fin) {
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
return true;
}
};
/*
// ResizedBitFilter deals with resizing to save memory // ResizedBitFilter deals with resizing to save memory
// whereas other filters should expect locations to be within range // whereas other filters should expect locations to be within range
// this filter will need to resize (and possibly rehash) locations // this filter will need to resize (and possibly rehash) locations
@ -380,9 +391,9 @@ namespace randlm {
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]); carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
} }
// last update must not have carried // last update must not have carried
if (!carry) if (!carry)
return true; return true;
// wrapped round so check whether need to reset to max count // wrapped round so check whether need to reset to max count
if (!wrap_around_) if (!wrap_around_)
CHECK(this->write(address, this->address_mask_)); CHECK(this->write(address, this->address_mask_));
return false; // false to indicate that overflowed return false; // false to indicate that overflowed
@ -397,7 +408,7 @@ namespace randlm {
} }
inline bool incrementSubCell(int bit, int len, T* cell) { inline bool incrementSubCell(int bit, int len, T* cell) {
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged // increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1) *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len)) & (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len)))); | (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
// indicate overflow as true // indicate overflow as true

View File

@ -10,58 +10,66 @@ using namespace Moses;
typedef uint64_t P; // largest input range is 2^64 typedef uint64_t P; // largest input range is 2^64
template <typename T> template <typename T>
class HashBase { class HashBase
protected: {
T m_; // range of hash output protected:
count_t H_; // number of hash functions to instantiate T m_; // range of hash output
virtual void initSeeds()=0; count_t H_; // number of hash functions to instantiate
virtual void freeSeeds()=0; virtual void initSeeds()=0;
public: virtual void freeSeeds()=0;
HashBase(float m, count_t H=1):m_((T)m), H_(H) { public:
//cerr << "range = (0..." << m_ << "]" << endl; HashBase(float m, count_t H=1):m_((T)m), H_(H) {
} //cerr << "range = (0..." << m_ << "]" << endl;
HashBase(FileHandler* fin) { }
load(fin); HashBase(FileHandler* fin) {
} load(fin);
virtual ~HashBase(){} }
virtual T hash(const char*s, count_t h)=0; // string hashing virtual ~HashBase() {}
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing virtual T hash(const char*s, count_t h)=0; // string hashing
count_t size() { return H_;} virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
virtual void save(FileHandler* fout) { count_t size() {
CHECK(fout != 0); return H_;
fout->write((char*)&m_, sizeof(m_)); }
fout->write((char*)&H_, sizeof(H_)); virtual void save(FileHandler* fout) {
} CHECK(fout != 0);
virtual void load(FileHandler* fin) { fout->write((char*)&m_, sizeof(m_));
CHECK(fin != 0); fout->write((char*)&H_, sizeof(H_));
fin->read((char*)&m_, sizeof(m_)); }
fin->read((char*)&H_, sizeof(H_)); virtual void load(FileHandler* fin) {
} CHECK(fin != 0);
fin->read((char*)&m_, sizeof(m_));
fin->read((char*)&H_, sizeof(H_));
}
}; };
template <typename T> template <typename T>
class UnivHash_linear: public HashBase<T> { class UnivHash_linear: public HashBase<T>
public: {
UnivHash_linear(float m, count_t H, P pr): public:
HashBase<T>(m, H), pr_(pr) { UnivHash_linear(float m, count_t H, P pr):
//CHECK(isPrime(pr_)); HashBase<T>(m, H), pr_(pr) {
initSeeds(); //CHECK(isPrime(pr_));
} initSeeds();
UnivHash_linear(FileHandler* fin): }
HashBase<T>(fin) { UnivHash_linear(FileHandler* fin):
load(fin); HashBase<T>(fin) {
} load(fin);
~UnivHash_linear() {freeSeeds();} }
T hash(const char* s, count_t h){return 0;} //not implemented ~UnivHash_linear() {
T hash(const wordID_t* id, const int len, count_t h); freeSeeds();
T hash(const wordID_t id, const count_t pos, }
const T prevValue, count_t h); T hash(const char* s, count_t h) {
void save(FileHandler* fout); return 0; //not implemented
void load(FileHandler* fin); }
private: T hash(const wordID_t* id, const int len, count_t h);
T** a_, **b_; T hash(const wordID_t id, const count_t pos,
P pr_; const T prevValue, count_t h);
void initSeeds(); void save(FileHandler* fout);
void freeSeeds(); void load(FileHandler* fin);
private:
T** a_, **b_;
P pr_;
void initSeeds();
void freeSeeds();
}; };
/* UnivHash_noPrimes: /* UnivHash_noPrimes:
@ -71,74 +79,89 @@ class UnivHash_linear: public HashBase<T> {
* # of hash function = 2^(l-1) * # of hash function = 2^(l-1)
*/ */
template <typename T> template <typename T>
class UnivHash_noPrimes: public HashBase<T> { class UnivHash_noPrimes: public HashBase<T>
public: {
UnivHash_noPrimes(float k, float l): public:
HashBase<T>(k, 100), d_(count_t((l-k))) { UnivHash_noPrimes(float k, float l):
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1; HashBase<T>(k, 100), d_(count_t((l-k))) {
else p_ = (P) pow(2,l); if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
initSeeds(); else p_ = (P) pow(2,l);
} initSeeds();
UnivHash_noPrimes(FileHandler* fin): }
HashBase<T>(fin) { UnivHash_noPrimes(FileHandler* fin):
load(fin); HashBase<T>(fin) {
} load(fin);
~UnivHash_noPrimes() {freeSeeds();} }
T hash(const char* s, count_t h); ~UnivHash_noPrimes() {
T hash(const wordID_t* id, const int len, count_t h); freeSeeds();
T hash(const P x, count_t h); }
void save(FileHandler* fout); T hash(const char* s, count_t h);
void load(FileHandler* fin); T hash(const wordID_t* id, const int len, count_t h);
private: T hash(const P x, count_t h);
count_t d_; // l-k void save(FileHandler* fout);
P p_, *a_; // real-valued input range, storage void load(FileHandler* fin);
void initSeeds(); private:
void freeSeeds() {delete[] a_;} count_t d_; // l-k
P p_, *a_; // real-valued input range, storage
void initSeeds();
void freeSeeds() {
delete[] a_;
}
}; };
template <typename T> template <typename T>
class Hash_shiftAddXOR: public HashBase<T> { class Hash_shiftAddXOR: public HashBase<T>
public: {
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H), public:
l_(5), r_(2) { Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
initSeeds(); l_(5), r_(2) {
} initSeeds();
~Hash_shiftAddXOR() {freeSeeds();} }
T hash(const char* s, count_t h); ~Hash_shiftAddXOR() {
T hash(const wordID_t* id, const int len, count_t h) {} // empty freeSeeds();
private: }
T* v_; // random seed storage T hash(const char* s, count_t h);
const unsigned short l_, r_; // left-shift bits, right-shift bits T hash(const wordID_t* id, const int len, count_t h) {} // empty
void initSeeds(); private:
void freeSeeds() {delete[] v_;} T* v_; // random seed storage
const unsigned short l_, r_; // left-shift bits, right-shift bits
void initSeeds();
void freeSeeds() {
delete[] v_;
}
}; };
template <typename T> template <typename T>
class UnivHash_tableXOR: public HashBase<T> { class UnivHash_tableXOR: public HashBase<T>
public: {
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H), public:
table_(NULL), tblLen_(255*MAX_STR_LEN) { UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
initSeeds(); table_(NULL), tblLen_(255*MAX_STR_LEN) {
} initSeeds();
~UnivHash_tableXOR() {freeSeeds();} }
T hash(const char* s, count_t h); ~UnivHash_tableXOR() {
T hash(const wordID_t* id, const int len, count_t h) {} freeSeeds();
private: }
T** table_; // storage for random numbers T hash(const char* s, count_t h);
count_t tblLen_; // length of table T hash(const wordID_t* id, const int len, count_t h) {}
void initSeeds(); private:
void freeSeeds(); T** table_; // storage for random numbers
count_t tblLen_; // length of table
void initSeeds();
void freeSeeds();
}; };
// ShiftAddXor // ShiftAddXor
template <typename T> template <typename T>
void Hash_shiftAddXOR<T>::initSeeds() { void Hash_shiftAddXOR<T>::initSeeds()
{
v_ = new T[this->H_]; v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++) for(count_t i=0; i < this->H_; i++)
v_[i] = Utils::rand<T>() + 1; v_[i] = Utils::rand<T>() + 1;
} }
template <typename T> template <typename T>
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) { T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0)
{
T value = v_[h]; T value = v_[h];
int pos(0); int pos(0);
unsigned char c; unsigned char c;
@ -150,40 +173,44 @@ T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
// UnivHash_tableXOR // UnivHash_tableXOR
template <typename T> template <typename T>
void UnivHash_tableXOR<T>::initSeeds() { void UnivHash_tableXOR<T>::initSeeds()
{
// delete any values in table // delete any values in table
if(table_) freeSeeds(); if(table_) freeSeeds();
// instance of new table // instance of new table
table_ = new T* [this->H_]; table_ = new T* [this->H_];
// fill with random values // fill with random values
for(count_t j=0; j < this->H_; j++) { for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_]; table_[j] = new T[tblLen_];
for(count_t i=0; i < tblLen_; i++) { for(count_t i=0; i < tblLen_; i++) {
table_[j][i] = Utils::rand<T>(this->m_-1); table_[j][i] = Utils::rand<T>(this->m_-1);
} }
} }
} }
template <typename T> template <typename T>
void UnivHash_tableXOR<T>::freeSeeds() { void UnivHash_tableXOR<T>::freeSeeds()
{
for(count_t j = 0; j < this->H_; j++) for(count_t j = 0; j < this->H_; j++)
delete[] table_[j]; delete[] table_[j];
delete[] table_; delete[] table_;
table_ = NULL; table_ = NULL;
} }
template <typename T> template <typename T>
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0) { T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0)
{
T value = 0; T value = 0;
count_t pos = 0, idx = 0; count_t pos = 0, idx = 0;
unsigned char c; unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) while((c = *s++) && (++pos < MAX_STR_LEN))
value ^= table_[h][idx += c]; value ^= table_[h][idx += c];
CHECK(value < this->m_); CHECK(value < this->m_);
return value; return value;
} }
// UnivHash_noPrimes // UnivHash_noPrimes
template <typename T> template <typename T>
void UnivHash_noPrimes<T>::initSeeds() { void UnivHash_noPrimes<T>::initSeeds()
{
a_ = new P[this->H_]; a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) { for(T i=0; i < this->H_; i++) {
a_[i] = Utils::rand<P>(); a_[i] = Utils::rand<P>();
@ -191,14 +218,16 @@ void UnivHash_noPrimes<T>::initSeeds() {
} }
} }
template <typename T> template <typename T>
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0) { T UnivHash_noPrimes<T>::hash(const P x, count_t h=0)
{
// h_a(x) = (ax mod 2^l) div 2^(l-k) // h_a(x) = (ax mod 2^l) div 2^(l-k)
T value = ((a_[h] * x) % p_) >> d_; T value = ((a_[h] * x) % p_) >> d_;
return value % this->m_; return value % this->m_;
} }
template <typename T> template <typename T>
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len, T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
count_t h=0) { count_t h=0)
{
T value = 0; T value = 0;
int pos(0); int pos(0);
while(pos < len) { while(pos < len) {
@ -208,39 +237,42 @@ T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
return value % this->m_; return value % this->m_;
} }
template <typename T> template <typename T>
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0) { T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0)
{
T value = 0; T value = 0;
int pos(0); int pos(0);
unsigned char c; unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) { while((c = *s++) && (++pos < MAX_STR_LEN)) {
value ^= hash((P)c, h); value ^= hash((P)c, h);
} }
return value % this->m_; return value % this->m_;
} }
template <typename T> template <typename T>
void UnivHash_noPrimes<T>::save(FileHandler* fout) { void UnivHash_noPrimes<T>::save(FileHandler* fout)
{
HashBase<T>::save(fout); HashBase<T>::save(fout);
fout->write((char*)&p_, sizeof(p_)); fout->write((char*)&p_, sizeof(p_));
fout->write((char*)&d_, sizeof(d_)); fout->write((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) { for(T i=0; i < this->H_; i++) {
fout->write((char*)&a_[i], sizeof(a_[i])); fout->write((char*)&a_[i], sizeof(a_[i]));
} }
} }
template <typename T> template <typename T>
void UnivHash_noPrimes<T>::load(FileHandler* fin) { void UnivHash_noPrimes<T>::load(FileHandler* fin)
{
a_ = new P[this->H_]; a_ = new P[this->H_];
// HashBase<T>::load(fin) already done in constructor // HashBase<T>::load(fin) already done in constructor
fin->read((char*)&p_, sizeof(p_)); fin->read((char*)&p_, sizeof(p_));
fin->read((char*)&d_, sizeof(d_)); fin->read((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) for(T i=0; i < this->H_; i++) {
{
fin->read((char*)&a_[i], sizeof(a_[i])); fin->read((char*)&a_[i], sizeof(a_[i]));
} }
} }
//UnivHash_linear //UnivHash_linear
template <typename T> template <typename T>
void UnivHash_linear<T>::initSeeds() { void UnivHash_linear<T>::initSeeds()
{
a_ = new T*[this->H_]; a_ = new T*[this->H_];
b_ = new T*[this->H_]; b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) { for(count_t i=0; i < this->H_; i++) {
@ -253,7 +285,8 @@ void UnivHash_linear<T>::initSeeds() {
} }
} }
template <typename T> template <typename T>
void UnivHash_linear<T>::freeSeeds() { void UnivHash_linear<T>::freeSeeds()
{
for(count_t i=0; i < this->H_; i++) { for(count_t i=0; i < this->H_; i++) {
delete[] a_[i]; delete[] a_[i];
delete[] b_[i]; delete[] b_[i];
@ -263,8 +296,9 @@ void UnivHash_linear<T>::freeSeeds() {
a_ = b_ = NULL; a_ = b_ = NULL;
} }
template <typename T> template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len, inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
count_t h=0) { count_t h=0)
{
CHECK(h < this->H_); CHECK(h < this->H_);
T value = 0; T value = 0;
int pos(0); int pos(0);
@ -276,19 +310,21 @@ inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
} }
template <typename T> template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos, inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h=0) { const T prevValue, count_t h=0)
{
CHECK(h < this->H_); CHECK(h < this->H_);
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_; T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
return value % this->m_; return value % this->m_;
} }
template <typename T> template <typename T>
void UnivHash_linear<T>::save(FileHandler* fout) { void UnivHash_linear<T>::save(FileHandler* fout)
{
// int bytes = sizeof(a_[0][0]); // int bytes = sizeof(a_[0][0]);
HashBase<T>::save(fout); HashBase<T>::save(fout);
fout->write((char*)&pr_, sizeof(pr_)); fout->write((char*)&pr_, sizeof(pr_));
for(count_t i=0; i < this->H_; i++) { for(count_t i=0; i < this->H_; i++) {
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fout->write((char*)&a_[i][j], sizeof(a_[i][j])); fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&b_[i][j], sizeof(b_[i][j])); fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl; //cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl; //cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
@ -296,7 +332,8 @@ void UnivHash_linear<T>::save(FileHandler* fout) {
} }
} }
template <typename T> template <typename T>
void UnivHash_linear<T>::load(FileHandler* fin) { void UnivHash_linear<T>::load(FileHandler* fin)
{
// HashBase<T>::load(fin) already done in constructor // HashBase<T>::load(fin) already done in constructor
fin->read((char*)&pr_, sizeof(pr_)); fin->read((char*)&pr_, sizeof(pr_));
a_ = new T*[this->H_]; a_ = new T*[this->H_];
@ -305,8 +342,8 @@ void UnivHash_linear<T>::load(FileHandler* fin) {
a_[i] = new T[MAX_NGRAM_ORDER]; a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER]; b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fin->read((char*)&a_[i][j], sizeof(a_[i][j])); fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
fin->read((char*)&b_[i][j], sizeof(b_[i][j])); fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl; //cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl; //cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
} }

View File

@ -16,27 +16,28 @@ using randlm::Cache;
const bool strict_checks_ = false; const bool strict_checks_ = false;
template<typename T> template<typename T>
class OnlineRLM: public PerfectHash<T> { class OnlineRLM: public PerfectHash<T>
{
public: public:
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order, OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase), Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) { vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
CHECK(vocab_ != 0); CHECK(vocab_ != 0);
//instantiate quantizer class here //instantiate quantizer class here
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1]; alpha_ = new float[order_ + 1];
for(count_t i = 0; i <= order_; ++i) for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4); alpha_[i] = i * log10(0.4);
cerr << "Initialzing auxillary bit filters...\n"; cerr << "Initialzing auxillary bit filters...\n";
bPrefix_ = new BitFilter(this->cells_); bPrefix_ = new BitFilter(this->cells_);
bHit_ = new BitFilter(this->cells_); bHit_ = new BitFilter(this->cells_);
} }
OnlineRLM(FileHandler* fin, count_t order): OnlineRLM(FileHandler* fin, count_t order):
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) { PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
load(fin); load(fin);
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1]; alpha_ = new float[order_ + 1];
for(count_t i = 0; i <= order_; ++i) for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4); alpha_[i] = i * log10(0.4);
} }
~OnlineRLM() { ~OnlineRLM() {
@ -52,14 +53,18 @@ public:
bool insert(const std::vector<string>& ngram, const int value); bool insert(const std::vector<string>& ngram, const int value);
bool update(const std::vector<string>& ngram, const int value); bool update(const std::vector<string>& ngram, const int value);
int query(const wordID_t* IDs, const int len); int query(const wordID_t* IDs, const int len);
int sbsqQuery(const std::vector<string>& ngram, int* len, int sbsqQuery(const std::vector<string>& ngram, int* len,
bool bStrict = false); bool bStrict = false);
int sbsqQuery(const wordID_t* IDs, const int len, int* codes, int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict = false); bool bStrict = false);
void remove(const std::vector<string>& ngram); void remove(const std::vector<string>& ngram);
count_t heurDelete(count_t num2del, count_t order = 5); count_t heurDelete(count_t num2del, count_t order = 5);
uint64_t corpusSize() {return corpusSize_;} uint64_t corpusSize() {
void corpusSize(uint64_t c) {corpusSize_ = c;} return corpusSize_;
}
void corpusSize(uint64_t c) {
corpusSize_ = c;
}
void clearCache() { void clearCache() {
if(cache_) cache_->clear(); if(cache_) cache_->clear();
} }
@ -77,7 +82,7 @@ protected:
void markQueried(hpdEntry_t& value); void markQueried(hpdEntry_t& value);
bool markPrefix(const wordID_t* IDs, const int len, bool bSet); bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
private: private:
const void* getContext(const wordID_t* ngram, int len); const void* getContext(const wordID_t* ngram, int len);
const bool bAdapting_; // used to signal adaptation of model const bool bAdapting_; // used to signal adaptation of model
const count_t order_; // LM order const count_t order_; // LM order
uint64_t corpusSize_; // total training corpus size uint64_t corpusSize_; // total training corpus size
@ -87,46 +92,48 @@ private:
BitFilter* bHit_; BitFilter* bHit_;
}; };
template<typename T> template<typename T>
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value) { bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value)
{
int len = ngram.size(); int len = ngram.size();
wordID_t wrdIDs[len]; wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1); uint64_t index(this->cells_ + 1);
for(int i = 0; i < len; ++i) for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]); wrdIDs[i] = vocab_->GetWordID(ngram[i]);
index = PerfectHash<T>::insert(wrdIDs, len, value); index = PerfectHash<T>::insert(wrdIDs, len, value);
if(value > 1 && len < order_) if(value > 1 && len < order_)
markPrefix(wrdIDs, ngram.size(), true); // mark context markPrefix(wrdIDs, ngram.size(), true); // mark context
// keep track of total items from training data minus "<s>" // keep track of total items from training data minus "<s>"
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0; corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
markQueried(index); markQueried(index);
return true; return true;
} }
template<typename T> template<typename T>
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value) { bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value)
{
int len = ngram.size(); int len = ngram.size();
wordID_t wrdIDs[len]; wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1); uint64_t index(this->cells_ + 1);
hpdEntry_t hpdItr; hpdEntry_t hpdItr;
vocab_->MakeOpen(); vocab_->MakeOpen();
for(int i = 0; i < len; ++i) for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]); wrdIDs[i] = vocab_->GetWordID(ngram[i]);
// if updating, minimize false positives by pre-checking if context already in model // if updating, minimize false positives by pre-checking if context already in model
bool bIncluded(true); bool bIncluded(true);
if(value > 1 && len < (int)order_) if(value > 1 && len < (int)order_)
bIncluded = markPrefix(wrdIDs, ngram.size(), true); // mark context bIncluded = markPrefix(wrdIDs, ngram.size(), true); // mark context
if(bIncluded) { // if context found if(bIncluded) { // if context found
bIncluded = PerfectHash<T>::update2(wrdIDs, len, value, hpdItr, index); bIncluded = PerfectHash<T>::update2(wrdIDs, len, value, hpdItr, index);
if(index < this->cells_) { if(index < this->cells_) {
markQueried(index); markQueried(index);
} } else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
} }
return bIncluded; return bIncluded;
} }
template<typename T> template<typename T>
int OnlineRLM<T>::query(const wordID_t* IDs, int len) { int OnlineRLM<T>::query(const wordID_t* IDs, int len)
{
uint64_t filterIdx = 0; uint64_t filterIdx = 0;
hpdEntry_t hpdItr; hpdEntry_t hpdItr;
int value(0); int value(0);
@ -135,8 +142,7 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
if(hpdItr != this->dict_.end()) { if(hpdItr != this->dict_.end()) {
//markQueried(hpdItr); // mark this event as "hit" //markQueried(hpdItr); // mark this event as "hit"
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
} } else {
else {
CHECK(filterIdx < this->cells_); CHECK(filterIdx < this->cells_);
//markQueried(filterIdx); //markQueried(filterIdx);
} }
@ -144,15 +150,16 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
return value > 0 ? value : 0; return value > 0 ? value : 0;
} }
template<typename T> template<typename T>
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) { bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
if(len <= 1) return true; // only do this for for ngrams with context {
static Cache<int> pfCache(-1, -1); // local prefix cache if(len <= 1) return true; // only do this for for ngrams with context
static Cache<int> pfCache(-1, -1); // local prefix cache
int code(0); int code(0);
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) { if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
hpdEntry_t hpdItr; hpdEntry_t hpdItr;
uint64_t filterIndex(0); uint64_t filterIndex(0);
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1] code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
if(code == -1) { // encountered false positive in pipeline if(code == -1) { // encountered false positive in pipeline
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n"; cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
// add all prefixes or return false; // add all prefixes or return false;
return false; return false;
@ -161,10 +168,9 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
CHECK(hpdItr == this->dict_.end()); CHECK(hpdItr == this->dict_.end());
if(bSet) bPrefix_->setBit(filterIndex); // mark index if(bSet) bPrefix_->setBit(filterIndex); // mark index
else bPrefix_->clearBit(filterIndex); // unset index else bPrefix_->clearBit(filterIndex); // unset index
} } else {
else {
CHECK(filterIndex == this->cells_ + 1); CHECK(filterIndex == this->cells_ + 1);
//how to handle hpd prefixes? //how to handle hpd prefixes?
} }
if(pfCache.nodes() > 10000) pfCache.clear(); if(pfCache.nodes() > 10000) pfCache.clear();
pfCache.setCacheNgram(IDs, len - 1, code, NULL); pfCache.setCacheNgram(IDs, len - 1, code, NULL);
@ -172,36 +178,40 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
return true; return true;
} }
template<typename T> template<typename T>
void OnlineRLM<T>::markQueried(const uint64_t& index) { void OnlineRLM<T>::markQueried(const uint64_t& index)
{
bHit_->setBit(index); bHit_->setBit(index);
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl; //cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
} }
template<typename T> template<typename T>
void OnlineRLM<T>::markQueried(hpdEntry_t& value) { void OnlineRLM<T>::markQueried(hpdEntry_t& value)
// set high bit of counter to indicate "hit" status {
// set high bit of counter to indicate "hit" status
value->second |= this->hitMask_; value->second |= this->hitMask_;
} }
template<typename T> template<typename T>
void OnlineRLM<T>::remove(const std::vector<string>& ngram) { void OnlineRLM<T>::remove(const std::vector<string>& ngram)
{
wordID_t IDs[ngram.size()]; wordID_t IDs[ngram.size()];
for(count_t i = 0; i < ngram.size(); ++i) for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]); IDs[i] = vocab_->GetWordID(ngram[i]);
PerfectHash<T>::remove(IDs, ngram.size()); PerfectHash<T>::remove(IDs, ngram.size());
} }
template<typename T> template<typename T>
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) { count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
{
count_t deleted = 0; count_t deleted = 0;
cout << "Deleting " << num2del << " of order "<< order << endl; cout << "Deleting " << num2del << " of order "<< order << endl;
// delete from filter first // delete from filter first
int full = *std::max_element(this->idxTracker_, this->idxTracker_ int full = *std::max_element(this->idxTracker_, this->idxTracker_
+ this->totBuckets_); + this->totBuckets_);
for(; full > 0; --full) // delete from fullest buckets first for(; full > 0; --full) // delete from fullest buckets first
for(int bk = 0; bk < this->totBuckets_; ++bk) { for(int bk = 0; bk < this->totBuckets_; ++bk) {
if(deleted >= num2del) break; if(deleted >= num2del) break;
if(this->idxTracker_[bk] == full) { // if full if(this->idxTracker_[bk] == full) { // if full
uint64_t first = bk * this->bucketRange_, uint64_t first = bk * this->bucketRange_,
last = first + this->bucketRange_; last = first + this->bucketRange_;
for(uint64_t row = first; row < last; ++row) { // check each row for(uint64_t row = first; row < last; ++row) { // check each row
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) { if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
if(this->filter_->read(row) != 0) { if(this->filter_->read(row) != 0) {
PerfectHash<T>::remove(row); // remove from filter PerfectHash<T>::remove(row); // remove from filter
@ -220,15 +230,17 @@ count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
} }
template<typename T> template<typename T>
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes, int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
bool bStrict) { bool bStrict)
{
wordID_t IDs[ngram.size()]; wordID_t IDs[ngram.size()];
for(count_t i = 0; i < ngram.size(); ++i) for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]); IDs[i] = vocab_->GetWordID(ngram[i]);
return sbsqQuery(IDs, ngram.size(), codes, bStrict); return sbsqQuery(IDs, ngram.size(), codes, bStrict);
} }
template<typename T> template<typename T>
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes, int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict) { bool bStrict)
{
uint64_t filterIdx = 0; uint64_t filterIdx = 0;
int val(0), fnd(0); int val(0), fnd(0);
hpdEntry_t hpdItr; hpdEntry_t hpdItr;
@ -240,14 +252,13 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
if(hpdItr != this->dict_.end()) { if(hpdItr != this->dict_.end()) {
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
} }
} } else if(bStrict) {
else if(bStrict) { break;
break;
} }
// add to value array // add to value array
codes[i] = val > 0 ? val : 0; codes[i] = val > 0 ? val : 0;
} }
while(bStrict && (fnd > 1)) { // do checks the other way while(bStrict && (fnd > 1)) { // do checks the other way
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx); val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
if(val != -1) break; // if anything found if(val != -1) break; // if anything found
else --fnd; // else decrement found else --fnd; // else decrement found
@ -255,8 +266,9 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
return fnd; return fnd;
} }
template<typename T> template<typename T>
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len, float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
const void** state) { const void** state)
{
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1)); static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
float logprob(0); float logprob(0);
const void* context = (state) ? *state : 0; const void* context = (state) ? *state : 0;
@ -264,61 +276,61 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) { if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
// get full prob and put in cache // get full prob and put in cache
int num_fnd(0), den_val(0); int num_fnd(0), den_val(0);
int in[len]; // in[] keeps counts of increasing order numerator int in[len]; // in[] keeps counts of increasing order numerator
for(int i = 0; i < len; ++i) in[i] = 0; for(int i = 0; i < len; ++i) in[i] = 0;
for(int i = len - 1; i >= 0; --i) { for(int i = len - 1; i >= 0; --i) {
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
in[i] = query(&ngram[i], len - i); in[i] = query(&ngram[i], len - i);
if(in[i] > 0) { if(in[i] > 0) {
num_fnd = len - i; num_fnd = len - i;
} } else if(strict_checks_) break;
else if(strict_checks_) break;
} }
while(num_fnd > 1) { // get lower order count while(num_fnd > 1) { // get lower order count
//get sub-context of size one less than length found (exluding target) //get sub-context of size one less than length found (exluding target)
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) && if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) { (den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
break; break;
} } else --num_fnd; // else backoff to lower ngram order
else --num_fnd; // else backoff to lower ngram order
} }
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
num_fnd = 0; num_fnd = 0;
switch(num_fnd) { // find prob (need to refactor into precomputation) switch(num_fnd) { // find prob (need to refactor into precomputation)
case 0: // OOV case 0: // OOV
logprob = alpha_[len] + oovprob; logprob = alpha_[len] + oovprob;
break; break;
case 1: // unigram found only case 1: // unigram found only
CHECK(in[len - 1] > 0); CHECK(in[len - 1] > 0);
logprob = alpha_[len - 1] + (corpusSize_ > 0 ? logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0); log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
//logprob = alpha_[len - 1] + //logprob = alpha_[len - 1] +
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)); //log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
break; break;
default: default:
CHECK(den_val > 0); CHECK(den_val > 0);
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs???? //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
logprob = alpha_[len - num_fnd] + logprob = alpha_[len - num_fnd] +
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val)); log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
break; break;
} }
// need unique context // need unique context
context = getContext(&ngram[len - num_fnd], num_fnd); context = getContext(&ngram[len - num_fnd], num_fnd);
// put whatever was found in cache // put whatever was found in cache
cache_->setCacheNgram(ngram, len, logprob, context); cache_->setCacheNgram(ngram, len, logprob, context);
} // end checkCache } // end checkCache
return logprob; return logprob;
} }
template<typename T> template<typename T>
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len) { const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
{
int dummy(0); int dummy(0);
float* addresses[len]; // only interested in addresses of cache float* addresses[len]; // only interested in addresses of cache
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len); CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
// return address of cache node // return address of cache node
return (const void*)addresses[0]; return (const void*)addresses[0];
} }
template<typename T> template<typename T>
void OnlineRLM<T>::randDelete(int num2del) { void OnlineRLM<T>::randDelete(int num2del)
{
int deleted = 0; int deleted = 0;
for(uint64_t i = 0; i < this->cells_; i++) { for(uint64_t i = 0; i < this->cells_; i++) {
if(this->filter_->read(i) != 0) { if(this->filter_->read(i) != 0) {
@ -329,18 +341,20 @@ void OnlineRLM<T>::randDelete(int num2del) {
} }
} }
template<typename T> template<typename T>
int OnlineRLM<T>::countHits() { int OnlineRLM<T>::countHits()
{
int hit(0); int hit(0);
for(uint64_t i = 0; i < this->cells_; ++i) for(uint64_t i = 0; i < this->cells_; ++i)
if(bHit_->testBit(i)) ++hit; if(bHit_->testBit(i)) ++hit;
iterate(this->dict_, itr) iterate(this->dict_, itr)
if((itr->second & this->hitMask_) != 0) if((itr->second & this->hitMask_) != 0)
++hit; ++hit;
cerr << "Hit count = " << hit << endl; cerr << "Hit count = " << hit << endl;
return hit; return hit;
} }
template<typename T> template<typename T>
int OnlineRLM<T>::countPrefixes() { int OnlineRLM<T>::countPrefixes()
{
int pfx(0); int pfx(0);
for(uint64_t i = 0; i < this->cells_; ++i) for(uint64_t i = 0; i < this->cells_; ++i)
if(bPrefix_->testBit(i)) ++pfx; if(bPrefix_->testBit(i)) ++pfx;
@ -349,22 +363,24 @@ int OnlineRLM<T>::countPrefixes() {
return pfx; return pfx;
} }
template<typename T> template<typename T>
int OnlineRLM<T>::cleanUpHPD() { int OnlineRLM<T>::cleanUpHPD()
{
cerr << "HPD size before = " << this->dict_.size() << endl; cerr << "HPD size before = " << this->dict_.size() << endl;
std::vector<string> vDel, vtmp; std::vector<string> vDel, vtmp;
iterate(this->dict_, itr) { iterate(this->dict_, itr) {
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
vDel.push_back(itr->first); vDel.push_back(itr->first);
} }
} }
iterate(vDel, vitr) iterate(vDel, vitr)
this->dict_.erase(*vitr); this->dict_.erase(*vitr);
cerr << "HPD size after = " << this->dict_.size() << endl; cerr << "HPD size after = " << this->dict_.size() << endl;
return vDel.size(); return vDel.size();
} }
template<typename T> template<typename T>
void OnlineRLM<T>::clearMarkings() { void OnlineRLM<T>::clearMarkings()
{
cerr << "clearing all event hits\n"; cerr << "clearing all event hits\n";
bHit_->reset(); bHit_->reset();
count_t* value(0); count_t* value(0);
@ -374,7 +390,8 @@ void OnlineRLM<T>::clearMarkings() {
} }
} }
template<typename T> template<typename T>
void OnlineRLM<T>::save(FileHandler* fout) { void OnlineRLM<T>::save(FileHandler* fout)
{
cerr << "Saving ORLM...\n"; cerr << "Saving ORLM...\n";
// save vocab // save vocab
vocab_->Save(fout); vocab_->Save(fout);
@ -387,7 +404,8 @@ void OnlineRLM<T>::save(FileHandler* fout) {
cerr << "Finished saving ORLM." << endl; cerr << "Finished saving ORLM." << endl;
} }
template<typename T> template<typename T>
void OnlineRLM<T>::load(FileHandler* fin) { void OnlineRLM<T>::load(FileHandler* fin)
{
cerr << "Loading ORLM...\n"; cerr << "Loading ORLM...\n";
// load vocab first // load vocab first
vocab_ = new Vocab(fin); vocab_ = new Vocab(fin);
@ -402,12 +420,13 @@ void OnlineRLM<T>::load(FileHandler* fin) {
PerfectHash<T>::load(fin); PerfectHash<T>::load(fin);
} }
template<typename T> template<typename T>
void OnlineRLM<T>::removeNonMarked() { void OnlineRLM<T>::removeNonMarked()
{
cerr << "deleting all unused events\n"; cerr << "deleting all unused events\n";
int deleted(0); int deleted(0);
for(uint64_t i = 0; i < this->cells_; ++i) { for(uint64_t i = 0; i < this->cells_; ++i) {
if(!(bHit_->testBit(i) || bPrefix_->testBit(i)) if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
&& (this->filter_->read(i) != 0)) { && (this->filter_->read(i) != 0)) {
PerfectHash<T>::remove(i); PerfectHash<T>::remove(i);
++deleted; ++deleted;
} }
@ -429,36 +448,36 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
// constrain cache queries using model assumptions // constrain cache queries using model assumptions
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found); int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
cerr << "denom_len = " << denom_len << endl; cerr << "denom_len = " << denom_len << endl;
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1, int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
&num_codes[0], &found); &num_codes[0], &found);
cerr << "num_len= " << num_len << endl; cerr << "num_len= " << num_len << endl;
// keed reducing ngram size until both denominator and numerator are found // keed reducing ngram size until both denominator and numerator are found
// allowed to leave kUnknownCode in cache because we check for this. // allowed to leave kUnknownCode in cache because we check for this.
found = num_len; // guaranteed to be <= denom_len + 1 found = num_len; // guaranteed to be <= denom_len + 1
// still check for OOV // still check for OOV
for (int i = len - found; i < len; ++i) for (int i = len - found; i < len; ++i)
if (ngram[i] == Vocab::kOOVWordID) { if (ngram[i] == Vocab::kOOVWordID) {
found = len - i - 1; found = len - i - 1;
} }
// check for relative estimator // check for relative estimator
while(found > 1) { while(found > 1) {
if(*denom_codes[found-1] == cache_unk_ && if(*denom_codes[found-1] == cache_unk_ &&
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) { ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) { //!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
*num_codes[found] = cache_unk_; *num_codes[found] = cache_unk_;
} else { } else {
if(*num_codes[found] != cache_unk_ || if(*num_codes[found] != cache_unk_ ||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1])) ((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
// struct_->query(&ngram[len-*found], *found, kMainEventIdx, // struct_->query(&ngram[len-*found], *found, kMainEventIdx,
// num_codes[*found], *denom_codes[*found-1])) // num_codes[*found], *denom_codes[*found-1]))
break; break;
} }
--found; --found;
} }
// didn't find bigram numerator or unigram denominator // didn't find bigram numerator or unigram denominator
if (found == 1) if (found == 1)
found = *num_codes[1] != cache_unk_ found = *num_codes[1] != cache_unk_
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0); || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]); //struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
// .... // ....
// return estimate applying correct backoff score (precomputed) // return estimate applying correct backoff score (precomputed)
@ -469,20 +488,20 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_; //log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
break; break;
case 1: // unigram over whole corpus case 1: // unigram over whole corpus
log_prob = alpha_[len - 1] + log_prob = alpha_[len - 1] +
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_)); log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_ //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
// + stupid_backoff_log10_[len - 1]; // precomputed // + stupid_backoff_log10_[len - 1]; // precomputed
break; break;
default: // otherwise use both statistics and (possibly zero) backoff weight default: // otherwise use both statistics and (possibly zero) backoff weight
log_prob = alpha_[len - found] + log_prob = alpha_[len - found] +
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1])); log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ]) //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1]) // - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
// + stupid_backoff_log10_[len - *found]; // + stupid_backoff_log10_[len - *found];
} }
context_state = (const void*)num_codes[found == len ? found - 1 : found];; context_state = (const void*)num_codes[found == len ? found - 1 : found];;
//probCache_->store(len, log_prob, context_state); //probCache_->store(len, log_prob, context_state);
if (state) if (state)
*state = context_state; *state = context_state;
return log_prob; return log_prob;

View File

@ -1,10 +1,11 @@
#include "params.h" #include "params.h"
namespace Moses { namespace Moses
{
// parameter constants // parameter constants
const std::string Parameters::kNotSetValue = "__NOT_SET__"; const std::string Parameters::kNotSetValue = "__NOT_SET__";
const int Parameters::kBoolValue = 0; const int Parameters::kBoolValue = 0;
const int Parameters::kIntValue = 1; const int Parameters::kIntValue = 1;
const int Parameters::kFloatValue = 2; const int Parameters::kFloatValue = 2;
const int Parameters::kStringValue = 3; const int Parameters::kStringValue = 3;
@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1;
const std::string Parameters::kTrueValue = "1"; const std::string Parameters::kTrueValue = "1";
const std::string Parameters::kFalseValue = "0"; const std::string Parameters::kFalseValue = "0";
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) { Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
{
initialize(paramdefs, paramNum); initialize(paramdefs, paramNum);
} }
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs, Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
const count_t paramNum) { const count_t paramNum)
{
initialize(paramdefs, paramNum); initialize(paramdefs, paramNum);
loadParams(argc, argv); loadParams(argc, argv);
} }
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) { void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
{
for( count_t i = 0; i < paramNum; i++ ) { for( count_t i = 0; i < paramNum; i++ ) {
params_[paramdefs[i].name] = paramdefs[i]; // assign name params_[paramdefs[i].name] = paramdefs[i]; // assign name
} }
cerr << "Default parameter values:\n"; cerr << "Default parameter values:\n";
iterate(params_, itr) iterate(params_, itr)
cerr << "\t" << itr->first << " --> " << itr->second.value << endl; cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
} }
bool Parameters::loadParams(int argc, char ** argv) { bool Parameters::loadParams(int argc, char ** argv)
{
// load params from commandline args // load params from commandline args
//if( argc < 3 ) { //if( argc < 3 ) {
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n"); // fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) {
std::string val = argv[i+1]; std::string val = argv[i+1];
Utils::trim(val); Utils::trim(val);
if( param == "config" ) if( param == "config" )
load_from_file = true; load_from_file = true;
if(!setParamValue(param, val)) { if(!setParamValue(param, val)) {
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl; std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
return false; return false;
@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) {
return success; return success;
} }
std::string Parameters::normaliseParamName(const std::string & name) { std::string Parameters::normaliseParamName(const std::string & name)
{
// Map valid abbreviations to long names. Retain other names. // Map valid abbreviations to long names. Retain other names.
if( params_.find(name) == params_.end() ) if( params_.find(name) == params_.end() )
iterate(params_, i) iterate(params_, i)
if( i->second.abbrev == name ) if( i->second.abbrev == name )
return i->first; return i->first;
return name; return name;
} }
int Parameters::getValueType(const std::string& name) { int Parameters::getValueType(const std::string& name)
{
if(params_.find(name) != params_.end()) if(params_.find(name) != params_.end())
return params_[name].type; return params_[name].type;
return Parameters::kUndefinedValue; return Parameters::kUndefinedValue;
} }
bool Parameters::isValidParamName(const std::string & name) { bool Parameters::isValidParamName(const std::string & name)
return params_.find(name) != params_.end(); {
return params_.find(name) != params_.end();
} }
bool Parameters::setParamValue(const std::string& name, const std::string& val) { bool Parameters::setParamValue(const std::string& name, const std::string& val)
// TODO: Add basic type checking w verifyValueType() {
bool set = isValidParamName(name); // TODO: Add basic type checking w verifyValueType()
if(set) { bool set = isValidParamName(name);
params_[name].value = val; if(set) {
params_[name].value = val;
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl; std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
} }
return( set ); return( set );
} }
std::string Parameters::getParamValue(const std::string& name) { std::string Parameters::getParamValue(const std::string& name)
{
std::string value = Parameters::kNotSetValue; std::string value = Parameters::kNotSetValue;
if(isValidParamName(name)) if(isValidParamName(name))
if(params_.find(name) != params_.end()) if(params_.find(name) != params_.end())
@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) {
value = kFalseValue; value = kFalseValue;
return value; return value;
} }
std::string Parameters::getParam(const std::string& name) { std::string Parameters::getParam(const std::string& name)
{
return getParamValue(name); return getParamValue(name);
/*void* Parameters::getParam(const std::string& name) { /*void* Parameters::getParam(const std::string& name) {
void* paramVal = 0; void* paramVal = 0;
int type = getValueType(name); int type = getValueType(name);
const char* sval = getParamValue(name).c_str(); const char* sval = getParamValue(name).c_str();
switch(type) { switch(type) {
case kIntValue: { case kIntValue: {
int ival = atoi(sval); int ival = atoi(sval);
paramVal = (void*)&ival; paramVal = (void*)&ival;
break; break;
}
case kFloatValue: {
float fval = atof(sval);
paramVal = (void*)&fval;
break;
}
case kStringValue: {
paramVal = (void*)sval;
break;
}
case kBoolValue: {
bool bval = sval == Parameters::kTrueValue ? true : false;
paramVal = (void*)&bval;
break;
}
default: // --> Parameters::kUndefinedValue
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
} }
case kFloatValue: { return paramVal;*/
float fval = atof(sval);
paramVal = (void*)&fval;
break;
}
case kStringValue: {
paramVal = (void*)sval;
break;
}
case kBoolValue: {
bool bval = sval == Parameters::kTrueValue ? true : false;
paramVal = (void*)&bval;
break;
}
default: // --> Parameters::kUndefinedValue
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
}
return paramVal;*/
} }
bool Parameters::verifyValueType(const std::string& name, const std::string& val) { bool Parameters::verifyValueType(const std::string& name, const std::string& val)
{
// Implement basic type checking // Implement basic type checking
return true; return true;
} }
int Parameters::getParamCount() const { int Parameters::getParamCount() const
{
return params_.size(); return params_.size();
} }
@ -161,7 +174,8 @@ int Parameters::getParamCount() const {
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but * HAVE TO CHANGE loadParams() from file to not overwrite command lines but
* override default if different*/ * override default if different*/
bool Parameters::loadParams(const std::string & file_path, bool Parameters::loadParams(const std::string & file_path,
std::set<std::string>& setParams) { std::set<std::string>& setParams)
{
// parameters loaded from file don't override cmd line paramters // parameters loaded from file don't override cmd line paramters
/*std::set<std::string>::iterator end = setParams.end(); /*std::set<std::string>::iterator end = setParams.end();
FileHandler file(file_path.c_str(), std::ios::in); FileHandler file(file_path.c_str(), std::ios::in);

View File

@ -10,20 +10,22 @@
#include "utils.h" #include "utils.h"
#include "types.h" #include "types.h"
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0])) #define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
namespace Moses { namespace Moses
{
typedef struct ParamDefs { typedef struct ParamDefs {
std::string name; std::string name;
std::string value; std::string value;
std::string abbrev; std::string abbrev;
int type; int type;
std::string description; std::string description;
} ParamDefs; } ParamDefs;
class Parameters { class Parameters
{
public: public:
static const std::string kNotSetValue; static const std::string kNotSetValue;
static const int kBoolValue; static const int kBoolValue;
static const int kIntValue; static const int kIntValue;
static const int kFloatValue; static const int kFloatValue;
@ -31,15 +33,15 @@ public:
static const int kUndefinedValue; static const int kUndefinedValue;
static const std::string kFalseValue; static const std::string kFalseValue;
static const std::string kTrueValue; static const std::string kTrueValue;
Parameters(const ParamDefs * paramdefs, const count_t paramNum); Parameters(const ParamDefs * paramdefs, const count_t paramNum);
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum); Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
~Parameters() {} ~Parameters() {}
bool loadParams(int argc, char ** argv); bool loadParams(int argc, char ** argv);
bool loadParams(const std::string& param_file, std::set<std::string>&); bool loadParams(const std::string& param_file, std::set<std::string>&);
int getValueType(const std::string & name); int getValueType(const std::string & name);
bool setParamValue(const std::string& name, const std::string& value); bool setParamValue(const std::string& name, const std::string& value);
bool verifyValueType(const std::string& name, const std::string& value); bool verifyValueType(const std::string& name, const std::string& value);
bool isValidParamName(const std::string & name); bool isValidParamName(const std::string & name);
std::string getParamValue(const std::string& name); std::string getParamValue(const std::string& name);
//void* getParam(const std::string& name); //void* getParam(const std::string& name);

View File

@ -8,17 +8,18 @@
#include "RandLMFilter.h" #include "RandLMFilter.h"
#include "quantizer.h" #include "quantizer.h"
/* /*
* PerfectHash handles setting up hash functions and storage * PerfectHash handles setting up hash functions and storage
* for LM data. * for LM data.
*/ */
using randlm::Filter; using randlm::Filter;
using randlm::BitFilter; using randlm::BitFilter;
typedef std::map<string, count_t> hpDict_t; typedef std::map<string, count_t> hpDict_t;
typedef hpDict_t::iterator hpdEntry_t; typedef hpDict_t::iterator hpdEntry_t;
static count_t collisions_ = 0; static count_t collisions_ = 0;
/* Based on Mortenson et. al. 2006 */ /* Based on Mortenson et. al. 2006 */
template<typename T> template<typename T>
class PerfectHash { class PerfectHash
{
public: public:
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase); PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
PerfectHash(FileHandler* fin) { PerfectHash(FileHandler* fin) {
@ -39,11 +40,11 @@ protected:
uint8_t* idxTracker_; uint8_t* idxTracker_;
uint64_t insert(const wordID_t* IDs, const int len, const count_t value); uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
bool update(const wordID_t* IDs, const int len, const count_t value, bool update(const wordID_t* IDs, const int len, const count_t value,
hpdEntry_t& hpdAddr, uint64_t& filterIdx); hpdEntry_t& hpdAddr, uint64_t& filterIdx);
bool update2(const wordID_t* IDs, const int len, const count_t value, bool update2(const wordID_t* IDs, const int len, const count_t value,
hpdEntry_t& hpdAddr, uint64_t& filterIdx); hpdEntry_t& hpdAddr, uint64_t& filterIdx);
int query(const wordID_t* IDs, const int len, int query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx); hpdEntry_t& hpdAddr, uint64_t& filterIdx);
virtual void remove(const wordID_t* IDs, const int len); virtual void remove(const wordID_t* IDs, const int len);
void remove(uint64_t index); void remove(uint64_t index);
void save(FileHandler* fout); void save(FileHandler* fout);
@ -52,32 +53,33 @@ protected:
//pointer to a specific entry in a hpDict_t //pointer to a specific entry in a hpDict_t
virtual void markQueried(hpdEntry_t&)=0; virtual void markQueried(hpdEntry_t&)=0;
private: private:
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket); T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
string hpDictKeyValue(const wordID_t* IDs, const int len); string hpDictKeyValue(const wordID_t* IDs, const int len);
uint64_t memBound_; // total memory bound in bytes uint64_t memBound_; // total memory bound in bytes
uint16_t cellWidth_; // in bits uint16_t cellWidth_; // in bits
UnivHash_linear<count_t>* bucketHash_; UnivHash_linear<count_t>* bucketHash_;
UnivHash_linear<T>* fingerHash_; UnivHash_linear<T>* fingerHash_;
LogQtizer* qtizer_; LogQtizer* qtizer_;
}; };
template<typename T> template<typename T>
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange, PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)), float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
cellWidth_(width) { cellWidth_(width)
{
bucketRange_ = static_cast<uint8_t>(bucketRange); bucketRange_ = static_cast<uint8_t>(bucketRange);
if(bucketRange > 255) { if(bucketRange > 255) {
cerr << "ERROR: Max bucket range is > 2^8\n"; cerr << "ERROR: Max bucket range is > 2^8\n";
exit(1); exit(1);
} }
qtizer_ = new LogQtizer(qBase); qtizer_ = new LogQtizer(qBase);
int valBits = (int)ceil(log2((float)qtizer_->maxcode())); int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl; cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
uint64_t totalBits = memBound_ << 3; uint64_t totalBits = memBound_ << 3;
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
filter_ = new Filter<T>(cells_, cellWidth_); filter_ = new Filter<T>(cells_, cellWidth_);
values_ = new Filter<T>(cells_, valBits); values_ = new Filter<T>(cells_, valBits);
idxTracker_ = new uint8_t[totBuckets_]; idxTracker_ = new uint8_t[totBuckets_];
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0; for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
// initialize ranges for each hash function // initialize ranges for each hash function
@ -85,7 +87,8 @@ PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME); fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
} }
template<typename T> template<typename T>
PerfectHash<T>::~PerfectHash() { PerfectHash<T>::~PerfectHash()
{
delete[] idxTracker_; delete[] idxTracker_;
delete filter_; delete filter_;
filter_ = NULL; filter_ = NULL;
@ -94,22 +97,22 @@ PerfectHash<T>::~PerfectHash() {
delete qtizer_; delete qtizer_;
delete values_; delete values_;
} }
template<typename T> template<typename T>
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len, uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
const count_t value) { const count_t value)
{
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
// restriction on fprint value is non-zero // restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t emptyidx = cells_ + 1; uint64_t emptyidx = cells_ + 1;
uint64_t index = bucket * bucketRange_, // starting bucket row uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_; // ending row lastrow = index + bucketRange_; // ending row
while(index < lastrow) { // unique so check each row for "matching" signature while(index < lastrow) { // unique so check each row for "matching" signature
T filterVal = filter_->read(index); T filterVal = filter_->read(index);
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
emptyidx = index; emptyidx = index;
} } else if(filterVal == fp) {
else if(filterVal == fp) {
++collisions_; ++collisions_;
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
return cells_ + 1; // finished return cells_ + 1; // finished
@ -122,20 +125,20 @@ uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
values_->write(emptyidx, code); values_->write(emptyidx, code);
++idxTracker_[bucket]; // keep track of bucket size ++idxTracker_[bucket]; // keep track of bucket size
return emptyidx; return emptyidx;
} } else { // bucket is full
else { // bucket is full
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
return cells_ + 1; return cells_ + 1;
} }
} }
template<typename T> template<typename T>
bool PerfectHash<T>::update(const wordID_t* IDs, const int len, bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary // check if key is in high perf. dictionary
filterIdx = cells_ + 1; filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len); string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) { if((hpdAddr = dict_.find(skey)) != dict_.end()) {
hpdAddr->second = value; hpdAddr->second = value;
return true; return true;
} }
// else hash ngram // else hash ngram
@ -144,66 +147,67 @@ bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero // restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_; lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index); T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p. if(filterVal == fp) { // found event w.h.p.
values_->write(index, (T)qtizer_->code(value)); values_->write(index, (T)qtizer_->code(value));
filterIdx = index; filterIdx = index;
return true; return true;
} }
++index; ++index;
} }
// could add if it gets here. // could add if it gets here.
return false; return false;
} }
template<typename T> template<typename T>
int PerfectHash<T>::query(const wordID_t* IDs, const int len, int PerfectHash<T>::query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx) { hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary // check if key is in high perf. dictionary
string skey = hpDictKeyValue(IDs, len); string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) { if((hpdAddr = dict_.find(skey)) != dict_.end()) {
filterIdx = cells_ + 1; filterIdx = cells_ + 1;
return(hpdAddr->second); // returns copy of value return(hpdAddr->second); // returns copy of value
} } else { // check if key is in filter
else { // check if key is in filter // get bucket
// get bucket
//count_t bucket = bucketHash_->hash(IDs, len); //count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
// restriction on fprint value is non-zero // restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter // return value if ngram is in filter
uint64_t index = bucket * bucketRange_, uint64_t index = bucket * bucketRange_,
lastrow = index + bucketRange_; lastrow = index + bucketRange_;
for(; index < lastrow; ++index) { for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) { if(filter_->read(index) == fp) {
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" << //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
//filter_->read(index) << "\tcode = " << code << endl; //filter_->read(index) << "\tcode = " << code << endl;
filterIdx = index; filterIdx = index;
hpdAddr = dict_.end(); hpdAddr = dict_.end();
return (int)qtizer_->value(values_->read(index)); return (int)qtizer_->value(values_->read(index));
} }
} }
} }
return -1; return -1;
} }
template<typename T> template<typename T>
void PerfectHash<T>::remove(const wordID_t* IDs, const int len) { void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
{
// delete key if in high perf. dictionary // delete key if in high perf. dictionary
string skey = hpDictKeyValue(IDs, len); string skey = hpDictKeyValue(IDs, len);
if(dict_.find(skey) != dict_.end()) if(dict_.find(skey) != dict_.end())
dict_.erase(skey); dict_.erase(skey);
else { // check if key is in filter else { // check if key is in filter
// get small representation for ngrams // get small representation for ngrams
//count_t bucket = bucketHash_->hash(IDs, len); //count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len)); count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
// retrieve non zero fingerprint for ngram // retrieve non zero fingerprint for ngram
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter // return value if ngram is in filter
uint64_t index = bucket * bucketRange_, uint64_t index = bucket * bucketRange_,
lastrow = index + bucketRange_; lastrow = index + bucketRange_;
for(; index < lastrow; ++index) { for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) { if(filter_->read(index) == fp) {
filter_->write(index, 0); filter_->write(index, 0);
values_->write(index, 0); values_->write(index, 0);
--idxTracker_[bucket]; // track bucket size reduction --idxTracker_[bucket]; // track bucket size reduction
@ -213,7 +217,8 @@ void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
} }
} }
template<typename T> // clear filter index template<typename T> // clear filter index
void PerfectHash<T>::remove(uint64_t index) { void PerfectHash<T>::remove(uint64_t index)
{
CHECK(index < cells_); CHECK(index < cells_);
CHECK(filter_->read(index) != 0); // slow CHECK(filter_->read(index) != 0); // slow
filter_->write(index, 0); filter_->write(index, 0);
@ -224,19 +229,21 @@ void PerfectHash<T>::remove(uint64_t index) {
} }
template<typename T> template<typename T>
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len, T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
count_t bucket) { count_t bucket)
{
count_t h = bucket; count_t h = bucket;
T fingerprint(0); T fingerprint(0);
do { do {
fingerprint = fingerHash_->hash(IDs, len, h); fingerprint = fingerHash_->hash(IDs, len, h);
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
} while((fingerprint == 0) && (h != bucket)); } while((fingerprint == 0) && (h != bucket));
if(fingerprint == 0) if(fingerprint == 0)
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl; cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
return fingerprint; return fingerprint;
} }
template<typename T> template<typename T>
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) { string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
{
string skey(" "); string skey(" ");
for(int i = 0; i < len; ++i) for(int i = 0; i < len; ++i)
skey += Utils::IntToStr(IDs[i]) + "¬"; skey += Utils::IntToStr(IDs[i]) + "¬";
@ -244,17 +251,20 @@ string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
return skey; return skey;
} }
template<typename T> template<typename T>
count_t PerfectHash<T>::hpDictMemUse() { count_t PerfectHash<T>::hpDictMemUse()
{
// return hpDict memory usage in MBs // return hpDict memory usage in MBs
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20; return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
} }
template<typename T> template<typename T>
count_t PerfectHash<T>::bucketsMemUse() { count_t PerfectHash<T>::bucketsMemUse()
{
// return bucket memory usage in MBs // return bucket memory usage in MBs
return (count_t) (filter_->size() + values_->size()); return (count_t) (filter_->size() + values_->size());
} }
template<typename T> template<typename T>
void PerfectHash<T>::save(FileHandler* fout) { void PerfectHash<T>::save(FileHandler* fout)
{
CHECK(fout != 0); CHECK(fout != 0);
cerr << "\tSaving perfect hash parameters...\n"; cerr << "\tSaving perfect hash parameters...\n";
fout->write((char*)&hitMask_, sizeof(hitMask_)); fout->write((char*)&hitMask_, sizeof(hitMask_));
@ -275,11 +285,12 @@ void PerfectHash<T>::save(FileHandler* fout) {
count_t size = dict_.size(); count_t size = dict_.size();
fout->write((char*)&size, sizeof(count_t)); fout->write((char*)&size, sizeof(count_t));
*fout << endl; *fout << endl;
iterate(dict_, t) iterate(dict_, t)
*fout << t->first << "\t" << t->second << "\n"; *fout << t->first << "\t" << t->second << "\n";
} }
template<typename T> template<typename T>
void PerfectHash<T>::load(FileHandler* fin) { void PerfectHash<T>::load(FileHandler* fin)
{
CHECK(fin != 0); CHECK(fin != 0);
cerr << "\tLoading perfect hash parameters...\n"; cerr << "\tLoading perfect hash parameters...\n";
fin->read((char*)&hitMask_, sizeof(hitMask_)); fin->read((char*)&hitMask_, sizeof(hitMask_));
@ -315,12 +326,13 @@ void PerfectHash<T>::load(FileHandler* fin) {
cerr << "Finished loading ORLM." << endl; cerr << "Finished loading ORLM." << endl;
} }
template<typename T> template<typename T>
void PerfectHash<T>::analyze() { void PerfectHash<T>::analyze()
{
cerr << "Analyzing Dynamic Bloomier Filter...\n"; cerr << "Analyzing Dynamic Bloomier Filter...\n";
// see how many items in each bucket // see how many items in each bucket
uint8_t* bucketCnt = new uint8_t[totBuckets_]; uint8_t* bucketCnt = new uint8_t[totBuckets_];
unsigned largestBucket = 0, totalCellsSet = 0, unsigned largestBucket = 0, totalCellsSet = 0,
smallestBucket = bucketRange_, totalZeroes = 0; smallestBucket = bucketRange_, totalZeroes = 0;
int curBucket = -1, fullBuckets(0); int curBucket = -1, fullBuckets(0);
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0; for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
for(uint64_t i =0; i < cells_; ++i) { for(uint64_t i =0; i < cells_; ++i) {
@ -328,16 +340,14 @@ void PerfectHash<T>::analyze() {
if(filter_->read(i) != 0) { if(filter_->read(i) != 0) {
++bucketCnt[curBucket]; ++bucketCnt[curBucket];
++totalCellsSet; ++totalCellsSet;
} } else ++totalZeroes;
else ++totalZeroes;
} }
count_t bi = 0, si = 0; count_t bi = 0, si = 0;
for(int i = 0; i < totBuckets_; ++i) { for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] > largestBucket) { if(bucketCnt[i] > largestBucket) {
largestBucket = bucketCnt[i]; largestBucket = bucketCnt[i];
bi = i; bi = i;
} } else if(bucketCnt[i] < smallestBucket) {
else if(bucketCnt[i] < smallestBucket) {
smallestBucket = bucketCnt[i]; smallestBucket = bucketCnt[i];
si = i; si = i;
} }
@ -350,8 +360,8 @@ void PerfectHash<T>::analyze() {
} }
for(int i = 0; i < totBuckets_; ++i) { for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] != idxTracker_[i]) if(bucketCnt[i] != idxTracker_[i])
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl; "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
} }
cerr << "total cells= " << cells_ << endl; cerr << "total cells= " << cells_ << endl;
cerr << "total buckets= " << totBuckets_ << endl; cerr << "total buckets= " << totBuckets_ << endl;
@ -364,7 +374,7 @@ void PerfectHash<T>::analyze() {
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl; cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl; cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl; " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
cerr << "total buckets full = " << fullBuckets << endl; cerr << "total buckets full = " << fullBuckets << endl;
cerr << "total collision errors= " << collisions_ << endl; cerr << "total collision errors= " << collisions_ << endl;
cerr << "high performance dictionary size= " << dict_.size() << endl; cerr << "high performance dictionary size= " << dict_.size() << endl;
@ -373,14 +383,15 @@ void PerfectHash<T>::analyze() {
cerr << "values MBs= " << values_->size() << endl; cerr << "values MBs= " << values_->size() << endl;
delete[] bucketCnt; delete[] bucketCnt;
} }
template<typename T> template<typename T>
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len, bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary // check if key is in high perf. dictionary
filterIdx = cells_ + 1; filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len); string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) { if((hpdAddr = dict_.find(skey)) != dict_.end()) {
hpdAddr->second += value; hpdAddr->second += value;
return true; return true;
} }
// else hash ngram // else hash ngram
@ -389,18 +400,18 @@ bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero // restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_; lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index); T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p. if(filterVal == fp) { // found event w.h.p.
int oldval = (int)qtizer_->value(values_->read(index)); int oldval = (int)qtizer_->value(values_->read(index));
values_->write(index, (T)qtizer_->code(oldval + value)); values_->write(index, (T)qtizer_->code(oldval + value));
filterIdx = index; filterIdx = index;
return true; return true;
} }
++index; ++index;
} }
// add if it gets here. // add if it gets here.
insert(IDs, len, value); insert(IDs, len, value);
return false; return false;
} }

View File

@ -8,7 +8,8 @@
#include "types.h" #include "types.h"
static const float kFloatErr = 0.00001f; static const float kFloatErr = 0.00001f;
class LogQtizer { class LogQtizer
{
public: public:
LogQtizer(float i): base_(pow(2, 1 / i)) { LogQtizer(float i): base_(pow(2, 1 / i)) {
CHECK(base_ > 1); CHECK(base_ > 1);
@ -16,8 +17,8 @@ public:
float value = 1; // code = 1 -> value = 1 for any base float value = 1; // code = 1 -> value = 1 for any base
std::vector<float> code_to_value_vec; std::vector<float> code_to_value_vec;
while (log2(value) < 30) { // assume 2^30 is largest count while (log2(value) < 30) { // assume 2^30 is largest count
code_to_value_vec.push_back(value); code_to_value_vec.push_back(value);
value = pow(base_, ++max_code_); value = pow(base_, ++max_code_);
} }
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_] code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
// get valid range // get valid range
@ -40,22 +41,22 @@ public:
int code(float value) { int code(float value) {
// should just be: return log_b(value) // should just be: return log_b(value)
CHECK(!(value < min_value_ || value > max_value_)); CHECK(!(value < min_value_ || value > max_value_));
// but binary search removes errors due to floor operator above // but binary search removes errors due to floor operator above
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_, int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
value) - code_to_value_); value) - code_to_value_);
// make sure not overestimating // make sure not overestimating
code = code_to_value_[code] > value ? code - 1 : code; code = code_to_value_[code] > value ? code - 1 : code;
return code; return code;
} }
inline float value(int code) { inline float value(int code) {
// table look up for values // table look up for values
return code_to_value_[code]; return code_to_value_[code];
} }
inline int maxcode() { inline int maxcode() {
return max_code_; return max_code_;
} }
inline float logValue(int code) { inline float logValue(int code) {
// table look up for log of values // table look up for log of values
return code_to_log_value_[code]; return code_to_log_value_[code];
} }
~LogQtizer() { ~LogQtizer() {
@ -69,15 +70,15 @@ public:
fout->write((char*)&min_value_, sizeof(min_value_)); fout->write((char*)&min_value_, sizeof(min_value_));
for (int j = 0; j <= max_code_; ++j) for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j])); fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
for (int j = 0; j <= max_code_; ++j) for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j])); fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl; std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
} }
private: private:
float base_; float base_;
float* code_to_value_; float* code_to_value_;
float* code_to_log_value_; float* code_to_log_value_;
int max_code_; int max_code_;
float max_value_; float max_value_;
float min_value_; float min_value_;
void load(FileHandler* fin) { void load(FileHandler* fin) {

View File

@ -103,10 +103,11 @@ bool Vocab::Load(const std::string & vocab_path, const FactorDirection& directio
std::cerr << "Loading vocab from " << vocab_path << std::endl; std::cerr << "Loading vocab from " << vocab_path << std::endl;
return Load(&vcbin, direction, factors, closed); return Load(&vcbin, direction, factors, closed);
} }
bool Vocab::Load(FileHandler* vcbin) { bool Vocab::Load(FileHandler* vcbin)
{
FactorList factors; FactorList factors;
factors.push_back(0); factors.push_back(0);
return Load(vcbin, Input, factors); return Load(vcbin, Input, factors);
} }
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction, bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
const FactorList& factors, bool closed) const FactorList& factors, bool closed)

View File

@ -74,12 +74,12 @@ int DynSuffixArray::F_firstIdx(unsigned word)
// return index of first row where word is found in m_F // return index of first row where word is found in m_F
/*for(int i=0; i < m_F->size(); ++i) { /*for(int i=0; i < m_F->size(); ++i) {
if(m_F->at(i) == word) { if(m_F->at(i) == word) {
return i; return i;
} }
} }
return -1;*/ return -1;*/
//NOTE: lower_bound is faster than linear search above but may cause issues //NOTE: lower_bound is faster than linear search above but may cause issues
// if ordering of vocab is not consecutive (ie..after deletions) // if ordering of vocab is not consecutive (ie..after deletions)
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin(); int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl; //cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
if(low >= m_F->size()) if(low >= m_F->size())
@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
{ {
set<pair<unsigned, unsigned> > seen; set<pair<unsigned, unsigned> > seen;
while(j != jprime) { while(j != jprime) {
// this 'seenit' check added for data with many loops. will remove after double // this 'seenit' check added for data with many loops. will remove after double
// checking. // checking.
bool seenit = seen.insert(std::make_pair(j, jprime)).second; bool seenit = seen.insert(std::make_pair(j, jprime)).second;
if(seenit) { if(seenit) {
for(int i=1; i < m_SA->size(); ++i) { for(int i=1; i < m_SA->size(); ++i) {
@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
int new_j = LastFirstFunc(j); int new_j = LastFirstFunc(j);
CHECK(j <= jprime); CHECK(j <= jprime);
// for SA and L, the element at pos j is moved to pos j' // for SA and L, the element at pos j is moved to pos j'
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j)); m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
m_L->erase(m_L->begin() + j); m_L->erase(m_L->begin() + j);
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j)); m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
m_SA->erase(m_SA->begin() + j); m_SA->erase(m_SA->begin() + j);
// all ISA values between (j...j'] decremented // all ISA values between (j...j'] decremented
for(size_t i = 0; i < m_ISA->size(); ++i) { for(size_t i = 0; i < m_ISA->size(); ++i) {

View File

@ -33,9 +33,9 @@ namespace Moses
class FactorFriend; class FactorFriend;
class FactorCollection; class FactorCollection;
/** Represents a factor (word, POS, etc). /** Represents a factor (word, POS, etc).
* *
* A Factor has a contiguous identifier and string value. * A Factor has a contiguous identifier and string value.
*/ */
class Factor class Factor
{ {
@ -45,17 +45,17 @@ class Factor
friend class FactorCollection; friend class FactorCollection;
friend class FactorFriend; friend class FactorFriend;
// FactorCollection writes here. // FactorCollection writes here.
std::string m_string; std::string m_string;
size_t m_id; size_t m_id;
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects //! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
Factor() {} Factor() {}
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly. // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {} Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
// Not implemented. Shouldn't be called. // Not implemented. Shouldn't be called.
Factor &operator=(const Factor &factor); Factor &operator=(const Factor &factor);
public: public:

View File

@ -33,7 +33,7 @@ FactorCollection FactorCollection::s_instance;
const Factor *FactorCollection::AddFactor(const StringPiece &factorString) const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{ {
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost. // Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
#ifdef WITH_THREADS #ifdef WITH_THREADS
#if BOOST_VERSION < 104200 #if BOOST_VERSION < 104200
FactorFriend to_ins; FactorFriend to_ins;
@ -42,7 +42,7 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{ {
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock); boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#if BOOST_VERSION >= 104200 #if BOOST_VERSION >= 104200
// If this line doesn't compile, upgrade your Boost. // If this line doesn't compile, upgrade your Boost.
Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor()); Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
#else // BOOST_VERSION #else // BOOST_VERSION
Set::const_iterator i = m_set.find(to_ins); Set::const_iterator i = m_set.find(to_ins);

View File

@ -47,7 +47,7 @@ namespace Moses
* private and friended to FactorFriend. The STL containers can delegate * private and friended to FactorFriend. The STL containers can delegate
* copying, so friending the container isn't sufficient. STL containers see * copying, so friending the container isn't sufficient. STL containers see
* FactorFriend's public copy constructor and everybody else sees Factor's * FactorFriend's public copy constructor and everybody else sees Factor's
* private copy constructor. * private copy constructor.
*/ */
struct FactorFriend { struct FactorFriend {
Factor in; Factor in;

View File

@ -30,20 +30,24 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std; using namespace std;
namespace Moses { namespace Moses
{
LanguageModel::LanguageModel() { LanguageModel::LanguageModel()
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature(); {
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
} }
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager) { void LanguageModel::Init(ScoreIndexManager &scoreIndexManager)
{
scoreIndexManager.AddScoreProducer(this); scoreIndexManager.AddScoreProducer(this);
} }
LanguageModel::~LanguageModel() {} LanguageModel::~LanguageModel() {}
// don't inline virtual funcs... // don't inline virtual funcs...
size_t LanguageModel::GetNumScoreComponents() const { size_t LanguageModel::GetNumScoreComponents() const
{
if (m_enableOOVFeature) { if (m_enableOOVFeature) {
return 2; return 2;
} else { } else {
@ -51,13 +55,15 @@ size_t LanguageModel::GetNumScoreComponents() const {
} }
} }
float LanguageModel::GetWeight() const { float LanguageModel::GetWeight() const
{
size_t lmIndex = StaticData::Instance().GetScoreIndexManager(). size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
GetBeginIndex(GetScoreBookkeepingID()); GetBeginIndex(GetScoreBookkeepingID());
return StaticData::Instance().GetAllWeights()[lmIndex]; return StaticData::Instance().GetAllWeights()[lmIndex];
} }
float LanguageModel::GetOOVWeight() const { float LanguageModel::GetOOVWeight() const
{
if (!m_enableOOVFeature) return 0; if (!m_enableOOVFeature) return 0;
size_t lmIndex = StaticData::Instance().GetScoreIndexManager(). size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
GetBeginIndex(GetScoreBookkeepingID()); GetBeginIndex(GetScoreBookkeepingID());

View File

@ -35,7 +35,8 @@ class Phrase;
class ScoreIndexManager; class ScoreIndexManager;
//! Abstract base class which represent a language model on a contiguous phrase //! Abstract base class which represent a language model on a contiguous phrase
class LanguageModel : public StatefulFeatureFunction { class LanguageModel : public StatefulFeatureFunction
{
protected: protected:
LanguageModel(); LanguageModel();
@ -43,11 +44,11 @@ protected:
void Init(ScoreIndexManager &scoreIndexManager); void Init(ScoreIndexManager &scoreIndexManager);
bool m_enableOOVFeature; bool m_enableOOVFeature;
public: public:
virtual ~LanguageModel(); virtual ~LanguageModel();
// Make another feature without copying the underlying model data. // Make another feature without copying the underlying model data.
virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0; virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0;
//! see ScoreProducer.h //! see ScoreProducer.h

View File

@ -10,10 +10,12 @@
namespace Moses namespace Moses
{ {
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0) { LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0)
{
} }
LanguageModelDMapLM::~LanguageModelDMapLM() { LanguageModelDMapLM::~LanguageModelDMapLM()
{
delete m_lm; delete m_lm;
} }
@ -51,8 +53,8 @@ void LanguageModelDMapLM::CreateFactor(FactorCollection& factorCollection)
} }
LMResult LanguageModelDMapLM::GetValueGivenState( LMResult LanguageModelDMapLM::GetValueGivenState(
const std::vector<const Word*>& contextFactor, const std::vector<const Word*>& contextFactor,
FFState& state) const FFState& state) const
{ {
DMapLMState& cast_state = static_cast<DMapLMState&>(state); DMapLMState& cast_state = static_cast<DMapLMState&>(state);
LMResult result; LMResult result;
@ -65,8 +67,8 @@ LMResult LanguageModelDMapLM::GetValueGivenState(
} }
LMResult LanguageModelDMapLM::GetValueForgotState( LMResult LanguageModelDMapLM::GetValueForgotState(
const std::vector<const Word*>& contextFactor, const std::vector<const Word*>& contextFactor,
FFState& outState) const FFState& outState) const
{ {
DMapLMState& cast_state = static_cast<DMapLMState&>(outState); DMapLMState& cast_state = static_cast<DMapLMState&>(outState);
LMResult result; LMResult result;
@ -78,13 +80,13 @@ LMResult LanguageModelDMapLM::GetValueForgotState(
} }
float LanguageModelDMapLM::GetValue( float LanguageModelDMapLM::GetValue(
const std::vector<const Word*>& contextFactor, const std::vector<const Word*>& contextFactor,
size_t target_order, size_t target_order,
size_t* succeeding_order) const size_t* succeeding_order) const
{ {
FactorType factorType = GetFactorType(); FactorType factorType = GetFactorType();
float score; float score;
std::string ngram_string(""); std::string ngram_string("");
ngram_string.append(((*contextFactor[0])[factorType])->GetString()); ngram_string.append(((*contextFactor[0])[factorType])->GetString());
for (size_t i = 1; i < contextFactor.size(); ++i) { for (size_t i = 1; i < contextFactor.size(); ++i) {
@ -97,38 +99,44 @@ float LanguageModelDMapLM::GetValue(
return score; return score;
} }
const FFState* LanguageModelDMapLM::GetNullContextState() const { const FFState* LanguageModelDMapLM::GetNullContextState() const
DMapLMState* state = new DMapLMState(); {
state->m_last_succeeding_order = GetNGramOrder(); DMapLMState* state = new DMapLMState();
return state; state->m_last_succeeding_order = GetNGramOrder();
return state;
} }
FFState* LanguageModelDMapLM::GetNewSentenceState() const { FFState* LanguageModelDMapLM::GetNewSentenceState() const
DMapLMState* state = new DMapLMState(); {
state->m_last_succeeding_order = GetNGramOrder(); DMapLMState* state = new DMapLMState();
return state; state->m_last_succeeding_order = GetNGramOrder();
return state;
} }
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const { const FFState* LanguageModelDMapLM::GetBeginSentenceState() const
DMapLMState* state = new DMapLMState(); {
state->m_last_succeeding_order = GetNGramOrder(); DMapLMState* state = new DMapLMState();
return state; state->m_last_succeeding_order = GetNGramOrder();
return state;
} }
FFState* LanguageModelDMapLM::NewState(const FFState* state) const { FFState* LanguageModelDMapLM::NewState(const FFState* state) const
DMapLMState* new_state = new DMapLMState(); {
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state); DMapLMState* new_state = new DMapLMState();
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order; const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
return new_state; new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
return new_state;
} }
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing() { void LanguageModelDMapLM::CleanUpAfterSentenceProcessing()
{
m_lm->printStats(); m_lm->printStats();
m_lm->resetStats(); m_lm->resetStats();
m_lm->clearCaches(); m_lm->clearCaches();
} }
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing() { void LanguageModelDMapLM::InitializeBeforeSentenceProcessing()
{
} }
} // namespace Moses } // namespace Moses

View File

@ -12,20 +12,22 @@
#include "LM/SingleFactor.h" #include "LM/SingleFactor.h"
#include "Util.h" #include "Util.h"
namespace Moses { namespace Moses
{
class DMapLMState : public FFState { class DMapLMState : public FFState
{
public: public:
int Compare(const FFState &o) const { int Compare(const FFState &o) const {
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o); const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
if (cast_other.m_last_succeeding_order < m_last_succeeding_order) if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
return -1; return -1;
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order) else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
return 1; return 1;
else else
return 0; return 0;
} }
uint8_t m_last_succeeding_order; uint8_t m_last_succeeding_order;
}; };
class LanguageModelDMapLM : public LanguageModelSingleFactor class LanguageModelDMapLM : public LanguageModelSingleFactor

View File

@ -69,7 +69,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
m_filePath = filePath; m_filePath = filePath;
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath); m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
m_lmtb->setMaxLoadedLevel(1000); m_lmtb->setMaxLoadedLevel(1000);
m_lmtb->load(m_filePath); m_lmtb->load(m_filePath);
d=m_lmtb->getDict(); d=m_lmtb->getDict();
@ -140,7 +140,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
} }
int LanguageModelIRST::GetLmID( const Factor *factor ) const int LanguageModelIRST::GetLmID( const Factor *factor ) const
{ {
size_t factorId = factor->GetId(); size_t factorId = factor->GetId();
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) { if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
@ -150,12 +150,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
////////// //////////
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti ///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
///e delle parole target in Moses, puo' accadere che una parola target ///e delle parole target in Moses, puo' accadere che una parola target
///di cui non sia stato ancora calcolato il suo codice target abbia ///di cui non sia stato ancora calcolato il suo codice target abbia
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size()) ///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
///E' necessario dunque identificare questi casi di indeterminatezza ///E' necessario dunque identificare questi casi di indeterminatezza
///del codice target. Attualamente, questo controllo e' stato implementato ///del codice target. Attualamente, questo controllo e' stato implementato
///impostando a m_empty tutti i termini che non hanno ancora ///impostando a m_empty tutti i termini che non hanno ancora
//ricevuto un codice target effettivo //ricevuto un codice target effettivo
/////////// ///////////
@ -167,7 +167,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C /// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
/// Cosi' funziona .... /// Cosi' funziona ....
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup /// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
/// quindi /// quindi
/// e scopro che rimane vuota una entry ogni due /// e scopro che rimane vuota una entry ogni due
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1) /// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
/// non da problemi di correttezza, ma solo di "spreco" di memoria /// non da problemi di correttezza, ma solo di "spreco" di memoria
@ -177,10 +177,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
//////////////// ////////////////
if (factorId >= m_lmIdLookup.size()){ if (factorId >= m_lmIdLookup.size()) {
//resize and fill with m_empty //resize and fill with m_empty
//increment the array more than needed to avoid too many resizing operation. //increment the array more than needed to avoid too many resizing operation.
m_lmIdLookup.resize(factorId+10, m_empty); m_lmIdLookup.resize(factorId+10, m_empty);
} }
//insert new code //insert new code

View File

@ -68,8 +68,9 @@ void LanguageModelImplementation::GetState(
GetValueForgotState(contextFactor, state); GetValueForgotState(contextFactor, state);
} }
// Calculate score of a phrase. // Calculate score of a phrase.
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
fullScore = 0; fullScore = 0;
ngramScore = 0; ngramScore = 0;
@ -81,7 +82,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
vector<const Word*> contextFactor; vector<const Word*> contextFactor;
contextFactor.reserve(GetNGramOrder()); contextFactor.reserve(GetNGramOrder());
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartArray()) ? std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartArray()) ?
GetBeginSentenceState() : GetNullContextState())); GetBeginSentenceState() : GetNullContextState()));
size_t currPos = 0; size_t currPos = 0;
while (currPos < phraseSize) { while (currPos < phraseSize) {
const Word &word = phrase.GetWord(currPos); const Word &word = phrase.GetWord(currPos);
@ -108,7 +109,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
fullScore += result.score; fullScore += result.score;
if (contextFactor.size() == GetNGramOrder()) if (contextFactor.size() == GetNGramOrder())
ngramScore += result.score; ngramScore += result.score;
if (result.unknown) ++oovCount; if (result.unknown) ++oovCount;
} }
} }
@ -116,7 +117,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
} }
} }
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const { FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const
{
// In this function, we only compute the LM scores of n-grams that overlap a // In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the // phrase boundary. Phrase-internal scores are taken directly from the
// translation option. // translation option.
@ -178,9 +180,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
contextFactor[i] = &hypo.GetWord((size_t)currPos); contextFactor[i] = &hypo.GetWord((size_t)currPos);
} }
lmScore += GetValueForgotState(contextFactor, *res).score; lmScore += GetValueForgotState(contextFactor, *res).score;
} } else {
else
{
if (endPos < currEndPos) { if (endPos < currEndPos) {
//need to get the LM state (otherwise the last LM state is fine) //need to get the LM state (otherwise the last LM state is fine)
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) { for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
@ -207,10 +207,11 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
return res; return res;
} }
namespace { namespace
{
// This is the FFState used by LanguageModelImplementation::EvaluateChart. // This is the FFState used by LanguageModelImplementation::EvaluateChart.
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp. // Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
class LanguageModelChartState : public FFState class LanguageModelChartState : public FFState
{ {
private: private:
@ -223,12 +224,11 @@ private:
const ChartHypothesis &m_hypo; const ChartHypothesis &m_hypo;
/** Construct the prefix string of up to specified size /** Construct the prefix string of up to specified size
* \param ret prefix string * \param ret prefix string
* \param size maximum size (typically max lm context window) * \param size maximum size (typically max lm context window)
*/ */
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
{
const TargetPhrase &target = hypo.GetCurrTargetPhrase(); const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
target.GetAlignmentInfo().GetNonTermIndexMap(); target.GetAlignmentInfo().GetNonTermIndexMap();
@ -257,13 +257,12 @@ private:
return size; return size;
} }
/** Construct the suffix phrase of up to specified size /** Construct the suffix phrase of up to specified size
* will always be called after the construction of prefix phrase * will always be called after the construction of prefix phrase
* \param ret suffix phrase * \param ret suffix phrase
* \param size maximum size of suffix * \param size maximum size of suffix
*/ */
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
{
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals); CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
// special handling for small hypotheses // special handling for small hypotheses
@ -292,8 +291,7 @@ private:
size_t nonTermInd = nonTermIndexMap[pos]; size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size); size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
} } else {
else {
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos)); ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
size--; size--;
} }
@ -309,11 +307,10 @@ private:
public: public:
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order) LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
:m_lmRightContext(NULL) :m_lmRightContext(NULL)
,m_contextPrefix(order - 1) ,m_contextPrefix(order - 1)
,m_contextSuffix( order - 1) ,m_contextSuffix( order - 1)
,m_hypo(hypo) ,m_hypo(hypo) {
{
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals(); m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) { for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
@ -334,8 +331,12 @@ public:
m_lmRightContext = rightState; m_lmRightContext = rightState;
} }
float GetPrefixScore() const { return m_prefixScore; } float GetPrefixScore() const {
FFState* GetRightContext() const { return m_lmRightContext; } return m_prefixScore;
}
FFState* GetRightContext() const {
return m_lmRightContext;
}
size_t GetNumTargetTerminals() const { size_t GetNumTargetTerminals() const {
return m_numTargetTerminals; return m_numTargetTerminals;
@ -353,8 +354,7 @@ public:
dynamic_cast<const LanguageModelChartState &>( o ); dynamic_cast<const LanguageModelChartState &>( o );
// prefix // prefix
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..." if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
{
int ret = GetPrefix().Compare(other.GetPrefix()); int ret = GetPrefix().Compare(other.GetPrefix());
if (ret != 0) if (ret != 0)
return ret; return ret;
@ -362,8 +362,7 @@ public:
// suffix // suffix
size_t inputSize = m_hypo.GetManager().GetSource().GetSize(); size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>" if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
{
int ret = other.GetRightContext()->Compare(*m_lmRightContext); int ret = other.GetRightContext()->Compare(*m_lmRightContext);
if (ret != 0) if (ret != 0)
return ret; return ret;
@ -374,7 +373,8 @@ public:
} // namespace } // namespace
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const { FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const
{
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder()); LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
// data structure for factored context phrase (history and predicted word) // data structure for factored context phrase (history and predicted word)
vector<const Word*> contextFactor; vector<const Word*> contextFactor;
@ -394,33 +394,28 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// loop over rule // loop over rule
for (size_t phrasePos = 0, wordPos = 0; for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize(); phrasePos < hypo.GetCurrTargetPhrase().GetSize();
phrasePos++) phrasePos++) {
{
// consult rule for either word or non-terminal // consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos); const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// regular word // regular word
if (!word.IsNonTerminal()) if (!word.IsNonTerminal()) {
{
ShiftOrPush(contextFactor, word); ShiftOrPush(contextFactor, word);
// beginning of sentence symbol <s>? -> just update state // beginning of sentence symbol <s>? -> just update state
if (word == GetSentenceStartArray()) if (word == GetSentenceStartArray()) {
{
CHECK(phrasePos == 0); CHECK(phrasePos == 0);
delete lmState; delete lmState;
lmState = NewState( GetBeginSentenceState() ); lmState = NewState( GetBeginSentenceState() );
} }
// score a regular word added by the rule // score a regular word added by the rule
else else {
{
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos ); updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
} }
} }
// non-terminal, add phrase from underlying hypothesis // non-terminal, add phrase from underlying hypothesis
else else {
{
// look up underlying hypothesis // look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos]; size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex); const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
@ -444,8 +439,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// push suffix // push suffix
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1); int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
if (suffixPos < 0) suffixPos = 0; // push all words if less than order if (suffixPos < 0) suffixPos = 0; // push all words if less than order
for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
{
const Word &word = prevState->GetSuffix().GetWord(suffixPos); const Word &word = prevState->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word); ShiftOrPush(contextFactor, word);
wordPos++; wordPos++;
@ -453,22 +447,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
} }
// internal non-terminal // internal non-terminal
else else {
{
// score its prefix // score its prefix
for(size_t prefixPos = 0; for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window prefixPos < GetNGramOrder()-1 // up to LM order window
&& prefixPos < subPhraseLength; // up to length && prefixPos < subPhraseLength; // up to length
prefixPos++) prefixPos++) {
{
const Word &word = prevState->GetPrefix().GetWord(prefixPos); const Word &word = prevState->GetPrefix().GetWord(prefixPos);
ShiftOrPush(contextFactor, word); ShiftOrPush(contextFactor, word);
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos ); updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
} }
// check if we are dealing with a large sub-phrase // check if we are dealing with a large sub-phrase
if (subPhraseLength > GetNGramOrder() - 1) if (subPhraseLength > GetNGramOrder() - 1) {
{
// add its finalized language model score // add its finalized language model score
finalizedScore += finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score
@ -503,11 +494,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
return ret; return ret;
} }
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const { void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
{
if (wordPos < GetNGramOrder()) { if (wordPos < GetNGramOrder()) {
*prefixScore += score; *prefixScore += score;
} } else {
else {
*finalizedScore += score; *finalizedScore += score;
} }
} }

View File

@ -45,7 +45,7 @@ class Phrase;
struct LMResult { struct LMResult {
// log probability // log probability
float score; float score;
// Is the word unknown? // Is the word unknown?
bool unknown; bool unknown;
}; };
@ -126,54 +126,55 @@ public:
virtual void CleanUpAfterSentenceProcessing() {}; virtual void CleanUpAfterSentenceProcessing() {};
}; };
class LMRefCount : public LanguageModel { class LMRefCount : public LanguageModel
public: {
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) { public:
Init(scoreIndexManager); LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
} Init(scoreIndexManager);
}
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const { LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
return new LMRefCount(scoreIndexManager, *this); return new LMRefCount(scoreIndexManager, *this);
} }
void InitializeBeforeSentenceProcessing() { void InitializeBeforeSentenceProcessing() {
m_impl->InitializeBeforeSentenceProcessing(); m_impl->InitializeBeforeSentenceProcessing();
} }
void CleanUpAfterSentenceProcessing() { void CleanUpAfterSentenceProcessing() {
m_impl->CleanUpAfterSentenceProcessing(); m_impl->CleanUpAfterSentenceProcessing();
} }
const FFState* EmptyHypothesisState(const InputType &/*input*/) const { const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
return m_impl->NewState(m_impl->GetBeginSentenceState()); return m_impl->NewState(m_impl->GetBeginSentenceState());
} }
bool Useable(const Phrase &phrase) const { bool Useable(const Phrase &phrase) const {
return m_impl->Useable(phrase); return m_impl->Useable(phrase);
} }
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount); return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
} }
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const { FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this); return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
} }
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const { FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this); return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
} }
std::string GetScoreProducerDescription(unsigned int param) const { std::string GetScoreProducerDescription(unsigned int param) const {
return m_impl->GetScoreProducerDescription(param); return m_impl->GetScoreProducerDescription(param);
} }
private: private:
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount &copy_from) : m_impl(copy_from.m_impl) { LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount &copy_from) : m_impl(copy_from.m_impl) {
Init(scoreIndexManager); Init(scoreIndexManager);
} }
boost::shared_ptr<LanguageModelImplementation> m_impl; boost::shared_ptr<LanguageModelImplementation> m_impl;
}; };
} }

View File

@ -43,8 +43,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std; using namespace std;
namespace Moses { namespace Moses
namespace { {
namespace
{
struct KenLMState : public FFState { struct KenLMState : public FFState {
lm::ngram::State state; lm::ngram::State state;
@ -59,67 +61,69 @@ struct KenLMState : public FFState {
/* /*
* An implementation of single factor LM using Ken's code. * An implementation of single factor LM using Ken's code.
*/ */
template <class Model> class LanguageModelKen : public LanguageModel { template <class Model> class LanguageModelKen : public LanguageModel
public: {
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy); public:
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const; LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
bool Useable(const Phrase &phrase) const { bool Useable(const Phrase &phrase) const {
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL); return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
} }
std::string GetScoreProducerDescription(unsigned) const { std::string GetScoreProducerDescription(unsigned) const {
std::ostringstream oss; std::ostringstream oss;
oss << "LM_" << m_ngram->Order() << "gram"; oss << "LM_" << m_ngram->Order() << "gram";
return oss.str(); return oss.str();
} }
const FFState *EmptyHypothesisState(const InputType &/*input*/) const { const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
KenLMState *ret = new KenLMState(); KenLMState *ret = new KenLMState();
ret->state = m_ngram->BeginSentenceState(); ret->state = m_ngram->BeginSentenceState();
return ret; return ret;
} }
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const; FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
private: private:
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from); LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from);
lm::WordIndex TranslateID(const Word &word) const { lm::WordIndex TranslateID(const Word &word) const {
std::size_t factor = word.GetFactor(m_factorType)->GetId(); std::size_t factor = word.GetFactor(m_factorType)->GetId();
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
} }
// Convert last words of hypothesis into vocab ids, returning an end pointer. // Convert last words of hypothesis into vocab ids, returning an end pointer.
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const { lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
lm::WordIndex *index = indices; lm::WordIndex *index = indices;
lm::WordIndex *end = indices + m_ngram->Order() - 1; lm::WordIndex *end = indices + m_ngram->Order() - 1;
int position = hypo.GetCurrTargetWordsRange().GetEndPos(); int position = hypo.GetCurrTargetWordsRange().GetEndPos();
for (; ; ++index, --position) { for (; ; ++index, --position) {
if (position == -1) { if (position == -1) {
*index = m_ngram->GetVocabulary().BeginSentence(); *index = m_ngram->GetVocabulary().BeginSentence();
return index + 1; return index + 1;
}
if (index == end) return index;
*index = TranslateID(hypo.GetWord(position));
} }
if (index == end) return index;
*index = TranslateID(hypo.GetWord(position));
} }
}
boost::shared_ptr<Model> m_ngram; boost::shared_ptr<Model> m_ngram;
std::vector<lm::WordIndex> m_lmIdLookup;
FactorType m_factorType; std::vector<lm::WordIndex> m_lmIdLookup;
const Factor *m_beginSentenceFactor; FactorType m_factorType;
const Factor *m_beginSentenceFactor;
}; };
class MappingBuilder : public lm::EnumerateVocab { class MappingBuilder : public lm::EnumerateVocab
{
public: public:
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping) MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorCollection(factorCollection), m_mapping(mapping) {} : m_factorCollection(factorCollection), m_mapping(mapping) {}
@ -138,11 +142,13 @@ private:
std::vector<lm::WordIndex> &m_mapping; std::vector<lm::WordIndex> &m_mapping;
}; };
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType) { template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType)
{
lm::ngram::Config config; lm::ngram::Config config;
IFVERBOSE(1) { IFVERBOSE(1) {
config.messages = &std::cerr; config.messages = &std::cerr;
} else { }
else {
config.messages = NULL; config.messages = NULL;
} }
FactorCollection &collection = FactorCollection::Instance(); FactorCollection &collection = FactorCollection::Instance();
@ -156,20 +162,23 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
Init(manager); Init(manager);
} }
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const { template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const
{
return new LanguageModelKen<Model>(manager, *this); return new LanguageModelKen<Model>(manager, *this);
} }
template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from) : template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from) :
m_ngram(copy_from.m_ngram), m_ngram(copy_from.m_ngram),
// TODO: don't copy this. // TODO: don't copy this.
m_lmIdLookup(copy_from.m_lmIdLookup), m_lmIdLookup(copy_from.m_lmIdLookup),
m_factorType(copy_from.m_factorType), m_factorType(copy_from.m_factorType),
m_beginSentenceFactor(copy_from.m_beginSentenceFactor) { m_beginSentenceFactor(copy_from.m_beginSentenceFactor)
{
Init(manager); Init(manager);
} }
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
fullScore = 0; fullScore = 0;
ngramScore = 0; ngramScore = 0;
oovCount = 0; oovCount = 0;
@ -186,13 +195,13 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
*state0 = m_ngram->NullContextState(); *state0 = m_ngram->NullContextState();
position = 0; position = 0;
} }
size_t ngramBoundary = m_ngram->Order() - 1; size_t ngramBoundary = m_ngram->Order() - 1;
for (; position < phrase.GetSize(); ++position) { for (; position < phrase.GetSize(); ++position) {
const Word &word = phrase.GetWord(position); const Word &word = phrase.GetWord(position);
if (word.IsNonTerminal()) { if (word.IsNonTerminal()) {
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete. // If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
ngramBoundary = m_ngram->Order() + position; ngramBoundary = m_ngram->Order() + position;
*state0 = m_ngram->NullContextState(); *state0 = m_ngram->NullContextState();
} else { } else {
@ -210,11 +219,12 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
} }
} }
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
{
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state; const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
std::auto_ptr<KenLMState> ret(new KenLMState()); std::auto_ptr<KenLMState> ret(new KenLMState());
if (!hypo.GetCurrTargetLength()) { if (!hypo.GetCurrTargetLength()) {
ret->state = in_state; ret->state = in_state;
return ret.release(); return ret.release();
@ -237,17 +247,17 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
} }
if (hypo.IsSourceCompleted()) { if (hypo.IsSourceCompleted()) {
// Score end of sentence. // Score end of sentence.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1); std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front()); const lm::WordIndex *last = LastIDs(hypo, &indices.front());
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob; score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
} else if (adjust_end < end) { } else if (adjust_end < end) {
// Get state after adding a long phrase. // Get state after adding a long phrase.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1); std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front()); const lm::WordIndex *last = LastIDs(hypo, &indices.front());
m_ngram->GetState(&indices.front(), last, ret->state); m_ngram->GetState(&indices.front(), last, ret->state);
} else if (state0 != &ret->state) { } else if (state0 != &ret->state) {
// Short enough phrase that we can just reuse the state. // Short enough phrase that we can just reuse the state.
ret->state = *state0; ret->state = *state0;
} }
@ -265,32 +275,37 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
return ret.release(); return ret.release();
} }
class LanguageModelChartStateKenLM : public FFState { class LanguageModelChartStateKenLM : public FFState
public: {
LanguageModelChartStateKenLM() {} public:
LanguageModelChartStateKenLM() {}
const lm::ngram::ChartState &GetChartState() const { return m_state; } const lm::ngram::ChartState &GetChartState() const {
lm::ngram::ChartState &GetChartState() { return m_state; } return m_state;
}
lm::ngram::ChartState &GetChartState() {
return m_state;
}
int Compare(const FFState& o) const int Compare(const FFState& o) const {
{ const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o); int ret = m_state.Compare(other.m_state);
int ret = m_state.Compare(other.m_state); return ret;
return ret; }
}
private: private:
lm::ngram::ChartState m_state; lm::ngram::ChartState m_state;
}; };
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const { template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
{
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM(); LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState()); lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
const size_t size = hypo.GetCurrTargetPhrase().GetSize(); const size_t size = hypo.GetCurrTargetPhrase().GetSize();
size_t phrasePos = 0; size_t phrasePos = 0;
// Special cases for first word. // Special cases for first word.
if (size) { if (size) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0); const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) { if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
@ -298,7 +313,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
ruleScore.BeginSentence(); ruleScore.BeginSentence();
phrasePos++; phrasePos++;
} else if (word.IsNonTerminal()) { } else if (word.IsNonTerminal()) {
// Non-terminal is first so we can copy instead of rescoring. // Non-terminal is first so we can copy instead of rescoring.
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState(); const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]); ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
@ -323,24 +338,25 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
} // namespace } // namespace
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) { LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy)
{
try { try {
lm::ngram::ModelType model_type; lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) { switch(model_type) {
case lm::ngram::HASH_PROBING: case lm::ngram::HASH_PROBING:
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy); return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
case lm::ngram::TRIE_SORTED: case lm::ngram::TRIE_SORTED:
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy); return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_TRIE_SORTED: case lm::ngram::QUANT_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy); return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
case lm::ngram::ARRAY_TRIE_SORTED: case lm::ngram::ARRAY_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy); return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_ARRAY_TRIE_SORTED: case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy); return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
default: default:
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
abort(); abort();
} }
} else { } else {
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy); return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);

View File

@ -26,12 +26,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h" #include "TypeDef.h"
namespace Moses { namespace Moses
{
class ScoreIndexManager; class ScoreIndexManager;
class LanguageModel; class LanguageModel;
// This will also load. // This will also load.
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy); LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
} // namespace Moses } // namespace Moses

View File

@ -9,10 +9,11 @@
#include "LM/ORLM.h" #include "LM/ORLM.h"
using std::map; using std::map;
namespace Moses namespace Moses
{
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
size_t nGramOrder)
{ {
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
size_t nGramOrder) {
cerr << "Loading LanguageModelORLM..." << endl; cerr << "Loading LanguageModelORLM..." << endl;
m_filePath = filePath; m_filePath = filePath;
m_factorType = factorType; m_factorType = factorType;
@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
CreateFactors(); CreateFactors();
return true; return true;
} }
void LanguageModelORLM::CreateFactors() { void LanguageModelORLM::CreateFactors()
{
FactorCollection &factorCollection = FactorCollection::Instance(); FactorCollection &factorCollection = FactorCollection::Instance();
size_t maxFactorId = 0; // to create lookup vector later on size_t maxFactorId = 0; // to create lookup vector later on
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart(); for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
vIter != m_lm->vocab_->VocabEnd(); vIter++){ vIter != m_lm->vocab_->VocabEnd(); vIter++) {
// get word from ORLM vocab and associate with (new) factor id // get word from ORLM vocab and associate with (new) factor id
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId(); size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
m_lmids_map[factorId] = vIter->second; m_lmids_map[factorId] = vIter->second;
@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() {
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndArray[m_factorType] = m_sentenceEnd; m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// add to lookup vector in object // add to lookup vector in object
lm_ids_vec_.resize(maxFactorId+1); lm_ids_vec_.resize(maxFactorId+1);
// fill with OOV code // fill with OOV code
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id); fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() {
iter != m_lmids_map.end() ; ++iter) iter != m_lmids_map.end() ; ++iter)
lm_ids_vec_[iter->first] = iter->second; lm_ids_vec_[iter->first] = iter->second;
} }
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const { wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
{
return m_lm->vocab_->GetWordID(str); return m_lm->vocab_->GetWordID(str);
} }
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const { wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
{
size_t factorId = factor->GetId(); size_t factorId = factor->GetId();
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId]; return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
} }
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor, LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
State* finalState) const { State* finalState) const
{
FactorType factorType = GetFactorType(); FactorType factorType = GetFactorType();
// set up context // set up context
//std::vector<long unsigned int> factor(1,0); //std::vector<long unsigned int> factor(1,0);
@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFact
*/ */
return ret; return ret;
} }
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value) { bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
{
/*cerr << "Inserting into ORLM: \""; /*cerr << "Inserting into ORLM: \"";
iterate(ngram, nit) iterate(ngram, nit)
cerr << *nit << " "; cerr << *nit << " ";
cerr << "\"\t" << value << endl; */ cerr << "\"\t" << value << endl; */
m_lm->vocab_->MakeOpen(); m_lm->vocab_->MakeOpen();
bool res = m_lm->update(ngram, value); bool res = m_lm->update(ngram, value);
m_lm->vocab_->MakeClosed(); m_lm->vocab_->MakeClosed();
return res; return res;
} }

View File

@ -15,7 +15,8 @@ namespace Moses
class Factor; class Factor;
class Phrase; class Phrase;
class LanguageModelORLM : public LanguageModelPointerState { class LanguageModelORLM : public LanguageModelPointerState
{
public: public:
typedef count_t T; // type for ORLM filter typedef count_t T; // type for ORLM filter
LanguageModelORLM() LanguageModelORLM()
@ -30,13 +31,15 @@ public:
fout.close(); fout.close();
delete m_lm; delete m_lm;
} }
void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches void CleanUpAfterSentenceProcessing() {
m_lm->clearCache(); // clear caches
}
void InitializeBeforeSentenceProcessing() { // nothing to do void InitializeBeforeSentenceProcessing() { // nothing to do
//m_lm->initThreadSpecificData(); // Creates thread specific data iff //m_lm->initThreadSpecificData(); // Creates thread specific data iff
// compiled with multithreading. // compiled with multithreading.
} }
bool UpdateORLM(const std::vector<string>& ngram, const int value); bool UpdateORLM(const std::vector<string>& ngram, const int value);
protected: protected:
OnlineRLM<T>* m_lm; OnlineRLM<T>* m_lm;
//MultiOnlineRLM<T>* m_lm; //MultiOnlineRLM<T>* m_lm;
wordID_t m_oov_id; wordID_t m_oov_id;

View File

@ -347,7 +347,8 @@ const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
} }
LanguageModelMultiFactor *NewParallelBackoff() { LanguageModelMultiFactor *NewParallelBackoff()
{
return new LanguageModelParallelBackoff(); return new LanguageModelParallelBackoff();
} }

View File

@ -38,7 +38,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses namespace Moses
{ {
namespace namespace
{ {
using namespace std; using namespace std;
@ -57,7 +57,7 @@ public:
} }
void InitializeBeforeSentenceProcessing() { void InitializeBeforeSentenceProcessing() {
m_lm->initThreadSpecificData(); // Creates thread specific data iff m_lm->initThreadSpecificData(); // Creates thread specific data iff
// compiled with multithreading. // compiled with multithreading.
} }
protected: protected:
std::vector<randlm::WordID> m_randlm_ids_vec; std::vector<randlm::WordID> m_randlm_ids_vec;
@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
} }
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor, LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
State* finalState) const State* finalState) const
{ {
FactorType factorType = GetFactorType(); FactorType factorType = GetFactorType();
// set up context // set up context
@ -156,7 +156,8 @@ LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
} }
LanguageModelPointerState *NewRandLM() { LanguageModelPointerState *NewRandLM()
{
return new LanguageModelRandLM(); return new LanguageModelRandLM();
} }

View File

@ -46,7 +46,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
const float weightLM = lm.GetWeight(); const float weightLM = lm.GetWeight();
const float oovWeightLM = lm.GetOOVWeight(); const float oovWeightLM = lm.GetOOVWeight();
float fullScore, nGramScore; float fullScore, nGramScore;
size_t oovCount; size_t oovCount;
// do not process, if factors not defined yet (happens in partial translation options) // do not process, if factors not defined yet (happens in partial translation options)
@ -64,7 +64,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
} else { } else {
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right? breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
} }
retFullScore += fullScore * weightLM; retFullScore += fullScore * weightLM;
retNGramScore += nGramScore * weightLM; retNGramScore += nGramScore * weightLM;

View File

@ -39,13 +39,13 @@ public:
virtual FFState* Evaluate(const Hypothesis& cur_hypo, virtual FFState* Evaluate(const Hypothesis& cur_hypo,
const FFState* prev_state, const FFState* prev_state,
ScoreComponentCollection* accumulator) const; ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart(const ChartHypothesis&, virtual FFState* EvaluateChart(const ChartHypothesis&,
int /* featureID */, int /* featureID */,
ScoreComponentCollection*) const { ScoreComponentCollection*) const {
CHECK(0); // not valid for chart decoder CHECK(0); // not valid for chart decoder
return NULL; return NULL;
} }
virtual const FFState* EmptyHypothesisState(const InputType &input) const; virtual const FFState* EmptyHypothesisState(const InputType &input) const;

View File

@ -267,8 +267,9 @@ struct SGNReverseCompare {
/** /**
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010 * Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
**/ **/
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const { void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
{
vector<SearchGraphNode> searchGraph; vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph); GetSearchGraph(searchGraph);
@ -282,15 +283,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
map<int,const Hypothesis*> idToHyp; map<int,const Hypothesis*> idToHyp;
map<int,float> fscores; map<int,float> fscores;
//Iterating through the hypos in reverse order of id gives a reverse //Iterating through the hypos in reverse order of id gives a reverse
//topological order. We rely on the fact that hypo ids are given out //topological order. We rely on the fact that hypo ids are given out
//sequentially, as the search proceeds. //sequentially, as the search proceeds.
//NB: Could just sort by stack. //NB: Could just sort by stack.
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare()); sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
//first task is to fill in the outgoing hypos and edge scores. //first task is to fill in the outgoing hypos and edge scores.
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
i != searchGraph.end(); ++i) { i != searchGraph.end(); ++i) {
const Hypothesis* hypo = i->hypo; const Hypothesis* hypo = i->hypo;
idToHyp[hypo->GetId()] = hypo; idToHyp[hypo->GetId()] = hypo;
fscores[hypo->GetId()] = i->fscore; fscores[hypo->GetId()] = i->fscore;
@ -298,7 +299,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//back to current //back to current
const Hypothesis* prevHypo = i->hypo->GetPrevHypo(); const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
outgoingHyps[prevHypo].insert(hypo); outgoingHyps[prevHypo].insert(hypo);
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] = edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
hypo->GetScore() - prevHypo->GetScore(); hypo->GetScore() - prevHypo->GetScore();
} }
//forward from current //forward from current
@ -309,7 +310,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
outgoingHyps[hypo].insert(nextHypo); outgoingHyps[hypo].insert(nextHypo);
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId()); map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
CHECK(fscoreIter != fscores.end()); CHECK(fscoreIter != fscores.end());
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] = edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
i->fscore - fscoreIter->second; i->fscore - fscoreIter->second;
} }
} }
@ -317,26 +318,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//then run through again to calculate sigmas //then run through again to calculate sigmas
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
i != searchGraph.end(); ++i) { i != searchGraph.end(); ++i) {
if (i->forward == -1) { if (i->forward == -1) {
sigmas[i->hypo] = 0; sigmas[i->hypo] = 0;
} else { } else {
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter = map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(i->hypo); outgoingHyps.find(i->hypo);
CHECK(outIter != outgoingHyps.end()); CHECK(outIter != outgoingHyps.end());
float sigma = 0; float sigma = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin(); for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) { j != outIter->second.end(); ++j) {
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j); map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
CHECK(succIter != sigmas.end()); CHECK(succIter != sigmas.end());
map<Edge,float>::const_iterator edgeScoreIter = map<Edge,float>::const_iterator edgeScoreIter =
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId())); edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
CHECK(edgeScoreIter != edgeScores.end()); CHECK(edgeScoreIter != edgeScores.end());
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j) float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
if (sigma == 0) { if (sigma == 0) {
sigma = term; sigma = term;
} else { } else {
sigma = log_sum(sigma,term); sigma = log_sum(sigma,term);
} }
@ -352,7 +353,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<const Hypothesis*> path; vector<const Hypothesis*> path;
path.push_back(startHypo); path.push_back(startHypo);
while(1) { while(1) {
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter = map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(path.back()); outgoingHyps.find(path.back());
if (outIter == outgoingHyps.end() || !outIter->second.size()) { if (outIter == outgoingHyps.end() || !outIter->second.size()) {
//end of the path //end of the path
@ -363,7 +364,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<float> candidateScores; vector<float> candidateScores;
float scoreTotal = 0; float scoreTotal = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin(); for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) { j != outIter->second.end(); ++j) {
candidates.push_back(*j); candidates.push_back(*j);
CHECK(sigmas.find(*j) != sigmas.end()); CHECK(sigmas.find(*j) != sigmas.end());
Edge edge(path.back()->GetId(),(*j)->GetId()); Edge edge(path.back()->GetId(),(*j)->GetId());
@ -390,18 +391,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
} }
//cerr << "Random: " << random << " Chose " << position-1 << endl; //cerr << "Random: " << random << " Chose " << position-1 << endl;
const Hypothesis* chosen = candidates[position-1]; const Hypothesis* chosen = candidates[position-1];
path.push_back(chosen); path.push_back(chosen);
} }
//cerr << "Path: " << endl; //cerr << "Path: " << endl;
//for (size_t j = 0; j < path.size(); ++j) { //for (size_t j = 0; j < path.size(); ++j) {
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl; // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
//} //}
//cerr << endl; //cerr << endl;
//Convert the hypos to TrellisPath //Convert the hypos to TrellisPath
ret.Add(new TrellisPath(path)); ret.Add(new TrellisPath(path));
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl; //cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
} }
} }
@ -676,17 +677,17 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
else else
outputSearchGraphStream << " hyp=" << searchNode.hypo->GetId(); outputSearchGraphStream << " hyp=" << searchNode.hypo->GetId();
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered() outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
<< " back=" << prevHypo->GetId() << " back=" << prevHypo->GetId()
<< " score=" << searchNode.hypo->GetScore() << " score=" << searchNode.hypo->GetScore()
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore()); << " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
if (searchNode.recombinationHypo != NULL) if (searchNode.recombinationHypo != NULL)
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId(); outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos() << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos(); << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
// Modified so that -osgx is a superset of -osg (GST Oct 2011) // Modified so that -osgx is a superset of -osg (GST Oct 2011)
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown(); ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
@ -694,10 +695,10 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
outputSearchGraphStream << " scores=[ "; outputSearchGraphStream << " scores=[ ";
StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown ); StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown );
outputSearchGraphStream << " ]"; outputSearchGraphStream << " ]";
outputSearchGraphStream << " out=" << searchNode.hypo->GetSourcePhraseStringRep() << "|" << outputSearchGraphStream << " out=" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl; searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl; // outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
} }

View File

@ -36,7 +36,7 @@ namespace PCN
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt; typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol; typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN; typedef std::vector<CNCol> CN;
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a /** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
* word lattice in PCN format, return a CN object representing the lattice * word lattice in PCN format, return a CN object representing the lattice
*/ */

View File

@ -71,10 +71,10 @@ Parameter::Parameter()
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false"); AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
AddParam("report-segmentation", "t", "report phrase segmentation in the output"); AddParam("report-segmentation", "t", "report phrase segmentation in the output");
#ifdef HAVE_SYNLM #ifdef HAVE_SYNLM
AddParam("slmodel-file", "location of the syntactic language model file(s)"); AddParam("slmodel-file", "location of the syntactic language model file(s)");
AddParam("weight-slm", "slm", "weight(s) for syntactic language model"); AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
AddParam("slmodel-factor", "factor to use with syntactic language model"); AddParam("slmodel-factor", "factor to use with syntactic language model");
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser"); AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
#endif #endif
AddParam("stack", "s", "maximum stack size for histogram pruning"); AddParam("stack", "s", "maximum stack size for histogram pruning");
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)"); AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
@ -277,14 +277,13 @@ bool Parameter::Validate()
PARAM_MAP::const_iterator iterParams; PARAM_MAP::const_iterator iterParams;
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) { for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
const std::string &key = iterParams->first; const std::string &key = iterParams->first;
if (m_valid.find(key) == m_valid.end()) if (m_valid.find(key) == m_valid.end()) {
{
UserMessage::Add("Unknown parameter " + key); UserMessage::Add("Unknown parameter " + key);
noErrorFlag = false; noErrorFlag = false;
} }
} }
// required parameters // required parameters
if (m_setting["ttable-file"].size() == 0) { if (m_setting["ttable-file"].size() == 0) {
@ -307,7 +306,7 @@ bool Parameter::Validate()
} }
if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1) if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1)
!= m_setting["weight-l"].size()) { != m_setting["weight-l"].size()) {
stringstream errorMsg(""); stringstream errorMsg("");
errorMsg << "Config and parameters specify " errorMsg << "Config and parameters specify "
<< static_cast<int>(m_setting["lmodel-file"].size()) << static_cast<int>(m_setting["lmodel-file"].size())
@ -457,8 +456,7 @@ bool Parameter::ReadConfigFile(const string &filePath )
if (line.size() == 0) { if (line.size() == 0) {
// blank line. do nothing. // blank line. do nothing.
} } else if (line[0]=='[') {
else if (line[0]=='[') {
// new parameter // new parameter
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) { for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
if (line[currPos] == ']') { if (line[currPos] == ']') {

View File

@ -143,9 +143,9 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) { for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
Word &word = AddWord(); Word &word = AddWord();
size_t index = 0; size_t index = 0;
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter)); for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
factor_it && (index < factorOrder.size()); factor_it && (index < factorOrder.size());
++factor_it, ++index) { ++factor_it, ++index) {
word[factorOrder[index]] = factorCollection.AddFactor(*factor_it); word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
} }
if (index != factorOrder.size()) { if (index != factorOrder.size()) {

View File

@ -61,7 +61,7 @@ public:
/** Fills phrase with words from format string, typically from phrase table or sentence input /** Fills phrase with words from format string, typically from phrase table or sentence input
* \param factorOrder factor types of each element in 2D string vector * \param factorOrder factor types of each element in 2D string vector
* \param phraseString formatted input string to parse * \param phraseString formatted input string to parse
* \param factorDelimiter delimiter between factors. * \param factorDelimiter delimiter between factors.
*/ */
void CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter); void CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter);

View File

@ -136,7 +136,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
m_filePath += ".gz"; m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl); VERBOSE(2,"Using gzipped file" << std::endl);
} }
PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(m_numScoreComponent,this); PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(m_numScoreComponent,this);
bool ret = pdm->Load(GetInput() bool ret = pdm->Load(GetInput()
, GetOutput() , GetOutput()
@ -154,7 +154,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
m_filePath += ".gz"; m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl); VERBOSE(2,"Using gzipped file" << std::endl);
} }
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this); PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this);
bool ret = pdm->Load(GetInput() bool ret = pdm->Load(GetInput()
, GetOutput() , GetOutput()
@ -255,18 +255,18 @@ PhraseDictionaryFeature::~PhraseDictionaryFeature()
std::string PhraseDictionaryFeature::GetScoreProducerDescription(unsigned idx) const std::string PhraseDictionaryFeature::GetScoreProducerDescription(unsigned idx) const
{ {
if (idx < GetNumInputScores()){ if (idx < GetNumInputScores()) {
return "InputScore"; return "InputScore";
}else{ } else {
return "PhraseModel"; return "PhraseModel";
} }
} }
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
{ {
if (idx < GetNumInputScores()){ if (idx < GetNumInputScores()) {
return "I"; return "I";
}else{ } else {
return "tm"; return "tm";
} }
} }

View File

@ -16,16 +16,16 @@
using namespace std; using namespace std;
namespace Moses namespace Moses
{ {
bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output , const std::vector<FactorType> &output
, const std::string &filePath , const std::string &filePath
, const std::vector<float> &weight , const std::vector<float> &weight
, size_t tableLimit , size_t tableLimit
, const LMList &languageModels , const LMList &languageModels
, const WordPenaltyProducer* wpProducer) , const WordPenaltyProducer* wpProducer)
{ {
// file path is the directory of the rules for eacg, NOT the file of all the rules // file path is the directory of the rules for eacg, NOT the file of all the rules
m_filePath = filePath; m_filePath = filePath;
@ -36,7 +36,7 @@ bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
m_languageModels = &languageModels; m_languageModels = &languageModels;
m_wpProducer = wpProducer; m_wpProducer = wpProducer;
m_weight = &weight; m_weight = &weight;
return true; return true;
} }
@ -44,20 +44,20 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
{ {
// clear out rules for previous sentence // clear out rules for previous sentence
m_collection.Clear(); m_collection.Clear();
// populate with rules for this sentence // populate with rules for this sentence
long translationId = source.GetTranslationId(); long translationId = source.GetTranslationId();
string grammarFile = m_filePath + "/grammar.out." + SPrint(translationId); string grammarFile = m_filePath + "/grammar.out." + SPrint(translationId);
// data from file // data from file
InputFileStream inFile(grammarFile); InputFileStream inFile(grammarFile);
std::auto_ptr<RuleTableLoader> loader = std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(grammarFile); RuleTableLoaderFactory::Create(grammarFile);
bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit, bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
*m_languageModels, m_wpProducer, *this); *m_languageModels, m_wpProducer, *this);
CHECK(ret); CHECK(ret);
} }

View File

@ -11,13 +11,14 @@
#include "PhraseDictionarySCFG.h" #include "PhraseDictionarySCFG.h"
namespace Moses { namespace Moses
{
class PhraseDictionaryALSuffixArray : public PhraseDictionarySCFG class PhraseDictionaryALSuffixArray : public PhraseDictionarySCFG
{ {
public: public:
PhraseDictionaryALSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature) PhraseDictionaryALSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature)
: PhraseDictionarySCFG(numScoreComponent,feature) {} : PhraseDictionarySCFG(numScoreComponent,feature) {}
bool Load(const std::vector<FactorType> &input bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output , const std::vector<FactorType> &output
@ -34,9 +35,9 @@ protected:
const LMList *m_languageModels; const LMList *m_languageModels;
const WordPenaltyProducer *m_wpProducer; const WordPenaltyProducer *m_wpProducer;
const std::vector<float> *m_weight; const std::vector<float> *m_weight;
}; };
} }

View File

@ -72,7 +72,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment) void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
{ {
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
} }
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */) void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{ {

View File

@ -15,30 +15,31 @@
using namespace std; using namespace std;
namespace Moses { namespace Moses
{
bool PhraseDictionaryHiero::Load(const std::vector<FactorType> &input bool PhraseDictionaryHiero::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output , const std::vector<FactorType> &output
, const std::string &filePath , const std::string &filePath
, const std::vector<float> &weight , const std::vector<float> &weight
, size_t tableLimit , size_t tableLimit
, const LMList &languageModels , const LMList &languageModels
, const WordPenaltyProducer* wpProducer) , const WordPenaltyProducer* wpProducer)
{ {
m_filePath = filePath; m_filePath = filePath;
m_tableLimit = tableLimit; m_tableLimit = tableLimit;
// data from file // data from file
InputFileStream inFile(filePath); InputFileStream inFile(filePath);
std::auto_ptr<RuleTableLoader> loader = std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(filePath); RuleTableLoaderFactory::Create(filePath);
bool ret = loader->Load(input, output, inFile, weight, tableLimit, bool ret = loader->Load(input, output, inFile, weight, tableLimit,
languageModels, wpProducer, *this); languageModels, wpProducer, *this);
return ret; return ret;
} }
} // namespace } // namespace

View File

@ -11,13 +11,14 @@
#include "PhraseDictionarySCFG.h" #include "PhraseDictionarySCFG.h"
namespace Moses { namespace Moses
{
class PhraseDictionaryHiero : public PhraseDictionarySCFG class PhraseDictionaryHiero : public PhraseDictionarySCFG
{ {
public: public:
PhraseDictionaryHiero(size_t numScoreComponent, PhraseDictionaryFeature* feature) PhraseDictionaryHiero(size_t numScoreComponent, PhraseDictionaryFeature* feature)
: PhraseDictionarySCFG(numScoreComponent,feature) {} : PhraseDictionarySCFG(numScoreComponent,feature) {}
bool Load(const std::vector<FactorType> &input bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output , const std::vector<FactorType> &output

Some files were not shown because too many files have changed in this diff Show More