uint -> size_t

This commit is contained in:
Hieu Hoang 2011-12-12 19:13:32 +07:00
parent 9ec1bef6fb
commit 9861ecbbe5
191 changed files with 4496 additions and 4143 deletions

View File

@ -38,22 +38,21 @@
typedef struct _cmd CMD;
struct _cmd
{
CMD * next;
CMD * tail; /* valid on in head */
RULE * rule; /* rule->actions contains shell script */
LIST * shell; /* $(SHELL) value */
LOL args; /* LISTs for $(<), $(>) */
char * buf; /* actual commands */
struct _cmd {
CMD * next;
CMD * tail; /* valid on in head */
RULE * rule; /* rule->actions contains shell script */
LIST * shell; /* $(SHELL) value */
LOL args; /* LISTs for $(<), $(>) */
char * buf; /* actual commands */
};
CMD * cmd_new
(
RULE * rule, /* rule (referenced) */
LIST * targets, /* $(<) (freed) */
LIST * sources, /* $(>) (freed) */
LIST * shell /* $(SHELL) (freed) */
RULE * rule, /* rule (referenced) */
LIST * targets, /* $(<) (freed) */
LIST * sources, /* $(>) (freed) */
LIST * shell /* $(SHELL) (freed) */
);
void cmd_free( CMD * );

View File

@ -10,35 +10,33 @@
#include <time.h>
struct profile_info
{
/* name of rule being called */
char* name;
/* cumulative time spent in rule */
clock_t cumulative;
/* time spent in rule proper */
clock_t net;
/* number of time rule was entered */
unsigned long num_entries;
/* number of the times this function is present in stack */
unsigned long stack_count;
/* bytes of memory allocated by the call */
unsigned long memory;
struct profile_info {
/* name of rule being called */
char* name;
/* cumulative time spent in rule */
clock_t cumulative;
/* time spent in rule proper */
clock_t net;
/* number of time rule was entered */
unsigned long num_entries;
/* number of the times this function is present in stack */
unsigned long stack_count;
/* bytes of memory allocated by the call */
unsigned long memory;
};
typedef struct profile_info profile_info;
struct profile_frame
{
/* permanent storage where data accumulates */
profile_info* info;
/* overhead for profiling in this call */
clock_t overhead;
/* time of last entry to rule */
clock_t entry_time;
/* stack frame of caller */
struct profile_frame* caller;
/* time spent in subrules */
clock_t subrules;
struct profile_frame {
/* permanent storage where data accumulates */
profile_info* info;
/* overhead for profiling in this call */
clock_t overhead;
/* time of last entry to rule */
clock_t entry_time;
/* stack frame of caller */
struct profile_frame* caller;
/* time spent in subrules */
clock_t subrules;
};
typedef struct profile_frame profile_frame;

View File

@ -18,22 +18,21 @@
#include <time.h>
typedef struct timing_info
{
double system;
double user;
time_t start;
time_t end;
typedef struct timing_info {
double system;
double user;
time_t start;
time_t end;
} timing_info;
void exec_cmd
(
char * string,
void (* func)( void * closure, int status, timing_info *, char *, char * ),
void * closure,
LIST * shell,
char * action,
char * target
char * string,
void (* func)( void * closure, int status, timing_info *, char *, char * ),
void * closure,
LIST * shell,
char * action,
char * target
);
int exec_wait();

View File

@ -33,14 +33,13 @@ int file_is_file(char* filename);
int file_mkdir(char *pathname);
typedef struct file_info_t file_info_t ;
struct file_info_t
{
char * name;
short is_file;
short is_dir;
unsigned long size;
time_t time;
LIST * files;
struct file_info_t {
char * name;
short is_file;
short is_dir;
unsigned long size;
time_t time;
LIST * files;
};

View File

@ -12,15 +12,14 @@
typedef struct _PARSE PARSE;
typedef struct frame FRAME;
struct frame
{
FRAME * prev;
/* The nearest enclosing frame for which module->user_module is true. */
FRAME * prev_user;
LOL args[ 1 ];
module_t * module;
PARSE * procedure;
char * rulename;
struct frame {
FRAME * prev;
/* The nearest enclosing frame for which module->user_module is true. */
FRAME * prev_user;
LOL args[ 1 ];
module_t * module;
PARSE * procedure;
char * rulename;
};

View File

@ -91,7 +91,7 @@
#include <ctype.h>
#include <malloc.h>
#ifndef __MWERKS__
#include <memory.h>
#include <memory.h>
#endif
#include <signal.h>
#include <string.h>
@ -113,17 +113,17 @@
/* AS400 cross-compile from NT. */
#ifdef AS400
#undef OSMINOR
#undef OSMAJOR
#define OSMAJOR "AS400=true"
#define OSMINOR "OS=AS400"
#define OS_AS400
#undef OSMINOR
#undef OSMAJOR
#define OSMAJOR "AS400=true"
#define OSMINOR "OS=AS400"
#define OS_AS400
#endif
/* Metrowerks Standard Library on Windows. */
#ifdef __MSL__
#undef HAVE_POPEN
#undef HAVE_POPEN
#endif
# endif
@ -182,7 +182,7 @@
#define DOWNSHIFT_PATHS
#ifdef __EMX__
#define USE_FILEUNIX
#define USE_FILEUNIX
#endif
#endif
@ -218,181 +218,181 @@
#define PATH_DELIM '/'
#ifdef _AIX
#define unix
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
#define OSMINOR "OS=AIX"
#define OS_AIX
#define NO_VFORK
#define unix
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
#define OSMINOR "OS=AIX"
#define OS_AIX
#define NO_VFORK
#endif
#ifdef AMIGA
#define OSMINOR "OS=AMIGA"
#define OS_AMIGA
#define OSMINOR "OS=AMIGA"
#define OS_AMIGA
#endif
#ifdef __BEOS__
#define unix
#define OSMINOR "OS=BEOS"
#define OS_BEOS
#define NO_VFORK
#define unix
#define OSMINOR "OS=BEOS"
#define OS_BEOS
#define NO_VFORK
#endif
#ifdef __bsdi__
#define OSMINOR "OS=BSDI"
#define OS_BSDI
#define OSMINOR "OS=BSDI"
#define OS_BSDI
#endif
#if defined (COHERENT) && defined (_I386)
#define OSMINOR "OS=COHERENT"
#define OS_COHERENT
#define NO_VFORK
#define OSMINOR "OS=COHERENT"
#define OS_COHERENT
#define NO_VFORK
#endif
#if defined(__cygwin__) || defined(__CYGWIN__)
#define OSMINOR "OS=CYGWIN"
#define OS_CYGWIN
#define OSMINOR "OS=CYGWIN"
#define OS_CYGWIN
#endif
#if defined(__FreeBSD__) && !defined(__DragonFly__)
#define OSMINOR "OS=FREEBSD"
#define OS_FREEBSD
#define OSMINOR "OS=FREEBSD"
#define OS_FREEBSD
#endif
#ifdef __DragonFly__
#define OSMINOR "OS=DRAGONFLYBSD"
#define OS_DRAGONFLYBSD
#define OSMINOR "OS=DRAGONFLYBSD"
#define OS_DRAGONFLYBSD
#endif
#ifdef __DGUX__
#define OSMINOR "OS=DGUX"
#define OS_DGUX
#define OSMINOR "OS=DGUX"
#define OS_DGUX
#endif
#ifdef __hpux
#define OSMINOR "OS=HPUX"
#define OS_HPUX
#define OSMINOR "OS=HPUX"
#define OS_HPUX
#endif
#ifdef __OPENNT
#define unix
#define OSMINOR "OS=INTERIX"
#define OS_INTERIX
#define NO_VFORK
#define unix
#define OSMINOR "OS=INTERIX"
#define OS_INTERIX
#define NO_VFORK
#endif
#ifdef __sgi
#define OSMINOR "OS=IRIX"
#define OS_IRIX
#define NO_VFORK
#define OSMINOR "OS=IRIX"
#define OS_IRIX
#define NO_VFORK
#endif
#ifdef __ISC
#define OSMINOR "OS=ISC"
#define OS_ISC
#define NO_VFORK
#define OSMINOR "OS=ISC"
#define OS_ISC
#define NO_VFORK
#endif
#ifdef linux
#define OSMINOR "OS=LINUX"
#define OS_LINUX
#define OSMINOR "OS=LINUX"
#define OS_LINUX
#endif
#ifdef __Lynx__
#define OSMINOR "OS=LYNX"
#define OS_LYNX
#define NO_VFORK
#define unix
#define OSMINOR "OS=LYNX"
#define OS_LYNX
#define NO_VFORK
#define unix
#endif
#ifdef __MACHTEN__
#define OSMINOR "OS=MACHTEN"
#define OS_MACHTEN
#define OSMINOR "OS=MACHTEN"
#define OS_MACHTEN
#endif
#ifdef mpeix
#define unix
#define OSMINOR "OS=MPEIX"
#define OS_MPEIX
#define NO_VFORK
#define unix
#define OSMINOR "OS=MPEIX"
#define OS_MPEIX
#define NO_VFORK
#endif
#ifdef __MVS__
#define unix
#define OSMINOR "OS=MVS"
#define OS_MVS
#define unix
#define OSMINOR "OS=MVS"
#define OS_MVS
#endif
#ifdef _ATT4
#define OSMINOR "OS=NCR"
#define OS_NCR
#define OSMINOR "OS=NCR"
#define OS_NCR
#endif
#ifdef __NetBSD__
#define unix
#define OSMINOR "OS=NETBSD"
#define OS_NETBSD
#define NO_VFORK
#define unix
#define OSMINOR "OS=NETBSD"
#define OS_NETBSD
#define NO_VFORK
#endif
#ifdef __QNX__
#define unix
#ifdef __QNXNTO__
#define OSMINOR "OS=QNXNTO"
#define OS_QNXNTO
#else
#define OSMINOR "OS=QNX"
#define OS_QNX
#define NO_VFORK
#define MAXLINE 996
#endif
#define unix
#ifdef __QNXNTO__
#define OSMINOR "OS=QNXNTO"
#define OS_QNXNTO
#else
#define OSMINOR "OS=QNX"
#define OS_QNX
#define NO_VFORK
#define MAXLINE 996
#endif
#endif
#ifdef NeXT
#ifdef __APPLE__
#define OSMINOR "OS=RHAPSODY"
#define OS_RHAPSODY
#else
#define OSMINOR "OS=NEXT"
#define OS_NEXT
#endif
#ifdef __APPLE__
#define OSMINOR "OS=RHAPSODY"
#define OS_RHAPSODY
#else
#define OSMINOR "OS=NEXT"
#define OS_NEXT
#endif
#endif
#ifdef __APPLE__
#define unix
#define OSMINOR "OS=MACOSX"
#define OS_MACOSX
#define unix
#define OSMINOR "OS=MACOSX"
#define OS_MACOSX
#endif
#ifdef __osf__
#ifndef unix
#define unix
#endif
#define OSMINOR "OS=OSF"
#define OS_OSF
#ifndef unix
#define unix
#endif
#define OSMINOR "OS=OSF"
#define OS_OSF
#endif
#ifdef _SEQUENT_
#define OSMINOR "OS=PTX"
#define OS_PTX
#define OSMINOR "OS=PTX"
#define OS_PTX
#endif
#ifdef M_XENIX
#define OSMINOR "OS=SCO"
#define OS_SCO
#define NO_VFORK
#define OSMINOR "OS=SCO"
#define OS_SCO
#define NO_VFORK
#endif
#ifdef sinix
#define unix
#define OSMINOR "OS=SINIX"
#define OS_SINIX
#define unix
#define OSMINOR "OS=SINIX"
#define OS_SINIX
#endif
#ifdef sun
#if defined(__svr4__) || defined(__SVR4)
#define OSMINOR "OS=SOLARIS"
#define OS_SOLARIS
#else
#define OSMINOR "OS=SUNOS"
#define OS_SUNOS
#endif
#if defined(__svr4__) || defined(__SVR4)
#define OSMINOR "OS=SOLARIS"
#define OS_SOLARIS
#else
#define OSMINOR "OS=SUNOS"
#define OS_SUNOS
#endif
#endif
#ifdef ultrix
#define OSMINOR "OS=ULTRIX"
#define OS_ULTRIX
#define OSMINOR "OS=ULTRIX"
#define OS_ULTRIX
#endif
#ifdef _UNICOS
#define OSMINOR "OS=UNICOS"
#define OS_UNICOS
#define OSMINOR "OS=UNICOS"
#define OS_UNICOS
#endif
#if defined(__USLC__) && !defined(M_XENIX)
#define OSMINOR "OS=UNIXWARE"
#define OS_UNIXWARE
#define OSMINOR "OS=UNIXWARE"
#define OS_UNIXWARE
#endif
#ifdef __OpenBSD__
#define OSMINOR "OS=OPENBSD"
#define OS_OPENBSD
#define unix
#define OSMINOR "OS=OPENBSD"
#define OS_OPENBSD
#define unix
#endif
#if defined (__FreeBSD_kernel__) && !defined(__FreeBSD__)
#define OSMINOR "OS=KFREEBSD"
#define OS_KFREEBSD
#define OSMINOR "OS=KFREEBSD"
#define OS_KFREEBSD
#endif
#ifndef OSMINOR
#define OSMINOR "OS=UNKNOWN"
#define OSMINOR "OS=UNKNOWN"
#endif
/* All the UNIX includes */
@ -401,7 +401,7 @@
#include <sys/stat.h>
#ifndef OS_MPEIX
#include <sys/file.h>
#include <sys/file.h>
#endif
#include <fcntl.h>
@ -413,11 +413,11 @@
#include <unistd.h>
#ifndef OS_QNX
#include <memory.h>
#include <memory.h>
#endif
#ifndef OS_ULTRIX
#include <stdlib.h>
#include <stdlib.h>
#endif
#if !defined( OS_BSDI ) && \
@ -429,7 +429,7 @@
!defined( OS_RHAPSODY ) && \
!defined( OS_MVS ) && \
!defined( OS_OPENBSD )
#include <malloc.h>
#include <malloc.h>
#endif
#endif
@ -443,57 +443,57 @@
defined( ppc ) || \
defined( __powerpc__ ) || \
defined( __ppc__ )
#define OSPLAT "OSPLAT=PPC"
#define OSPLAT "OSPLAT=PPC"
#endif
#if defined( _ALPHA_ ) || \
defined( __alpha__ )
#define OSPLAT "OSPLAT=AXP"
#define OSPLAT "OSPLAT=AXP"
#endif
#if defined( _i386_ ) || \
defined( __i386__ ) || \
defined( __i386 ) || \
defined( _M_IX86 )
#define OSPLAT "OSPLAT=X86"
#define OSPLAT "OSPLAT=X86"
#endif
#if defined( __ia64__ ) || \
defined( __IA64__ ) || \
defined( __ia64 )
#define OSPLAT "OSPLAT=IA64"
#define OSPLAT "OSPLAT=IA64"
#endif
#if defined( __x86_64__ ) || \
defined( __amd64__ ) || \
defined( _M_AMD64 )
#define OSPLAT "OSPLAT=X86_64"
#define OSPLAT "OSPLAT=X86_64"
#endif
#if defined( __sparc__ ) || \
defined( __sparc )
#define OSPLAT "OSPLAT=SPARC"
#define OSPLAT "OSPLAT=SPARC"
#endif
#ifdef __mips__
#define OSPLAT "OSPLAT=MIPS"
#define OSPLAT "OSPLAT=MIPS"
#endif
#ifdef __arm__
#define OSPLAT "OSPLAT=ARM"
#define OSPLAT "OSPLAT=ARM"
#endif
#ifdef __s390__
#define OSPLAT "OSPLAT=390"
#define OSPLAT "OSPLAT=390"
#endif
#ifdef __hppa
#define OSPLAT "OSPLAT=PARISC"
#define OSPLAT "OSPLAT=PARISC"
#endif
#ifndef OSPLAT
#define OSPLAT ""
#define OSPLAT ""
#endif
/*
@ -501,16 +501,16 @@
*/
#ifndef MAXLINE
#define MAXLINE 102400 /* longest 'together' actions' */
#define MAXLINE 102400 /* longest 'together' actions' */
#endif
#ifndef EXITOK
#define EXITOK 0
#define EXITBAD 1
#define EXITOK 0
#define EXITBAD 1
#endif
#ifndef SPLITPATH
#define SPLITPATH ':'
#define SPLITPATH ':'
#endif
/* You probably do not need to muck with these. */
@ -526,19 +526,18 @@
#define DEBUG_MAX 14
struct globs
{
int noexec;
int jobs;
int quitquick;
int newestfirst; /* build newest sources first */
int pipe_action;
char debug[ DEBUG_MAX ];
FILE * cmdout; /* print cmds, not run them */
long timeout; /* number of seconds to limit actions to,
struct globs {
int noexec;
int jobs;
int quitquick;
int newestfirst; /* build newest sources first */
int pipe_action;
char debug[ DEBUG_MAX ];
FILE * cmdout; /* print cmds, not run them */
long timeout; /* number of seconds to limit actions to,
* default 0 for no limit.
*/
int dart; /* output build and test results formatted for Dart */
int dart; /* output build and test results formatted for Dart */
};
extern struct globs globs;

View File

@ -26,56 +26,56 @@
/* Tokens. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
/* Put the tokens into the symbol table, so that GDB and other debuggers
know about them. */
enum yytokentype {
_BANG_t = 258,
_BANG_EQUALS_t = 259,
_AMPER_t = 260,
_AMPERAMPER_t = 261,
_LPAREN_t = 262,
_RPAREN_t = 263,
_PLUS_EQUALS_t = 264,
_COLON_t = 265,
_SEMIC_t = 266,
_LANGLE_t = 267,
_LANGLE_EQUALS_t = 268,
_EQUALS_t = 269,
_RANGLE_t = 270,
_RANGLE_EQUALS_t = 271,
_QUESTION_EQUALS_t = 272,
_LBRACKET_t = 273,
_RBRACKET_t = 274,
ACTIONS_t = 275,
BIND_t = 276,
CASE_t = 277,
CLASS_t = 278,
DEFAULT_t = 279,
ELSE_t = 280,
EXISTING_t = 281,
FOR_t = 282,
IF_t = 283,
IGNORE_t = 284,
IN_t = 285,
INCLUDE_t = 286,
LOCAL_t = 287,
MODULE_t = 288,
ON_t = 289,
PIECEMEAL_t = 290,
QUIETLY_t = 291,
RETURN_t = 292,
RULE_t = 293,
SWITCH_t = 294,
TOGETHER_t = 295,
UPDATED_t = 296,
WHILE_t = 297,
_LBRACE_t = 298,
_BAR_t = 299,
_BARBAR_t = 300,
_RBRACE_t = 301,
ARG = 302,
STRING = 303
};
/* Put the tokens into the symbol table, so that GDB and other debuggers
know about them. */
enum yytokentype {
_BANG_t = 258,
_BANG_EQUALS_t = 259,
_AMPER_t = 260,
_AMPERAMPER_t = 261,
_LPAREN_t = 262,
_RPAREN_t = 263,
_PLUS_EQUALS_t = 264,
_COLON_t = 265,
_SEMIC_t = 266,
_LANGLE_t = 267,
_LANGLE_EQUALS_t = 268,
_EQUALS_t = 269,
_RANGLE_t = 270,
_RANGLE_EQUALS_t = 271,
_QUESTION_EQUALS_t = 272,
_LBRACKET_t = 273,
_RBRACKET_t = 274,
ACTIONS_t = 275,
BIND_t = 276,
CASE_t = 277,
CLASS_t = 278,
DEFAULT_t = 279,
ELSE_t = 280,
EXISTING_t = 281,
FOR_t = 282,
IF_t = 283,
IGNORE_t = 284,
IN_t = 285,
INCLUDE_t = 286,
LOCAL_t = 287,
MODULE_t = 288,
ON_t = 289,
PIECEMEAL_t = 290,
QUIETLY_t = 291,
RETURN_t = 292,
RULE_t = 293,
SWITCH_t = 294,
TOGETHER_t = 295,
UPDATED_t = 296,
WHILE_t = 297,
_LBRACE_t = 298,
_BAR_t = 299,
_BARBAR_t = 300,
_RBRACE_t = 301,
ARG = 302,
STRING = 303
};
#endif
#define _BANG_t 258
#define _BANG_EQUALS_t 259

View File

@ -1,44 +1,44 @@
{ "!", _BANG_t },
{ "!=", _BANG_EQUALS_t },
{ "&", _AMPER_t },
{ "&&", _AMPERAMPER_t },
{ "(", _LPAREN_t },
{ ")", _RPAREN_t },
{ "+=", _PLUS_EQUALS_t },
{ ":", _COLON_t },
{ ";", _SEMIC_t },
{ "<", _LANGLE_t },
{ "<=", _LANGLE_EQUALS_t },
{ "=", _EQUALS_t },
{ ">", _RANGLE_t },
{ ">=", _RANGLE_EQUALS_t },
{ "?=", _QUESTION_EQUALS_t },
{ "[", _LBRACKET_t },
{ "]", _RBRACKET_t },
{ "actions", ACTIONS_t },
{ "bind", BIND_t },
{ "case", CASE_t },
{ "class", CLASS_t },
{ "default", DEFAULT_t },
{ "else", ELSE_t },
{ "existing", EXISTING_t },
{ "for", FOR_t },
{ "if", IF_t },
{ "ignore", IGNORE_t },
{ "in", IN_t },
{ "include", INCLUDE_t },
{ "local", LOCAL_t },
{ "module", MODULE_t },
{ "on", ON_t },
{ "piecemeal", PIECEMEAL_t },
{ "quietly", QUIETLY_t },
{ "return", RETURN_t },
{ "rule", RULE_t },
{ "switch", SWITCH_t },
{ "together", TOGETHER_t },
{ "updated", UPDATED_t },
{ "while", WHILE_t },
{ "{", _LBRACE_t },
{ "|", _BAR_t },
{ "||", _BARBAR_t },
{ "}", _RBRACE_t },
{ "!", _BANG_t },
{ "!=", _BANG_EQUALS_t },
{ "&", _AMPER_t },
{ "&&", _AMPERAMPER_t },
{ "(", _LPAREN_t },
{ ")", _RPAREN_t },
{ "+=", _PLUS_EQUALS_t },
{ ":", _COLON_t },
{ ";", _SEMIC_t },
{ "<", _LANGLE_t },
{ "<=", _LANGLE_EQUALS_t },
{ "=", _EQUALS_t },
{ ">", _RANGLE_t },
{ ">=", _RANGLE_EQUALS_t },
{ "?=", _QUESTION_EQUALS_t },
{ "[", _LBRACKET_t },
{ "]", _RBRACKET_t },
{ "actions", ACTIONS_t },
{ "bind", BIND_t },
{ "case", CASE_t },
{ "class", CLASS_t },
{ "default", DEFAULT_t },
{ "else", ELSE_t },
{ "existing", EXISTING_t },
{ "for", FOR_t },
{ "if", IF_t },
{ "ignore", IGNORE_t },
{ "in", IN_t },
{ "include", INCLUDE_t },
{ "local", LOCAL_t },
{ "module", MODULE_t },
{ "on", ON_t },
{ "piecemeal", PIECEMEAL_t },
{ "quietly", QUIETLY_t },
{ "return", RETURN_t },
{ "rule", RULE_t },
{ "switch", SWITCH_t },
{ "together", TOGETHER_t },
{ "updated", UPDATED_t },
{ "while", WHILE_t },
{ "{", _LBRACE_t },
{ "|", _BAR_t },
{ "||", _BARBAR_t },
{ "}", _RBRACE_t },

View File

@ -56,9 +56,9 @@
typedef struct _list LIST;
struct _list {
LIST *next;
LIST *tail; /* only valid in head node */
char *string; /* private copy */
LIST *next;
LIST *tail; /* only valid in head node */
char *string; /* private copy */
};
/*
@ -70,8 +70,8 @@ typedef struct _lol LOL;
# define LOL_MAX 19
struct _lol {
int count;
LIST *list[ LOL_MAX ];
int count;
LIST *list[ LOL_MAX ];
};
LIST * list_append( LIST *l, LIST *nl );

View File

@ -14,12 +14,12 @@ int make( int n_targets, const char **targets, int anyhow );
int make1( TARGET *t );
typedef struct {
int temp;
int updating;
int cantfind;
int cantmake;
int targets;
int made;
int temp;
int updating;
int cantfind;
int cantmake;
int targets;
int made;
} COUNTS ;

View File

@ -65,24 +65,24 @@ typedef unsigned int md5_word_t; /* 32-bit word */
/* Define the state of the MD5 Algorithm. */
typedef struct md5_state_s {
md5_word_t count[2]; /* message length in bits, lsw first */
md5_word_t abcd[4]; /* digest buffer */
md5_byte_t buf[64]; /* accumulate block */
md5_word_t count[2]; /* message length in bits, lsw first */
md5_word_t abcd[4]; /* digest buffer */
md5_byte_t buf[64]; /* accumulate block */
} md5_state_t;
#ifdef __cplusplus
extern "C"
extern "C"
{
#endif
/* Initialize the algorithm. */
void md5_init(md5_state_t *pms);
/* Initialize the algorithm. */
void md5_init(md5_state_t *pms);
/* Append a string to the message. */
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
/* Append a string to the message. */
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
/* Finish the message and return the digest. */
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
/* Finish the message and return the digest. */
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
#ifdef __cplusplus
} /* end extern "C" */

View File

@ -11,122 +11,122 @@ http://www.boost.org/LICENSE_1_0.txt)
#ifdef OPT_BOEHM_GC
/* Use Boehm GC memory allocator. */
#include <gc.h>
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
#define bjam_realloc_x(p,s) GC_realloc(p,s)
#define bjam_free_x(p) GC_free(p)
#define bjam_mem_init_x() GC_init(); GC_enable_incremental()
/* Use Boehm GC memory allocator. */
#include <gc.h>
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
#define bjam_realloc_x(p,s) GC_realloc(p,s)
#define bjam_free_x(p) GC_free(p)
#define bjam_mem_init_x() GC_init(); GC_enable_incremental()
#define bjam_malloc_raw_x(s) malloc(s)
#define bjam_calloc_raw_x(n,s) calloc(n,s)
#define bjam_realloc_raw_x(p,s) realloc(p,s)
#define bjam_free_raw_x(p) free(p)
#define bjam_malloc_raw_x(s) malloc(s)
#define bjam_calloc_raw_x(n,s) calloc(n,s)
#define bjam_realloc_raw_x(p,s) realloc(p,s)
#define bjam_free_raw_x(p) free(p)
#ifndef BJAM_NEWSTR_NO_ALLOCATE
#define BJAM_NEWSTR_NO_ALLOCATE
#endif
#ifndef BJAM_NEWSTR_NO_ALLOCATE
#define BJAM_NEWSTR_NO_ALLOCATE
#endif
#elif defined(OPT_DUMA)
/* Use Duma memory debugging library. */
#include <stdlib.h>
#define _DUMA_CONFIG_H_
#define DUMA_NO_GLOBAL_MALLOC_FREE
#define DUMA_EXPLICIT_INIT
#define DUMA_NO_THREAD_SAFETY
#define DUMA_NO_CPP_SUPPORT
/* #define DUMA_NO_LEAKDETECTION */
/* #define DUMA_USE_FRAMENO */
/* #define DUMA_PREFER_ATEXIT */
/* #define DUMA_OLD_DEL_MACRO */
/* #define DUMA_NO_HANG_MSG */
#define DUMA_PAGE_SIZE 4096
#define DUMA_MIN_ALIGNMENT 1
/* #define DUMA_GNU_INIT_ATTR 0 */
typedef unsigned int DUMA_ADDR;
typedef unsigned int DUMA_SIZE;
#include <duma.h>
#define bjam_malloc_x(s) malloc(s)
#define bjam_calloc_x(n,s) calloc(n,s)
#define bjam_realloc_x(p,s) realloc(p,s)
#define bjam_free_x(p) free(p)
/* Use Duma memory debugging library. */
#include <stdlib.h>
#define _DUMA_CONFIG_H_
#define DUMA_NO_GLOBAL_MALLOC_FREE
#define DUMA_EXPLICIT_INIT
#define DUMA_NO_THREAD_SAFETY
#define DUMA_NO_CPP_SUPPORT
/* #define DUMA_NO_LEAKDETECTION */
/* #define DUMA_USE_FRAMENO */
/* #define DUMA_PREFER_ATEXIT */
/* #define DUMA_OLD_DEL_MACRO */
/* #define DUMA_NO_HANG_MSG */
#define DUMA_PAGE_SIZE 4096
#define DUMA_MIN_ALIGNMENT 1
/* #define DUMA_GNU_INIT_ATTR 0 */
typedef unsigned int DUMA_ADDR;
typedef unsigned int DUMA_SIZE;
#include <duma.h>
#define bjam_malloc_x(s) malloc(s)
#define bjam_calloc_x(n,s) calloc(n,s)
#define bjam_realloc_x(p,s) realloc(p,s)
#define bjam_free_x(p) free(p)
#ifndef BJAM_NEWSTR_NO_ALLOCATE
#define BJAM_NEWSTR_NO_ALLOCATE
#endif
#ifndef BJAM_NEWSTR_NO_ALLOCATE
#define BJAM_NEWSTR_NO_ALLOCATE
#endif
#else
/* Standard C memory allocation. */
#define bjam_malloc_x(s) malloc(s)
#define bjam_calloc_x(n,s) calloc(n,s)
#define bjam_realloc_x(p,s) realloc(p,s)
#define bjam_free_x(p) free(p)
/* Standard C memory allocation. */
#define bjam_malloc_x(s) malloc(s)
#define bjam_calloc_x(n,s) calloc(n,s)
#define bjam_realloc_x(p,s) realloc(p,s)
#define bjam_free_x(p) free(p)
#endif
#ifndef bjam_malloc_atomic_x
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
#endif
#ifndef bjam_calloc_atomic_x
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
#endif
#ifndef bjam_mem_init_x
#define bjam_mem_init_x()
#define bjam_mem_init_x()
#endif
#ifndef bjam_mem_close_x
#define bjam_mem_close_x()
#define bjam_mem_close_x()
#endif
#ifndef bjam_malloc_raw_x
#define bjam_malloc_raw_x(s) bjam_malloc_x(s)
#define bjam_malloc_raw_x(s) bjam_malloc_x(s)
#endif
#ifndef bjam_calloc_raw_x
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
#endif
#ifndef bjam_realloc_raw_x
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
#endif
#ifndef bjam_free_raw_x
#define bjam_free_raw_x(p) bjam_free_x(p)
#define bjam_free_raw_x(p) bjam_free_x(p)
#endif
#ifdef OPT_DEBUG_PROFILE
/* Profile tracing of memory allocations. */
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
#define BJAM_FREE(p) bjam_free_x(p)
#define BJAM_MEM_INIT() bjam_mem_init_x()
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
/* Profile tracing of memory allocations. */
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
#define BJAM_FREE(p) bjam_free_x(p)
#define BJAM_MEM_INIT() bjam_mem_init_x()
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
#else
/* No mem tracing. */
#define BJAM_MALLOC(s) bjam_malloc_x(s)
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
#define BJAM_FREE(p) bjam_free_x(p)
#define BJAM_MEM_INIT() bjam_mem_init_x()
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
/* No mem tracing. */
#define BJAM_MALLOC(s) bjam_malloc_x(s)
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
#define BJAM_FREE(p) bjam_free_x(p)
#define BJAM_MEM_INIT() bjam_mem_init_x()
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
#endif

View File

@ -8,15 +8,14 @@
#include "lists.h"
struct module_t
{
char* name;
struct hash* rules;
struct hash* variables;
struct hash* imported_modules;
struct module_t* class_module;
struct hash* native_rules;
int user_module;
struct module_t {
char* name;
struct hash* rules;
struct hash* variables;
struct hash* imported_modules;
struct module_t* class_module;
struct hash* native_rules;
int user_module;
};
typedef struct module_t module_t ; /* MSVC debugger gets confused unless this is provided */

View File

@ -7,20 +7,19 @@
#include "rules.h"
struct native_rule_t
{
char* name;
argument_list* arguments;
PARSE* procedure;
/* Version of the interface that the native rule provides.
It's possible that we want to change the set parameter
for existing native rule. In that case, version number
should be incremented so that Boost.Build can check for
version it relies on.
struct native_rule_t {
char* name;
argument_list* arguments;
PARSE* procedure;
/* Version of the interface that the native rule provides.
It's possible that we want to change the set parameter
for existing native rule. In that case, version number
should be incremented so that Boost.Build can check for
version it relies on.
Versions are numbered from 1.
*/
int version;
Versions are numbered from 1.
*/
int version;
};
/* MSVC debugger gets confused unless this is provided */

View File

@ -11,10 +11,9 @@
* \ -) "Command line option."
*/
typedef struct bjam_option
{
char flag; /* filled in by getoption() */
char *val; /* set to random address if true */
typedef struct bjam_option {
char flag; /* filled in by getoption() */
char *val; /* set to random address if true */
} bjam_option;
# define N_OPTS 256

View File

@ -14,13 +14,13 @@
#define EXIT_TIMEOUT 2
void out_action(
const char * action,
const char * target,
const char * command,
const char * out_data,
const char * err_data,
int exit_reason
);
const char * action,
const char * target,
const char * command,
const char * out_data,
const char * err_data,
int exit_reason
);
char * outf_int( int value );
char * outf_double( double value );

View File

@ -26,31 +26,31 @@
*/
struct _PARSE {
LIST * (* func)( PARSE *, FRAME * );
PARSE * left;
PARSE * right;
PARSE * third;
char * string;
char * string1;
int num;
int refs;
/* module * module; */
char * rulename;
char * file;
int line;
LIST * (* func)( PARSE *, FRAME * );
PARSE * left;
PARSE * right;
PARSE * third;
char * string;
char * string1;
int num;
int refs;
/* module * module; */
char * rulename;
char * file;
int line;
};
void parse_file( char *, FRAME * );
void parse_save( PARSE * );
PARSE * parse_make(
LIST * (* func)( PARSE *, FRAME * ),
PARSE * left,
PARSE * right,
PARSE * third,
char * string,
char * string1,
int num );
LIST * (* func)( PARSE *, FRAME * ),
PARSE * left,
PARSE * right,
PARSE * third,
char * string,
char * string1,
int num );
void parse_refer ( PARSE * );
void parse_free ( PARSE * );

View File

@ -28,17 +28,15 @@
typedef struct _pathname PATHNAME;
typedef struct _pathpart PATHPART;
struct _pathpart
{
char * ptr;
int len;
struct _pathpart {
char * ptr;
int len;
};
struct _pathname
{
PATHPART part[6];
struct _pathname {
PATHPART part[6];
#ifdef OS_VMS
int parent;
int parent;
#endif
#define f_grist part[0]

View File

@ -9,13 +9,13 @@
#define NSUBEXP 10
typedef struct regexp {
char *startp[NSUBEXP];
char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
char *startp[NSUBEXP];
char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
regexp *regcomp( char *exp );

View File

@ -53,19 +53,17 @@ typedef struct _settings SETTINGS ;
/* RULE - a generic jam rule, the product of RULE and ACTIONS. */
/* A rule's argument list. */
struct argument_list
{
int reference_count;
LOL data[1];
struct argument_list {
int reference_count;
LOL data[1];
};
/* Build actions corresponding to a rule. */
struct rule_actions
{
int reference_count;
char * command; /* command string from ACTIONS */
LIST * bindlist;
int flags; /* modifiers on ACTIONS */
struct rule_actions {
int reference_count;
char * command; /* command string from ACTIONS */
LIST * bindlist;
int flags; /* modifiers on ACTIONS */
#define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */
#define RULE_TOGETHER 0x02 /* combine actions on single target */
@ -78,67 +76,61 @@ struct rule_actions
typedef struct rule_actions rule_actions;
typedef struct argument_list argument_list;
struct _rule
{
char * name;
PARSE * procedure; /* parse tree from RULE */
argument_list * arguments; /* argument checking info, or NULL for unchecked
struct _rule {
char * name;
PARSE * procedure; /* parse tree from RULE */
argument_list * arguments; /* argument checking info, or NULL for unchecked
*/
rule_actions * actions; /* build actions, or NULL for no actions */
module_t * module; /* module in which this rule is executed */
int exported; /* nonzero if this rule is supposed to appear in
rule_actions * actions; /* build actions, or NULL for no actions */
module_t * module; /* module in which this rule is executed */
int exported; /* nonzero if this rule is supposed to appear in
* the global module and be automatically
* imported into other modules
*/
#ifdef HAVE_PYTHON
PyObject * python_function;
PyObject * python_function;
#endif
};
/* ACTIONS - a chain of ACTIONs. */
struct _actions
{
ACTIONS * next;
ACTIONS * tail; /* valid only for head */
ACTION * action;
struct _actions {
ACTIONS * next;
ACTIONS * tail; /* valid only for head */
ACTION * action;
};
/* ACTION - a RULE instance with targets and sources. */
struct _action
{
RULE * rule;
TARGETS * targets;
TARGETS * sources; /* aka $(>) */
char running; /* has been started */
char status; /* see TARGET status */
struct _action {
RULE * rule;
TARGETS * targets;
TARGETS * sources; /* aka $(>) */
char running; /* has been started */
char status; /* see TARGET status */
};
/* SETTINGS - variables to set when executing a TARGET's ACTIONS. */
struct _settings
{
SETTINGS * next;
char * symbol; /* symbol name for var_set() */
LIST * value; /* symbol value for var_set() */
int multiple;
struct _settings {
SETTINGS * next;
char * symbol; /* symbol name for var_set() */
LIST * value; /* symbol value for var_set() */
int multiple;
};
/* TARGETS - a chain of TARGETs. */
struct _targets
{
TARGETS * next;
TARGETS * tail; /* valid only for head */
TARGET * target;
struct _targets {
TARGETS * next;
TARGETS * tail; /* valid only for head */
TARGET * target;
};
/* TARGET - an entity (e.g. a file) that can be built. */
struct _target
{
char * name;
char * boundname; /* if search() relocates target */
ACTIONS * actions; /* rules to execute, if any */
SETTINGS * settings; /* variables to define */
struct _target {
char * name;
char * boundname; /* if search() relocates target */
ACTIONS * actions; /* rules to execute, if any */
SETTINGS * settings; /* variables to define */
short flags; /* status info */
short flags; /* status info */
#define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */
#define T_FLAG_NOCARE 0x0002 /* NOCARE applied */
@ -148,28 +140,28 @@ struct _target
#define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */
#define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */
/* This flag has been added to support a new built-in rule named "RMBAD". It is
* used to force removal of outdated targets whose dependencies fail to build.
*/
/* This flag has been added to support a new built-in rule named "RMBAD". It is
* used to force removal of outdated targets whose dependencies fail to build.
*/
#define T_FLAG_RMOLD 0x0080 /* RMBAD applied */
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
* to indicate that the result of running a given action should be inverted,
* i.e. ok <=> fail. This is useful for launching certain test runs from a
* Jamfile.
*/
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
* to indicate that the result of running a given action should be inverted,
* i.e. ok <=> fail. This is useful for launching certain test runs from a
* Jamfile.
*/
#define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */
#define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */
/* Indicates that the target must be a file. This prevents matching non-files,
* like directories, when a target is searched.
*/
/* Indicates that the target must be a file. This prevents matching non-files,
* like directories, when a target is searched.
*/
#define T_FLAG_ISFILE 0x0400
#define T_FLAG_PRECIOUS 0x0800
#define T_FLAG_PRECIOUS 0x0800
char binding; /* how target relates to a real file or
char binding; /* how target relates to a real file or
* folder
*/
@ -178,32 +170,32 @@ struct _target
#define T_BIND_PARENTS 2 /* using parent's timestamp */
#define T_BIND_EXISTS 3 /* real file, timestamp valid */
TARGETS * depends; /* dependencies */
TARGETS * dependants; /* the inverse of dependencies */
TARGETS * rebuilds; /* targets that should be force-rebuilt
TARGETS * depends; /* dependencies */
TARGETS * dependants; /* the inverse of dependencies */
TARGETS * rebuilds; /* targets that should be force-rebuilt
* whenever this one is
*/
TARGET * includes; /* internal includes node */
TARGET * original_target; /* original_target->includes = this */
char rescanned;
TARGET * includes; /* internal includes node */
TARGET * original_target; /* original_target->includes = this */
char rescanned;
time_t time; /* update time */
time_t leaf; /* update time of leaf sources */
time_t time; /* update time */
time_t leaf; /* update time of leaf sources */
char fate; /* make0()'s diagnosis */
char fate; /* make0()'s diagnosis */
#define T_FATE_INIT 0 /* nothing done to target */
#define T_FATE_MAKING 1 /* make0(target) on stack */
#define T_FATE_STABLE 2 /* target did not need updating */
#define T_FATE_NEWER 3 /* target newer than parent */
#define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */
#define T_FATE_ISTMP 4 /* unneeded temp target oddly present */
#define T_FATE_BUILD 5 /* >= BUILD rebuilds target */
#define T_FATE_TOUCHED 5 /* manually touched with -t */
#define T_FATE_REBUILD 6
#define T_FATE_REBUILD 6
#define T_FATE_MISSING 7 /* is missing, needs updating */
#define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */
#define T_FATE_OUTDATED 9 /* is out of date, needs updating */
@ -213,7 +205,7 @@ struct _target
#define T_FATE_CANTFIND 11 /* no rules to make missing target */
#define T_FATE_CANTMAKE 12 /* can not find dependencies */
char progress; /* tracks make1() progress */
char progress; /* tracks make1() progress */
#define T_MAKE_INIT 0 /* make1(target) not yet called */
#define T_MAKE_ONSTACK 1 /* make1(target) on stack */
@ -222,20 +214,20 @@ struct _target
#define T_MAKE_DONE 4 /* make1(target) done */
#ifdef OPT_SEMAPHORE
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
#endif
#ifdef OPT_SEMAPHORE
TARGET * semaphore; /* used in serialization */
TARGET * semaphore; /* used in serialization */
#endif
char status; /* exec_cmd() result */
char status; /* exec_cmd() result */
int asynccnt; /* child deps outstanding */
TARGETS * parents; /* used by make1() for completion */
char * cmds; /* type-punned command list */
int asynccnt; /* child deps outstanding */
TARGETS * parents; /* used by make1() for completion */
char * cmds; /* type-punned command list */
char * failed;
char * failed;
};

View File

@ -29,15 +29,14 @@
#define YYSTYPE YYSYMBOL
typedef struct _YYSTYPE
{
int type;
char * string;
PARSE * parse;
LIST * list;
int number;
char * file;
int line;
typedef struct _YYSTYPE {
int type;
char * string;
PARSE * parse;
LIST * list;
int number;
char * file;
int line;
} YYSTYPE;
extern YYSTYPE yylval;

View File

@ -7,14 +7,13 @@
# include <stddef.h>
typedef struct string
{
char* value;
unsigned long size;
unsigned long capacity;
char opt[32];
typedef struct string {
char* value;
unsigned long size;
unsigned long capacity;
char opt[32];
#ifndef NDEBUG
char magic[4];
char magic[4];
#endif
} string;

View File

@ -50,10 +50,10 @@ Data::~Data() {
//ADDED BY TS
void Data::remove_duplicates() {
uint nSentences = featdata->size();
size_t nSentences = featdata->size();
assert(scoredata->size() == nSentences);
for (uint s=0; s < nSentences; s++) {
for (size_t s=0; s < nSentences; s++) {
FeatureArray& feat_array = featdata->get(s);
ScoreArray& score_array = scoredata->get(s);
@ -61,29 +61,29 @@ void Data::remove_duplicates() {
assert(feat_array.size() == score_array.size());
//serves as a hash-map:
std::map<double, std::vector<uint> > lookup;
std::map<double, std::vector<size_t> > lookup;
uint end_pos = feat_array.size() - 1;
size_t end_pos = feat_array.size() - 1;
uint nRemoved = 0;
for (uint k=0; k <= end_pos; k++) {
size_t nRemoved = 0;
for (size_t k=0; k <= end_pos; k++) {
const FeatureStats& cur_feats = feat_array.get(k);
double sum = 0.0;
for (uint l=0; l < cur_feats.size(); l++)
for (size_t l=0; l < cur_feats.size(); l++)
sum += cur_feats.get(l);
if (lookup.find(sum) != lookup.end()) {
//std::cerr << "hit" << std::endl;
std::vector<uint>& cur_list = lookup[sum];
std::vector<size_t>& cur_list = lookup[sum];
uint l=0;
size_t l=0;
for (l=0; l < cur_list.size(); l++) {
uint j=cur_list[l];
size_t j=cur_list[l];
if (cur_feats == feat_array.get(j)
&& score_array.get(k) == score_array.get(j)) {

View File

@ -129,7 +129,8 @@ IOWrapper::~IOWrapper()
delete m_singleBestOutputCollector;
}
void IOWrapper::ResetTranslationId() {
void IOWrapper::ResetTranslationId()
{
m_translationId = StaticData::Instance().GetStartTranslationId();
}
@ -369,18 +370,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
if (pds.size() > 0) {
for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)) {
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
@ -394,18 +395,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
if (gds.size() > 0) {
for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)) {
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}

View File

@ -210,13 +210,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
{
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments();
AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
}
}
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
@ -227,7 +227,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize();
@ -239,7 +239,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
{
ostringstream out;
OutputAlignment(out, edges);
collector->Write(lineNo,out.str());
}
@ -412,18 +412,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
if (pds.size() > 0) {
for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)) {
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
@ -432,18 +432,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
if (gds.size() > 0) {
for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j) {
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
if (labeledOutput && (i == 0) ) {
if ((j == 0) || (j == pd_numinputscore)) {
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
OutputAlignment(out, ai, sourceOffset, targetOffset);
}

View File

@ -83,7 +83,7 @@ public:
m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector) {}
/** Translate one sentence
/** Translate one sentence
* gets called by main function implemented at end of this source file */
void Run() {
@ -130,7 +130,7 @@ public:
manager.SerializeSearchGraphPB(m_lineNumber, output);
}
#endif
}
}
// apply decision rule and output best translation(s)
if (m_outputCollector) {
@ -145,8 +145,7 @@ public:
// MAP decoding: best hypothesis
const Hypothesis* bestHypo = NULL;
if (!staticData.UseMBR())
{
if (!staticData.UseMBR()) {
bestHypo = manager.GetBestHypothesis();
if (bestHypo) {
if (staticData.IsPathRecoveryEnabled()) {
@ -165,11 +164,10 @@ public:
}
}
out << endl;
}
}
// MBR decoding (n-best MBR, lattice MBR, consensus)
else
{
else {
// we first need the n-best translations
size_t nBestSize = staticData.GetMBRSize();
if (nBestSize <= 0) {
@ -205,7 +203,7 @@ public:
}
// consensus decoding
else if (staticData.UseConsensusDecoding()) {
else if (staticData.UseConsensusDecoding()) {
const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
OutputBestHypo(conBestHypo, m_lineNumber,
staticData.GetReportSegmentation(),
@ -214,8 +212,8 @@ public:
IFVERBOSE(2) {
PrintUserTime("finished Consensus decoding");
}
}
}
// n-best MBR decoding
else {
const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
@ -482,7 +480,7 @@ int main(int argc, char** argv)
alignmentInfoCollector.get() );
// execute task
#ifdef WITH_THREADS
pool.Submit(task);
pool.Submit(task);
#else
task->Run();
#endif

View File

@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
}
}
}
bool epsilon = false;
if (target == "") {
target="<EPSILON>";

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -42,10 +42,11 @@ void AlignmentInfo::BuildNonTermIndexMap()
for (p = begin(); p != end(); ++p) {
m_nonTermIndexMap[p->second] = i++;
}
}
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) {
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b)
{
if(a->second < b->second) return true;
if(a->second == b->second) return (a->first < b->first);
return false;
@ -55,34 +56,32 @@ bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,si
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
{
std::vector< const std::pair<size_t,size_t>* > ret;
CollType::const_iterator iter;
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter)
{
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
const std::pair<size_t,size_t> &alignPair = *iter;
ret.push_back(&alignPair);
}
const StaticData &staticData = StaticData::Instance();
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
switch (wordAlignmentSort)
{
case NoSort:
break;
case TargetOrder:
std::sort(ret.begin(), ret.end(), compare_target);
break;
default:
CHECK(false);
switch (wordAlignmentSort) {
case NoSort:
break;
case TargetOrder:
std::sort(ret.begin(), ret.end(), compare_target);
break;
default:
CHECK(false);
}
return ret;
}
std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo)
{
AlignmentInfo::const_iterator iter;

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -37,12 +37,16 @@ class AlignmentInfo
friend struct AlignmentInfoOrderer;
friend class AlignmentInfoCollection;
public:
public:
typedef std::vector<size_t> NonTermIndexMap;
typedef CollType::const_iterator const_iterator;
const_iterator begin() const { return m_collection.begin(); }
const_iterator end() const { return m_collection.end(); }
const_iterator begin() const {
return m_collection.begin();
}
const_iterator end() const {
return m_collection.end();
}
// Provides a map from target-side to source-side non-terminal indices.
// The target-side index should be the rule symbol index (counting terminals).
@ -52,12 +56,11 @@ class AlignmentInfo
}
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
private:
private:
// AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
: m_collection(pairs)
{
: m_collection(pairs) {
BuildNonTermIndexMap();
}
@ -69,8 +72,7 @@ class AlignmentInfo
// Define an arbitrary strict weak ordering between AlignmentInfo objects
// for use by AlignmentInfoCollection.
struct AlignmentInfoOrderer
{
struct AlignmentInfoOrderer {
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
return a.m_collection < b.m_collection;
}

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -36,7 +36,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
}
const AlignmentInfo *AlignmentInfoCollection::Add(
const std::set<std::pair<size_t,size_t> > &pairs)
const std::set<std::pair<size_t,size_t> > &pairs)
{
std::pair<AlignmentInfoSet::iterator, bool> ret =
m_collection.insert(AlignmentInfo(pairs));

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -29,8 +29,10 @@ namespace Moses
// Singleton collection of all AlignmentInfo objects.
class AlignmentInfoCollection
{
public:
static AlignmentInfoCollection &Instance() { return s_instance; }
public:
static AlignmentInfoCollection &Instance() {
return s_instance;
}
// Returns a pointer to an AlignmentInfo object with the same source-target
// alignment pairs as given in the argument. If the collection already
@ -41,7 +43,7 @@ class AlignmentInfoCollection
// Returns a pointer to an empty AlignmentInfo object.
const AlignmentInfo &GetEmptyAlignmentInfo() const;
private:
private:
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
// Only a single static variable should be created.

View File

@ -7,455 +7,454 @@
using namespace std;
namespace Moses {
namespace Moses
{
BilingualDynSuffixArray::BilingualDynSuffixArray():
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
m_maxSampleSize(20)
{
m_srcSA = 0;
m_trgSA = 0;
m_srcCorpus = new std::vector<wordID_t>();
m_trgCorpus = new std::vector<wordID_t>();
m_srcVocab = new Vocab(false);
m_trgVocab = new Vocab(false);
m_scoreCmp = 0;
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
m_maxSampleSize(20)
{
m_srcSA = 0;
m_trgSA = 0;
m_srcCorpus = new std::vector<wordID_t>();
m_trgCorpus = new std::vector<wordID_t>();
m_srcVocab = new Vocab(false);
m_trgVocab = new Vocab(false);
m_scoreCmp = 0;
}
BilingualDynSuffixArray::~BilingualDynSuffixArray()
BilingualDynSuffixArray::~BilingualDynSuffixArray()
{
if(m_srcSA) delete m_srcSA;
if(m_trgSA) delete m_trgSA;
if(m_srcVocab) delete m_srcVocab;
if(m_trgVocab) delete m_trgVocab;
if(m_srcCorpus) delete m_srcCorpus;
if(m_trgCorpus) delete m_trgCorpus;
if(m_scoreCmp) delete m_scoreCmp;
if(m_srcSA) delete m_srcSA;
if(m_trgSA) delete m_trgSA;
if(m_srcVocab) delete m_srcVocab;
if(m_trgVocab) delete m_trgVocab;
if(m_srcCorpus) delete m_srcCorpus;
if(m_trgCorpus) delete m_trgCorpus;
if(m_scoreCmp) delete m_scoreCmp;
}
bool BilingualDynSuffixArray::Load(
const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputFactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight)
const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputFactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight)
{
m_inputFactors = inputFactors;
m_outputFactors = outputFactors;
m_scoreCmp = new ScoresComp(weight);
InputFileStream sourceStrme(source);
InputFileStream targetStrme(target);
cerr << "Loading source corpus...\n";
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
cerr << "Loading target corpus...\n";
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
m_scoreCmp = new ScoresComp(weight);
InputFileStream sourceStrme(source);
InputFileStream targetStrme(target);
cerr << "Loading source corpus...\n";
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
cerr << "Loading target corpus...\n";
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
// build suffix arrays and auxilliary arrays
cerr << "Building Source Suffix Array...\n";
m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false;
cerr << "Building Target Suffix Array...\n";
//m_trgSA = new DynSuffixArray(m_trgCorpus);
//if(!m_trgSA) return false;
// build suffix arrays and auxilliary arrays
cerr << "Building Source Suffix Array...\n";
m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false;
cerr << "Building Target Suffix Array...\n";
//m_trgSA = new DynSuffixArray(m_trgCorpus);
//if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n";
InputFileStream alignStrme(alignments);
cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme);
//LoadAlignments(alignStrme);
InputFileStream alignStrme(alignments);
cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme);
//LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
return true;
return true;
}
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
{
// stores the alignments in the raw file format
std::string line;
std::vector<int> vtmp;
while(getline(align, line)) {
Utils::splitToInt(line, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
std::vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin();
itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr));
}
m_rawAlignments.push_back(vAlgn);
}
return m_rawAlignments.size();
// stores the alignments in the raw file format
std::string line;
std::vector<int> vtmp;
while(getline(align, line)) {
Utils::splitToInt(line, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
std::vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin();
itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr));
}
m_rawAlignments.push_back(vAlgn);
}
return m_rawAlignments.size();
}
int BilingualDynSuffixArray::LoadRawAlignments(string& align) {
// stores the alignments in the raw file format
int BilingualDynSuffixArray::LoadRawAlignments(string& align)
{
// stores the alignments in the raw file format
vector<int> vtmp;
Utils::splitToInt(align, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin();
itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr));
itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr));
}
m_rawAlignments.push_back(vAlgn);
return m_rawAlignments.size();
}
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
{
std::string line;
std::vector<int> vtmp;
int sntIndex(0);
while(getline(align, line)) {
Utils::splitToInt(line, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
int sourceSize = GetSourceSentenceSize(sntIndex);
int targetSize = GetTargetSentenceSize(sntIndex);
std::string line;
std::vector<int> vtmp;
int sntIndex(0);
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
for(int i=0; i < (int)vtmp.size(); i+=2) {
int sourcePos = vtmp[i];
int targetPos = vtmp[i+1];
CHECK(sourcePos < sourceSize);
CHECK(targetPos < targetSize);
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
}
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
curSnt.trgSnt = m_trgCorpus + sntIndex;
m_alignments.push_back(curSnt);
sntIndex++;
}
return m_alignments.size();
while(getline(align, line)) {
Utils::splitToInt(line, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
int sourceSize = GetSourceSentenceSize(sntIndex);
int targetSize = GetTargetSentenceSize(sntIndex);
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
for(int i=0; i < (int)vtmp.size(); i+=2) {
int sourcePos = vtmp[i];
int targetPos = vtmp[i+1];
CHECK(sourcePos < sourceSize);
CHECK(targetPos < targetSize);
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
}
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
curSnt.trgSnt = m_trgCorpus + sntIndex;
m_alignments.push_back(curSnt);
sntIndex++;
}
return m_alignments.size();
}
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
{
// retrieves the alignments in the format used by SentenceAlignment.Extract()
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
std::vector<short> alignment = m_rawAlignments.at(sntIndex);
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
for(size_t i=0; i < alignment.size(); i+=2) {
int sourcePos = alignment[i];
int targetPos = alignment[i+1];
if(trg2Src) {
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
}
else {
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
}
}
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
curSnt.trgSnt = m_trgCorpus + sntIndex;
return curSnt;
// retrieves the alignments in the format used by SentenceAlignment.Extract()
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
std::vector<short> alignment = m_rawAlignments.at(sntIndex);
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
for(size_t i=0; i < alignment.size(); i+=2) {
int sourcePos = alignment[i];
int targetPos = alignment[i+1];
if(trg2Src) {
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
} else {
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
}
}
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
curSnt.trgSnt = m_trgCorpus + sntIndex;
return curSnt;
}
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
{
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
* parameter */
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
// get span of phrase in source sentence
int beginSentence = m_srcSntBreaks[sntIndex];
int rightIdx = wordIndex - beginSentence
,leftIdx = rightIdx - sourceSize + 1;
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
* parameter */
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
// get span of phrase in source sentence
int beginSentence = m_srcSntBreaks[sntIndex];
int rightIdx = wordIndex - beginSentence
,leftIdx = rightIdx - sourceSize + 1;
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
}
void BilingualDynSuffixArray::CleanUp()
void BilingualDynSuffixArray::CleanUp()
{
//m_wordPairCache.clear();
//m_wordPairCache.clear();
}
int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
Vocab* vocab)
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
Vocab* vocab)
{
std::string line, word;
int sntIdx(0);
std::string line, word;
int sntIdx(0);
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
while(getline(corpus, line)) {
sntArray.push_back(sntIdx);
Phrase phrase(ARRAY_SIZE_INCR);
// parse phrase
phrase.CreateFromString( factors, line, factorDelimiter);
// store words in vocabulary and corpus
for( size_t i = 0; i < phrase.GetSize(); ++i) {
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
}
sntIdx += phrase.GetSize();
}
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
while(getline(corpus, line)) {
sntArray.push_back(sntIdx);
Phrase phrase(ARRAY_SIZE_INCR);
// parse phrase
phrase.CreateFromString( factors, line, factorDelimiter);
// store words in vocabulary and corpus
for( size_t i = 0; i < phrase.GetSize(); ++i) {
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
}
sntIdx += phrase.GetSize();
}
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
vocab->MakeClosed(); // avoid adding words
return cArray.size();
return cArray.size();
}
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
{
// looks up the SA vocab ids for the current src phrase
size_t phraseSize = src.GetSize();
for (size_t pos = 0; pos < phraseSize; ++pos) {
const Word &word = src.GetWord(pos);
wordID_t arrayId = m_srcVocab->GetWordID(word);
if (arrayId == m_srcVocab->GetkOOVWordID())
{ // oov
return false;
}
else
{
output.SetId(pos, arrayId);
//cerr << arrayId << " ";
}
}
return true;
// looks up the SA vocab ids for the current src phrase
size_t phraseSize = src.GetSize();
for (size_t pos = 0; pos < phraseSize; ++pos) {
const Word &word = src.GetWord(pos);
wordID_t arrayId = m_srcVocab->GetWordID(word);
if (arrayId == m_srcVocab->GetkOOVWordID()) {
// oov
return false;
} else {
output.SetId(pos, arrayId);
//cerr << arrayId << " ";
}
}
return true;
}
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
{
//return pair<float, float>(1, 1);
float srcLexWeight(1.0), trgLexWeight(1.0);
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
// for each source word
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
float srcSumPairProbs(0);
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
//return pair<float, float>(1, 1);
float srcLexWeight(1.0), trgLexWeight(1.0);
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
// for each source word
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
float srcSumPairProbs(0);
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
// for each target word aligned to this source word in this alignment
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
itrCache = m_wordPairCache.find(wordpair); // search cache again
}
CHECK(itrCache != m_wordPairCache.end());
srcSumPairProbs += itrCache->second.first;
targetProbs[wordpair] = itrCache->second.second;
}
else { // extract p(trg|src)
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
int trgIdx = srcWordAlignments[i];
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
// get probability of this source->target word pair
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
itrCache = m_wordPairCache.find(wordpair); // search cache again
}
CHECK(itrCache != m_wordPairCache.end());
srcSumPairProbs += itrCache->second.first;
targetProbs[wordpair] = itrCache->second.second;
} else { // extract p(trg|src)
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
int trgIdx = srcWordAlignments[i];
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
// get probability of this source->target word pair
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
itrCache = m_wordPairCache.find(wordpair);
if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
itrCache = m_wordPairCache.find(wordpair); // search cache again
}
CHECK(itrCache != m_wordPairCache.end());
srcSumPairProbs += itrCache->second.first;
targetProbs[wordpair] = itrCache->second.second;
}
}
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
} // end for each source word
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
float trgSumPairProbs(0);
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
if(trgItr->first.second == trgWord)
trgSumPairProbs += trgItr->second;
itrCache = m_wordPairCache.find(wordpair); // search cache again
}
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
int noAligned = alignment.numberAligned.at(trgIdx);
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
}
// TODO::Need to get p(NULL|trg)
return pair<float, float>(srcLexWeight, trgLexWeight);
CHECK(itrCache != m_wordPairCache.end());
srcSumPairProbs += itrCache->second.first;
targetProbs[wordpair] = itrCache->second.second;
}
}
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
} // end for each source word
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
float trgSumPairProbs(0);
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
if(trgItr->first.second == trgWord)
trgSumPairProbs += trgItr->second;
}
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
int noAligned = alignment.numberAligned.at(trgIdx);
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
}
// TODO::Need to get p(NULL|trg)
return pair<float, float>(srcLexWeight, trgLexWeight);
}
void BilingualDynSuffixArray::CacheFreqWords() const {
void BilingualDynSuffixArray::CacheFreqWords() const
{
std::multimap<int, wordID_t> wordCnts;
// for each source word in vocab
Vocab::Word2Id::const_iterator it;
Vocab::Word2Id::const_iterator it;
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
// get its frequency
wordID_t srcWord = it->second;
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
if(wrdIndices.size() >= 1000) { // min count
if(wrdIndices.size() >= 1000) { // min count
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
}
}
int numSoFar(0);
std::multimap<int, wordID_t>::reverse_iterator ritr;
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
std::multimap<int, wordID_t>::reverse_iterator ritr;
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
m_freqWordsCached.insert(ritr->second);
CacheWordProbs(ritr->second);
if(++numSoFar == 50) break; // get top counts
}
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
}
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
{
std::map<wordID_t, int> counts;
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
CHECK(ret);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
float denom(0);
// for each occurrence of this word
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
CHECK(sntIdx != -1);
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
if(srcAlg.size() == 0) {
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
++denom;
}
else { //get target words aligned to srcword in this sentence
for(size_t i=0; i < srcAlg.size(); ++i) {
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
++counts[trgWord];
++denom;
}
}
}
// now we've gotten counts of all target words aligned to this source word
// get probs and cache all pairs
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
itrCnt != counts.end(); ++itrCnt) {
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
}
std::map<wordID_t, int> counts;
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
CHECK(ret);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
float denom(0);
// for each occurrence of this word
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
CHECK(sntIdx != -1);
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
if(srcAlg.size() == 0) {
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
++denom;
} else { //get target words aligned to srcword in this sentence
for(size_t i=0; i < srcAlg.size(); ++i) {
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
++counts[trgWord];
++denom;
}
}
}
// now we've gotten counts of all target words aligned to this source word
// get probs and cache all pairs
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
itrCnt != counts.end(); ++itrCnt) {
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
}
}
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
{
// takes sentence indexes and looks up vocab IDs
SAPhrase phraseIds(phrasepair.GetTargetSize());
int sntIndex = phrasepair.m_sntIndex;
int id(-1), pos(0);
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
phraseIds.SetId(pos++, id);
}
return phraseIds;
}
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
{
TargetPhrase* targetPhrase = new TargetPhrase(Output);
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
Word& word = m_trgVocab->GetWord( phrase.words[i]);
CHECK(word != m_trgVocab->GetkOOVWord());
targetPhrase->AddWord(word);
}
// scoring
return targetPhrase;
// takes sentence indexes and looks up vocab IDs
SAPhrase phraseIds(phrasepair.GetTargetSize());
int sntIndex = phrasepair.m_sntIndex;
int id(-1), pos(0);
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
phraseIds.SetId(pos++, id);
}
return phraseIds;
}
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
{
TargetPhrase* targetPhrase = new TargetPhrase(Output);
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
Word& word = m_trgVocab->GetWord( phrase.words[i]);
CHECK(word != m_trgVocab->GetkOOVWord());
targetPhrase->AddWord(word);
}
// scoring
return targetPhrase;
}
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
{
//cerr << "phrase is \"" << src << endl;
size_t sourceSize = src.GetSize();
SAPhrase localIDs(sourceSize);
if(!GetLocalVocabIDs(src, localIDs)) return;
float totalTrgPhrases(0);
std::map<SAPhrase, int> phraseCounts;
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
std::map<SAPhrase, pair<float, float> > lexicalWeights;
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
std::vector<unsigned> wrdIndices;
// extract sentence IDs from SA and return rightmost index of phrases
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
size_t sourceSize = src.GetSize();
SAPhrase localIDs(sourceSize);
if(!GetLocalVocabIDs(src, localIDs)) return;
float totalTrgPhrases(0);
std::map<SAPhrase, int> phraseCounts;
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
std::map<SAPhrase, pair<float, float> > lexicalWeights;
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
std::vector<unsigned> wrdIndices;
// extract sentence IDs from SA and return rightmost index of phrases
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
SampleSelection(wrdIndices);
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
// for each sentence with this phrase
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
//cerr << "extracted " << phrasePairs.size() << endl;
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
std::vector<PhrasePair*>::iterator iterPhrasePair;
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
phraseCounts[phrase]++; // count each unique phrase
// NOTE::Correct but slow to extract lexical weight here. could do
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
// for each sentence with this phrase
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
//cerr << "extracted " << phrasePairs.size() << endl;
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
std::vector<PhrasePair*>::iterator iterPhrasePair;
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
phraseCounts[phrase]++; // count each unique phrase
// NOTE::Correct but slow to extract lexical weight here. could do
// it later for only the top phrases chosen by phrase prob p(e|f)
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
itrLexW->second = lexWeight; // if this lex weight is greater save it
else lexicalWeights[phrase] = lexWeight; // else save
}
// done with sentence. delete SA phrase pairs
RemoveAllInColl(phrasePairs);
} // done with all sentences
// convert to moses phrase pairs
std::map<SAPhrase, int>::const_iterator iterPhrases;
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
// get scores of all phrases
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
itrLexW = lexicalWeights.find(iterPhrases->first);
CHECK(itrLexW != lexicalWeights.end());
Scores scoreVector(3);
scoreVector[0] = trg2SrcMLE;
scoreVector[1] = itrLexW->second.first;
scoreVector[2] = 2.718; // exp(1);
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
}
// return top scoring phrases
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
Scores scoreVector = ritr->first;
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
target.push_back(make_pair( scoreVector, targetPhrase));
if(target.size() == m_maxSampleSize) break;
}
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
itrLexW->second = lexWeight; // if this lex weight is greater save it
else lexicalWeights[phrase] = lexWeight; // else save
}
// done with sentence. delete SA phrase pairs
RemoveAllInColl(phrasePairs);
} // done with all sentences
// convert to moses phrase pairs
std::map<SAPhrase, int>::const_iterator iterPhrases;
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
// get scores of all phrases
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
itrLexW = lexicalWeights.find(iterPhrases->first);
CHECK(itrLexW != lexicalWeights.end());
Scores scoreVector(3);
scoreVector[0] = trg2SrcMLE;
scoreVector[1] = itrLexW->second.first;
scoreVector[2] = 2.718; // exp(1);
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
}
// return top scoring phrases
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
Scores scoreVector = ritr->first;
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
target.push_back(make_pair( scoreVector, targetPhrase));
if(target.size() == m_maxSampleSize) break;
}
}
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
{
std::vector<unsigned>::const_iterator vit;
std::vector<int> sntIndexes;
for(size_t i=0; i < wrdIndices.size(); ++i) {
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
int index = int(vit - sntBreaks.begin()) - 1;
// check for phrases that cross sentence boundaries
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
sntIndexes.push_back(-1); // set bad flag
else
sntIndexes.push_back(index); // store the index of the sentence in the corpus
}
return sntIndexes;
std::vector<unsigned>::const_iterator vit;
std::vector<int> sntIndexes;
for(size_t i=0; i < wrdIndices.size(); ++i) {
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
int index = int(vit - sntBreaks.begin()) - 1;
// check for phrases that cross sentence boundaries
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
sntIndexes.push_back(-1); // set bad flag
else
sntIndexes.push_back(index); // store the index of the sentence in the corpus
}
return sntIndexes;
}
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
int sampleSize) const
int sampleSize) const
{
// only use top 'sampleSize' number of samples
if(sample.size() > sampleSize)
sample.erase(sample.begin()+sampleSize, sample.end());
return sample.size();
if(sample.size() > sampleSize)
sample.erase(sample.begin()+sampleSize, sample.end());
return sample.size();
}
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
{
vuint_t srcFactor, trgFactor;
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR);
@ -471,7 +470,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
}
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcVocab->MakeClosed();
Phrase tphrase(ARRAY_SIZE_INCR);
tphrase.CreateFromString(m_outputFactors, target, factorDelimiter);
@ -494,16 +493,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
LoadRawAlignments(alignment);
m_trgVocab->MakeClosed();
//for(size_t i=0; i < sphrase.GetSize(); ++i)
//ClearWordInCache(sIDs[i]);
//ClearWordInCache(sIDs[i]);
}
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
{
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
return;
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
first, last;
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
first, last;
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
if(it->first.first == srcWord) { // all source words grouped
if(it->first.first == srcWord) { // all source words grouped
first = it; // copy first entry of srcWord
last = it++;
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
@ -513,80 +513,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
m_wordPairCache.erase(first, last);
}
}
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
:m_sntIndex(sntIndex)
,numberAligned(targetSize, 0)
,alignedList(sourceSize)
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
:m_sntIndex(sntIndex)
,numberAligned(targetSize, 0)
,alignedList(sourceSize)
{
for(int i=0; i < sourceSize; ++i) {
std::vector<int> trgWrd;
alignedList[i] = trgWrd;
}
for(int i=0; i < sourceSize; ++i) {
std::vector<int> trgWrd;
alignedList[i] = trgWrd;
}
}
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
{
// foreign = target, F=T
// english = source, E=S
int countTarget = numberAligned.size();
int minTarget = 9999;
int maxTarget = -1;
std::vector< int > usedTarget = numberAligned;
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++)
{
for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++)
{
int targetPos = alignedList[sourcePos][ind];
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
if (targetPos<minTarget) { minTarget = targetPos; }
if (targetPos>maxTarget) { maxTarget = targetPos; }
usedTarget[ targetPos ]--;
} // for(int ind=0;ind<sentence
} // for(int sourcePos=startSource
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
if (maxTarget >= 0 && // aligned to any foreign words at all
maxTarget-minTarget < maxPhraseLength)
{ // foreign phrase within limits
// check if foreign words are aligned to out of bound english words
bool out_of_bounds = false;
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++)
{
if (usedTarget[targetPos]>0)
{
// cout << "ouf of bounds: " << targetPos << "\n";
out_of_bounds = true;
}
}
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
if (!out_of_bounds)
{
// start point of foreign phrase may retreat over unaligned
for(int startTarget = minTarget;
(startTarget >= 0 &&
startTarget > maxTarget-maxPhraseLength && // within length limit
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
startTarget--)
{
// end point of foreign phrase may advance over unaligned
for (int endTarget=maxTarget;
(endTarget<countTarget &&
endTarget<startTarget+maxPhraseLength && // within length limit
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
endTarget++)
{
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
ret.push_back(phrasePair);
} // for (int endTarget=maxTarget;
} // for(int startTarget=minTarget;
} // if (!out_of_bounds)
} // if (maxTarget >= 0 &&
return (ret.size() > 0);
// foreign = target, F=T
// english = source, E=S
int countTarget = numberAligned.size();
int minTarget = 9999;
int maxTarget = -1;
std::vector< int > usedTarget = numberAligned;
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
int targetPos = alignedList[sourcePos][ind];
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
if (targetPos<minTarget) {
minTarget = targetPos;
}
if (targetPos>maxTarget) {
maxTarget = targetPos;
}
usedTarget[ targetPos ]--;
} // for(int ind=0;ind<sentence
} // for(int sourcePos=startSource
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
if (maxTarget >= 0 && // aligned to any foreign words at all
maxTarget-minTarget < maxPhraseLength) {
// foreign phrase within limits
// check if foreign words are aligned to out of bound english words
bool out_of_bounds = false;
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
if (usedTarget[targetPos]>0) {
// cout << "ouf of bounds: " << targetPos << "\n";
out_of_bounds = true;
}
}
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
if (!out_of_bounds) {
// start point of foreign phrase may retreat over unaligned
for(int startTarget = minTarget;
(startTarget >= 0 &&
startTarget > maxTarget-maxPhraseLength && // within length limit
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
startTarget--) {
// end point of foreign phrase may advance over unaligned
for (int endTarget=maxTarget;
(endTarget<countTarget &&
endTarget<startTarget+maxPhraseLength && // within length limit
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
endTarget++) {
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
ret.push_back(phrasePair);
} // for (int endTarget=maxTarget;
} // for(int startTarget=minTarget;
} // if (!out_of_bounds)
} // if (maxTarget >= 0 &&
return (ret.size() > 0);
}
}// end namepsace

View File

@ -2,70 +2,73 @@
#define moses_BilingualDynSuffixArray_h
#include "TargetPhrase.h"
#include "DynSuffixArray.h"
#include "DynSuffixArray.h"
#include "DynSAInclude/vocab.h"
#include "DynSAInclude/types.h"
#include "DynSAInclude/utils.h"
#include "InputFileStream.h"
#include "FactorTypeSet.h"
namespace Moses {
namespace Moses
{
class SAPhrase
{
public:
std::vector<wordID_t> words;
SAPhrase(size_t phraseSize)
:words(phraseSize)
{}
void SetId(size_t pos, wordID_t id)
{
std::vector<wordID_t> words;
SAPhrase(size_t phraseSize)
:words(phraseSize)
{}
void SetId(size_t pos, wordID_t id) {
CHECK(pos < words.size());
words[pos] = id;
}
bool operator<(const SAPhrase& phr2) const
{ return words < phr2.words; }
words[pos] = id;
}
bool operator<(const SAPhrase& phr2) const {
return words < phr2.words;
}
};
class PhrasePair
{
public:
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
: m_startTarget(startTarget)
, m_endTarget(endTarget)
, m_startSource(startSource)
, m_endSource(endSource)
, m_sntIndex(sntIndex)
{}
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
: m_startTarget(startTarget)
, m_endTarget(endTarget)
, m_startSource(startSource)
, m_endSource(endSource)
, m_sntIndex(sntIndex)
{}
size_t GetTargetSize() const
{ return m_endTarget - m_startTarget + 1; }
size_t GetTargetSize() const {
return m_endTarget - m_startTarget + 1;
}
};
class SentenceAlignment
class SentenceAlignment
{
public:
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
int m_sntIndex;
std::vector<wordID_t>* trgSnt;
std::vector<wordID_t>* srcSnt;
std::vector<int> numberAligned;
std::vector< std::vector<int> > alignedList;
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
int m_sntIndex;
std::vector<wordID_t>* trgSnt;
std::vector<wordID_t>* srcSnt;
std::vector<int> numberAligned;
std::vector< std::vector<int> > alignedList;
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
};
class ScoresComp {
public:
class ScoresComp
{
public:
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
bool operator()(const Scores& s1, const Scores& s2) const {
bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation
/*float score1(0), score2(0);
int idx1(0), idx2(0);
for (Scores::const_iterator itr = s1.begin();
for (Scores::const_iterator itr = s1.begin();
itr != s1.end(); ++itr) {
score1 += log(*itr * m_weights.at(idx1++));
score1 += log(*itr * m_weights.at(idx1++));
}
for (Scores::const_iterator itr = s2.begin();
itr != s2.end(); ++itr) {
@ -73,73 +76,72 @@ public:
}
return score1 < score2;*/
}
private:
private:
const std::vector<float>& m_weights;
};
class BilingualDynSuffixArray {
public:
BilingualDynSuffixArray();
~BilingualDynSuffixArray();
bool Load( const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputTactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight);
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void CleanUp();
class BilingualDynSuffixArray
{
public:
BilingualDynSuffixArray();
~BilingualDynSuffixArray();
bool Load( const std::vector<FactorType>& inputFactors,
const std::vector<FactorType>& outputTactors,
std::string source, std::string target, std::string alignments,
const std::vector<float> &weight);
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void CleanUp();
void addSntPair(string& source, string& target, string& alignment);
private:
DynSuffixArray* m_srcSA;
DynSuffixArray* m_trgSA;
std::vector<wordID_t>* m_srcCorpus;
std::vector<wordID_t>* m_trgCorpus;
DynSuffixArray* m_srcSA;
DynSuffixArray* m_trgSA;
std::vector<wordID_t>* m_srcCorpus;
std::vector<wordID_t>* m_trgCorpus;
std::vector<FactorType> m_inputFactors;
std::vector<FactorType> m_outputFactors;
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
Vocab* m_srcVocab, *m_trgVocab;
ScoresComp* m_scoreCmp;
Vocab* m_srcVocab, *m_trgVocab;
ScoresComp* m_scoreCmp;
std::vector<SentenceAlignment> m_alignments;
std::vector<std::vector<short> > m_rawAlignments;
std::vector<SentenceAlignment> m_alignments;
std::vector<std::vector<short> > m_rawAlignments;
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached;
const size_t m_maxPhraseLength, m_maxSampleSize;
const size_t m_maxPhraseLength, m_maxSampleSize;
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
std::vector<wordID_t>&, std::vector<wordID_t>&,
Vocab*);
int LoadAlignments(InputFileStream& aligs);
int LoadRawAlignments(InputFileStream& aligs);
int LoadRawAlignments(string& aligs);
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
std::vector<wordID_t>&, std::vector<wordID_t>&,
Vocab*);
int LoadAlignments(InputFileStream& aligs);
int LoadRawAlignments(InputFileStream& aligs);
int LoadRawAlignments(string& aligs);
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
int SampleSelection(std::vector<unsigned>&, int = 300) const;
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
int SampleSelection(std::vector<unsigned>&, int = 300) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const;
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
void ClearWordInCache(wordID_t);
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
int GetSourceSentenceSize(size_t sentenceId) const
{
return (sentenceId==m_srcSntBreaks.size()-1) ?
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
}
int GetTargetSentenceSize(size_t sentenceId) const
{
return (sentenceId==m_trgSntBreaks.size()-1) ?
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
}
int GetSourceSentenceSize(size_t sentenceId) const {
return (sentenceId==m_srcSntBreaks.size()-1) ?
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
}
int GetTargetSentenceSize(size_t sentenceId) const {
return (sentenceId==m_trgSntBreaks.size()-1) ?
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
}
};
} // end namespace
#endif

View File

@ -98,8 +98,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
// add all trans opt into queue. using only 1st child node.
ChartTranslationOptionList::const_iterator iterList;
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList)
{
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList) {
const ChartTranslationOption &transOpt = **iterList;
RuleCube *ruleCube = new RuleCube(transOpt, allChartCells, m_manager);
queue.Add(ruleCube);
@ -107,8 +106,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
// pluck things out of queue and add to hypo collection
const size_t popLimit = staticData.GetCubePruningPopLimit();
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops)
{
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
ChartHypothesis *hypo = queue.Pop();
AddHypothesis(hypo);
}

View File

@ -34,7 +34,7 @@ class Word;
class ChartCellLabel
{
public:
public:
ChartCellLabel(const WordsRange &coverage, const Word &label,
const ChartHypothesisCollection *stack=NULL)
: m_coverage(coverage)
@ -42,12 +42,17 @@ class ChartCellLabel
, m_stack(stack)
{}
const WordsRange &GetCoverage() const { return m_coverage; }
const Word &GetLabel() const { return m_label; }
const ChartHypothesisCollection *GetStack() const { return m_stack; }
const WordsRange &GetCoverage() const {
return m_coverage;
}
const Word &GetLabel() const {
return m_label;
}
const ChartHypothesisCollection *GetStack() const {
return m_stack;
}
bool operator<(const ChartCellLabel &other) const
{
bool operator<(const ChartCellLabel &other) const {
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't
// need to compare m_stack.
if (m_coverage == other.m_coverage) {
@ -56,7 +61,7 @@ class ChartCellLabel
return m_coverage < other.m_coverage;
}
private:
private:
const WordsRange &m_coverage;
const Word &m_label;
const ChartHypothesisCollection *m_stack;

View File

@ -34,40 +34,45 @@ class ChartHypothesisCollection;
class ChartCellLabelSet
{
private:
private:
typedef std::set<ChartCellLabel> SetType;
public:
public:
typedef SetType::const_iterator const_iterator;
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
const_iterator begin() const { return m_set.begin(); }
const_iterator end() const { return m_set.end(); }
const_iterator begin() const {
return m_set.begin();
}
const_iterator end() const {
return m_set.end();
}
void AddWord(const Word &w)
{
void AddWord(const Word &w) {
ChartCellLabel cellLabel(m_coverage, w);
m_set.insert(cellLabel);
}
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack)
{
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack) {
ChartCellLabel cellLabel(m_coverage, w, &stack);
m_set.insert(cellLabel);
}
bool Empty() const { return m_set.empty(); }
bool Empty() const {
return m_set.empty();
}
size_t GetSize() const { return m_set.size(); }
size_t GetSize() const {
return m_set.size();
}
const ChartCellLabel *Find(const Word &w) const
{
const ChartCellLabel *Find(const Word &w) const {
SetType::const_iterator p = m_set.find(ChartCellLabel(m_coverage, w));
return p == m_set.end() ? 0 : &(*p);
}
private:
private:
const WordsRange &m_coverage;
SetType m_set;
};

View File

@ -57,15 +57,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOption &transOpt,
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
m_prevHypos.reserve(childEntries.size());
std::vector<HypothesisDimension>::const_iterator iter;
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter)
{
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) {
m_prevHypos.push_back(iter->GetHypothesis());
}
}
ChartHypothesis::~ChartHypothesis()
{
// delete feature function states
// delete feature function states
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
delete m_ffStates[i];
}
@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const
size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
prevHypo->CreateOutputPhrase(outPhrase);
}
else {
} else {
outPhrase.AddWord(word);
}
}
@ -120,20 +118,19 @@ Phrase ChartHypothesis::GetOutputPhrase() const
*/
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
{
int comp = 0;
int comp = 0;
// -1 = this < compare
// +1 = this > compare
// 0 = this ==compare
for (unsigned i = 0; i < m_ffStates.size(); ++i)
{
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
comp = m_ffStates[i] - compare.m_ffStates[i];
else
else
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
if (comp != 0)
return comp;
if (comp != 0)
return comp;
}
return 0;
@ -154,12 +151,12 @@ void ChartHypothesis::CalcScore()
const ScoreComponentCollection &scoreBreakdown = GetCurrTargetPhrase().GetScoreBreakdown();
m_scoreBreakdown.PlusEquals(scoreBreakdown);
// compute values of stateless feature functions that were not
// compute values of stateless feature functions that were not
// cached in the translation option-- there is no principled distinction
//const vector<const StatelessFeatureFunction*>& sfs =
// m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions();
// TODO!
// TODO!
//for (unsigned i = 0; i < sfs.size(); ++i) {
// sfs[i]->ChartEvaluate(m_targetPhrase, &m_scoreBreakdown);
//}
@ -167,7 +164,7 @@ void ChartHypothesis::CalcScore()
const std::vector<const StatefulFeatureFunction*>& ffs =
m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
}
m_totalScore = m_scoreBreakdown.GetWeightedScore();
@ -258,13 +255,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
{
out << hypo.GetId();
// recombination
if (hypo.GetWinningHypothesis() != NULL &&
hypo.GetWinningHypothesis() != &hypo)
{
out << "->" << hypo.GetWinningHypothesis()->GetId();
}
// recombination
if (hypo.GetWinningHypothesis() != NULL &&
hypo.GetWinningHypothesis() != &hypo) {
out << "->" << hypo.GetWinningHypothesis()->GetId();
}
out << " " << hypo.GetCurrTargetPhrase()
//<< " " << outPhrase

View File

@ -55,7 +55,7 @@ protected:
const ChartTranslationOption &m_transOpt;
WordsRange m_currSourceWordsRange;
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
,m_lmNGram
,m_lmPrefix;
@ -94,7 +94,9 @@ public:
~ChartHypothesis();
unsigned GetId() const { return m_id; }
unsigned GetId() const {
return m_id;
}
const ChartTranslationOption &GetTranslationOption()const {
return m_transOpt;
@ -108,15 +110,17 @@ public:
inline const ChartArcList* GetArcList() const {
return m_arcList;
}
inline const FFState* GetFFState( size_t featureID ) const {
return m_ffStates[ featureID ];
}
inline const ChartManager& GetManager() const { return m_manager; }
inline const FFState* GetFFState( size_t featureID ) const {
return m_ffStates[ featureID ];
}
inline const ChartManager& GetManager() const {
return m_manager;
}
void CreateOutputPhrase(Phrase &outPhrase) const;
Phrase GetOutputPhrase() const;
int RecombineCompare(const ChartHypothesis &compare) const;
int RecombineCompare(const ChartHypothesis &compare) const;
void CalcScore();
@ -135,17 +139,17 @@ public:
return m_prevHypos;
}
const ChartHypothesis* GetPrevHypo(size_t pos) const {
return m_prevHypos[pos];
}
const ChartHypothesis* GetPrevHypo(size_t pos) const {
return m_prevHypos[pos];
}
const Word &GetTargetLHS() const {
return GetCurrTargetPhrase().GetTargetLHS();
}
const ChartHypothesis* GetWinningHypothesis() const {
return m_winningHypo;
}
const ChartHypothesis* GetWinningHypothesis() const {
return m_winningHypo;
}
TO_STRING();

View File

@ -101,8 +101,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo);
}
else {
} else {
ChartHypothesis::Delete(hypo);
}
return false;

View File

@ -43,7 +43,7 @@ public:
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
// assert in same cell
const WordsRange &rangeA = hypoA->GetCurrSourceRange()
, &rangeB = hypoB->GetCurrSourceRange();
, &rangeB = hypoB->GetCurrSourceRange();
CHECK(rangeA == rangeB);
// shouldn't be mixing hypos with different lhs
@ -113,7 +113,9 @@ public:
return m_hyposOrdered;
}
float GetBestScore() const { return m_bestScore; }
float GetBestScore() const {
return m_bestScore;
}
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;

View File

@ -231,17 +231,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
{
size_t size = m_source.GetSize();
// which hypotheses are reachable?
std::map<unsigned,bool> reachable;
WordsRange fullRange(0, size-1);
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
// which hypotheses are reachable?
std::map<unsigned,bool> reachable;
WordsRange fullRange(0, size-1);
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
if (hypo == NULL) {
// no hypothesis
return;
}
FindReachableHypotheses( hypo, reachable);
FindReachableHypotheses( hypo, reachable);
for (size_t width = 1; width <= size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
@ -257,42 +257,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
{
// do not recurse, if already visited
if (reachable.find(hypo->GetId()) != reachable.end())
{
return;
}
// do not recurse, if already visited
if (reachable.find(hypo->GetId()) != reachable.end()) {
return;
}
// recurse
reachable[ hypo->GetId() ] = true;
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i)
{
FindReachableHypotheses( *i, reachable );
}
// recurse
reachable[ hypo->GetId() ] = true;
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
FindReachableHypotheses( *i, reachable );
}
// also loop over recombined hypotheses (arcs)
const ChartArcList *arcList = hypo->GetArcList();
if (arcList) {
ChartArcList::const_iterator iterArc;
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
const ChartHypothesis &arc = **iterArc;
FindReachableHypotheses( &arc, reachable );
}
}
// also loop over recombined hypotheses (arcs)
const ChartArcList *arcList = hypo->GetArcList();
if (arcList) {
ChartArcList::const_iterator iterArc;
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
const ChartHypothesis &arc = **iterArc;
FindReachableHypotheses( &arc, reachable );
}
}
}
void ChartManager::CreateDeviantPaths(
boost::shared_ptr<const ChartTrellisPath> basePath,
ChartTrellisDetourQueue &q)
boost::shared_ptr<const ChartTrellisPath> basePath,
ChartTrellisDetourQueue &q)
{
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
}
void ChartManager::CreateDeviantPaths(
boost::shared_ptr<const ChartTrellisPath> basePath,
const ChartTrellisNode &substitutedNode,
ChartTrellisDetourQueue &queue)
boost::shared_ptr<const ChartTrellisPath> basePath,
const ChartTrellisNode &substitutedNode,
ChartTrellisDetourQueue &queue)
{
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
if (arcList) {

View File

@ -69,7 +69,7 @@ public:
void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
const InputType& GetSource() const {
return m_source;
@ -89,7 +89,9 @@ public:
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
}
unsigned GetNextHypoId() { return m_hypothesisId++; }
unsigned GetNextHypoId() {
return m_hypothesisId++;
}
};
}

View File

@ -77,19 +77,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// get list of all rules that apply to spans at same starting position
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();
// loop through the rules
// (note that expandableDottedRuleList can be expanded as the loop runs
// (note that expandableDottedRuleList can be expanded as the loop runs
// through calls to ExtendPartialRuleApplication())
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
// rule we are about to extend
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
// we will now try to extend it, starting after where it ended
size_t startPos = prevDottedRule.IsRoot()
? range.GetStartPos()
: prevDottedRule.GetWordsRange().GetEndPos() + 1;
? range.GetStartPos()
: prevDottedRule.GetWordsRange().GetEndPos() + 1;
// search for terminal symbol
// (if only one more word position needs to be covered)
@ -102,15 +102,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// if we found a new rule -> create it and add it to the list
if (node != NULL) {
// create the rule
// create the rule
#ifdef USE_BOOST_POOL
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
prevDottedRule);
#else
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
sourceWordLabel,
prevDottedRule);
sourceWordLabel,
prevDottedRule);
#endif
dottedRuleCol.Add(relEndPos+1, dottedRule);
}
@ -136,9 +136,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// word.
endPos = absEndPos - 1;
stackInd = relEndPos;
}
else
{
} else {
endPos = absEndPos;
stackInd = relEndPos + 1;
}
@ -215,7 +213,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
// We'll do whichever minimises the number of lookups:
if (numCombinations <= numChildren*2) {
// loop over possible source non-terminal labels (as found in input tree)
// loop over possible source non-terminal labels (as found in input tree)
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
for (; p != sEnd; ++p) {
@ -242,14 +240,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
prevDottedRule);
prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
}
}
else
{
} else {
// loop over possible expansions of the rule
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end =
@ -274,7 +270,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
prevDottedRule);
prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}

View File

@ -30,7 +30,7 @@ namespace Moses
{
void ChartTranslationOption::CalcEstimateOfBestScore(
const ChartCellCollection &allChartCells)
const ChartCellCollection &allChartCells)
{
const TargetPhrase &targetPhrase = **(m_targetPhraseCollection.begin());
m_estimateOfBestScore = targetPhrase.GetFutureScore();

View File

@ -37,7 +37,7 @@ class ChartCellCollection;
// of translations and provdes an estimate of the best score.
class ChartTranslationOption
{
public:
public:
ChartTranslationOption(const TargetPhraseCollection &targetPhraseColl,
const DottedRule &dottedRule,
const WordsRange &wordsRange,
@ -45,16 +45,17 @@ class ChartTranslationOption
: m_dottedRule(dottedRule)
, m_targetPhraseCollection(targetPhraseColl)
, m_wordsRange(wordsRange)
, m_estimateOfBestScore(0)
{
, m_estimateOfBestScore(0) {
CalcEstimateOfBestScore(allChartCells);
}
~ChartTranslationOption() {}
const DottedRule &GetDottedRule() const { return m_dottedRule; }
const DottedRule &GetDottedRule() const {
return m_dottedRule;
}
const TargetPhraseCollection &GetTargetPhraseCollection() const {
const TargetPhraseCollection &GetTargetPhraseCollection() const {
return m_targetPhraseCollection;
}
@ -65,9 +66,11 @@ class ChartTranslationOption
// return an estimate of the best score possible with this translation option.
// the estimate is the sum of the top target phrase's estimated score plus the
// scores of the best child hypotheses.
inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; }
inline float GetEstimateOfBestScore() const {
return m_estimateOfBestScore;
}
private:
private:
// not implemented
ChartTranslationOption &operator=(const ChartTranslationOption &);

View File

@ -106,8 +106,8 @@ void ChartTranslationOptionCollection::ProcessUnknownWord(size_t startPos, size_
return;
}
if (startPos == 0 || startPos == m_source.GetSize() - 1)
{ // don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
if (startPos == 0 || startPos == m_source.GetSize() - 1) {
// don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
return;
}

View File

@ -74,9 +74,9 @@ protected:
public:
ChartTranslationOptionCollection(InputType const& source
, const TranslationSystem* system
, const ChartCellCollection &hypoStackColl
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
, const TranslationSystem* system
, const ChartCellCollection &hypoStackColl
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
virtual ~ChartTranslationOptionCollection();
void CreateTranslationOptionsForRange(size_t startPos
, size_t endPos);

View File

@ -66,12 +66,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &targetPhraseC
if (m_collection.size() < ruleLimit) {
// not yet filled out quota. add everything
ChartTranslationOption *option = new ChartTranslationOption(
targetPhraseCollection, dottedRule, m_range, chartCellColl);
targetPhraseCollection, dottedRule, m_range, chartCellColl);
m_collection.push_back(option);
float score = option->GetEstimateOfBestScore();
m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
}
else {
} else {
// full but not bursting. add if better than worst score
ChartTranslationOption option(targetPhraseCollection, dottedRule,
m_range, chartCellColl);

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -27,15 +27,15 @@ namespace Moses
{
ChartTrellisDetour::ChartTrellisDetour(
boost::shared_ptr<const ChartTrellisPath> basePath,
const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo)
boost::shared_ptr<const ChartTrellisPath> basePath,
const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo)
: m_basePath(basePath)
, m_substitutedNode(substitutedNode)
, m_replacementHypo(replacementHypo)
{
float diff = replacementHypo.GetTotalScore()
- substitutedNode.GetHypothesis().GetTotalScore();
- substitutedNode.GetHypothesis().GetTotalScore();
m_totalScore = basePath->GetTotalScore() + diff;
}

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -29,20 +29,24 @@ class ChartTrellisPath;
class ChartTrellisDetour
{
public:
public:
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
const ChartTrellisNode &, const ChartHypothesis &);
const ChartTrellisPath &GetBasePath() const { return *m_basePath; }
const ChartTrellisPath &GetBasePath() const {
return *m_basePath;
}
const ChartTrellisNode &GetSubstitutedNode() const {
return m_substitutedNode;
}
const ChartHypothesis &GetReplacementHypo() const {
return m_replacementHypo;
}
float GetTotalScore() const { return m_totalScore; }
float GetTotalScore() const {
return m_totalScore;
}
private:
private:
boost::shared_ptr<const ChartTrellisPath> m_basePath;
const ChartTrellisNode &m_substitutedNode;
const ChartHypothesis &m_replacementHypo;

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -21,13 +21,16 @@
#include "Util.h"
namespace Moses {
namespace Moses
{
ChartTrellisDetourQueue::~ChartTrellisDetourQueue() {
ChartTrellisDetourQueue::~ChartTrellisDetourQueue()
{
RemoveAllInColl(m_queue);
}
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour)
{
if (m_capacity == 0 || m_queue.size() < m_capacity) {
m_queue.insert(detour);
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
}
}
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() {
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop()
{
QueueType::iterator p = m_queue.begin();
const ChartTrellisDetour *top = *p;
m_queue.erase(p);

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@ -23,19 +23,23 @@
#include <set>
namespace Moses {
namespace Moses
{
// A bounded priority queue of ChartTrellisDetour pointers. The top item is
// the best scoring detour. The queue assumes ownership of pushed items and
// relinquishes ownership when they are popped. Any remaining items at the
// time of the queue's destruction are deleted.
class ChartTrellisDetourQueue {
public:
class ChartTrellisDetourQueue
{
public:
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
~ChartTrellisDetourQueue();
bool Empty() const { return m_queue.empty(); }
bool Empty() const {
return m_queue.empty();
}
// Add the detour to the queue or delete it if the queue is full and the
// score is no better than the queue's worst score.
@ -45,7 +49,7 @@ class ChartTrellisDetourQueue {
// caller is responsible for deleting the object.
const ChartTrellisDetour *Pop();
private:
private:
struct DetourOrderer {
bool operator()(const ChartTrellisDetour* a,
const ChartTrellisDetour* b) const {

View File

@ -31,16 +31,16 @@ namespace Moses
{
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
: m_hypo(hypo)
: m_hypo(hypo)
{
CreateChildren();
}
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
ChartTrellisNode *&deviationPoint)
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
? detour.GetReplacementHypo()
: detour.GetBasePath().GetFinalNode().GetHypothesis())
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
? detour.GetReplacementHypo()
: detour.GetBasePath().GetFinalNode().GetHypothesis())
{
if (&m_hypo == &detour.GetReplacementHypo()) {
deviationPoint = this;
@ -56,9 +56,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root,
const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo,
ChartTrellisNode *&deviationPoint)
: m_hypo((&root == &substitutedNode)
? replacementHypo
: root.GetHypothesis())
: m_hypo((&root == &substitutedNode)
? replacementHypo
: root.GetHypothesis())
{
if (&root == &substitutedNode) {
deviationPoint = this;
@ -124,8 +124,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode,
for (size_t ind = 0; ind < children.size(); ++ind) {
const ChartTrellisNode *origChild = children[ind];
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
replacementHypo,
deviationPoint);
replacementHypo,
deviationPoint);
m_children.push_back(child);
}
}

View File

@ -32,7 +32,7 @@ class ChartTrellisDetour;
class ChartTrellisNode
{
public:
public:
typedef std::vector<ChartTrellisNode*> NodeChildren;
ChartTrellisNode(const ChartHypothesis &hypo);
@ -40,15 +40,21 @@ class ChartTrellisNode
~ChartTrellisNode();
const ChartHypothesis &GetHypothesis() const { return m_hypo; }
const ChartHypothesis &GetHypothesis() const {
return m_hypo;
}
const NodeChildren &GetChildren() const { return m_children; }
const NodeChildren &GetChildren() const {
return m_children;
}
const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; }
const ChartTrellisNode &GetChild(size_t i) const {
return *m_children[i];
}
Phrase GetOutputPhrase() const;
private:
private:
ChartTrellisNode(const ChartTrellisNode &); // Not implemented
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented

View File

@ -30,17 +30,17 @@ namespace Moses
{
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
: m_finalNode(new ChartTrellisNode(hypo))
, m_deviationPoint(NULL)
, m_scoreBreakdown(hypo.GetScoreBreakdown())
, m_totalScore(hypo.GetTotalScore())
: m_finalNode(new ChartTrellisNode(hypo))
, m_deviationPoint(NULL)
, m_scoreBreakdown(hypo.GetScoreBreakdown())
, m_totalScore(hypo.GetTotalScore())
{
}
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
, m_totalScore(0)
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
, m_totalScore(0)
{
CHECK(m_deviationPoint);
ScoreComponentCollection scoreChange;

View File

@ -36,18 +36,24 @@ class ChartTrellisNode;
class ChartTrellisPath
{
public:
public:
ChartTrellisPath(const ChartHypothesis &hypo);
ChartTrellisPath(const ChartTrellisDetour &detour);
~ChartTrellisPath();
const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; }
const ChartTrellisNode &GetFinalNode() const {
return *m_finalNode;
}
const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; }
const ChartTrellisNode *GetDeviationPoint() const {
return m_deviationPoint;
}
//! get score for this path throught trellis
float GetTotalScore() const { return m_totalScore; }
float GetTotalScore() const {
return m_totalScore;
}
Phrase GetOutputPhrase() const;
@ -56,7 +62,7 @@ class ChartTrellisPath
return m_scoreBreakdown;
}
private:
private:
ChartTrellisPath(const ChartTrellisPath &); // Not implemented
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented

View File

@ -32,26 +32,38 @@ class DottedRule
{
friend std::ostream& operator<<(std::ostream &, const DottedRule &);
public:
public:
// used only to init dot stack.
DottedRule()
: m_cellLabel(NULL)
, m_prev(NULL) {}
: m_cellLabel(NULL)
, m_prev(NULL) {}
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
: m_cellLabel(&ccl)
, m_prev(&prev) {}
: m_cellLabel(&ccl)
, m_prev(&prev) {}
const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); }
const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); }
bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); }
const DottedRule *GetPrev() const { return m_prev; }
bool IsRoot() const { return m_prev == NULL; }
const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; }
const WordsRange &GetWordsRange() const {
return m_cellLabel->GetCoverage();
}
const Word &GetSourceWord() const {
return m_cellLabel->GetLabel();
}
bool IsNonTerminal() const {
return m_cellLabel->GetLabel().IsNonTerminal();
}
const DottedRule *GetPrev() const {
return m_prev;
}
bool IsRoot() const {
return m_prev == NULL;
}
const ChartCellLabel &GetChartCellLabel() const {
return *m_cellLabel;
}
private:
private:
const ChartCellLabel *m_cellLabel; // usually contains something, unless
// it's the init processed rule
// it's the init processed rule
const DottedRule *m_prev;
};

View File

@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

View File

@ -34,21 +34,23 @@ namespace Moses
class DottedRuleInMemory : public DottedRule
{
public:
public:
// used only to init dot stack.
explicit DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node)
: DottedRule()
, m_node(node) {}
: DottedRule()
, m_node(node) {}
DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node,
const ChartCellLabel &cellLabel,
const DottedRuleInMemory &prev)
: DottedRule(cellLabel, prev)
, m_node(node) {}
const PhraseDictionaryNodeSCFG &GetLastNode() const { return m_node; }
: DottedRule(cellLabel, prev)
, m_node(node) {}
private:
const PhraseDictionaryNodeSCFG &GetLastNode() const {
return m_node;
}
private:
const PhraseDictionaryNodeSCFG &m_node;
};

View File

@ -34,26 +34,32 @@ namespace Moses
{
class DottedRuleOnDisk : public DottedRule
{
public:
public:
// used only to init dot stack.
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
: DottedRule()
, m_lastNode(lastNode)
, m_done(false) {}
: DottedRule()
, m_lastNode(lastNode)
, m_done(false) {}
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
const ChartCellLabel &cellLabel,
const DottedRuleOnDisk &prev)
: DottedRule(cellLabel, prev)
, m_lastNode(lastNode)
, m_done(false) {}
: DottedRule(cellLabel, prev)
, m_lastNode(lastNode)
, m_done(false) {}
const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; }
const OnDiskPt::PhraseNode &GetLastNode() const {
return m_lastNode;
}
bool Done() const { return m_done; }
void Done(bool value) const { m_done = value; }
bool Done() const {
return m_done;
}
void Done(bool value) const {
m_done = value;
}
private:
private:
const OnDiskPt::PhraseNode &m_lastNode;
mutable bool m_done;
};

View File

@ -36,9 +36,9 @@ public:
const ChartHypothesis&,
int /* featureID */,
ScoreComponentCollection*) const {
CHECK(0); // feature function not valid in chart decoder
return NULL;
}
CHECK(0); // feature function not valid in chart decoder
return NULL;
}
};
/** Doesn't do anything but provide a key into the global

View File

@ -22,176 +22,179 @@
#include <ctime>
#include <iostream>
namespace randlm {
template<typename T>
class CacheNode {
public:
typedef std::map<wordID_t, CacheNode<T>* > childMap;
// initialise value to 'unknown' (i.e. not yet queried or cached).
CacheNode(T unknown_value) : value_(unknown_value) {}
childMap childs_; // child pointers
T value_; // value stored
const void* state_; // state pointer
};
template<typename T>
class Cache {
public:
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
// unknown_value is used to indicate the ngram was not queried (yet)
// null_value_ indicates it was queried but not found in model
// space usage is handled by client.
Cache(T unknown_value, T null_value) :
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
root_ = newNode();
}
~Cache() {
if(clear()) {
delete root_;
root_ = NULL;
} else {
std::cerr << "Error freeing cache memory.\n";
}
}
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
// inserts full ngram into cache
CacheNode<T>* node = root_;
for (int i = len - 1; i > -1; --i) {
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// current node is already prefix. Go to child node
node = node->childs_[ngram[i]];
} else {
// no child for prefix. set new child link in current node
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
}
}
node->value_ = value;
node->state_ = state;
return true;
}
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
// finds value for this full ngram only (returns false if full ngram not in cache)
CacheNode<T> * node = root_;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
} else {
// not cached
return false;
}
}
*value = node->value_;
if(state) *state = node->state_;
return *value != null_value_ && *value != unknown_value_;
}
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
// set values array to point to cache value nodes
CacheNode<T> * node = root_;
*found = 0;
//values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_) {
return len - 1 - i; // max length posible
}
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[i] = &node->value_;
}
}
return len; // all possible
}
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
// get pointers to values for ngram and constituents.
// returns upper bound on longest subngram in model.
// 'found' stores longest non-null and known value found.
CacheNode<T> * node = root_;
*found = 0;
values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[len - i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_)
return len - 1 - i; // max length posible
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[len - i] = &node->value_;
}
}
return len; // all possible
}
bool clear() {
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
return clearNodes(root_);
}
int nodes() {
// returns number of nodes
return cur_nodes_;
}
int nodeSize() {
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
}
private:
CacheNode<T> * root_;
count_t cur_nodes_;
T unknown_value_; // Used to initialise data at each node
T null_value_; // Indicates cached something not in model
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
++cur_nodes_;
return new CacheNode<T>(unknown_value_);
}
bool clearNodes(CacheNode<T> * node) {
//delete children from this node
if(!node->childs_.empty()) {
iterate(node->childs_, itr) {
if(!clearNodes(itr->second))
std::cerr << "Error emptying cache\n";
delete itr->second;
--cur_nodes_;
}
node->childs_.clear();
}
return true;
}
namespace randlm
{
};
template<typename T>
class CacheNode
{
public:
typedef std::map<wordID_t, CacheNode<T>* > childMap;
// initialise value to 'unknown' (i.e. not yet queried or cached).
CacheNode(T unknown_value) : value_(unknown_value) {}
childMap childs_; // child pointers
T value_; // value stored
const void* state_; // state pointer
};
template<typename T>
class Cache
{
public:
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
// unknown_value is used to indicate the ngram was not queried (yet)
// null_value_ indicates it was queried but not found in model
// space usage is handled by client.
Cache(T unknown_value, T null_value) :
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
root_ = newNode();
}
~Cache() {
if(clear()) {
delete root_;
root_ = NULL;
} else {
std::cerr << "Error freeing cache memory.\n";
}
}
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
// inserts full ngram into cache
CacheNode<T>* node = root_;
for (int i = len - 1; i > -1; --i) {
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// current node is already prefix. Go to child node
node = node->childs_[ngram[i]];
} else {
// no child for prefix. set new child link in current node
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
}
}
node->value_ = value;
node->state_ = state;
return true;
}
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
// finds value for this full ngram only (returns false if full ngram not in cache)
CacheNode<T> * node = root_;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
} else {
// not cached
return false;
}
}
*value = node->value_;
if(state) *state = node->state_;
return *value != null_value_ && *value != unknown_value_;
}
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
// set values array to point to cache value nodes
CacheNode<T> * node = root_;
*found = 0;
//values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_) {
return len - 1 - i; // max length posible
}
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[i] = &node->value_;
}
}
return len; // all possible
}
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
// get pointers to values for ngram and constituents.
// returns upper bound on longest subngram in model.
// 'found' stores longest non-null and known value found.
CacheNode<T> * node = root_;
*found = 0;
values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[len - i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_)
return len - 1 - i; // max length posible
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[len - i] = &node->value_;
}
}
return len; // all possible
}
bool clear() {
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
return clearNodes(root_);
}
int nodes() {
// returns number of nodes
return cur_nodes_;
}
int nodeSize() {
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
}
private:
CacheNode<T> * root_;
count_t cur_nodes_;
T unknown_value_; // Used to initialise data at each node
T null_value_; // Indicates cached something not in model
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
++cur_nodes_;
return new CacheNode<T>(unknown_value_);
}
bool clearNodes(CacheNode<T> * node) {
//delete children from this node
if(!node->childs_.empty()) {
iterate(node->childs_, itr) {
if(!clearNodes(itr->second))
std::cerr << "Error emptying cache\n";
delete itr->second;
--cur_nodes_;
}
node->childs_.clear();
}
return true;
}
};
} //end namespace
#endif //INC_RANDLM_CACHE_H

View File

@ -20,295 +20,306 @@
#include <cmath>
#include "file.h"
namespace randlm {
// Class Filter wraps a contiguous array of data. Filter and its subclasses
// implement read/write/increment functionality on arrays with arbitrary sized addresses
// (i.e. an address may not use a full number of bytes). When converting to byte-based
// representation we assume "unused" bits are to left.
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
// been masked out.
template<typename T>
class Filter {
public:
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
// number of bits in T
cell_width_ = sizeof(T) << 3;
// current implementation has following constraints
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
// used for >> division
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
// size of underlying data in Ts
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
// instantiate underlying data
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(reset());
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
// mask for full cell
full_mask_ = static_cast<T>(0xffffffffffffffffull);
// mask for bits that make up the address
address_mask_ = full_mask_ >> first_bit_;
}
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
CHECK(loadHeader(fin));
if (loaddata)
CHECK(loadData(fin));
}
virtual ~Filter() {
delete[] data_;
}
bool reset() {
for (uint64_t i = 0; i < cells_; ++i)
data_[i] = 0;
return true;
}
count_t size() {
// return approx size of filter in MBs
return cells_ * sizeof(T) >> 20;
}
// read / write functions
inline bool read(uint64_t address, T* value) {
CHECK(address <= addresses_);
// copy address to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = data_[data_cell] & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = (data_[data_cell] >> -offset) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return true;
}
inline T read(uint64_t address) {
CHECK(address <= addresses_);
// return value at address
T value = 0;
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
value = data_[data_cell] & address_mask_;
}
// data address starts to left so shift it right
else if (offset < 0) {
value = (data_[data_cell] >> -offset) & address_mask_;
}
// data address is to right so shift it left and look at one more cell to right
else
value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return value;
}
inline bool write(uint64_t address, T value) {
CHECK(address <= addresses_);
CHECK(log2(value) <= width_);
// write 'value' to address
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = (value << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = (value >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
// copy 'address' ^ 'finger' to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = (((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
& address_mask_ ;
return true;
}
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
// write 'value' ^ 'finger' to address
finger &= address_mask_; // make sure fingerprint is correct size
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = ((finger ^ value) << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = ((finger ^ value) >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
// debugging
void printFilter(const std::string & prefix = "", uint32_t truncate = 64){
std::cout << prefix;
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
for (int j = cell_width_ - 1; j >= 0; --j)
if (data_[i] & (1ull << j))
std::cout << 1;
else
std::cout << 0;
std::cout << "\n";
}
std::cout << std::endl;
}
// i/o
uint64_t getAddresses() { return addresses_; }
int getWidth() { return width_; }
int getCellWidth() { return cell_width_; }
uint32_t getCells() { return cells_; }
virtual bool save(FileHandler* out) {
CHECK(out != NULL);
CHECK(out->write((char*)&cells_, sizeof(cells_)));
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
CHECK(out->write((char*)&width_, sizeof(width_)));
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
if((width_ == 1) || cells_ < jump)
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
else {
uint64_t idx(0);
while(idx + jump < cells_) {
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
idx += jump;
}
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
return true;
}
protected:
bool loadHeader(FileHandler* fin) {
CHECK(fin != NULL);
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
CHECK(fin->read((char*)&width_, sizeof(width_)));
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
return true;
}
bool loadData(FileHandler* fin) {
// instantiate underlying array
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
return true;
}
uint64_t cells_; // number T making up 'data_'
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
int log_cell_width_; // log of bits used for >> division
uint64_t addresses_; // number of addresses in the filter
int width_; // width in bits of each address
int first_bit_; // position of first bit in initial byte
T full_mask_; // all 1s
T address_mask_; // 1s in those positions that are part of address
T* data_; // the raw data as bytes
};
namespace randlm
{
// Extension with bit test/setter methods added
class BitFilter : public Filter<uint8_t> {
public:
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
BitFilter(FileHandler* fin, bool loaddata = true)
: Filter<uint8_t>(fin, loaddata) {
if (loaddata)
CHECK(load(fin));
}
// TODO: overload operator[]
virtual bool testBit(uint64_t location) {
// test bit referenced by location
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
}
virtual bool setBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
// Class Filter wraps a contiguous array of data. Filter and its subclasses
// implement read/write/increment functionality on arrays with arbitrary sized addresses
// (i.e. an address may not use a full number of bytes). When converting to byte-based
// representation we assume "unused" bits are to left.
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
// been masked out.
template<typename T>
class Filter
{
public:
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
// number of bits in T
cell_width_ = sizeof(T) << 3;
// current implementation has following constraints
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
// used for >> division
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
// size of underlying data in Ts
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
// instantiate underlying data
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(reset());
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
// mask for full cell
full_mask_ = static_cast<T>(0xffffffffffffffffull);
// mask for bits that make up the address
address_mask_ = full_mask_ >> first_bit_;
}
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
CHECK(loadHeader(fin));
if (loaddata)
CHECK(loadData(fin));
}
virtual ~Filter() {
delete[] data_;
}
bool reset() {
for (uint64_t i = 0; i < cells_; ++i)
data_[i] = 0;
return true;
}
count_t size() {
// return approx size of filter in MBs
return cells_ * sizeof(T) >> 20;
}
// read / write functions
inline bool read(uint64_t address, T* value) {
CHECK(address <= addresses_);
// copy address to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = data_[data_cell] & address_mask_;
return true;
}
virtual bool clearBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
// data address starts to left so shift it right
if (offset < 0) {
*value = (data_[data_cell] >> -offset) & address_mask_;
return true;
}
bool save(FileHandler* fout) {
CHECK(Filter<uint8_t>::save(fout));
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
// data address is to right so shift it left and look at one more cell to right
*value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return true;
}
inline T read(uint64_t address) {
CHECK(address <= addresses_);
// return value at address
T value = 0;
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
value = data_[data_cell] & address_mask_;
}
// data address starts to left so shift it right
else if (offset < 0) {
value = (data_[data_cell] >> -offset) & address_mask_;
}
// data address is to right so shift it left and look at one more cell to right
else
value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return value;
}
inline bool write(uint64_t address, T value) {
CHECK(address <= addresses_);
CHECK(log2(value) <= width_);
// write 'value' to address
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true;
}
float rho(uint64_t limit = 0) {
uint64_t ones = 0;
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
for (uint64_t i = 0; i < range; ++i)
for (int j = 0; j < 8; ++j)
if (data_[i] & (1 << j))
++ones;
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
}
protected:
bool load(FileHandler* fin) {
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = (value << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
};
/*
// address in data is to right so shift value right by offset
data_[data_cell] = (value >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
// copy 'address' ^ 'finger' to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = (((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
& address_mask_ ;
return true;
}
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
// write 'value' ^ 'finger' to address
finger &= address_mask_; // make sure fingerprint is correct size
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = ((finger ^ value) << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = ((finger ^ value) >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
// debugging
void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
std::cout << prefix;
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
for (int j = cell_width_ - 1; j >= 0; --j)
if (data_[i] & (1ull << j))
std::cout << 1;
else
std::cout << 0;
std::cout << "\n";
}
std::cout << std::endl;
}
// i/o
uint64_t getAddresses() {
return addresses_;
}
int getWidth() {
return width_;
}
int getCellWidth() {
return cell_width_;
}
uint32_t getCells() {
return cells_;
}
virtual bool save(FileHandler* out) {
CHECK(out != NULL);
CHECK(out->write((char*)&cells_, sizeof(cells_)));
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
CHECK(out->write((char*)&width_, sizeof(width_)));
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
if((width_ == 1) || cells_ < jump)
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
else {
uint64_t idx(0);
while(idx + jump < cells_) {
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
idx += jump;
}
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
return true;
}
protected:
bool loadHeader(FileHandler* fin) {
CHECK(fin != NULL);
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
CHECK(fin->read((char*)&width_, sizeof(width_)));
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
return true;
}
bool loadData(FileHandler* fin) {
// instantiate underlying array
data_ = new T[cells_];
CHECK(data_ != NULL);
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
return true;
}
uint64_t cells_; // number T making up 'data_'
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
int log_cell_width_; // log of bits used for >> division
uint64_t addresses_; // number of addresses in the filter
int width_; // width in bits of each address
int first_bit_; // position of first bit in initial byte
T full_mask_; // all 1s
T address_mask_; // 1s in those positions that are part of address
T* data_; // the raw data as bytes
};
// Extension with bit test/setter methods added
class BitFilter : public Filter<uint8_t>
{
public:
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
BitFilter(FileHandler* fin, bool loaddata = true)
: Filter<uint8_t>(fin, loaddata) {
if (loaddata)
CHECK(load(fin));
}
// TODO: overload operator[]
virtual bool testBit(uint64_t location) {
// test bit referenced by location
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
}
virtual bool setBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
return true;
}
virtual bool clearBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
return true;
}
bool save(FileHandler* fout) {
CHECK(Filter<uint8_t>::save(fout));
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
return true;
}
float rho(uint64_t limit = 0) {
uint64_t ones = 0;
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
for (uint64_t i = 0; i < range; ++i)
for (int j = 0; j < 8; ++j)
if (data_[i] & (1 << j))
++ones;
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
}
protected:
bool load(FileHandler* fin) {
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
return true;
}
};
/*
// ResizedBitFilter deals with resizing to save memory
// whereas other filters should expect locations to be within range
// this filter will need to resize (and possibly rehash) locations
@ -380,9 +391,9 @@ namespace randlm {
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
}
// last update must not have carried
if (!carry)
if (!carry)
return true;
// wrapped round so check whether need to reset to max count
// wrapped round so check whether need to reset to max count
if (!wrap_around_)
CHECK(this->write(address, this->address_mask_));
return false; // false to indicate that overflowed
@ -397,7 +408,7 @@ namespace randlm {
}
inline bool incrementSubCell(int bit, int len, T* cell) {
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
// indicate overflow as true

View File

@ -10,58 +10,66 @@ using namespace Moses;
typedef uint64_t P; // largest input range is 2^64
template <typename T>
class HashBase {
protected:
T m_; // range of hash output
count_t H_; // number of hash functions to instantiate
virtual void initSeeds()=0;
virtual void freeSeeds()=0;
public:
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
//cerr << "range = (0..." << m_ << "]" << endl;
}
HashBase(FileHandler* fin) {
load(fin);
}
virtual ~HashBase(){}
virtual T hash(const char*s, count_t h)=0; // string hashing
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
count_t size() { return H_;}
virtual void save(FileHandler* fout) {
CHECK(fout != 0);
fout->write((char*)&m_, sizeof(m_));
fout->write((char*)&H_, sizeof(H_));
}
virtual void load(FileHandler* fin) {
CHECK(fin != 0);
fin->read((char*)&m_, sizeof(m_));
fin->read((char*)&H_, sizeof(H_));
}
class HashBase
{
protected:
T m_; // range of hash output
count_t H_; // number of hash functions to instantiate
virtual void initSeeds()=0;
virtual void freeSeeds()=0;
public:
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
//cerr << "range = (0..." << m_ << "]" << endl;
}
HashBase(FileHandler* fin) {
load(fin);
}
virtual ~HashBase() {}
virtual T hash(const char*s, count_t h)=0; // string hashing
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
count_t size() {
return H_;
}
virtual void save(FileHandler* fout) {
CHECK(fout != 0);
fout->write((char*)&m_, sizeof(m_));
fout->write((char*)&H_, sizeof(H_));
}
virtual void load(FileHandler* fin) {
CHECK(fin != 0);
fin->read((char*)&m_, sizeof(m_));
fin->read((char*)&H_, sizeof(H_));
}
};
template <typename T>
class UnivHash_linear: public HashBase<T> {
public:
UnivHash_linear(float m, count_t H, P pr):
HashBase<T>(m, H), pr_(pr) {
//CHECK(isPrime(pr_));
initSeeds();
}
UnivHash_linear(FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_linear() {freeSeeds();}
T hash(const char* s, count_t h){return 0;} //not implemented
T hash(const wordID_t* id, const int len, count_t h);
T hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h);
void save(FileHandler* fout);
void load(FileHandler* fin);
private:
T** a_, **b_;
P pr_;
void initSeeds();
void freeSeeds();
class UnivHash_linear: public HashBase<T>
{
public:
UnivHash_linear(float m, count_t H, P pr):
HashBase<T>(m, H), pr_(pr) {
//CHECK(isPrime(pr_));
initSeeds();
}
UnivHash_linear(FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_linear() {
freeSeeds();
}
T hash(const char* s, count_t h) {
return 0; //not implemented
}
T hash(const wordID_t* id, const int len, count_t h);
T hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h);
void save(FileHandler* fout);
void load(FileHandler* fin);
private:
T** a_, **b_;
P pr_;
void initSeeds();
void freeSeeds();
};
/* UnivHash_noPrimes:
@ -71,74 +79,89 @@ class UnivHash_linear: public HashBase<T> {
* # of hash function = 2^(l-1)
*/
template <typename T>
class UnivHash_noPrimes: public HashBase<T> {
public:
UnivHash_noPrimes(float k, float l):
HashBase<T>(k, 100), d_(count_t((l-k))) {
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
else p_ = (P) pow(2,l);
initSeeds();
}
UnivHash_noPrimes(FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_noPrimes() {freeSeeds();}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h);
T hash(const P x, count_t h);
void save(FileHandler* fout);
void load(FileHandler* fin);
private:
count_t d_; // l-k
P p_, *a_; // real-valued input range, storage
void initSeeds();
void freeSeeds() {delete[] a_;}
class UnivHash_noPrimes: public HashBase<T>
{
public:
UnivHash_noPrimes(float k, float l):
HashBase<T>(k, 100), d_(count_t((l-k))) {
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
else p_ = (P) pow(2,l);
initSeeds();
}
UnivHash_noPrimes(FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_noPrimes() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h);
T hash(const P x, count_t h);
void save(FileHandler* fout);
void load(FileHandler* fin);
private:
count_t d_; // l-k
P p_, *a_; // real-valued input range, storage
void initSeeds();
void freeSeeds() {
delete[] a_;
}
};
template <typename T>
class Hash_shiftAddXOR: public HashBase<T> {
public:
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
l_(5), r_(2) {
initSeeds();
}
~Hash_shiftAddXOR() {freeSeeds();}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {} // empty
private:
T* v_; // random seed storage
const unsigned short l_, r_; // left-shift bits, right-shift bits
void initSeeds();
void freeSeeds() {delete[] v_;}
class Hash_shiftAddXOR: public HashBase<T>
{
public:
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
l_(5), r_(2) {
initSeeds();
}
~Hash_shiftAddXOR() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {} // empty
private:
T* v_; // random seed storage
const unsigned short l_, r_; // left-shift bits, right-shift bits
void initSeeds();
void freeSeeds() {
delete[] v_;
}
};
template <typename T>
class UnivHash_tableXOR: public HashBase<T> {
public:
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
table_(NULL), tblLen_(255*MAX_STR_LEN) {
initSeeds();
}
~UnivHash_tableXOR() {freeSeeds();}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {}
private:
T** table_; // storage for random numbers
count_t tblLen_; // length of table
void initSeeds();
void freeSeeds();
class UnivHash_tableXOR: public HashBase<T>
{
public:
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
table_(NULL), tblLen_(255*MAX_STR_LEN) {
initSeeds();
}
~UnivHash_tableXOR() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {}
private:
T** table_; // storage for random numbers
count_t tblLen_; // length of table
void initSeeds();
void freeSeeds();
};
// ShiftAddXor
template <typename T>
void Hash_shiftAddXOR<T>::initSeeds() {
void Hash_shiftAddXOR<T>::initSeeds()
{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
v_[i] = Utils::rand<T>() + 1;
v_[i] = Utils::rand<T>() + 1;
}
template <typename T>
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0)
{
T value = v_[h];
int pos(0);
unsigned char c;
@ -150,40 +173,44 @@ T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
// UnivHash_tableXOR
template <typename T>
void UnivHash_tableXOR<T>::initSeeds() {
void UnivHash_tableXOR<T>::initSeeds()
{
// delete any values in table
if(table_) freeSeeds();
if(table_) freeSeeds();
// instance of new table
table_ = new T* [this->H_];
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
for(count_t i=0; i < tblLen_; i++) {
table_[j][i] = Utils::rand<T>(this->m_-1);
for(count_t i=0; i < tblLen_; i++) {
table_[j][i] = Utils::rand<T>(this->m_-1);
}
}
}
template <typename T>
void UnivHash_tableXOR<T>::freeSeeds() {
void UnivHash_tableXOR<T>::freeSeeds()
{
for(count_t j = 0; j < this->H_; j++)
delete[] table_[j];
delete[] table_;
table_ = NULL;
}
template <typename T>
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0) {
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0)
{
T value = 0;
count_t pos = 0, idx = 0;
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN))
value ^= table_[h][idx += c];
CHECK(value < this->m_);
CHECK(value < this->m_);
return value;
}
// UnivHash_noPrimes
template <typename T>
void UnivHash_noPrimes<T>::initSeeds() {
void UnivHash_noPrimes<T>::initSeeds()
{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
a_[i] = Utils::rand<P>();
@ -191,14 +218,16 @@ void UnivHash_noPrimes<T>::initSeeds() {
}
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0) {
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0)
{
// h_a(x) = (ax mod 2^l) div 2^(l-k)
T value = ((a_[h] * x) % p_) >> d_;
return value % this->m_;
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
count_t h=0) {
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
count_t h=0)
{
T value = 0;
int pos(0);
while(pos < len) {
@ -208,39 +237,42 @@ T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
return value % this->m_;
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0) {
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0)
{
T value = 0;
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
value ^= hash((P)c, h);
value ^= hash((P)c, h);
}
return value % this->m_;
}
template <typename T>
void UnivHash_noPrimes<T>::save(FileHandler* fout) {
void UnivHash_noPrimes<T>::save(FileHandler* fout)
{
HashBase<T>::save(fout);
fout->write((char*)&p_, sizeof(p_));
fout->write((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) {
for(T i=0; i < this->H_; i++) {
fout->write((char*)&a_[i], sizeof(a_[i]));
}
}
template <typename T>
void UnivHash_noPrimes<T>::load(FileHandler* fin) {
void UnivHash_noPrimes<T>::load(FileHandler* fin)
{
a_ = new P[this->H_];
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&p_, sizeof(p_));
fin->read((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++)
{
for(T i=0; i < this->H_; i++) {
fin->read((char*)&a_[i], sizeof(a_[i]));
}
}
//UnivHash_linear
template <typename T>
void UnivHash_linear<T>::initSeeds() {
void UnivHash_linear<T>::initSeeds()
{
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
@ -253,7 +285,8 @@ void UnivHash_linear<T>::initSeeds() {
}
}
template <typename T>
void UnivHash_linear<T>::freeSeeds() {
void UnivHash_linear<T>::freeSeeds()
{
for(count_t i=0; i < this->H_; i++) {
delete[] a_[i];
delete[] b_[i];
@ -263,8 +296,9 @@ void UnivHash_linear<T>::freeSeeds() {
a_ = b_ = NULL;
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
count_t h=0) {
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
count_t h=0)
{
CHECK(h < this->H_);
T value = 0;
int pos(0);
@ -276,19 +310,21 @@ inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h=0) {
const T prevValue, count_t h=0)
{
CHECK(h < this->H_);
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
return value % this->m_;
}
template <typename T>
void UnivHash_linear<T>::save(FileHandler* fout) {
void UnivHash_linear<T>::save(FileHandler* fout)
{
// int bytes = sizeof(a_[0][0]);
HashBase<T>::save(fout);
fout->write((char*)&pr_, sizeof(pr_));
for(count_t i=0; i < this->H_; i++) {
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
@ -296,7 +332,8 @@ void UnivHash_linear<T>::save(FileHandler* fout) {
}
}
template <typename T>
void UnivHash_linear<T>::load(FileHandler* fin) {
void UnivHash_linear<T>::load(FileHandler* fin)
{
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&pr_, sizeof(pr_));
a_ = new T*[this->H_];
@ -305,8 +342,8 @@ void UnivHash_linear<T>::load(FileHandler* fin) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}

View File

@ -16,27 +16,28 @@ using randlm::Cache;
const bool strict_checks_ = false;
template<typename T>
class OnlineRLM: public PerfectHash<T> {
class OnlineRLM: public PerfectHash<T>
{
public:
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
CHECK(vocab_ != 0);
//instantiate quantizer class here
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
for(count_t i = 0; i <= order_; ++i)
for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
cerr << "Initialzing auxillary bit filters...\n";
bPrefix_ = new BitFilter(this->cells_);
bHit_ = new BitFilter(this->cells_);
}
OnlineRLM(FileHandler* fin, count_t order):
OnlineRLM(FileHandler* fin, count_t order):
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
load(fin);
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
for(count_t i = 0; i <= order_; ++i)
for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
}
~OnlineRLM() {
@ -52,14 +53,18 @@ public:
bool insert(const std::vector<string>& ngram, const int value);
bool update(const std::vector<string>& ngram, const int value);
int query(const wordID_t* IDs, const int len);
int sbsqQuery(const std::vector<string>& ngram, int* len,
bool bStrict = false);
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict = false);
int sbsqQuery(const std::vector<string>& ngram, int* len,
bool bStrict = false);
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict = false);
void remove(const std::vector<string>& ngram);
count_t heurDelete(count_t num2del, count_t order = 5);
uint64_t corpusSize() {return corpusSize_;}
void corpusSize(uint64_t c) {corpusSize_ = c;}
uint64_t corpusSize() {
return corpusSize_;
}
void corpusSize(uint64_t c) {
corpusSize_ = c;
}
void clearCache() {
if(cache_) cache_->clear();
}
@ -77,7 +82,7 @@ protected:
void markQueried(hpdEntry_t& value);
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
private:
const void* getContext(const wordID_t* ngram, int len);
const void* getContext(const wordID_t* ngram, int len);
const bool bAdapting_; // used to signal adaptation of model
const count_t order_; // LM order
uint64_t corpusSize_; // total training corpus size
@ -87,46 +92,48 @@ private:
BitFilter* bHit_;
};
template<typename T>
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value) {
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value)
{
int len = ngram.size();
wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1);
for(int i = 0; i < len; ++i)
for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
index = PerfectHash<T>::insert(wrdIDs, len, value);
if(value > 1 && len < order_)
markPrefix(wrdIDs, ngram.size(), true); // mark context
// keep track of total items from training data minus "<s>"
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
markQueried(index);
return true;
}
template<typename T>
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value) {
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value)
{
int len = ngram.size();
wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1);
hpdEntry_t hpdItr;
vocab_->MakeOpen();
for(int i = 0; i < len; ++i)
for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
// if updating, minimize false positives by pre-checking if context already in model
bool bIncluded(true);
// if updating, minimize false positives by pre-checking if context already in model
bool bIncluded(true);
if(value > 1 && len < (int)order_)
bIncluded = markPrefix(wrdIDs, ngram.size(), true); // mark context
if(bIncluded) { // if context found
if(bIncluded) { // if context found
bIncluded = PerfectHash<T>::update2(wrdIDs, len, value, hpdItr, index);
if(index < this->cells_) {
markQueried(index);
}
else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
} else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
}
return bIncluded;
}
template<typename T>
int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
int OnlineRLM<T>::query(const wordID_t* IDs, int len)
{
uint64_t filterIdx = 0;
hpdEntry_t hpdItr;
int value(0);
@ -135,8 +142,7 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
if(hpdItr != this->dict_.end()) {
//markQueried(hpdItr); // mark this event as "hit"
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
}
else {
} else {
CHECK(filterIdx < this->cells_);
//markQueried(filterIdx);
}
@ -144,15 +150,16 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
return value > 0 ? value : 0;
}
template<typename T>
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
if(len <= 1) return true; // only do this for for ngrams with context
static Cache<int> pfCache(-1, -1); // local prefix cache
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
{
if(len <= 1) return true; // only do this for for ngrams with context
static Cache<int> pfCache(-1, -1); // local prefix cache
int code(0);
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
hpdEntry_t hpdItr;
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
hpdEntry_t hpdItr;
uint64_t filterIndex(0);
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
if(code == -1) { // encountered false positive in pipeline
if(code == -1) { // encountered false positive in pipeline
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
// add all prefixes or return false;
return false;
@ -161,10 +168,9 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
CHECK(hpdItr == this->dict_.end());
if(bSet) bPrefix_->setBit(filterIndex); // mark index
else bPrefix_->clearBit(filterIndex); // unset index
}
else {
} else {
CHECK(filterIndex == this->cells_ + 1);
//how to handle hpd prefixes?
//how to handle hpd prefixes?
}
if(pfCache.nodes() > 10000) pfCache.clear();
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
@ -172,36 +178,40 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
return true;
}
template<typename T>
void OnlineRLM<T>::markQueried(const uint64_t& index) {
void OnlineRLM<T>::markQueried(const uint64_t& index)
{
bHit_->setBit(index);
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
}
template<typename T>
void OnlineRLM<T>::markQueried(hpdEntry_t& value) {
// set high bit of counter to indicate "hit" status
void OnlineRLM<T>::markQueried(hpdEntry_t& value)
{
// set high bit of counter to indicate "hit" status
value->second |= this->hitMask_;
}
template<typename T>
void OnlineRLM<T>::remove(const std::vector<string>& ngram) {
void OnlineRLM<T>::remove(const std::vector<string>& ngram)
{
wordID_t IDs[ngram.size()];
for(count_t i = 0; i < ngram.size(); ++i)
for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
PerfectHash<T>::remove(IDs, ngram.size());
}
template<typename T>
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
{
count_t deleted = 0;
cout << "Deleting " << num2del << " of order "<< order << endl;
// delete from filter first
int full = *std::max_element(this->idxTracker_, this->idxTracker_
+ this->totBuckets_);
int full = *std::max_element(this->idxTracker_, this->idxTracker_
+ this->totBuckets_);
for(; full > 0; --full) // delete from fullest buckets first
for(int bk = 0; bk < this->totBuckets_; ++bk) {
for(int bk = 0; bk < this->totBuckets_; ++bk) {
if(deleted >= num2del) break;
if(this->idxTracker_[bk] == full) { // if full
uint64_t first = bk * this->bucketRange_,
last = first + this->bucketRange_;
for(uint64_t row = first; row < last; ++row) { // check each row
last = first + this->bucketRange_;
for(uint64_t row = first; row < last; ++row) { // check each row
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
if(this->filter_->read(row) != 0) {
PerfectHash<T>::remove(row); // remove from filter
@ -220,15 +230,17 @@ count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
}
template<typename T>
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
bool bStrict) {
bool bStrict)
{
wordID_t IDs[ngram.size()];
for(count_t i = 0; i < ngram.size(); ++i)
for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
}
template<typename T>
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict) {
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict)
{
uint64_t filterIdx = 0;
int val(0), fnd(0);
hpdEntry_t hpdItr;
@ -240,14 +252,13 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
if(hpdItr != this->dict_.end()) {
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
}
}
else if(bStrict) {
break;
} else if(bStrict) {
break;
}
// add to value array
codes[i] = val > 0 ? val : 0;
}
while(bStrict && (fnd > 1)) { // do checks the other way
while(bStrict && (fnd > 1)) { // do checks the other way
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
if(val != -1) break; // if anything found
else --fnd; // else decrement found
@ -255,8 +266,9 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
return fnd;
}
template<typename T>
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
const void** state) {
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
const void** state)
{
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
float logprob(0);
const void* context = (state) ? *state : 0;
@ -264,61 +276,61 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
// get full prob and put in cache
int num_fnd(0), den_val(0);
int in[len]; // in[] keeps counts of increasing order numerator
int in[len]; // in[] keeps counts of increasing order numerator
for(int i = 0; i < len; ++i) in[i] = 0;
for(int i = len - 1; i >= 0; --i) {
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
in[i] = query(&ngram[i], len - i);
if(in[i] > 0) {
num_fnd = len - i;
}
else if(strict_checks_) break;
} else if(strict_checks_) break;
}
while(num_fnd > 1) { // get lower order count
//get sub-context of size one less than length found (exluding target)
//get sub-context of size one less than length found (exluding target)
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
break;
}
else --num_fnd; // else backoff to lower ngram order
} else --num_fnd; // else backoff to lower ngram order
}
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
num_fnd = 0;
switch(num_fnd) { // find prob (need to refactor into precomputation)
case 0: // OOV
logprob = alpha_[len] + oovprob;
break;
case 1: // unigram found only
CHECK(in[len - 1] > 0);
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
//logprob = alpha_[len - 1] +
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
break;
default:
CHECK(den_val > 0);
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
logprob = alpha_[len - num_fnd] +
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
break;
case 0: // OOV
logprob = alpha_[len] + oovprob;
break;
case 1: // unigram found only
CHECK(in[len - 1] > 0);
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
//logprob = alpha_[len - 1] +
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
break;
default:
CHECK(den_val > 0);
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
logprob = alpha_[len - num_fnd] +
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
break;
}
// need unique context
context = getContext(&ngram[len - num_fnd], num_fnd);
// put whatever was found in cache
cache_->setCacheNgram(ngram, len, logprob, context);
} // end checkCache
return logprob;
return logprob;
}
template<typename T>
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len) {
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
{
int dummy(0);
float* addresses[len]; // only interested in addresses of cache
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
// return address of cache node
return (const void*)addresses[0];
return (const void*)addresses[0];
}
template<typename T>
void OnlineRLM<T>::randDelete(int num2del) {
void OnlineRLM<T>::randDelete(int num2del)
{
int deleted = 0;
for(uint64_t i = 0; i < this->cells_; i++) {
if(this->filter_->read(i) != 0) {
@ -329,18 +341,20 @@ void OnlineRLM<T>::randDelete(int num2del) {
}
}
template<typename T>
int OnlineRLM<T>::countHits() {
int OnlineRLM<T>::countHits()
{
int hit(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bHit_->testBit(i)) ++hit;
iterate(this->dict_, itr)
if((itr->second & this->hitMask_) != 0)
++hit;
if((itr->second & this->hitMask_) != 0)
++hit;
cerr << "Hit count = " << hit << endl;
return hit;
}
template<typename T>
int OnlineRLM<T>::countPrefixes() {
int OnlineRLM<T>::countPrefixes()
{
int pfx(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bPrefix_->testBit(i)) ++pfx;
@ -349,22 +363,24 @@ int OnlineRLM<T>::countPrefixes() {
return pfx;
}
template<typename T>
int OnlineRLM<T>::cleanUpHPD() {
int OnlineRLM<T>::cleanUpHPD()
{
cerr << "HPD size before = " << this->dict_.size() << endl;
std::vector<string> vDel, vtmp;
iterate(this->dict_, itr) {
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
vDel.push_back(itr->first);
}
}
iterate(vDel, vitr)
this->dict_.erase(*vitr);
iterate(vDel, vitr)
this->dict_.erase(*vitr);
cerr << "HPD size after = " << this->dict_.size() << endl;
return vDel.size();
}
template<typename T>
void OnlineRLM<T>::clearMarkings() {
void OnlineRLM<T>::clearMarkings()
{
cerr << "clearing all event hits\n";
bHit_->reset();
count_t* value(0);
@ -374,7 +390,8 @@ void OnlineRLM<T>::clearMarkings() {
}
}
template<typename T>
void OnlineRLM<T>::save(FileHandler* fout) {
void OnlineRLM<T>::save(FileHandler* fout)
{
cerr << "Saving ORLM...\n";
// save vocab
vocab_->Save(fout);
@ -387,7 +404,8 @@ void OnlineRLM<T>::save(FileHandler* fout) {
cerr << "Finished saving ORLM." << endl;
}
template<typename T>
void OnlineRLM<T>::load(FileHandler* fin) {
void OnlineRLM<T>::load(FileHandler* fin)
{
cerr << "Loading ORLM...\n";
// load vocab first
vocab_ = new Vocab(fin);
@ -402,12 +420,13 @@ void OnlineRLM<T>::load(FileHandler* fin) {
PerfectHash<T>::load(fin);
}
template<typename T>
void OnlineRLM<T>::removeNonMarked() {
void OnlineRLM<T>::removeNonMarked()
{
cerr << "deleting all unused events\n";
int deleted(0);
for(uint64_t i = 0; i < this->cells_; ++i) {
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
&& (this->filter_->read(i) != 0)) {
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
&& (this->filter_->read(i) != 0)) {
PerfectHash<T>::remove(i);
++deleted;
}
@ -429,36 +448,36 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
// constrain cache queries using model assumptions
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
cerr << "denom_len = " << denom_len << endl;
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
&num_codes[0], &found);
cerr << "num_len= " << num_len << endl;
// keed reducing ngram size until both denominator and numerator are found
// allowed to leave kUnknownCode in cache because we check for this.
found = num_len; // guaranteed to be <= denom_len + 1
// still check for OOV
for (int i = len - found; i < len; ++i)
if (ngram[i] == Vocab::kOOVWordID) {
for (int i = len - found; i < len; ++i)
if (ngram[i] == Vocab::kOOVWordID) {
found = len - i - 1;
}
// check for relative estimator
while(found > 1) {
if(*denom_codes[found-1] == cache_unk_ &&
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
if(*denom_codes[found-1] == cache_unk_ &&
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
*num_codes[found] = cache_unk_;
} else {
if(*num_codes[found] != cache_unk_ ||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
// num_codes[*found], *denom_codes[*found-1]))
break;
}
}
--found;
}
// didn't find bigram numerator or unigram denominator
// didn't find bigram numerator or unigram denominator
if (found == 1)
found = *num_codes[1] != cache_unk_
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
found = *num_codes[1] != cache_unk_
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
// ....
// return estimate applying correct backoff score (precomputed)
@ -469,20 +488,20 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
break;
case 1: // unigram over whole corpus
log_prob = alpha_[len - 1] +
log_prob = alpha_[len - 1] +
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
// + stupid_backoff_log10_[len - 1]; // precomputed
break;
default: // otherwise use both statistics and (possibly zero) backoff weight
log_prob = alpha_[len - found] +
log_prob = alpha_[len - found] +
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
// + stupid_backoff_log10_[len - *found];
}
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
//probCache_->store(len, log_prob, context_state);
//probCache_->store(len, log_prob, context_state);
if (state)
*state = context_state;
return log_prob;

View File

@ -1,10 +1,11 @@
#include "params.h"
namespace Moses {
namespace Moses
{
// parameter constants
const std::string Parameters::kNotSetValue = "__NOT_SET__";
const int Parameters::kBoolValue = 0;
const int Parameters::kBoolValue = 0;
const int Parameters::kIntValue = 1;
const int Parameters::kFloatValue = 2;
const int Parameters::kStringValue = 3;
@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1;
const std::string Parameters::kTrueValue = "1";
const std::string Parameters::kFalseValue = "0";
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) {
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
{
initialize(paramdefs, paramNum);
}
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
const count_t paramNum) {
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
const count_t paramNum)
{
initialize(paramdefs, paramNum);
loadParams(argc, argv);
}
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) {
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
{
for( count_t i = 0; i < paramNum; i++ ) {
params_[paramdefs[i].name] = paramdefs[i]; // assign name
}
cerr << "Default parameter values:\n";
iterate(params_, itr)
cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
iterate(params_, itr)
cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
}
bool Parameters::loadParams(int argc, char ** argv) {
bool Parameters::loadParams(int argc, char ** argv)
{
// load params from commandline args
//if( argc < 3 ) {
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) {
std::string val = argv[i+1];
Utils::trim(val);
if( param == "config" )
load_from_file = true;
load_from_file = true;
if(!setParamValue(param, val)) {
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
return false;
@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) {
return success;
}
std::string Parameters::normaliseParamName(const std::string & name) {
std::string Parameters::normaliseParamName(const std::string & name)
{
// Map valid abbreviations to long names. Retain other names.
if( params_.find(name) == params_.end() )
iterate(params_, i)
if( i->second.abbrev == name )
return i->first;
iterate(params_, i)
if( i->second.abbrev == name )
return i->first;
return name;
}
int Parameters::getValueType(const std::string& name) {
int Parameters::getValueType(const std::string& name)
{
if(params_.find(name) != params_.end())
return params_[name].type;
return Parameters::kUndefinedValue;
}
bool Parameters::isValidParamName(const std::string & name) {
return params_.find(name) != params_.end();
bool Parameters::isValidParamName(const std::string & name)
{
return params_.find(name) != params_.end();
}
bool Parameters::setParamValue(const std::string& name, const std::string& val) {
// TODO: Add basic type checking w verifyValueType()
bool set = isValidParamName(name);
if(set) {
params_[name].value = val;
bool Parameters::setParamValue(const std::string& name, const std::string& val)
{
// TODO: Add basic type checking w verifyValueType()
bool set = isValidParamName(name);
if(set) {
params_[name].value = val;
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
}
return( set );
}
std::string Parameters::getParamValue(const std::string& name) {
std::string Parameters::getParamValue(const std::string& name)
{
std::string value = Parameters::kNotSetValue;
if(isValidParamName(name))
if(params_.find(name) != params_.end())
@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) {
value = kFalseValue;
return value;
}
std::string Parameters::getParam(const std::string& name) {
std::string Parameters::getParam(const std::string& name)
{
return getParamValue(name);
/*void* Parameters::getParam(const std::string& name) {
void* paramVal = 0;
int type = getValueType(name);
const char* sval = getParamValue(name).c_str();
switch(type) {
case kIntValue: {
int ival = atoi(sval);
paramVal = (void*)&ival;
break;
/*void* Parameters::getParam(const std::string& name) {
void* paramVal = 0;
int type = getValueType(name);
const char* sval = getParamValue(name).c_str();
switch(type) {
case kIntValue: {
int ival = atoi(sval);
paramVal = (void*)&ival;
break;
}
case kFloatValue: {
float fval = atof(sval);
paramVal = (void*)&fval;
break;
}
case kStringValue: {
paramVal = (void*)sval;
break;
}
case kBoolValue: {
bool bval = sval == Parameters::kTrueValue ? true : false;
paramVal = (void*)&bval;
break;
}
default: // --> Parameters::kUndefinedValue
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
}
case kFloatValue: {
float fval = atof(sval);
paramVal = (void*)&fval;
break;
}
case kStringValue: {
paramVal = (void*)sval;
break;
}
case kBoolValue: {
bool bval = sval == Parameters::kTrueValue ? true : false;
paramVal = (void*)&bval;
break;
}
default: // --> Parameters::kUndefinedValue
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
}
return paramVal;*/
return paramVal;*/
}
bool Parameters::verifyValueType(const std::string& name, const std::string& val) {
bool Parameters::verifyValueType(const std::string& name, const std::string& val)
{
// Implement basic type checking
return true;
}
int Parameters::getParamCount() const {
int Parameters::getParamCount() const
{
return params_.size();
}
@ -161,7 +174,8 @@ int Parameters::getParamCount() const {
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
* override default if different*/
bool Parameters::loadParams(const std::string & file_path,
std::set<std::string>& setParams) {
std::set<std::string>& setParams)
{
// parameters loaded from file don't override cmd line paramters
/*std::set<std::string>::iterator end = setParams.end();
FileHandler file(file_path.c_str(), std::ios::in);

View File

@ -10,20 +10,22 @@
#include "utils.h"
#include "types.h"
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
namespace Moses {
namespace Moses
{
typedef struct ParamDefs {
std::string name;
std::string value;
std::string value;
std::string abbrev;
int type;
std::string description;
} ParamDefs;
class Parameters {
class Parameters
{
public:
static const std::string kNotSetValue;
static const std::string kNotSetValue;
static const int kBoolValue;
static const int kIntValue;
static const int kFloatValue;
@ -31,15 +33,15 @@ public:
static const int kUndefinedValue;
static const std::string kFalseValue;
static const std::string kTrueValue;
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
~Parameters() {}
bool loadParams(int argc, char ** argv);
bool loadParams(const std::string& param_file, std::set<std::string>&);
int getValueType(const std::string & name);
bool setParamValue(const std::string& name, const std::string& value);
bool verifyValueType(const std::string& name, const std::string& value);
bool setParamValue(const std::string& name, const std::string& value);
bool verifyValueType(const std::string& name, const std::string& value);
bool isValidParamName(const std::string & name);
std::string getParamValue(const std::string& name);
//void* getParam(const std::string& name);

View File

@ -8,17 +8,18 @@
#include "RandLMFilter.h"
#include "quantizer.h"
/*
* PerfectHash handles setting up hash functions and storage
* for LM data.
*/
* PerfectHash handles setting up hash functions and storage
* for LM data.
*/
using randlm::Filter;
using randlm::BitFilter;
typedef std::map<string, count_t> hpDict_t;
typedef hpDict_t::iterator hpdEntry_t;
static count_t collisions_ = 0;
/* Based on Mortenson et. al. 2006 */
/* Based on Mortenson et. al. 2006 */
template<typename T>
class PerfectHash {
class PerfectHash
{
public:
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
PerfectHash(FileHandler* fin) {
@ -39,11 +40,11 @@ protected:
uint8_t* idxTracker_;
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
bool update(const wordID_t* IDs, const int len, const count_t value,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
bool update2(const wordID_t* IDs, const int len, const count_t value,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
int query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
int query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
virtual void remove(const wordID_t* IDs, const int len);
void remove(uint64_t index);
void save(FileHandler* fout);
@ -52,32 +53,33 @@ protected:
//pointer to a specific entry in a hpDict_t
virtual void markQueried(hpdEntry_t&)=0;
private:
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
string hpDictKeyValue(const wordID_t* IDs, const int len);
uint64_t memBound_; // total memory bound in bytes
uint16_t cellWidth_; // in bits
UnivHash_linear<count_t>* bucketHash_;
UnivHash_linear<count_t>* bucketHash_;
UnivHash_linear<T>* fingerHash_;
LogQtizer* qtizer_;
};
template<typename T>
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
cellWidth_(width) {
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
cellWidth_(width)
{
bucketRange_ = static_cast<uint8_t>(bucketRange);
if(bucketRange > 255) {
cerr << "ERROR: Max bucket range is > 2^8\n";
cerr << "ERROR: Max bucket range is > 2^8\n";
exit(1);
}
qtizer_ = new LogQtizer(qBase);
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
uint64_t totalBits = memBound_ << 3;
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
filter_ = new Filter<T>(cells_, cellWidth_);
values_ = new Filter<T>(cells_, valBits);
values_ = new Filter<T>(cells_, valBits);
idxTracker_ = new uint8_t[totBuckets_];
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
// initialize ranges for each hash function
@ -85,7 +87,8 @@ PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
}
template<typename T>
PerfectHash<T>::~PerfectHash() {
PerfectHash<T>::~PerfectHash()
{
delete[] idxTracker_;
delete filter_;
filter_ = NULL;
@ -94,22 +97,22 @@ PerfectHash<T>::~PerfectHash() {
delete qtizer_;
delete values_;
}
template<typename T>
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
const count_t value) {
template<typename T>
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
const count_t value)
{
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t emptyidx = cells_ + 1;
uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_; // ending row
while(index < lastrow) { // unique so check each row for "matching" signature
lastrow = index + bucketRange_; // ending row
while(index < lastrow) { // unique so check each row for "matching" signature
T filterVal = filter_->read(index);
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
emptyidx = index;
}
else if(filterVal == fp) {
} else if(filterVal == fp) {
++collisions_;
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
return cells_ + 1; // finished
@ -122,20 +125,20 @@ uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
values_->write(emptyidx, code);
++idxTracker_[bucket]; // keep track of bucket size
return emptyidx;
}
else { // bucket is full
} else { // bucket is full
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
return cells_ + 1;
}
}
template<typename T>
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
template<typename T>
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
hpdAddr->second = value;
hpdAddr->second = value;
return true;
}
// else hash ngram
@ -144,66 +147,67 @@ bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_;
lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
values_->write(index, (T)qtizer_->code(value));
values_->write(index, (T)qtizer_->code(value));
filterIdx = index;
return true;
}
++index;
}
// could add if it gets here.
// could add if it gets here.
return false;
}
template<typename T>
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
template<typename T>
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
filterIdx = cells_ + 1;
return(hpdAddr->second); // returns copy of value
}
else { // check if key is in filter
// get bucket
} else { // check if key is in filter
// get bucket
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
lastrow = index + bucketRange_;
lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) {
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
//filter_->read(index) << "\tcode = " << code << endl;
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
//filter_->read(index) << "\tcode = " << code << endl;
filterIdx = index;
hpdAddr = dict_.end();
return (int)qtizer_->value(values_->read(index));
return (int)qtizer_->value(values_->read(index));
}
}
}
return -1;
}
template<typename T>
void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
{
// delete key if in high perf. dictionary
string skey = hpDictKeyValue(IDs, len);
if(dict_.find(skey) != dict_.end())
dict_.erase(skey);
else { // check if key is in filter
// get small representation for ngrams
// get small representation for ngrams
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
// retrieve non zero fingerprint for ngram
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
lastrow = index + bucketRange_;
lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) {
if(filter_->read(index) == fp) {
filter_->write(index, 0);
values_->write(index, 0);
--idxTracker_[bucket]; // track bucket size reduction
@ -213,7 +217,8 @@ void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
}
}
template<typename T> // clear filter index
void PerfectHash<T>::remove(uint64_t index) {
void PerfectHash<T>::remove(uint64_t index)
{
CHECK(index < cells_);
CHECK(filter_->read(index) != 0); // slow
filter_->write(index, 0);
@ -224,19 +229,21 @@ void PerfectHash<T>::remove(uint64_t index) {
}
template<typename T>
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
count_t bucket) {
count_t bucket)
{
count_t h = bucket;
T fingerprint(0);
do {
fingerprint = fingerHash_->hash(IDs, len, h);
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
} while((fingerprint == 0) && (h != bucket));
if(fingerprint == 0)
if(fingerprint == 0)
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
return fingerprint;
}
template<typename T>
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
{
string skey(" ");
for(int i = 0; i < len; ++i)
skey += Utils::IntToStr(IDs[i]) + "¬";
@ -244,17 +251,20 @@ string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
return skey;
}
template<typename T>
count_t PerfectHash<T>::hpDictMemUse() {
count_t PerfectHash<T>::hpDictMemUse()
{
// return hpDict memory usage in MBs
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
}
template<typename T>
count_t PerfectHash<T>::bucketsMemUse() {
count_t PerfectHash<T>::bucketsMemUse()
{
// return bucket memory usage in MBs
return (count_t) (filter_->size() + values_->size());
return (count_t) (filter_->size() + values_->size());
}
template<typename T>
void PerfectHash<T>::save(FileHandler* fout) {
void PerfectHash<T>::save(FileHandler* fout)
{
CHECK(fout != 0);
cerr << "\tSaving perfect hash parameters...\n";
fout->write((char*)&hitMask_, sizeof(hitMask_));
@ -275,11 +285,12 @@ void PerfectHash<T>::save(FileHandler* fout) {
count_t size = dict_.size();
fout->write((char*)&size, sizeof(count_t));
*fout << endl;
iterate(dict_, t)
*fout << t->first << "\t" << t->second << "\n";
iterate(dict_, t)
*fout << t->first << "\t" << t->second << "\n";
}
template<typename T>
void PerfectHash<T>::load(FileHandler* fin) {
void PerfectHash<T>::load(FileHandler* fin)
{
CHECK(fin != 0);
cerr << "\tLoading perfect hash parameters...\n";
fin->read((char*)&hitMask_, sizeof(hitMask_));
@ -315,12 +326,13 @@ void PerfectHash<T>::load(FileHandler* fin) {
cerr << "Finished loading ORLM." << endl;
}
template<typename T>
void PerfectHash<T>::analyze() {
void PerfectHash<T>::analyze()
{
cerr << "Analyzing Dynamic Bloomier Filter...\n";
// see how many items in each bucket
uint8_t* bucketCnt = new uint8_t[totBuckets_];
unsigned largestBucket = 0, totalCellsSet = 0,
smallestBucket = bucketRange_, totalZeroes = 0;
unsigned largestBucket = 0, totalCellsSet = 0,
smallestBucket = bucketRange_, totalZeroes = 0;
int curBucket = -1, fullBuckets(0);
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
for(uint64_t i =0; i < cells_; ++i) {
@ -328,16 +340,14 @@ void PerfectHash<T>::analyze() {
if(filter_->read(i) != 0) {
++bucketCnt[curBucket];
++totalCellsSet;
}
else ++totalZeroes;
} else ++totalZeroes;
}
count_t bi = 0, si = 0;
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] > largestBucket) {
largestBucket = bucketCnt[i];
bi = i;
}
else if(bucketCnt[i] < smallestBucket) {
} else if(bucketCnt[i] < smallestBucket) {
smallestBucket = bucketCnt[i];
si = i;
}
@ -350,8 +360,8 @@ void PerfectHash<T>::analyze() {
}
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] != idxTracker_[i])
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
}
cerr << "total cells= " << cells_ << endl;
cerr << "total buckets= " << totBuckets_ << endl;
@ -364,7 +374,7 @@ void PerfectHash<T>::analyze() {
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
cerr << "total buckets full = " << fullBuckets << endl;
cerr << "total collision errors= " << collisions_ << endl;
cerr << "high performance dictionary size= " << dict_.size() << endl;
@ -373,14 +383,15 @@ void PerfectHash<T>::analyze() {
cerr << "values MBs= " << values_->size() << endl;
delete[] bucketCnt;
}
template<typename T>
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
template<typename T>
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
hpdAddr->second += value;
hpdAddr->second += value;
return true;
}
// else hash ngram
@ -389,18 +400,18 @@ bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_;
lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
int oldval = (int)qtizer_->value(values_->read(index));
values_->write(index, (T)qtizer_->code(oldval + value));
int oldval = (int)qtizer_->value(values_->read(index));
values_->write(index, (T)qtizer_->code(oldval + value));
filterIdx = index;
return true;
}
++index;
}
// add if it gets here.
// add if it gets here.
insert(IDs, len, value);
return false;
}

View File

@ -8,7 +8,8 @@
#include "types.h"
static const float kFloatErr = 0.00001f;
class LogQtizer {
class LogQtizer
{
public:
LogQtizer(float i): base_(pow(2, 1 / i)) {
CHECK(base_ > 1);
@ -16,8 +17,8 @@ public:
float value = 1; // code = 1 -> value = 1 for any base
std::vector<float> code_to_value_vec;
while (log2(value) < 30) { // assume 2^30 is largest count
code_to_value_vec.push_back(value);
value = pow(base_, ++max_code_);
code_to_value_vec.push_back(value);
value = pow(base_, ++max_code_);
}
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
// get valid range
@ -40,22 +41,22 @@ public:
int code(float value) {
// should just be: return log_b(value)
CHECK(!(value < min_value_ || value > max_value_));
// but binary search removes errors due to floor operator above
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
value) - code_to_value_);
// make sure not overestimating
// but binary search removes errors due to floor operator above
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
value) - code_to_value_);
// make sure not overestimating
code = code_to_value_[code] > value ? code - 1 : code;
return code;
}
inline float value(int code) {
// table look up for values
// table look up for values
return code_to_value_[code];
}
inline int maxcode() {
return max_code_;
}
inline float logValue(int code) {
// table look up for log of values
// table look up for log of values
return code_to_log_value_[code];
}
~LogQtizer() {
@ -69,15 +70,15 @@ public:
fout->write((char*)&min_value_, sizeof(min_value_));
for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
for (int j = 0; j <= max_code_; ++j)
for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
}
private:
float base_;
float* code_to_value_;
float* code_to_value_;
float* code_to_log_value_;
int max_code_;
int max_code_;
float max_value_;
float min_value_;
void load(FileHandler* fin) {

View File

@ -103,10 +103,11 @@ bool Vocab::Load(const std::string & vocab_path, const FactorDirection& directio
std::cerr << "Loading vocab from " << vocab_path << std::endl;
return Load(&vcbin, direction, factors, closed);
}
bool Vocab::Load(FileHandler* vcbin) {
bool Vocab::Load(FileHandler* vcbin)
{
FactorList factors;
factors.push_back(0);
return Load(vcbin, Input, factors);
return Load(vcbin, Input, factors);
}
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
const FactorList& factors, bool closed)

View File

@ -74,12 +74,12 @@ int DynSuffixArray::F_firstIdx(unsigned word)
// return index of first row where word is found in m_F
/*for(int i=0; i < m_F->size(); ++i) {
if(m_F->at(i) == word) {
return i;
return i;
}
}
return -1;*/
//NOTE: lower_bound is faster than linear search above but may cause issues
// if ordering of vocab is not consecutive (ie..after deletions)
//NOTE: lower_bound is faster than linear search above but may cause issues
// if ordering of vocab is not consecutive (ie..after deletions)
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
if(low >= m_F->size())
@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
{
set<pair<unsigned, unsigned> > seen;
while(j != jprime) {
// this 'seenit' check added for data with many loops. will remove after double
// checking.
// this 'seenit' check added for data with many loops. will remove after double
// checking.
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
if(seenit) {
for(int i=1; i < m_SA->size(); ++i) {
@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
int new_j = LastFirstFunc(j);
CHECK(j <= jprime);
// for SA and L, the element at pos j is moved to pos j'
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
m_L->erase(m_L->begin() + j);
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
m_SA->erase(m_SA->begin() + j);
// all ISA values between (j...j'] decremented
for(size_t i = 0; i < m_ISA->size(); ++i) {

View File

@ -33,9 +33,9 @@ namespace Moses
class FactorFriend;
class FactorCollection;
/** Represents a factor (word, POS, etc).
/** Represents a factor (word, POS, etc).
*
* A Factor has a contiguous identifier and string value.
* A Factor has a contiguous identifier and string value.
*/
class Factor
{
@ -45,17 +45,17 @@ class Factor
friend class FactorCollection;
friend class FactorFriend;
// FactorCollection writes here.
// FactorCollection writes here.
std::string m_string;
size_t m_id;
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
Factor() {}
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
// Not implemented. Shouldn't be called.
// Not implemented. Shouldn't be called.
Factor &operator=(const Factor &factor);
public:

View File

@ -33,7 +33,7 @@ FactorCollection FactorCollection::s_instance;
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
#ifdef WITH_THREADS
#if BOOST_VERSION < 104200
FactorFriend to_ins;
@ -42,7 +42,7 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#if BOOST_VERSION >= 104200
// If this line doesn't compile, upgrade your Boost.
// If this line doesn't compile, upgrade your Boost.
Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
#else // BOOST_VERSION
Set::const_iterator i = m_set.find(to_ins);

View File

@ -47,7 +47,7 @@ namespace Moses
* private and friended to FactorFriend. The STL containers can delegate
* copying, so friending the container isn't sufficient. STL containers see
* FactorFriend's public copy constructor and everybody else sees Factor's
* private copy constructor.
* private copy constructor.
*/
struct FactorFriend {
Factor in;

View File

@ -30,20 +30,24 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
namespace Moses {
namespace Moses
{
LanguageModel::LanguageModel() {
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
LanguageModel::LanguageModel()
{
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
}
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager) {
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager)
{
scoreIndexManager.AddScoreProducer(this);
}
LanguageModel::~LanguageModel() {}
// don't inline virtual funcs...
size_t LanguageModel::GetNumScoreComponents() const {
size_t LanguageModel::GetNumScoreComponents() const
{
if (m_enableOOVFeature) {
return 2;
} else {
@ -51,13 +55,15 @@ size_t LanguageModel::GetNumScoreComponents() const {
}
}
float LanguageModel::GetWeight() const {
float LanguageModel::GetWeight() const
{
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
GetBeginIndex(GetScoreBookkeepingID());
return StaticData::Instance().GetAllWeights()[lmIndex];
}
float LanguageModel::GetOOVWeight() const {
float LanguageModel::GetOOVWeight() const
{
if (!m_enableOOVFeature) return 0;
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
GetBeginIndex(GetScoreBookkeepingID());

View File

@ -35,7 +35,8 @@ class Phrase;
class ScoreIndexManager;
//! Abstract base class which represent a language model on a contiguous phrase
class LanguageModel : public StatefulFeatureFunction {
class LanguageModel : public StatefulFeatureFunction
{
protected:
LanguageModel();
@ -43,11 +44,11 @@ protected:
void Init(ScoreIndexManager &scoreIndexManager);
bool m_enableOOVFeature;
public:
virtual ~LanguageModel();
// Make another feature without copying the underlying model data.
// Make another feature without copying the underlying model data.
virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0;
//! see ScoreProducer.h

View File

@ -10,10 +10,12 @@
namespace Moses
{
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0) {
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0)
{
}
LanguageModelDMapLM::~LanguageModelDMapLM() {
LanguageModelDMapLM::~LanguageModelDMapLM()
{
delete m_lm;
}
@ -51,8 +53,8 @@ void LanguageModelDMapLM::CreateFactor(FactorCollection& factorCollection)
}
LMResult LanguageModelDMapLM::GetValueGivenState(
const std::vector<const Word*>& contextFactor,
FFState& state) const
const std::vector<const Word*>& contextFactor,
FFState& state) const
{
DMapLMState& cast_state = static_cast<DMapLMState&>(state);
LMResult result;
@ -65,8 +67,8 @@ LMResult LanguageModelDMapLM::GetValueGivenState(
}
LMResult LanguageModelDMapLM::GetValueForgotState(
const std::vector<const Word*>& contextFactor,
FFState& outState) const
const std::vector<const Word*>& contextFactor,
FFState& outState) const
{
DMapLMState& cast_state = static_cast<DMapLMState&>(outState);
LMResult result;
@ -78,13 +80,13 @@ LMResult LanguageModelDMapLM::GetValueForgotState(
}
float LanguageModelDMapLM::GetValue(
const std::vector<const Word*>& contextFactor,
size_t target_order,
size_t* succeeding_order) const
const std::vector<const Word*>& contextFactor,
size_t target_order,
size_t* succeeding_order) const
{
FactorType factorType = GetFactorType();
float score;
std::string ngram_string("");
ngram_string.append(((*contextFactor[0])[factorType])->GetString());
for (size_t i = 1; i < contextFactor.size(); ++i) {
@ -97,38 +99,44 @@ float LanguageModelDMapLM::GetValue(
return score;
}
const FFState* LanguageModelDMapLM::GetNullContextState() const {
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
const FFState* LanguageModelDMapLM::GetNullContextState() const
{
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
}
FFState* LanguageModelDMapLM::GetNewSentenceState() const {
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
FFState* LanguageModelDMapLM::GetNewSentenceState() const
{
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
}
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const {
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const
{
DMapLMState* state = new DMapLMState();
state->m_last_succeeding_order = GetNGramOrder();
return state;
}
FFState* LanguageModelDMapLM::NewState(const FFState* state) const {
DMapLMState* new_state = new DMapLMState();
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
return new_state;
FFState* LanguageModelDMapLM::NewState(const FFState* state) const
{
DMapLMState* new_state = new DMapLMState();
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
return new_state;
}
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing() {
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing()
{
m_lm->printStats();
m_lm->resetStats();
m_lm->clearCaches();
}
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing() {
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing()
{
}
} // namespace Moses

View File

@ -12,20 +12,22 @@
#include "LM/SingleFactor.h"
#include "Util.h"
namespace Moses {
namespace Moses
{
class DMapLMState : public FFState {
class DMapLMState : public FFState
{
public:
int Compare(const FFState &o) const {
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
return -1;
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
return 1;
else
return 0;
}
uint8_t m_last_succeeding_order;
int Compare(const FFState &o) const {
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
return -1;
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
return 1;
else
return 0;
}
uint8_t m_last_succeeding_order;
};
class LanguageModelDMapLM : public LanguageModelSingleFactor

View File

@ -69,7 +69,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
m_filePath = filePath;
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
m_lmtb->setMaxLoadedLevel(1000);
m_lmtb->load(m_filePath);
d=m_lmtb->getDict();
@ -140,7 +140,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
}
int LanguageModelIRST::GetLmID( const Factor *factor ) const
{
{
size_t factorId = factor->GetId();
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
@ -150,12 +150,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
//////////
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
///e delle parole target in Moses, puo' accadere che una parola target
///e delle parole target in Moses, puo' accadere che una parola target
///di cui non sia stato ancora calcolato il suo codice target abbia
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
///E' necessario dunque identificare questi casi di indeterminatezza
///del codice target. Attualamente, questo controllo e' stato implementato
///impostando a m_empty tutti i termini che non hanno ancora
///impostando a m_empty tutti i termini che non hanno ancora
//ricevuto un codice target effettivo
///////////
@ -167,7 +167,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
/// Cosi' funziona ....
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
/// quindi
/// quindi
/// e scopro che rimane vuota una entry ogni due
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
/// non da problemi di correttezza, ma solo di "spreco" di memoria
@ -177,10 +177,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
////////////////
if (factorId >= m_lmIdLookup.size()){
//resize and fill with m_empty
//increment the array more than needed to avoid too many resizing operation.
m_lmIdLookup.resize(factorId+10, m_empty);
if (factorId >= m_lmIdLookup.size()) {
//resize and fill with m_empty
//increment the array more than needed to avoid too many resizing operation.
m_lmIdLookup.resize(factorId+10, m_empty);
}
//insert new code

View File

@ -68,8 +68,9 @@ void LanguageModelImplementation::GetState(
GetValueForgotState(contextFactor, state);
}
// Calculate score of a phrase.
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
// Calculate score of a phrase.
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
fullScore = 0;
ngramScore = 0;
@ -81,7 +82,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
vector<const Word*> contextFactor;
contextFactor.reserve(GetNGramOrder());
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartArray()) ?
GetBeginSentenceState() : GetNullContextState()));
GetBeginSentenceState() : GetNullContextState()));
size_t currPos = 0;
while (currPos < phraseSize) {
const Word &word = phrase.GetWord(currPos);
@ -108,7 +109,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
fullScore += result.score;
if (contextFactor.size() == GetNGramOrder())
ngramScore += result.score;
if (result.unknown) ++oovCount;
if (result.unknown) ++oovCount;
}
}
@ -116,7 +117,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
}
}
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const {
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const
{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
// translation option.
@ -178,9 +180,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
contextFactor[i] = &hypo.GetWord((size_t)currPos);
}
lmScore += GetValueForgotState(contextFactor, *res).score;
}
else
{
} else {
if (endPos < currEndPos) {
//need to get the LM state (otherwise the last LM state is fine)
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
@ -207,10 +207,11 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
return res;
}
namespace {
namespace
{
// This is the FFState used by LanguageModelImplementation::EvaluateChart.
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
// This is the FFState used by LanguageModelImplementation::EvaluateChart.
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
class LanguageModelChartState : public FFState
{
private:
@ -223,12 +224,11 @@ private:
const ChartHypothesis &m_hypo;
/** Construct the prefix string of up to specified size
/** Construct the prefix string of up to specified size
* \param ret prefix string
* \param size maximum size (typically max lm context window)
*/
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
{
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
target.GetAlignmentInfo().GetNonTermIndexMap();
@ -257,13 +257,12 @@ private:
return size;
}
/** Construct the suffix phrase of up to specified size
/** Construct the suffix phrase of up to specified size
* will always be called after the construction of prefix phrase
* \param ret suffix phrase
* \param size maximum size of suffix
*/
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
{
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
// special handling for small hypotheses
@ -292,8 +291,7 @@ private:
size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
}
else {
} else {
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
size--;
}
@ -309,11 +307,10 @@ private:
public:
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
:m_lmRightContext(NULL)
,m_contextPrefix(order - 1)
,m_contextSuffix( order - 1)
,m_hypo(hypo)
{
:m_lmRightContext(NULL)
,m_contextPrefix(order - 1)
,m_contextSuffix( order - 1)
,m_hypo(hypo) {
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
@ -334,8 +331,12 @@ public:
m_lmRightContext = rightState;
}
float GetPrefixScore() const { return m_prefixScore; }
FFState* GetRightContext() const { return m_lmRightContext; }
float GetPrefixScore() const {
return m_prefixScore;
}
FFState* GetRightContext() const {
return m_lmRightContext;
}
size_t GetNumTargetTerminals() const {
return m_numTargetTerminals;
@ -353,8 +354,7 @@ public:
dynamic_cast<const LanguageModelChartState &>( o );
// prefix
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
{
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
int ret = GetPrefix().Compare(other.GetPrefix());
if (ret != 0)
return ret;
@ -362,8 +362,7 @@ public:
// suffix
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
{
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
int ret = other.GetRightContext()->Compare(*m_lmRightContext);
if (ret != 0)
return ret;
@ -374,7 +373,8 @@ public:
} // namespace
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const {
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const
{
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
// data structure for factored context phrase (history and predicted word)
vector<const Word*> contextFactor;
@ -394,33 +394,28 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// loop over rule
for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
phrasePos++)
{
phrasePos++) {
// consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// regular word
if (!word.IsNonTerminal())
{
if (!word.IsNonTerminal()) {
ShiftOrPush(contextFactor, word);
// beginning of sentence symbol <s>? -> just update state
if (word == GetSentenceStartArray())
{
if (word == GetSentenceStartArray()) {
CHECK(phrasePos == 0);
delete lmState;
lmState = NewState( GetBeginSentenceState() );
}
// score a regular word added by the rule
else
{
else {
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
}
}
// non-terminal, add phrase from underlying hypothesis
else
{
else {
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
@ -444,8 +439,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// push suffix
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
{
for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
const Word &word = prevState->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
wordPos++;
@ -453,22 +447,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
}
// internal non-terminal
else
{
else {
// score its prefix
for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window
&& prefixPos < subPhraseLength; // up to length
prefixPos++)
{
&& prefixPos < subPhraseLength; // up to length
prefixPos++) {
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
ShiftOrPush(contextFactor, word);
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
}
// check if we are dealing with a large sub-phrase
if (subPhraseLength > GetNGramOrder() - 1)
{
if (subPhraseLength > GetNGramOrder() - 1) {
// add its finalized language model score
finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score
@ -503,11 +494,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
return ret;
}
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const {
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
{
if (wordPos < GetNGramOrder()) {
*prefixScore += score;
}
else {
} else {
*finalizedScore += score;
}
}

View File

@ -45,7 +45,7 @@ class Phrase;
struct LMResult {
// log probability
float score;
// Is the word unknown?
// Is the word unknown?
bool unknown;
};
@ -126,54 +126,55 @@ public:
virtual void CleanUpAfterSentenceProcessing() {};
};
class LMRefCount : public LanguageModel {
public:
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
Init(scoreIndexManager);
}
class LMRefCount : public LanguageModel
{
public:
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
Init(scoreIndexManager);
}
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
return new LMRefCount(scoreIndexManager, *this);
}
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
return new LMRefCount(scoreIndexManager, *this);
}
void InitializeBeforeSentenceProcessing() {
m_impl->InitializeBeforeSentenceProcessing();
}
void InitializeBeforeSentenceProcessing() {
m_impl->InitializeBeforeSentenceProcessing();
}
void CleanUpAfterSentenceProcessing() {
m_impl->CleanUpAfterSentenceProcessing();
}
void CleanUpAfterSentenceProcessing() {
m_impl->CleanUpAfterSentenceProcessing();
}
const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
return m_impl->NewState(m_impl->GetBeginSentenceState());
}
const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
return m_impl->NewState(m_impl->GetBeginSentenceState());
}
bool Useable(const Phrase &phrase) const {
return m_impl->Useable(phrase);
}
bool Useable(const Phrase &phrase) const {
return m_impl->Useable(phrase);
}
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
}
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
}
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
}
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
}
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
}
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
}
std::string GetScoreProducerDescription(unsigned int param) const {
return m_impl->GetScoreProducerDescription(param);
}
std::string GetScoreProducerDescription(unsigned int param) const {
return m_impl->GetScoreProducerDescription(param);
}
private:
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount &copy_from) : m_impl(copy_from.m_impl) {
Init(scoreIndexManager);
}
private:
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount &copy_from) : m_impl(copy_from.m_impl) {
Init(scoreIndexManager);
}
boost::shared_ptr<LanguageModelImplementation> m_impl;
boost::shared_ptr<LanguageModelImplementation> m_impl;
};
}

View File

@ -43,8 +43,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
namespace Moses {
namespace {
namespace Moses
{
namespace
{
struct KenLMState : public FFState {
lm::ngram::State state;
@ -59,67 +61,69 @@ struct KenLMState : public FFState {
/*
* An implementation of single factor LM using Ken's code.
*/
template <class Model> class LanguageModelKen : public LanguageModel {
public:
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
template <class Model> class LanguageModelKen : public LanguageModel
{
public:
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
bool Useable(const Phrase &phrase) const {
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
}
bool Useable(const Phrase &phrase) const {
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
}
std::string GetScoreProducerDescription(unsigned) const {
std::ostringstream oss;
oss << "LM_" << m_ngram->Order() << "gram";
return oss.str();
}
std::string GetScoreProducerDescription(unsigned) const {
std::ostringstream oss;
oss << "LM_" << m_ngram->Order() << "gram";
return oss.str();
}
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
KenLMState *ret = new KenLMState();
ret->state = m_ngram->BeginSentenceState();
return ret;
}
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
KenLMState *ret = new KenLMState();
ret->state = m_ngram->BeginSentenceState();
return ret;
}
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
private:
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from);
private:
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from);
lm::WordIndex TranslateID(const Word &word) const {
std::size_t factor = word.GetFactor(m_factorType)->GetId();
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
}
lm::WordIndex TranslateID(const Word &word) const {
std::size_t factor = word.GetFactor(m_factorType)->GetId();
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
}
// Convert last words of hypothesis into vocab ids, returning an end pointer.
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
lm::WordIndex *index = indices;
lm::WordIndex *end = indices + m_ngram->Order() - 1;
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
for (; ; ++index, --position) {
if (position == -1) {
*index = m_ngram->GetVocabulary().BeginSentence();
return index + 1;
}
if (index == end) return index;
*index = TranslateID(hypo.GetWord(position));
// Convert last words of hypothesis into vocab ids, returning an end pointer.
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
lm::WordIndex *index = indices;
lm::WordIndex *end = indices + m_ngram->Order() - 1;
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
for (; ; ++index, --position) {
if (position == -1) {
*index = m_ngram->GetVocabulary().BeginSentence();
return index + 1;
}
if (index == end) return index;
*index = TranslateID(hypo.GetWord(position));
}
}
boost::shared_ptr<Model> m_ngram;
std::vector<lm::WordIndex> m_lmIdLookup;
boost::shared_ptr<Model> m_ngram;
FactorType m_factorType;
std::vector<lm::WordIndex> m_lmIdLookup;
const Factor *m_beginSentenceFactor;
FactorType m_factorType;
const Factor *m_beginSentenceFactor;
};
class MappingBuilder : public lm::EnumerateVocab {
class MappingBuilder : public lm::EnumerateVocab
{
public:
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorCollection(factorCollection), m_mapping(mapping) {}
@ -138,11 +142,13 @@ private:
std::vector<lm::WordIndex> &m_mapping;
};
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType) {
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType)
{
lm::ngram::Config config;
IFVERBOSE(1) {
config.messages = &std::cerr;
} else {
}
else {
config.messages = NULL;
}
FactorCollection &collection = FactorCollection::Instance();
@ -156,20 +162,23 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
Init(manager);
}
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const {
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const
{
return new LanguageModelKen<Model>(manager, *this);
}
template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from) :
m_ngram(copy_from.m_ngram),
// TODO: don't copy this.
m_lmIdLookup(copy_from.m_lmIdLookup),
m_factorType(copy_from.m_factorType),
m_beginSentenceFactor(copy_from.m_beginSentenceFactor) {
m_ngram(copy_from.m_ngram),
// TODO: don't copy this.
m_lmIdLookup(copy_from.m_lmIdLookup),
m_factorType(copy_from.m_factorType),
m_beginSentenceFactor(copy_from.m_beginSentenceFactor)
{
Init(manager);
}
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
fullScore = 0;
ngramScore = 0;
oovCount = 0;
@ -186,13 +195,13 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
*state0 = m_ngram->NullContextState();
position = 0;
}
size_t ngramBoundary = m_ngram->Order() - 1;
for (; position < phrase.GetSize(); ++position) {
const Word &word = phrase.GetWord(position);
if (word.IsNonTerminal()) {
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
ngramBoundary = m_ngram->Order() + position;
*state0 = m_ngram->NullContextState();
} else {
@ -210,11 +219,12 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
}
}
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
{
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
std::auto_ptr<KenLMState> ret(new KenLMState());
if (!hypo.GetCurrTargetLength()) {
ret->state = in_state;
return ret.release();
@ -237,17 +247,17 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
}
if (hypo.IsSourceCompleted()) {
// Score end of sentence.
// Score end of sentence.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
} else if (adjust_end < end) {
// Get state after adding a long phrase.
// Get state after adding a long phrase.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
m_ngram->GetState(&indices.front(), last, ret->state);
} else if (state0 != &ret->state) {
// Short enough phrase that we can just reuse the state.
// Short enough phrase that we can just reuse the state.
ret->state = *state0;
}
@ -265,32 +275,37 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
return ret.release();
}
class LanguageModelChartStateKenLM : public FFState {
public:
LanguageModelChartStateKenLM() {}
class LanguageModelChartStateKenLM : public FFState
{
public:
LanguageModelChartStateKenLM() {}
const lm::ngram::ChartState &GetChartState() const { return m_state; }
lm::ngram::ChartState &GetChartState() { return m_state; }
const lm::ngram::ChartState &GetChartState() const {
return m_state;
}
lm::ngram::ChartState &GetChartState() {
return m_state;
}
int Compare(const FFState& o) const
{
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
int ret = m_state.Compare(other.m_state);
return ret;
}
int Compare(const FFState& o) const {
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
int ret = m_state.Compare(other.m_state);
return ret;
}
private:
lm::ngram::ChartState m_state;
private:
lm::ngram::ChartState m_state;
};
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const {
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
{
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
size_t phrasePos = 0;
// Special cases for first word.
// Special cases for first word.
if (size) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
@ -298,7 +313,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
ruleScore.BeginSentence();
phrasePos++;
} else if (word.IsNonTerminal()) {
// Non-terminal is first so we can copy instead of rescoring.
// Non-terminal is first so we can copy instead of rescoring.
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
@ -323,24 +338,25 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
} // namespace
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) {
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy)
{
try {
lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) {
case lm::ngram::HASH_PROBING:
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
case lm::ngram::TRIE_SORTED:
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
case lm::ngram::ARRAY_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
default:
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
abort();
case lm::ngram::HASH_PROBING:
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
case lm::ngram::TRIE_SORTED:
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
case lm::ngram::ARRAY_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
default:
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
abort();
}
} else {
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);

View File

@ -26,12 +26,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h"
namespace Moses {
namespace Moses
{
class ScoreIndexManager;
class LanguageModel;
// This will also load.
// This will also load.
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
} // namespace Moses

View File

@ -9,10 +9,11 @@
#include "LM/ORLM.h"
using std::map;
namespace Moses
namespace Moses
{
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
size_t nGramOrder)
{
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
size_t nGramOrder) {
cerr << "Loading LanguageModelORLM..." << endl;
m_filePath = filePath;
m_factorType = factorType;
@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
CreateFactors();
return true;
}
void LanguageModelORLM::CreateFactors() {
void LanguageModelORLM::CreateFactors()
{
FactorCollection &factorCollection = FactorCollection::Instance();
size_t maxFactorId = 0; // to create lookup vector later on
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
vIter != m_lm->vocab_->VocabEnd(); vIter++){
vIter != m_lm->vocab_->VocabEnd(); vIter++) {
// get word from ORLM vocab and associate with (new) factor id
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
m_lmids_map[factorId] = vIter->second;
@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() {
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
lm_ids_vec_.resize(maxFactorId+1);
lm_ids_vec_.resize(maxFactorId+1);
// fill with OOV code
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() {
iter != m_lmids_map.end() ; ++iter)
lm_ids_vec_[iter->first] = iter->second;
}
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const {
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
{
return m_lm->vocab_->GetWordID(str);
}
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const {
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
{
size_t factorId = factor->GetId();
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
}
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
State* finalState) const {
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
State* finalState) const
{
FactorType factorType = GetFactorType();
// set up context
//std::vector<long unsigned int> factor(1,0);
@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFact
*/
return ret;
}
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value) {
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
{
/*cerr << "Inserting into ORLM: \"";
iterate(ngram, nit)
cerr << *nit << " ";
cerr << "\"\t" << value << endl; */
m_lm->vocab_->MakeOpen();
bool res = m_lm->update(ngram, value);
bool res = m_lm->update(ngram, value);
m_lm->vocab_->MakeClosed();
return res;
}

View File

@ -15,7 +15,8 @@ namespace Moses
class Factor;
class Phrase;
class LanguageModelORLM : public LanguageModelPointerState {
class LanguageModelORLM : public LanguageModelPointerState
{
public:
typedef count_t T; // type for ORLM filter
LanguageModelORLM()
@ -30,13 +31,15 @@ public:
fout.close();
delete m_lm;
}
void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches
void CleanUpAfterSentenceProcessing() {
m_lm->clearCache(); // clear caches
}
void InitializeBeforeSentenceProcessing() { // nothing to do
//m_lm->initThreadSpecificData(); // Creates thread specific data iff
// compiled with multithreading.
// compiled with multithreading.
}
bool UpdateORLM(const std::vector<string>& ngram, const int value);
protected:
protected:
OnlineRLM<T>* m_lm;
//MultiOnlineRLM<T>* m_lm;
wordID_t m_oov_id;

View File

@ -347,7 +347,8 @@ const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
}
LanguageModelMultiFactor *NewParallelBackoff() {
LanguageModelMultiFactor *NewParallelBackoff()
{
return new LanguageModelParallelBackoff();
}

View File

@ -38,7 +38,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
namespace
namespace
{
using namespace std;
@ -57,7 +57,7 @@ public:
}
void InitializeBeforeSentenceProcessing() {
m_lm->initThreadSpecificData(); // Creates thread specific data iff
// compiled with multithreading.
// compiled with multithreading.
}
protected:
std::vector<randlm::WordID> m_randlm_ids_vec;
@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
}
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
State* finalState) const
State* finalState) const
{
FactorType factorType = GetFactorType();
// set up context
@ -156,7 +156,8 @@ LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
}
LanguageModelPointerState *NewRandLM() {
LanguageModelPointerState *NewRandLM()
{
return new LanguageModelRandLM();
}

View File

@ -46,7 +46,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
const float weightLM = lm.GetWeight();
const float oovWeightLM = lm.GetOOVWeight();
float fullScore, nGramScore;
float fullScore, nGramScore;
size_t oovCount;
// do not process, if factors not defined yet (happens in partial translation options)
@ -64,7 +64,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
} else {
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
}
retFullScore += fullScore * weightLM;
retNGramScore += nGramScore * weightLM;

View File

@ -39,13 +39,13 @@ public:
virtual FFState* Evaluate(const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart(const ChartHypothesis&,
int /* featureID */,
ScoreComponentCollection*) const {
CHECK(0); // not valid for chart decoder
return NULL;
}
ScoreComponentCollection*) const {
CHECK(0); // not valid for chart decoder
return NULL;
}
virtual const FFState* EmptyHypothesisState(const InputType &input) const;

View File

@ -267,8 +267,9 @@ struct SGNReverseCompare {
/**
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
**/
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
{
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
@ -282,15 +283,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
map<int,const Hypothesis*> idToHyp;
map<int,float> fscores;
//Iterating through the hypos in reverse order of id gives a reverse
//topological order. We rely on the fact that hypo ids are given out
//Iterating through the hypos in reverse order of id gives a reverse
//topological order. We rely on the fact that hypo ids are given out
//sequentially, as the search proceeds.
//NB: Could just sort by stack.
//NB: Could just sort by stack.
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
//first task is to fill in the outgoing hypos and edge scores.
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
i != searchGraph.end(); ++i) {
i != searchGraph.end(); ++i) {
const Hypothesis* hypo = i->hypo;
idToHyp[hypo->GetId()] = hypo;
fscores[hypo->GetId()] = i->fscore;
@ -298,7 +299,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//back to current
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
outgoingHyps[prevHypo].insert(hypo);
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
hypo->GetScore() - prevHypo->GetScore();
}
//forward from current
@ -309,7 +310,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
outgoingHyps[hypo].insert(nextHypo);
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
CHECK(fscoreIter != fscores.end());
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
i->fscore - fscoreIter->second;
}
}
@ -317,26 +318,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//then run through again to calculate sigmas
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
i != searchGraph.end(); ++i) {
i != searchGraph.end(); ++i) {
if (i->forward == -1) {
sigmas[i->hypo] = 0;
} else {
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(i->hypo);
CHECK(outIter != outgoingHyps.end());
float sigma = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) {
j != outIter->second.end(); ++j) {
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
CHECK(succIter != sigmas.end());
map<Edge,float>::const_iterator edgeScoreIter =
map<Edge,float>::const_iterator edgeScoreIter =
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
CHECK(edgeScoreIter != edgeScores.end());
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
if (sigma == 0) {
sigma = term;
sigma = term;
} else {
sigma = log_sum(sigma,term);
}
@ -352,7 +353,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<const Hypothesis*> path;
path.push_back(startHypo);
while(1) {
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(path.back());
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
//end of the path
@ -363,7 +364,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<float> candidateScores;
float scoreTotal = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
j != outIter->second.end(); ++j) {
j != outIter->second.end(); ++j) {
candidates.push_back(*j);
CHECK(sigmas.find(*j) != sigmas.end());
Edge edge(path.back()->GetId(),(*j)->GetId());
@ -390,18 +391,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
}
//cerr << "Random: " << random << " Chose " << position-1 << endl;
const Hypothesis* chosen = candidates[position-1];
path.push_back(chosen);
path.push_back(chosen);
}
//cerr << "Path: " << endl;
//for (size_t j = 0; j < path.size(); ++j) {
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
//}
//cerr << endl;
//Convert the hypos to TrellisPath
ret.Add(new TrellisPath(path));
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
}
}
}
@ -676,17 +677,17 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
else
outputSearchGraphStream << " hyp=" << searchNode.hypo->GetId();
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
<< " back=" << prevHypo->GetId()
<< " score=" << searchNode.hypo->GetScore()
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
<< " back=" << prevHypo->GetId()
<< " score=" << searchNode.hypo->GetScore()
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
if (searchNode.recombinationHypo != NULL)
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
if (searchNode.recombinationHypo != NULL)
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
// Modified so that -osgx is a superset of -osg (GST Oct 2011)
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
@ -694,10 +695,10 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
outputSearchGraphStream << " scores=[ ";
StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown );
outputSearchGraphStream << " ]";
outputSearchGraphStream << " out=" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
}

View File

@ -36,7 +36,7 @@ namespace PCN
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
* word lattice in PCN format, return a CN object representing the lattice
*/

View File

@ -71,10 +71,10 @@ Parameter::Parameter()
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
AddParam("report-segmentation", "t", "report phrase segmentation in the output");
#ifdef HAVE_SYNLM
AddParam("slmodel-file", "location of the syntactic language model file(s)");
AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
AddParam("slmodel-factor", "factor to use with syntactic language model");
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
AddParam("slmodel-file", "location of the syntactic language model file(s)");
AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
AddParam("slmodel-factor", "factor to use with syntactic language model");
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
#endif
AddParam("stack", "s", "maximum stack size for histogram pruning");
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
@ -277,14 +277,13 @@ bool Parameter::Validate()
PARAM_MAP::const_iterator iterParams;
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
const std::string &key = iterParams->first;
if (m_valid.find(key) == m_valid.end())
{
if (m_valid.find(key) == m_valid.end()) {
UserMessage::Add("Unknown parameter " + key);
noErrorFlag = false;
}
}
// required parameters
if (m_setting["ttable-file"].size() == 0) {
@ -307,7 +306,7 @@ bool Parameter::Validate()
}
if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1)
!= m_setting["weight-l"].size()) {
!= m_setting["weight-l"].size()) {
stringstream errorMsg("");
errorMsg << "Config and parameters specify "
<< static_cast<int>(m_setting["lmodel-file"].size())
@ -457,8 +456,7 @@ bool Parameter::ReadConfigFile(const string &filePath )
if (line.size() == 0) {
// blank line. do nothing.
}
else if (line[0]=='[') {
} else if (line[0]=='[') {
// new parameter
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
if (line[currPos] == ']') {

View File

@ -143,9 +143,9 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
Word &word = AddWord();
size_t index = 0;
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
factor_it && (index < factorOrder.size());
++factor_it, ++index) {
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
factor_it && (index < factorOrder.size());
++factor_it, ++index) {
word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
}
if (index != factorOrder.size()) {

View File

@ -61,7 +61,7 @@ public:
/** Fills phrase with words from format string, typically from phrase table or sentence input
* \param factorOrder factor types of each element in 2D string vector
* \param phraseString formatted input string to parse
* \param factorDelimiter delimiter between factors.
* \param factorDelimiter delimiter between factors.
*/
void CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter);

View File

@ -136,7 +136,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl);
}
PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(m_numScoreComponent,this);
bool ret = pdm->Load(GetInput()
, GetOutput()
@ -154,7 +154,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
m_filePath += ".gz";
VERBOSE(2,"Using gzipped file" << std::endl);
}
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this);
bool ret = pdm->Load(GetInput()
, GetOutput()
@ -255,18 +255,18 @@ PhraseDictionaryFeature::~PhraseDictionaryFeature()
std::string PhraseDictionaryFeature::GetScoreProducerDescription(unsigned idx) const
{
if (idx < GetNumInputScores()){
if (idx < GetNumInputScores()) {
return "InputScore";
}else{
} else {
return "PhraseModel";
}
}
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
{
if (idx < GetNumInputScores()){
if (idx < GetNumInputScores()) {
return "I";
}else{
} else {
return "tm";
}
}

View File

@ -16,16 +16,16 @@
using namespace std;
namespace Moses
namespace Moses
{
bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer)
, const std::vector<FactorType> &output
, const std::string &filePath
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer)
{
// file path is the directory of the rules for eacg, NOT the file of all the rules
m_filePath = filePath;
@ -36,7 +36,7 @@ bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
m_languageModels = &languageModels;
m_wpProducer = wpProducer;
m_weight = &weight;
return true;
}
@ -44,20 +44,20 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
{
// clear out rules for previous sentence
m_collection.Clear();
// populate with rules for this sentence
long translationId = source.GetTranslationId();
string grammarFile = m_filePath + "/grammar.out." + SPrint(translationId);
// data from file
InputFileStream inFile(grammarFile);
std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(grammarFile);
RuleTableLoaderFactory::Create(grammarFile);
bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
*m_languageModels, m_wpProducer, *this);
CHECK(ret);
}

View File

@ -11,13 +11,14 @@
#include "PhraseDictionarySCFG.h"
namespace Moses {
namespace Moses
{
class PhraseDictionaryALSuffixArray : public PhraseDictionarySCFG
{
public:
PhraseDictionaryALSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature)
: PhraseDictionarySCFG(numScoreComponent,feature) {}
: PhraseDictionarySCFG(numScoreComponent,feature) {}
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
@ -34,9 +35,9 @@ protected:
const LMList *m_languageModels;
const WordPenaltyProducer *m_wpProducer;
const std::vector<float> *m_weight;
};
}

View File

@ -72,7 +72,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{

View File

@ -15,30 +15,31 @@
using namespace std;
namespace Moses {
namespace Moses
{
bool PhraseDictionaryHiero::Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer)
, const std::vector<FactorType> &output
, const std::string &filePath
, const std::vector<float> &weight
, size_t tableLimit
, const LMList &languageModels
, const WordPenaltyProducer* wpProducer)
{
m_filePath = filePath;
m_tableLimit = tableLimit;
// data from file
InputFileStream inFile(filePath);
std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(filePath);
RuleTableLoaderFactory::Create(filePath);
bool ret = loader->Load(input, output, inFile, weight, tableLimit,
languageModels, wpProducer, *this);
return ret;
}
} // namespace

View File

@ -11,13 +11,14 @@
#include "PhraseDictionarySCFG.h"
namespace Moses {
namespace Moses
{
class PhraseDictionaryHiero : public PhraseDictionarySCFG
{
public:
PhraseDictionaryHiero(size_t numScoreComponent, PhraseDictionaryFeature* feature)
: PhraseDictionarySCFG(numScoreComponent,feature) {}
: PhraseDictionarySCFG(numScoreComponent,feature) {}
bool Load(const std::vector<FactorType> &input
, const std::vector<FactorType> &output

Some files were not shown because too many files have changed in this diff Show More