mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-05 02:22:21 +03:00
uint -> size_t
This commit is contained in:
parent
9ec1bef6fb
commit
9861ecbbe5
@ -38,22 +38,21 @@
|
||||
|
||||
typedef struct _cmd CMD;
|
||||
|
||||
struct _cmd
|
||||
{
|
||||
CMD * next;
|
||||
CMD * tail; /* valid on in head */
|
||||
RULE * rule; /* rule->actions contains shell script */
|
||||
LIST * shell; /* $(SHELL) value */
|
||||
LOL args; /* LISTs for $(<), $(>) */
|
||||
char * buf; /* actual commands */
|
||||
struct _cmd {
|
||||
CMD * next;
|
||||
CMD * tail; /* valid on in head */
|
||||
RULE * rule; /* rule->actions contains shell script */
|
||||
LIST * shell; /* $(SHELL) value */
|
||||
LOL args; /* LISTs for $(<), $(>) */
|
||||
char * buf; /* actual commands */
|
||||
};
|
||||
|
||||
CMD * cmd_new
|
||||
(
|
||||
RULE * rule, /* rule (referenced) */
|
||||
LIST * targets, /* $(<) (freed) */
|
||||
LIST * sources, /* $(>) (freed) */
|
||||
LIST * shell /* $(SHELL) (freed) */
|
||||
RULE * rule, /* rule (referenced) */
|
||||
LIST * targets, /* $(<) (freed) */
|
||||
LIST * sources, /* $(>) (freed) */
|
||||
LIST * shell /* $(SHELL) (freed) */
|
||||
);
|
||||
|
||||
void cmd_free( CMD * );
|
||||
|
@ -10,35 +10,33 @@
|
||||
#include <time.h>
|
||||
|
||||
|
||||
struct profile_info
|
||||
{
|
||||
/* name of rule being called */
|
||||
char* name;
|
||||
/* cumulative time spent in rule */
|
||||
clock_t cumulative;
|
||||
/* time spent in rule proper */
|
||||
clock_t net;
|
||||
/* number of time rule was entered */
|
||||
unsigned long num_entries;
|
||||
/* number of the times this function is present in stack */
|
||||
unsigned long stack_count;
|
||||
/* bytes of memory allocated by the call */
|
||||
unsigned long memory;
|
||||
struct profile_info {
|
||||
/* name of rule being called */
|
||||
char* name;
|
||||
/* cumulative time spent in rule */
|
||||
clock_t cumulative;
|
||||
/* time spent in rule proper */
|
||||
clock_t net;
|
||||
/* number of time rule was entered */
|
||||
unsigned long num_entries;
|
||||
/* number of the times this function is present in stack */
|
||||
unsigned long stack_count;
|
||||
/* bytes of memory allocated by the call */
|
||||
unsigned long memory;
|
||||
};
|
||||
typedef struct profile_info profile_info;
|
||||
|
||||
struct profile_frame
|
||||
{
|
||||
/* permanent storage where data accumulates */
|
||||
profile_info* info;
|
||||
/* overhead for profiling in this call */
|
||||
clock_t overhead;
|
||||
/* time of last entry to rule */
|
||||
clock_t entry_time;
|
||||
/* stack frame of caller */
|
||||
struct profile_frame* caller;
|
||||
/* time spent in subrules */
|
||||
clock_t subrules;
|
||||
struct profile_frame {
|
||||
/* permanent storage where data accumulates */
|
||||
profile_info* info;
|
||||
/* overhead for profiling in this call */
|
||||
clock_t overhead;
|
||||
/* time of last entry to rule */
|
||||
clock_t entry_time;
|
||||
/* stack frame of caller */
|
||||
struct profile_frame* caller;
|
||||
/* time spent in subrules */
|
||||
clock_t subrules;
|
||||
};
|
||||
typedef struct profile_frame profile_frame;
|
||||
|
||||
|
@ -18,22 +18,21 @@
|
||||
|
||||
#include <time.h>
|
||||
|
||||
typedef struct timing_info
|
||||
{
|
||||
double system;
|
||||
double user;
|
||||
time_t start;
|
||||
time_t end;
|
||||
typedef struct timing_info {
|
||||
double system;
|
||||
double user;
|
||||
time_t start;
|
||||
time_t end;
|
||||
} timing_info;
|
||||
|
||||
void exec_cmd
|
||||
(
|
||||
char * string,
|
||||
void (* func)( void * closure, int status, timing_info *, char *, char * ),
|
||||
void * closure,
|
||||
LIST * shell,
|
||||
char * action,
|
||||
char * target
|
||||
char * string,
|
||||
void (* func)( void * closure, int status, timing_info *, char *, char * ),
|
||||
void * closure,
|
||||
LIST * shell,
|
||||
char * action,
|
||||
char * target
|
||||
);
|
||||
|
||||
int exec_wait();
|
||||
|
@ -33,14 +33,13 @@ int file_is_file(char* filename);
|
||||
int file_mkdir(char *pathname);
|
||||
|
||||
typedef struct file_info_t file_info_t ;
|
||||
struct file_info_t
|
||||
{
|
||||
char * name;
|
||||
short is_file;
|
||||
short is_dir;
|
||||
unsigned long size;
|
||||
time_t time;
|
||||
LIST * files;
|
||||
struct file_info_t {
|
||||
char * name;
|
||||
short is_file;
|
||||
short is_dir;
|
||||
unsigned long size;
|
||||
time_t time;
|
||||
LIST * files;
|
||||
};
|
||||
|
||||
|
||||
|
@ -12,15 +12,14 @@
|
||||
typedef struct _PARSE PARSE;
|
||||
typedef struct frame FRAME;
|
||||
|
||||
struct frame
|
||||
{
|
||||
FRAME * prev;
|
||||
/* The nearest enclosing frame for which module->user_module is true. */
|
||||
FRAME * prev_user;
|
||||
LOL args[ 1 ];
|
||||
module_t * module;
|
||||
PARSE * procedure;
|
||||
char * rulename;
|
||||
struct frame {
|
||||
FRAME * prev;
|
||||
/* The nearest enclosing frame for which module->user_module is true. */
|
||||
FRAME * prev_user;
|
||||
LOL args[ 1 ];
|
||||
module_t * module;
|
||||
PARSE * procedure;
|
||||
char * rulename;
|
||||
};
|
||||
|
||||
|
||||
|
@ -91,7 +91,7 @@
|
||||
#include <ctype.h>
|
||||
#include <malloc.h>
|
||||
#ifndef __MWERKS__
|
||||
#include <memory.h>
|
||||
#include <memory.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#include <string.h>
|
||||
@ -113,17 +113,17 @@
|
||||
/* AS400 cross-compile from NT. */
|
||||
|
||||
#ifdef AS400
|
||||
#undef OSMINOR
|
||||
#undef OSMAJOR
|
||||
#define OSMAJOR "AS400=true"
|
||||
#define OSMINOR "OS=AS400"
|
||||
#define OS_AS400
|
||||
#undef OSMINOR
|
||||
#undef OSMAJOR
|
||||
#define OSMAJOR "AS400=true"
|
||||
#define OSMINOR "OS=AS400"
|
||||
#define OS_AS400
|
||||
#endif
|
||||
|
||||
/* Metrowerks Standard Library on Windows. */
|
||||
|
||||
#ifdef __MSL__
|
||||
#undef HAVE_POPEN
|
||||
#undef HAVE_POPEN
|
||||
#endif
|
||||
|
||||
# endif
|
||||
@ -182,7 +182,7 @@
|
||||
#define DOWNSHIFT_PATHS
|
||||
|
||||
#ifdef __EMX__
|
||||
#define USE_FILEUNIX
|
||||
#define USE_FILEUNIX
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -218,181 +218,181 @@
|
||||
#define PATH_DELIM '/'
|
||||
|
||||
#ifdef _AIX
|
||||
#define unix
|
||||
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
|
||||
#define OSMINOR "OS=AIX"
|
||||
#define OS_AIX
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#define MAXLINE 23552 /* 24k - 1k, longest 'together' actions */
|
||||
#define OSMINOR "OS=AIX"
|
||||
#define OS_AIX
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef AMIGA
|
||||
#define OSMINOR "OS=AMIGA"
|
||||
#define OS_AMIGA
|
||||
#define OSMINOR "OS=AMIGA"
|
||||
#define OS_AMIGA
|
||||
#endif
|
||||
#ifdef __BEOS__
|
||||
#define unix
|
||||
#define OSMINOR "OS=BEOS"
|
||||
#define OS_BEOS
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#define OSMINOR "OS=BEOS"
|
||||
#define OS_BEOS
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef __bsdi__
|
||||
#define OSMINOR "OS=BSDI"
|
||||
#define OS_BSDI
|
||||
#define OSMINOR "OS=BSDI"
|
||||
#define OS_BSDI
|
||||
#endif
|
||||
#if defined (COHERENT) && defined (_I386)
|
||||
#define OSMINOR "OS=COHERENT"
|
||||
#define OS_COHERENT
|
||||
#define NO_VFORK
|
||||
#define OSMINOR "OS=COHERENT"
|
||||
#define OS_COHERENT
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#if defined(__cygwin__) || defined(__CYGWIN__)
|
||||
#define OSMINOR "OS=CYGWIN"
|
||||
#define OS_CYGWIN
|
||||
#define OSMINOR "OS=CYGWIN"
|
||||
#define OS_CYGWIN
|
||||
#endif
|
||||
#if defined(__FreeBSD__) && !defined(__DragonFly__)
|
||||
#define OSMINOR "OS=FREEBSD"
|
||||
#define OS_FREEBSD
|
||||
#define OSMINOR "OS=FREEBSD"
|
||||
#define OS_FREEBSD
|
||||
#endif
|
||||
#ifdef __DragonFly__
|
||||
#define OSMINOR "OS=DRAGONFLYBSD"
|
||||
#define OS_DRAGONFLYBSD
|
||||
#define OSMINOR "OS=DRAGONFLYBSD"
|
||||
#define OS_DRAGONFLYBSD
|
||||
#endif
|
||||
#ifdef __DGUX__
|
||||
#define OSMINOR "OS=DGUX"
|
||||
#define OS_DGUX
|
||||
#define OSMINOR "OS=DGUX"
|
||||
#define OS_DGUX
|
||||
#endif
|
||||
#ifdef __hpux
|
||||
#define OSMINOR "OS=HPUX"
|
||||
#define OS_HPUX
|
||||
#define OSMINOR "OS=HPUX"
|
||||
#define OS_HPUX
|
||||
#endif
|
||||
#ifdef __OPENNT
|
||||
#define unix
|
||||
#define OSMINOR "OS=INTERIX"
|
||||
#define OS_INTERIX
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#define OSMINOR "OS=INTERIX"
|
||||
#define OS_INTERIX
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef __sgi
|
||||
#define OSMINOR "OS=IRIX"
|
||||
#define OS_IRIX
|
||||
#define NO_VFORK
|
||||
#define OSMINOR "OS=IRIX"
|
||||
#define OS_IRIX
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef __ISC
|
||||
#define OSMINOR "OS=ISC"
|
||||
#define OS_ISC
|
||||
#define NO_VFORK
|
||||
#define OSMINOR "OS=ISC"
|
||||
#define OS_ISC
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef linux
|
||||
#define OSMINOR "OS=LINUX"
|
||||
#define OS_LINUX
|
||||
#define OSMINOR "OS=LINUX"
|
||||
#define OS_LINUX
|
||||
#endif
|
||||
#ifdef __Lynx__
|
||||
#define OSMINOR "OS=LYNX"
|
||||
#define OS_LYNX
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#define OSMINOR "OS=LYNX"
|
||||
#define OS_LYNX
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#endif
|
||||
#ifdef __MACHTEN__
|
||||
#define OSMINOR "OS=MACHTEN"
|
||||
#define OS_MACHTEN
|
||||
#define OSMINOR "OS=MACHTEN"
|
||||
#define OS_MACHTEN
|
||||
#endif
|
||||
#ifdef mpeix
|
||||
#define unix
|
||||
#define OSMINOR "OS=MPEIX"
|
||||
#define OS_MPEIX
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#define OSMINOR "OS=MPEIX"
|
||||
#define OS_MPEIX
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef __MVS__
|
||||
#define unix
|
||||
#define OSMINOR "OS=MVS"
|
||||
#define OS_MVS
|
||||
#define unix
|
||||
#define OSMINOR "OS=MVS"
|
||||
#define OS_MVS
|
||||
#endif
|
||||
#ifdef _ATT4
|
||||
#define OSMINOR "OS=NCR"
|
||||
#define OS_NCR
|
||||
#define OSMINOR "OS=NCR"
|
||||
#define OS_NCR
|
||||
#endif
|
||||
#ifdef __NetBSD__
|
||||
#define unix
|
||||
#define OSMINOR "OS=NETBSD"
|
||||
#define OS_NETBSD
|
||||
#define NO_VFORK
|
||||
#define unix
|
||||
#define OSMINOR "OS=NETBSD"
|
||||
#define OS_NETBSD
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef __QNX__
|
||||
#define unix
|
||||
#ifdef __QNXNTO__
|
||||
#define OSMINOR "OS=QNXNTO"
|
||||
#define OS_QNXNTO
|
||||
#else
|
||||
#define OSMINOR "OS=QNX"
|
||||
#define OS_QNX
|
||||
#define NO_VFORK
|
||||
#define MAXLINE 996
|
||||
#endif
|
||||
#define unix
|
||||
#ifdef __QNXNTO__
|
||||
#define OSMINOR "OS=QNXNTO"
|
||||
#define OS_QNXNTO
|
||||
#else
|
||||
#define OSMINOR "OS=QNX"
|
||||
#define OS_QNX
|
||||
#define NO_VFORK
|
||||
#define MAXLINE 996
|
||||
#endif
|
||||
#endif
|
||||
#ifdef NeXT
|
||||
#ifdef __APPLE__
|
||||
#define OSMINOR "OS=RHAPSODY"
|
||||
#define OS_RHAPSODY
|
||||
#else
|
||||
#define OSMINOR "OS=NEXT"
|
||||
#define OS_NEXT
|
||||
#endif
|
||||
#ifdef __APPLE__
|
||||
#define OSMINOR "OS=RHAPSODY"
|
||||
#define OS_RHAPSODY
|
||||
#else
|
||||
#define OSMINOR "OS=NEXT"
|
||||
#define OS_NEXT
|
||||
#endif
|
||||
#endif
|
||||
#ifdef __APPLE__
|
||||
#define unix
|
||||
#define OSMINOR "OS=MACOSX"
|
||||
#define OS_MACOSX
|
||||
#define unix
|
||||
#define OSMINOR "OS=MACOSX"
|
||||
#define OS_MACOSX
|
||||
#endif
|
||||
#ifdef __osf__
|
||||
#ifndef unix
|
||||
#define unix
|
||||
#endif
|
||||
#define OSMINOR "OS=OSF"
|
||||
#define OS_OSF
|
||||
#ifndef unix
|
||||
#define unix
|
||||
#endif
|
||||
#define OSMINOR "OS=OSF"
|
||||
#define OS_OSF
|
||||
#endif
|
||||
#ifdef _SEQUENT_
|
||||
#define OSMINOR "OS=PTX"
|
||||
#define OS_PTX
|
||||
#define OSMINOR "OS=PTX"
|
||||
#define OS_PTX
|
||||
#endif
|
||||
#ifdef M_XENIX
|
||||
#define OSMINOR "OS=SCO"
|
||||
#define OS_SCO
|
||||
#define NO_VFORK
|
||||
#define OSMINOR "OS=SCO"
|
||||
#define OS_SCO
|
||||
#define NO_VFORK
|
||||
#endif
|
||||
#ifdef sinix
|
||||
#define unix
|
||||
#define OSMINOR "OS=SINIX"
|
||||
#define OS_SINIX
|
||||
#define unix
|
||||
#define OSMINOR "OS=SINIX"
|
||||
#define OS_SINIX
|
||||
#endif
|
||||
#ifdef sun
|
||||
#if defined(__svr4__) || defined(__SVR4)
|
||||
#define OSMINOR "OS=SOLARIS"
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#define OSMINOR "OS=SUNOS"
|
||||
#define OS_SUNOS
|
||||
#endif
|
||||
#if defined(__svr4__) || defined(__SVR4)
|
||||
#define OSMINOR "OS=SOLARIS"
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#define OSMINOR "OS=SUNOS"
|
||||
#define OS_SUNOS
|
||||
#endif
|
||||
#endif
|
||||
#ifdef ultrix
|
||||
#define OSMINOR "OS=ULTRIX"
|
||||
#define OS_ULTRIX
|
||||
#define OSMINOR "OS=ULTRIX"
|
||||
#define OS_ULTRIX
|
||||
#endif
|
||||
#ifdef _UNICOS
|
||||
#define OSMINOR "OS=UNICOS"
|
||||
#define OS_UNICOS
|
||||
#define OSMINOR "OS=UNICOS"
|
||||
#define OS_UNICOS
|
||||
#endif
|
||||
#if defined(__USLC__) && !defined(M_XENIX)
|
||||
#define OSMINOR "OS=UNIXWARE"
|
||||
#define OS_UNIXWARE
|
||||
#define OSMINOR "OS=UNIXWARE"
|
||||
#define OS_UNIXWARE
|
||||
#endif
|
||||
#ifdef __OpenBSD__
|
||||
#define OSMINOR "OS=OPENBSD"
|
||||
#define OS_OPENBSD
|
||||
#define unix
|
||||
#define OSMINOR "OS=OPENBSD"
|
||||
#define OS_OPENBSD
|
||||
#define unix
|
||||
#endif
|
||||
#if defined (__FreeBSD_kernel__) && !defined(__FreeBSD__)
|
||||
#define OSMINOR "OS=KFREEBSD"
|
||||
#define OS_KFREEBSD
|
||||
#define OSMINOR "OS=KFREEBSD"
|
||||
#define OS_KFREEBSD
|
||||
#endif
|
||||
#ifndef OSMINOR
|
||||
#define OSMINOR "OS=UNKNOWN"
|
||||
#define OSMINOR "OS=UNKNOWN"
|
||||
#endif
|
||||
|
||||
/* All the UNIX includes */
|
||||
@ -401,7 +401,7 @@
|
||||
#include <sys/stat.h>
|
||||
|
||||
#ifndef OS_MPEIX
|
||||
#include <sys/file.h>
|
||||
#include <sys/file.h>
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
@ -413,11 +413,11 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#ifndef OS_QNX
|
||||
#include <memory.h>
|
||||
#include <memory.h>
|
||||
#endif
|
||||
|
||||
#ifndef OS_ULTRIX
|
||||
#include <stdlib.h>
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#if !defined( OS_BSDI ) && \
|
||||
@ -429,7 +429,7 @@
|
||||
!defined( OS_RHAPSODY ) && \
|
||||
!defined( OS_MVS ) && \
|
||||
!defined( OS_OPENBSD )
|
||||
#include <malloc.h>
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -443,57 +443,57 @@
|
||||
defined( ppc ) || \
|
||||
defined( __powerpc__ ) || \
|
||||
defined( __ppc__ )
|
||||
#define OSPLAT "OSPLAT=PPC"
|
||||
#define OSPLAT "OSPLAT=PPC"
|
||||
#endif
|
||||
|
||||
#if defined( _ALPHA_ ) || \
|
||||
defined( __alpha__ )
|
||||
#define OSPLAT "OSPLAT=AXP"
|
||||
#define OSPLAT "OSPLAT=AXP"
|
||||
#endif
|
||||
|
||||
#if defined( _i386_ ) || \
|
||||
defined( __i386__ ) || \
|
||||
defined( __i386 ) || \
|
||||
defined( _M_IX86 )
|
||||
#define OSPLAT "OSPLAT=X86"
|
||||
#define OSPLAT "OSPLAT=X86"
|
||||
#endif
|
||||
|
||||
#if defined( __ia64__ ) || \
|
||||
defined( __IA64__ ) || \
|
||||
defined( __ia64 )
|
||||
#define OSPLAT "OSPLAT=IA64"
|
||||
#define OSPLAT "OSPLAT=IA64"
|
||||
#endif
|
||||
|
||||
#if defined( __x86_64__ ) || \
|
||||
defined( __amd64__ ) || \
|
||||
defined( _M_AMD64 )
|
||||
#define OSPLAT "OSPLAT=X86_64"
|
||||
#define OSPLAT "OSPLAT=X86_64"
|
||||
#endif
|
||||
|
||||
|
||||
#if defined( __sparc__ ) || \
|
||||
defined( __sparc )
|
||||
#define OSPLAT "OSPLAT=SPARC"
|
||||
#define OSPLAT "OSPLAT=SPARC"
|
||||
#endif
|
||||
|
||||
#ifdef __mips__
|
||||
#define OSPLAT "OSPLAT=MIPS"
|
||||
#define OSPLAT "OSPLAT=MIPS"
|
||||
#endif
|
||||
|
||||
#ifdef __arm__
|
||||
#define OSPLAT "OSPLAT=ARM"
|
||||
#define OSPLAT "OSPLAT=ARM"
|
||||
#endif
|
||||
|
||||
#ifdef __s390__
|
||||
#define OSPLAT "OSPLAT=390"
|
||||
#define OSPLAT "OSPLAT=390"
|
||||
#endif
|
||||
|
||||
#ifdef __hppa
|
||||
#define OSPLAT "OSPLAT=PARISC"
|
||||
#define OSPLAT "OSPLAT=PARISC"
|
||||
#endif
|
||||
|
||||
#ifndef OSPLAT
|
||||
#define OSPLAT ""
|
||||
#define OSPLAT ""
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -501,16 +501,16 @@
|
||||
*/
|
||||
|
||||
#ifndef MAXLINE
|
||||
#define MAXLINE 102400 /* longest 'together' actions' */
|
||||
#define MAXLINE 102400 /* longest 'together' actions' */
|
||||
#endif
|
||||
|
||||
#ifndef EXITOK
|
||||
#define EXITOK 0
|
||||
#define EXITBAD 1
|
||||
#define EXITOK 0
|
||||
#define EXITBAD 1
|
||||
#endif
|
||||
|
||||
#ifndef SPLITPATH
|
||||
#define SPLITPATH ':'
|
||||
#define SPLITPATH ':'
|
||||
#endif
|
||||
|
||||
/* You probably do not need to muck with these. */
|
||||
@ -526,19 +526,18 @@
|
||||
#define DEBUG_MAX 14
|
||||
|
||||
|
||||
struct globs
|
||||
{
|
||||
int noexec;
|
||||
int jobs;
|
||||
int quitquick;
|
||||
int newestfirst; /* build newest sources first */
|
||||
int pipe_action;
|
||||
char debug[ DEBUG_MAX ];
|
||||
FILE * cmdout; /* print cmds, not run them */
|
||||
long timeout; /* number of seconds to limit actions to,
|
||||
struct globs {
|
||||
int noexec;
|
||||
int jobs;
|
||||
int quitquick;
|
||||
int newestfirst; /* build newest sources first */
|
||||
int pipe_action;
|
||||
char debug[ DEBUG_MAX ];
|
||||
FILE * cmdout; /* print cmds, not run them */
|
||||
long timeout; /* number of seconds to limit actions to,
|
||||
* default 0 for no limit.
|
||||
*/
|
||||
int dart; /* output build and test results formatted for Dart */
|
||||
int dart; /* output build and test results formatted for Dart */
|
||||
};
|
||||
|
||||
extern struct globs globs;
|
||||
|
@ -26,56 +26,56 @@
|
||||
/* Tokens. */
|
||||
#ifndef YYTOKENTYPE
|
||||
# define YYTOKENTYPE
|
||||
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
||||
know about them. */
|
||||
enum yytokentype {
|
||||
_BANG_t = 258,
|
||||
_BANG_EQUALS_t = 259,
|
||||
_AMPER_t = 260,
|
||||
_AMPERAMPER_t = 261,
|
||||
_LPAREN_t = 262,
|
||||
_RPAREN_t = 263,
|
||||
_PLUS_EQUALS_t = 264,
|
||||
_COLON_t = 265,
|
||||
_SEMIC_t = 266,
|
||||
_LANGLE_t = 267,
|
||||
_LANGLE_EQUALS_t = 268,
|
||||
_EQUALS_t = 269,
|
||||
_RANGLE_t = 270,
|
||||
_RANGLE_EQUALS_t = 271,
|
||||
_QUESTION_EQUALS_t = 272,
|
||||
_LBRACKET_t = 273,
|
||||
_RBRACKET_t = 274,
|
||||
ACTIONS_t = 275,
|
||||
BIND_t = 276,
|
||||
CASE_t = 277,
|
||||
CLASS_t = 278,
|
||||
DEFAULT_t = 279,
|
||||
ELSE_t = 280,
|
||||
EXISTING_t = 281,
|
||||
FOR_t = 282,
|
||||
IF_t = 283,
|
||||
IGNORE_t = 284,
|
||||
IN_t = 285,
|
||||
INCLUDE_t = 286,
|
||||
LOCAL_t = 287,
|
||||
MODULE_t = 288,
|
||||
ON_t = 289,
|
||||
PIECEMEAL_t = 290,
|
||||
QUIETLY_t = 291,
|
||||
RETURN_t = 292,
|
||||
RULE_t = 293,
|
||||
SWITCH_t = 294,
|
||||
TOGETHER_t = 295,
|
||||
UPDATED_t = 296,
|
||||
WHILE_t = 297,
|
||||
_LBRACE_t = 298,
|
||||
_BAR_t = 299,
|
||||
_BARBAR_t = 300,
|
||||
_RBRACE_t = 301,
|
||||
ARG = 302,
|
||||
STRING = 303
|
||||
};
|
||||
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
||||
know about them. */
|
||||
enum yytokentype {
|
||||
_BANG_t = 258,
|
||||
_BANG_EQUALS_t = 259,
|
||||
_AMPER_t = 260,
|
||||
_AMPERAMPER_t = 261,
|
||||
_LPAREN_t = 262,
|
||||
_RPAREN_t = 263,
|
||||
_PLUS_EQUALS_t = 264,
|
||||
_COLON_t = 265,
|
||||
_SEMIC_t = 266,
|
||||
_LANGLE_t = 267,
|
||||
_LANGLE_EQUALS_t = 268,
|
||||
_EQUALS_t = 269,
|
||||
_RANGLE_t = 270,
|
||||
_RANGLE_EQUALS_t = 271,
|
||||
_QUESTION_EQUALS_t = 272,
|
||||
_LBRACKET_t = 273,
|
||||
_RBRACKET_t = 274,
|
||||
ACTIONS_t = 275,
|
||||
BIND_t = 276,
|
||||
CASE_t = 277,
|
||||
CLASS_t = 278,
|
||||
DEFAULT_t = 279,
|
||||
ELSE_t = 280,
|
||||
EXISTING_t = 281,
|
||||
FOR_t = 282,
|
||||
IF_t = 283,
|
||||
IGNORE_t = 284,
|
||||
IN_t = 285,
|
||||
INCLUDE_t = 286,
|
||||
LOCAL_t = 287,
|
||||
MODULE_t = 288,
|
||||
ON_t = 289,
|
||||
PIECEMEAL_t = 290,
|
||||
QUIETLY_t = 291,
|
||||
RETURN_t = 292,
|
||||
RULE_t = 293,
|
||||
SWITCH_t = 294,
|
||||
TOGETHER_t = 295,
|
||||
UPDATED_t = 296,
|
||||
WHILE_t = 297,
|
||||
_LBRACE_t = 298,
|
||||
_BAR_t = 299,
|
||||
_BARBAR_t = 300,
|
||||
_RBRACE_t = 301,
|
||||
ARG = 302,
|
||||
STRING = 303
|
||||
};
|
||||
#endif
|
||||
#define _BANG_t 258
|
||||
#define _BANG_EQUALS_t 259
|
||||
|
@ -1,44 +1,44 @@
|
||||
{ "!", _BANG_t },
|
||||
{ "!=", _BANG_EQUALS_t },
|
||||
{ "&", _AMPER_t },
|
||||
{ "&&", _AMPERAMPER_t },
|
||||
{ "(", _LPAREN_t },
|
||||
{ ")", _RPAREN_t },
|
||||
{ "+=", _PLUS_EQUALS_t },
|
||||
{ ":", _COLON_t },
|
||||
{ ";", _SEMIC_t },
|
||||
{ "<", _LANGLE_t },
|
||||
{ "<=", _LANGLE_EQUALS_t },
|
||||
{ "=", _EQUALS_t },
|
||||
{ ">", _RANGLE_t },
|
||||
{ ">=", _RANGLE_EQUALS_t },
|
||||
{ "?=", _QUESTION_EQUALS_t },
|
||||
{ "[", _LBRACKET_t },
|
||||
{ "]", _RBRACKET_t },
|
||||
{ "actions", ACTIONS_t },
|
||||
{ "bind", BIND_t },
|
||||
{ "case", CASE_t },
|
||||
{ "class", CLASS_t },
|
||||
{ "default", DEFAULT_t },
|
||||
{ "else", ELSE_t },
|
||||
{ "existing", EXISTING_t },
|
||||
{ "for", FOR_t },
|
||||
{ "if", IF_t },
|
||||
{ "ignore", IGNORE_t },
|
||||
{ "in", IN_t },
|
||||
{ "include", INCLUDE_t },
|
||||
{ "local", LOCAL_t },
|
||||
{ "module", MODULE_t },
|
||||
{ "on", ON_t },
|
||||
{ "piecemeal", PIECEMEAL_t },
|
||||
{ "quietly", QUIETLY_t },
|
||||
{ "return", RETURN_t },
|
||||
{ "rule", RULE_t },
|
||||
{ "switch", SWITCH_t },
|
||||
{ "together", TOGETHER_t },
|
||||
{ "updated", UPDATED_t },
|
||||
{ "while", WHILE_t },
|
||||
{ "{", _LBRACE_t },
|
||||
{ "|", _BAR_t },
|
||||
{ "||", _BARBAR_t },
|
||||
{ "}", _RBRACE_t },
|
||||
{ "!", _BANG_t },
|
||||
{ "!=", _BANG_EQUALS_t },
|
||||
{ "&", _AMPER_t },
|
||||
{ "&&", _AMPERAMPER_t },
|
||||
{ "(", _LPAREN_t },
|
||||
{ ")", _RPAREN_t },
|
||||
{ "+=", _PLUS_EQUALS_t },
|
||||
{ ":", _COLON_t },
|
||||
{ ";", _SEMIC_t },
|
||||
{ "<", _LANGLE_t },
|
||||
{ "<=", _LANGLE_EQUALS_t },
|
||||
{ "=", _EQUALS_t },
|
||||
{ ">", _RANGLE_t },
|
||||
{ ">=", _RANGLE_EQUALS_t },
|
||||
{ "?=", _QUESTION_EQUALS_t },
|
||||
{ "[", _LBRACKET_t },
|
||||
{ "]", _RBRACKET_t },
|
||||
{ "actions", ACTIONS_t },
|
||||
{ "bind", BIND_t },
|
||||
{ "case", CASE_t },
|
||||
{ "class", CLASS_t },
|
||||
{ "default", DEFAULT_t },
|
||||
{ "else", ELSE_t },
|
||||
{ "existing", EXISTING_t },
|
||||
{ "for", FOR_t },
|
||||
{ "if", IF_t },
|
||||
{ "ignore", IGNORE_t },
|
||||
{ "in", IN_t },
|
||||
{ "include", INCLUDE_t },
|
||||
{ "local", LOCAL_t },
|
||||
{ "module", MODULE_t },
|
||||
{ "on", ON_t },
|
||||
{ "piecemeal", PIECEMEAL_t },
|
||||
{ "quietly", QUIETLY_t },
|
||||
{ "return", RETURN_t },
|
||||
{ "rule", RULE_t },
|
||||
{ "switch", SWITCH_t },
|
||||
{ "together", TOGETHER_t },
|
||||
{ "updated", UPDATED_t },
|
||||
{ "while", WHILE_t },
|
||||
{ "{", _LBRACE_t },
|
||||
{ "|", _BAR_t },
|
||||
{ "||", _BARBAR_t },
|
||||
{ "}", _RBRACE_t },
|
||||
|
@ -56,9 +56,9 @@
|
||||
typedef struct _list LIST;
|
||||
|
||||
struct _list {
|
||||
LIST *next;
|
||||
LIST *tail; /* only valid in head node */
|
||||
char *string; /* private copy */
|
||||
LIST *next;
|
||||
LIST *tail; /* only valid in head node */
|
||||
char *string; /* private copy */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -70,8 +70,8 @@ typedef struct _lol LOL;
|
||||
# define LOL_MAX 19
|
||||
|
||||
struct _lol {
|
||||
int count;
|
||||
LIST *list[ LOL_MAX ];
|
||||
int count;
|
||||
LIST *list[ LOL_MAX ];
|
||||
};
|
||||
|
||||
LIST * list_append( LIST *l, LIST *nl );
|
||||
|
@ -14,12 +14,12 @@ int make( int n_targets, const char **targets, int anyhow );
|
||||
int make1( TARGET *t );
|
||||
|
||||
typedef struct {
|
||||
int temp;
|
||||
int updating;
|
||||
int cantfind;
|
||||
int cantmake;
|
||||
int targets;
|
||||
int made;
|
||||
int temp;
|
||||
int updating;
|
||||
int cantfind;
|
||||
int cantmake;
|
||||
int targets;
|
||||
int made;
|
||||
} COUNTS ;
|
||||
|
||||
|
||||
|
@ -65,24 +65,24 @@ typedef unsigned int md5_word_t; /* 32-bit word */
|
||||
|
||||
/* Define the state of the MD5 Algorithm. */
|
||||
typedef struct md5_state_s {
|
||||
md5_word_t count[2]; /* message length in bits, lsw first */
|
||||
md5_word_t abcd[4]; /* digest buffer */
|
||||
md5_byte_t buf[64]; /* accumulate block */
|
||||
md5_word_t count[2]; /* message length in bits, lsw first */
|
||||
md5_word_t abcd[4]; /* digest buffer */
|
||||
md5_byte_t buf[64]; /* accumulate block */
|
||||
} md5_state_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/* Initialize the algorithm. */
|
||||
void md5_init(md5_state_t *pms);
|
||||
/* Initialize the algorithm. */
|
||||
void md5_init(md5_state_t *pms);
|
||||
|
||||
/* Append a string to the message. */
|
||||
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
|
||||
/* Append a string to the message. */
|
||||
void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes);
|
||||
|
||||
/* Finish the message and return the digest. */
|
||||
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
|
||||
/* Finish the message and return the digest. */
|
||||
void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* end extern "C" */
|
||||
|
@ -11,122 +11,122 @@ http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifdef OPT_BOEHM_GC
|
||||
|
||||
/* Use Boehm GC memory allocator. */
|
||||
#include <gc.h>
|
||||
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
|
||||
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
|
||||
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
|
||||
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
|
||||
#define bjam_realloc_x(p,s) GC_realloc(p,s)
|
||||
#define bjam_free_x(p) GC_free(p)
|
||||
#define bjam_mem_init_x() GC_init(); GC_enable_incremental()
|
||||
/* Use Boehm GC memory allocator. */
|
||||
#include <gc.h>
|
||||
#define bjam_malloc_x(s) memset(GC_malloc(s),0,s)
|
||||
#define bjam_malloc_atomic_x(s) memset(GC_malloc_atomic(s),0,s)
|
||||
#define bjam_calloc_x(n,s) memset(GC_malloc((n)*(s)),0,(n)*(s))
|
||||
#define bjam_calloc_atomic_x(n,s) memset(GC_malloc_atomic((n)*(s)),0,(n)*(s))
|
||||
#define bjam_realloc_x(p,s) GC_realloc(p,s)
|
||||
#define bjam_free_x(p) GC_free(p)
|
||||
#define bjam_mem_init_x() GC_init(); GC_enable_incremental()
|
||||
|
||||
#define bjam_malloc_raw_x(s) malloc(s)
|
||||
#define bjam_calloc_raw_x(n,s) calloc(n,s)
|
||||
#define bjam_realloc_raw_x(p,s) realloc(p,s)
|
||||
#define bjam_free_raw_x(p) free(p)
|
||||
#define bjam_malloc_raw_x(s) malloc(s)
|
||||
#define bjam_calloc_raw_x(n,s) calloc(n,s)
|
||||
#define bjam_realloc_raw_x(p,s) realloc(p,s)
|
||||
#define bjam_free_raw_x(p) free(p)
|
||||
|
||||
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
||||
#define BJAM_NEWSTR_NO_ALLOCATE
|
||||
#endif
|
||||
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
||||
#define BJAM_NEWSTR_NO_ALLOCATE
|
||||
#endif
|
||||
|
||||
#elif defined(OPT_DUMA)
|
||||
|
||||
/* Use Duma memory debugging library. */
|
||||
#include <stdlib.h>
|
||||
#define _DUMA_CONFIG_H_
|
||||
#define DUMA_NO_GLOBAL_MALLOC_FREE
|
||||
#define DUMA_EXPLICIT_INIT
|
||||
#define DUMA_NO_THREAD_SAFETY
|
||||
#define DUMA_NO_CPP_SUPPORT
|
||||
/* #define DUMA_NO_LEAKDETECTION */
|
||||
/* #define DUMA_USE_FRAMENO */
|
||||
/* #define DUMA_PREFER_ATEXIT */
|
||||
/* #define DUMA_OLD_DEL_MACRO */
|
||||
/* #define DUMA_NO_HANG_MSG */
|
||||
#define DUMA_PAGE_SIZE 4096
|
||||
#define DUMA_MIN_ALIGNMENT 1
|
||||
/* #define DUMA_GNU_INIT_ATTR 0 */
|
||||
typedef unsigned int DUMA_ADDR;
|
||||
typedef unsigned int DUMA_SIZE;
|
||||
#include <duma.h>
|
||||
#define bjam_malloc_x(s) malloc(s)
|
||||
#define bjam_calloc_x(n,s) calloc(n,s)
|
||||
#define bjam_realloc_x(p,s) realloc(p,s)
|
||||
#define bjam_free_x(p) free(p)
|
||||
/* Use Duma memory debugging library. */
|
||||
#include <stdlib.h>
|
||||
#define _DUMA_CONFIG_H_
|
||||
#define DUMA_NO_GLOBAL_MALLOC_FREE
|
||||
#define DUMA_EXPLICIT_INIT
|
||||
#define DUMA_NO_THREAD_SAFETY
|
||||
#define DUMA_NO_CPP_SUPPORT
|
||||
/* #define DUMA_NO_LEAKDETECTION */
|
||||
/* #define DUMA_USE_FRAMENO */
|
||||
/* #define DUMA_PREFER_ATEXIT */
|
||||
/* #define DUMA_OLD_DEL_MACRO */
|
||||
/* #define DUMA_NO_HANG_MSG */
|
||||
#define DUMA_PAGE_SIZE 4096
|
||||
#define DUMA_MIN_ALIGNMENT 1
|
||||
/* #define DUMA_GNU_INIT_ATTR 0 */
|
||||
typedef unsigned int DUMA_ADDR;
|
||||
typedef unsigned int DUMA_SIZE;
|
||||
#include <duma.h>
|
||||
#define bjam_malloc_x(s) malloc(s)
|
||||
#define bjam_calloc_x(n,s) calloc(n,s)
|
||||
#define bjam_realloc_x(p,s) realloc(p,s)
|
||||
#define bjam_free_x(p) free(p)
|
||||
|
||||
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
||||
#define BJAM_NEWSTR_NO_ALLOCATE
|
||||
#endif
|
||||
#ifndef BJAM_NEWSTR_NO_ALLOCATE
|
||||
#define BJAM_NEWSTR_NO_ALLOCATE
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
/* Standard C memory allocation. */
|
||||
#define bjam_malloc_x(s) malloc(s)
|
||||
#define bjam_calloc_x(n,s) calloc(n,s)
|
||||
#define bjam_realloc_x(p,s) realloc(p,s)
|
||||
#define bjam_free_x(p) free(p)
|
||||
/* Standard C memory allocation. */
|
||||
#define bjam_malloc_x(s) malloc(s)
|
||||
#define bjam_calloc_x(n,s) calloc(n,s)
|
||||
#define bjam_realloc_x(p,s) realloc(p,s)
|
||||
#define bjam_free_x(p) free(p)
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef bjam_malloc_atomic_x
|
||||
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
|
||||
#define bjam_malloc_atomic_x(s) bjam_malloc_x(s)
|
||||
#endif
|
||||
#ifndef bjam_calloc_atomic_x
|
||||
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
|
||||
#define bjam_calloc_atomic_x(n,s) bjam_calloc_x(n,s)
|
||||
#endif
|
||||
#ifndef bjam_mem_init_x
|
||||
#define bjam_mem_init_x()
|
||||
#define bjam_mem_init_x()
|
||||
#endif
|
||||
#ifndef bjam_mem_close_x
|
||||
#define bjam_mem_close_x()
|
||||
#define bjam_mem_close_x()
|
||||
#endif
|
||||
#ifndef bjam_malloc_raw_x
|
||||
#define bjam_malloc_raw_x(s) bjam_malloc_x(s)
|
||||
#define bjam_malloc_raw_x(s) bjam_malloc_x(s)
|
||||
#endif
|
||||
#ifndef bjam_calloc_raw_x
|
||||
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
|
||||
#define bjam_calloc_raw_x(n,s) bjam_calloc_x(n,s)
|
||||
#endif
|
||||
#ifndef bjam_realloc_raw_x
|
||||
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
|
||||
#define bjam_realloc_raw_x(p,s) bjam_realloc_x(p,s)
|
||||
#endif
|
||||
#ifndef bjam_free_raw_x
|
||||
#define bjam_free_raw_x(p) bjam_free_x(p)
|
||||
#define bjam_free_raw_x(p) bjam_free_x(p)
|
||||
#endif
|
||||
|
||||
#ifdef OPT_DEBUG_PROFILE
|
||||
|
||||
/* Profile tracing of memory allocations. */
|
||||
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
|
||||
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
|
||||
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
|
||||
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
|
||||
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
|
||||
#define BJAM_FREE(p) bjam_free_x(p)
|
||||
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
||||
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
||||
/* Profile tracing of memory allocations. */
|
||||
#define BJAM_MALLOC(s) (profile_memory(s), bjam_malloc_x(s))
|
||||
#define BJAM_MALLOC_ATOMIC(s) (profile_memory(s), bjam_malloc_atomic_x(s))
|
||||
#define BJAM_CALLOC(n,s) (profile_memory(n*s), bjam_calloc_x(n,s))
|
||||
#define BJAM_CALLOC_ATOMIC(n,s) (profile_memory(n*s), bjam_calloc_atomic_x(n,s))
|
||||
#define BJAM_REALLOC(p,s) (profile_memory(s), bjam_realloc_x(p,s))
|
||||
#define BJAM_FREE(p) bjam_free_x(p)
|
||||
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
||||
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
||||
|
||||
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
|
||||
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
|
||||
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
|
||||
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
||||
#define BJAM_MALLOC_RAW(s) (profile_memory(s), bjam_malloc_raw_x(s))
|
||||
#define BJAM_CALLOC_RAW(n,s) (profile_memory(n*s), bjam_calloc_raw_x(n,s))
|
||||
#define BJAM_REALLOC_RAW(p,s) (profile_memory(s), bjam_realloc_raw_x(p,s))
|
||||
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
||||
|
||||
#else
|
||||
|
||||
/* No mem tracing. */
|
||||
#define BJAM_MALLOC(s) bjam_malloc_x(s)
|
||||
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
|
||||
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
|
||||
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
|
||||
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
|
||||
#define BJAM_FREE(p) bjam_free_x(p)
|
||||
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
||||
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
||||
/* No mem tracing. */
|
||||
#define BJAM_MALLOC(s) bjam_malloc_x(s)
|
||||
#define BJAM_MALLOC_ATOMIC(s) bjam_malloc_atomic_x(s)
|
||||
#define BJAM_CALLOC(n,s) bjam_calloc_x(n,s)
|
||||
#define BJAM_CALLOC_ATOMIC(n,s) bjam_calloc_atomic_x(n,s)
|
||||
#define BJAM_REALLOC(p,s) bjam_realloc_x(p,s)
|
||||
#define BJAM_FREE(p) bjam_free_x(p)
|
||||
#define BJAM_MEM_INIT() bjam_mem_init_x()
|
||||
#define BJAM_MEM_CLOSE() bjam_mem_close_x()
|
||||
|
||||
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
|
||||
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
|
||||
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
|
||||
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
||||
#define BJAM_MALLOC_RAW(s) bjam_malloc_raw_x(s)
|
||||
#define BJAM_CALLOC_RAW(n,s) bjam_calloc_raw_x(n,s)
|
||||
#define BJAM_REALLOC_RAW(p,s) bjam_realloc_raw_x(p,s)
|
||||
#define BJAM_FREE_RAW(p) bjam_free_raw_x(p)
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -8,15 +8,14 @@
|
||||
|
||||
#include "lists.h"
|
||||
|
||||
struct module_t
|
||||
{
|
||||
char* name;
|
||||
struct hash* rules;
|
||||
struct hash* variables;
|
||||
struct hash* imported_modules;
|
||||
struct module_t* class_module;
|
||||
struct hash* native_rules;
|
||||
int user_module;
|
||||
struct module_t {
|
||||
char* name;
|
||||
struct hash* rules;
|
||||
struct hash* variables;
|
||||
struct hash* imported_modules;
|
||||
struct module_t* class_module;
|
||||
struct hash* native_rules;
|
||||
int user_module;
|
||||
};
|
||||
|
||||
typedef struct module_t module_t ; /* MSVC debugger gets confused unless this is provided */
|
||||
|
@ -7,20 +7,19 @@
|
||||
|
||||
#include "rules.h"
|
||||
|
||||
struct native_rule_t
|
||||
{
|
||||
char* name;
|
||||
argument_list* arguments;
|
||||
PARSE* procedure;
|
||||
/* Version of the interface that the native rule provides.
|
||||
It's possible that we want to change the set parameter
|
||||
for existing native rule. In that case, version number
|
||||
should be incremented so that Boost.Build can check for
|
||||
version it relies on.
|
||||
struct native_rule_t {
|
||||
char* name;
|
||||
argument_list* arguments;
|
||||
PARSE* procedure;
|
||||
/* Version of the interface that the native rule provides.
|
||||
It's possible that we want to change the set parameter
|
||||
for existing native rule. In that case, version number
|
||||
should be incremented so that Boost.Build can check for
|
||||
version it relies on.
|
||||
|
||||
Versions are numbered from 1.
|
||||
*/
|
||||
int version;
|
||||
Versions are numbered from 1.
|
||||
*/
|
||||
int version;
|
||||
};
|
||||
|
||||
/* MSVC debugger gets confused unless this is provided */
|
||||
|
@ -11,10 +11,9 @@
|
||||
* \ -) "Command line option."
|
||||
*/
|
||||
|
||||
typedef struct bjam_option
|
||||
{
|
||||
char flag; /* filled in by getoption() */
|
||||
char *val; /* set to random address if true */
|
||||
typedef struct bjam_option {
|
||||
char flag; /* filled in by getoption() */
|
||||
char *val; /* set to random address if true */
|
||||
} bjam_option;
|
||||
|
||||
# define N_OPTS 256
|
||||
|
@ -14,13 +14,13 @@
|
||||
#define EXIT_TIMEOUT 2
|
||||
|
||||
void out_action(
|
||||
const char * action,
|
||||
const char * target,
|
||||
const char * command,
|
||||
const char * out_data,
|
||||
const char * err_data,
|
||||
int exit_reason
|
||||
);
|
||||
const char * action,
|
||||
const char * target,
|
||||
const char * command,
|
||||
const char * out_data,
|
||||
const char * err_data,
|
||||
int exit_reason
|
||||
);
|
||||
|
||||
char * outf_int( int value );
|
||||
char * outf_double( double value );
|
||||
|
@ -26,31 +26,31 @@
|
||||
*/
|
||||
|
||||
struct _PARSE {
|
||||
LIST * (* func)( PARSE *, FRAME * );
|
||||
PARSE * left;
|
||||
PARSE * right;
|
||||
PARSE * third;
|
||||
char * string;
|
||||
char * string1;
|
||||
int num;
|
||||
int refs;
|
||||
/* module * module; */
|
||||
char * rulename;
|
||||
char * file;
|
||||
int line;
|
||||
LIST * (* func)( PARSE *, FRAME * );
|
||||
PARSE * left;
|
||||
PARSE * right;
|
||||
PARSE * third;
|
||||
char * string;
|
||||
char * string1;
|
||||
int num;
|
||||
int refs;
|
||||
/* module * module; */
|
||||
char * rulename;
|
||||
char * file;
|
||||
int line;
|
||||
};
|
||||
|
||||
void parse_file( char *, FRAME * );
|
||||
void parse_save( PARSE * );
|
||||
|
||||
PARSE * parse_make(
|
||||
LIST * (* func)( PARSE *, FRAME * ),
|
||||
PARSE * left,
|
||||
PARSE * right,
|
||||
PARSE * third,
|
||||
char * string,
|
||||
char * string1,
|
||||
int num );
|
||||
LIST * (* func)( PARSE *, FRAME * ),
|
||||
PARSE * left,
|
||||
PARSE * right,
|
||||
PARSE * third,
|
||||
char * string,
|
||||
char * string1,
|
||||
int num );
|
||||
|
||||
void parse_refer ( PARSE * );
|
||||
void parse_free ( PARSE * );
|
||||
|
@ -28,17 +28,15 @@
|
||||
typedef struct _pathname PATHNAME;
|
||||
typedef struct _pathpart PATHPART;
|
||||
|
||||
struct _pathpart
|
||||
{
|
||||
char * ptr;
|
||||
int len;
|
||||
struct _pathpart {
|
||||
char * ptr;
|
||||
int len;
|
||||
};
|
||||
|
||||
struct _pathname
|
||||
{
|
||||
PATHPART part[6];
|
||||
struct _pathname {
|
||||
PATHPART part[6];
|
||||
#ifdef OS_VMS
|
||||
int parent;
|
||||
int parent;
|
||||
#endif
|
||||
|
||||
#define f_grist part[0]
|
||||
|
@ -9,13 +9,13 @@
|
||||
|
||||
#define NSUBEXP 10
|
||||
typedef struct regexp {
|
||||
char *startp[NSUBEXP];
|
||||
char *endp[NSUBEXP];
|
||||
char regstart; /* Internal use only. */
|
||||
char reganch; /* Internal use only. */
|
||||
char *regmust; /* Internal use only. */
|
||||
int regmlen; /* Internal use only. */
|
||||
char program[1]; /* Unwarranted chumminess with compiler. */
|
||||
char *startp[NSUBEXP];
|
||||
char *endp[NSUBEXP];
|
||||
char regstart; /* Internal use only. */
|
||||
char reganch; /* Internal use only. */
|
||||
char *regmust; /* Internal use only. */
|
||||
int regmlen; /* Internal use only. */
|
||||
char program[1]; /* Unwarranted chumminess with compiler. */
|
||||
} regexp;
|
||||
|
||||
regexp *regcomp( char *exp );
|
||||
|
@ -53,19 +53,17 @@ typedef struct _settings SETTINGS ;
|
||||
/* RULE - a generic jam rule, the product of RULE and ACTIONS. */
|
||||
|
||||
/* A rule's argument list. */
|
||||
struct argument_list
|
||||
{
|
||||
int reference_count;
|
||||
LOL data[1];
|
||||
struct argument_list {
|
||||
int reference_count;
|
||||
LOL data[1];
|
||||
};
|
||||
|
||||
/* Build actions corresponding to a rule. */
|
||||
struct rule_actions
|
||||
{
|
||||
int reference_count;
|
||||
char * command; /* command string from ACTIONS */
|
||||
LIST * bindlist;
|
||||
int flags; /* modifiers on ACTIONS */
|
||||
struct rule_actions {
|
||||
int reference_count;
|
||||
char * command; /* command string from ACTIONS */
|
||||
LIST * bindlist;
|
||||
int flags; /* modifiers on ACTIONS */
|
||||
|
||||
#define RULE_NEWSRCS 0x01 /* $(>) is updated sources only */
|
||||
#define RULE_TOGETHER 0x02 /* combine actions on single target */
|
||||
@ -78,67 +76,61 @@ struct rule_actions
|
||||
typedef struct rule_actions rule_actions;
|
||||
typedef struct argument_list argument_list;
|
||||
|
||||
struct _rule
|
||||
{
|
||||
char * name;
|
||||
PARSE * procedure; /* parse tree from RULE */
|
||||
argument_list * arguments; /* argument checking info, or NULL for unchecked
|
||||
struct _rule {
|
||||
char * name;
|
||||
PARSE * procedure; /* parse tree from RULE */
|
||||
argument_list * arguments; /* argument checking info, or NULL for unchecked
|
||||
*/
|
||||
rule_actions * actions; /* build actions, or NULL for no actions */
|
||||
module_t * module; /* module in which this rule is executed */
|
||||
int exported; /* nonzero if this rule is supposed to appear in
|
||||
rule_actions * actions; /* build actions, or NULL for no actions */
|
||||
module_t * module; /* module in which this rule is executed */
|
||||
int exported; /* nonzero if this rule is supposed to appear in
|
||||
* the global module and be automatically
|
||||
* imported into other modules
|
||||
*/
|
||||
#ifdef HAVE_PYTHON
|
||||
PyObject * python_function;
|
||||
PyObject * python_function;
|
||||
#endif
|
||||
};
|
||||
|
||||
/* ACTIONS - a chain of ACTIONs. */
|
||||
struct _actions
|
||||
{
|
||||
ACTIONS * next;
|
||||
ACTIONS * tail; /* valid only for head */
|
||||
ACTION * action;
|
||||
struct _actions {
|
||||
ACTIONS * next;
|
||||
ACTIONS * tail; /* valid only for head */
|
||||
ACTION * action;
|
||||
};
|
||||
|
||||
/* ACTION - a RULE instance with targets and sources. */
|
||||
struct _action
|
||||
{
|
||||
RULE * rule;
|
||||
TARGETS * targets;
|
||||
TARGETS * sources; /* aka $(>) */
|
||||
char running; /* has been started */
|
||||
char status; /* see TARGET status */
|
||||
struct _action {
|
||||
RULE * rule;
|
||||
TARGETS * targets;
|
||||
TARGETS * sources; /* aka $(>) */
|
||||
char running; /* has been started */
|
||||
char status; /* see TARGET status */
|
||||
};
|
||||
|
||||
/* SETTINGS - variables to set when executing a TARGET's ACTIONS. */
|
||||
struct _settings
|
||||
{
|
||||
SETTINGS * next;
|
||||
char * symbol; /* symbol name for var_set() */
|
||||
LIST * value; /* symbol value for var_set() */
|
||||
int multiple;
|
||||
struct _settings {
|
||||
SETTINGS * next;
|
||||
char * symbol; /* symbol name for var_set() */
|
||||
LIST * value; /* symbol value for var_set() */
|
||||
int multiple;
|
||||
};
|
||||
|
||||
/* TARGETS - a chain of TARGETs. */
|
||||
struct _targets
|
||||
{
|
||||
TARGETS * next;
|
||||
TARGETS * tail; /* valid only for head */
|
||||
TARGET * target;
|
||||
struct _targets {
|
||||
TARGETS * next;
|
||||
TARGETS * tail; /* valid only for head */
|
||||
TARGET * target;
|
||||
};
|
||||
|
||||
/* TARGET - an entity (e.g. a file) that can be built. */
|
||||
struct _target
|
||||
{
|
||||
char * name;
|
||||
char * boundname; /* if search() relocates target */
|
||||
ACTIONS * actions; /* rules to execute, if any */
|
||||
SETTINGS * settings; /* variables to define */
|
||||
struct _target {
|
||||
char * name;
|
||||
char * boundname; /* if search() relocates target */
|
||||
ACTIONS * actions; /* rules to execute, if any */
|
||||
SETTINGS * settings; /* variables to define */
|
||||
|
||||
short flags; /* status info */
|
||||
short flags; /* status info */
|
||||
|
||||
#define T_FLAG_TEMP 0x0001 /* TEMPORARY applied */
|
||||
#define T_FLAG_NOCARE 0x0002 /* NOCARE applied */
|
||||
@ -148,28 +140,28 @@ struct _target
|
||||
#define T_FLAG_NOUPDATE 0x0020 /* NOUPDATE applied */
|
||||
#define T_FLAG_VISITED 0x0040 /* CWM: Used in debugging */
|
||||
|
||||
/* This flag has been added to support a new built-in rule named "RMBAD". It is
|
||||
* used to force removal of outdated targets whose dependencies fail to build.
|
||||
*/
|
||||
/* This flag has been added to support a new built-in rule named "RMBAD". It is
|
||||
* used to force removal of outdated targets whose dependencies fail to build.
|
||||
*/
|
||||
#define T_FLAG_RMOLD 0x0080 /* RMBAD applied */
|
||||
|
||||
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
|
||||
* to indicate that the result of running a given action should be inverted,
|
||||
* i.e. ok <=> fail. This is useful for launching certain test runs from a
|
||||
* Jamfile.
|
||||
*/
|
||||
/* This flag was added to support a new built-in rule named "FAIL_EXPECTED" used
|
||||
* to indicate that the result of running a given action should be inverted,
|
||||
* i.e. ok <=> fail. This is useful for launching certain test runs from a
|
||||
* Jamfile.
|
||||
*/
|
||||
#define T_FLAG_FAIL_EXPECTED 0x0100 /* FAIL_EXPECTED applied */
|
||||
|
||||
#define T_FLAG_INTERNAL 0x0200 /* internal INCLUDES node */
|
||||
|
||||
/* Indicates that the target must be a file. This prevents matching non-files,
|
||||
* like directories, when a target is searched.
|
||||
*/
|
||||
/* Indicates that the target must be a file. This prevents matching non-files,
|
||||
* like directories, when a target is searched.
|
||||
*/
|
||||
#define T_FLAG_ISFILE 0x0400
|
||||
|
||||
#define T_FLAG_PRECIOUS 0x0800
|
||||
#define T_FLAG_PRECIOUS 0x0800
|
||||
|
||||
char binding; /* how target relates to a real file or
|
||||
char binding; /* how target relates to a real file or
|
||||
* folder
|
||||
*/
|
||||
|
||||
@ -178,32 +170,32 @@ struct _target
|
||||
#define T_BIND_PARENTS 2 /* using parent's timestamp */
|
||||
#define T_BIND_EXISTS 3 /* real file, timestamp valid */
|
||||
|
||||
TARGETS * depends; /* dependencies */
|
||||
TARGETS * dependants; /* the inverse of dependencies */
|
||||
TARGETS * rebuilds; /* targets that should be force-rebuilt
|
||||
TARGETS * depends; /* dependencies */
|
||||
TARGETS * dependants; /* the inverse of dependencies */
|
||||
TARGETS * rebuilds; /* targets that should be force-rebuilt
|
||||
* whenever this one is
|
||||
*/
|
||||
TARGET * includes; /* internal includes node */
|
||||
TARGET * original_target; /* original_target->includes = this */
|
||||
char rescanned;
|
||||
TARGET * includes; /* internal includes node */
|
||||
TARGET * original_target; /* original_target->includes = this */
|
||||
char rescanned;
|
||||
|
||||
time_t time; /* update time */
|
||||
time_t leaf; /* update time of leaf sources */
|
||||
time_t time; /* update time */
|
||||
time_t leaf; /* update time of leaf sources */
|
||||
|
||||
char fate; /* make0()'s diagnosis */
|
||||
char fate; /* make0()'s diagnosis */
|
||||
|
||||
#define T_FATE_INIT 0 /* nothing done to target */
|
||||
#define T_FATE_MAKING 1 /* make0(target) on stack */
|
||||
|
||||
#define T_FATE_STABLE 2 /* target did not need updating */
|
||||
#define T_FATE_NEWER 3 /* target newer than parent */
|
||||
|
||||
|
||||
#define T_FATE_SPOIL 4 /* >= SPOIL rebuilds parents */
|
||||
#define T_FATE_ISTMP 4 /* unneeded temp target oddly present */
|
||||
|
||||
#define T_FATE_BUILD 5 /* >= BUILD rebuilds target */
|
||||
#define T_FATE_TOUCHED 5 /* manually touched with -t */
|
||||
#define T_FATE_REBUILD 6
|
||||
#define T_FATE_REBUILD 6
|
||||
#define T_FATE_MISSING 7 /* is missing, needs updating */
|
||||
#define T_FATE_NEEDTMP 8 /* missing temp that must be rebuild */
|
||||
#define T_FATE_OUTDATED 9 /* is out of date, needs updating */
|
||||
@ -213,7 +205,7 @@ struct _target
|
||||
#define T_FATE_CANTFIND 11 /* no rules to make missing target */
|
||||
#define T_FATE_CANTMAKE 12 /* can not find dependencies */
|
||||
|
||||
char progress; /* tracks make1() progress */
|
||||
char progress; /* tracks make1() progress */
|
||||
|
||||
#define T_MAKE_INIT 0 /* make1(target) not yet called */
|
||||
#define T_MAKE_ONSTACK 1 /* make1(target) on stack */
|
||||
@ -222,20 +214,20 @@ struct _target
|
||||
#define T_MAKE_DONE 4 /* make1(target) done */
|
||||
|
||||
#ifdef OPT_SEMAPHORE
|
||||
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
|
||||
#define T_MAKE_SEMAPHORE 5 /* Special target type for semaphores */
|
||||
#endif
|
||||
|
||||
#ifdef OPT_SEMAPHORE
|
||||
TARGET * semaphore; /* used in serialization */
|
||||
TARGET * semaphore; /* used in serialization */
|
||||
#endif
|
||||
|
||||
char status; /* exec_cmd() result */
|
||||
char status; /* exec_cmd() result */
|
||||
|
||||
int asynccnt; /* child deps outstanding */
|
||||
TARGETS * parents; /* used by make1() for completion */
|
||||
char * cmds; /* type-punned command list */
|
||||
int asynccnt; /* child deps outstanding */
|
||||
TARGETS * parents; /* used by make1() for completion */
|
||||
char * cmds; /* type-punned command list */
|
||||
|
||||
char * failed;
|
||||
char * failed;
|
||||
};
|
||||
|
||||
|
||||
|
@ -29,15 +29,14 @@
|
||||
|
||||
#define YYSTYPE YYSYMBOL
|
||||
|
||||
typedef struct _YYSTYPE
|
||||
{
|
||||
int type;
|
||||
char * string;
|
||||
PARSE * parse;
|
||||
LIST * list;
|
||||
int number;
|
||||
char * file;
|
||||
int line;
|
||||
typedef struct _YYSTYPE {
|
||||
int type;
|
||||
char * string;
|
||||
PARSE * parse;
|
||||
LIST * list;
|
||||
int number;
|
||||
char * file;
|
||||
int line;
|
||||
} YYSTYPE;
|
||||
|
||||
extern YYSTYPE yylval;
|
||||
|
@ -7,14 +7,13 @@
|
||||
|
||||
# include <stddef.h>
|
||||
|
||||
typedef struct string
|
||||
{
|
||||
char* value;
|
||||
unsigned long size;
|
||||
unsigned long capacity;
|
||||
char opt[32];
|
||||
typedef struct string {
|
||||
char* value;
|
||||
unsigned long size;
|
||||
unsigned long capacity;
|
||||
char opt[32];
|
||||
#ifndef NDEBUG
|
||||
char magic[4];
|
||||
char magic[4];
|
||||
#endif
|
||||
} string;
|
||||
|
||||
|
@ -50,10 +50,10 @@ Data::~Data() {
|
||||
//ADDED BY TS
|
||||
void Data::remove_duplicates() {
|
||||
|
||||
uint nSentences = featdata->size();
|
||||
size_t nSentences = featdata->size();
|
||||
assert(scoredata->size() == nSentences);
|
||||
|
||||
for (uint s=0; s < nSentences; s++) {
|
||||
for (size_t s=0; s < nSentences; s++) {
|
||||
|
||||
FeatureArray& feat_array = featdata->get(s);
|
||||
ScoreArray& score_array = scoredata->get(s);
|
||||
@ -61,29 +61,29 @@ void Data::remove_duplicates() {
|
||||
assert(feat_array.size() == score_array.size());
|
||||
|
||||
//serves as a hash-map:
|
||||
std::map<double, std::vector<uint> > lookup;
|
||||
std::map<double, std::vector<size_t> > lookup;
|
||||
|
||||
uint end_pos = feat_array.size() - 1;
|
||||
size_t end_pos = feat_array.size() - 1;
|
||||
|
||||
uint nRemoved = 0;
|
||||
for (uint k=0; k <= end_pos; k++) {
|
||||
size_t nRemoved = 0;
|
||||
for (size_t k=0; k <= end_pos; k++) {
|
||||
|
||||
const FeatureStats& cur_feats = feat_array.get(k);
|
||||
|
||||
double sum = 0.0;
|
||||
for (uint l=0; l < cur_feats.size(); l++)
|
||||
for (size_t l=0; l < cur_feats.size(); l++)
|
||||
sum += cur_feats.get(l);
|
||||
|
||||
if (lookup.find(sum) != lookup.end()) {
|
||||
|
||||
//std::cerr << "hit" << std::endl;
|
||||
|
||||
std::vector<uint>& cur_list = lookup[sum];
|
||||
std::vector<size_t>& cur_list = lookup[sum];
|
||||
|
||||
uint l=0;
|
||||
size_t l=0;
|
||||
for (l=0; l < cur_list.size(); l++) {
|
||||
|
||||
uint j=cur_list[l];
|
||||
size_t j=cur_list[l];
|
||||
|
||||
if (cur_feats == feat_array.get(j)
|
||||
&& score_array.get(k) == score_array.get(j)) {
|
||||
|
@ -129,7 +129,8 @@ IOWrapper::~IOWrapper()
|
||||
delete m_singleBestOutputCollector;
|
||||
}
|
||||
|
||||
void IOWrapper::ResetTranslationId() {
|
||||
void IOWrapper::ResetTranslationId()
|
||||
{
|
||||
m_translationId = StaticData::Instance().GetStartTranslationId();
|
||||
}
|
||||
|
||||
@ -369,18 +370,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
|
||||
if (pds.size() > 0) {
|
||||
|
||||
for( size_t i=0; i<pds.size(); i++ ) {
|
||||
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j){
|
||||
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j) {
|
||||
|
||||
if (labeledOutput && (i == 0) ){
|
||||
if ((j == 0) || (j == pd_numinputscore)){
|
||||
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
if (labeledOutput && (i == 0) ) {
|
||||
if ((j == 0) || (j == pd_numinputscore)) {
|
||||
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -394,18 +395,18 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
|
||||
if (gds.size() > 0) {
|
||||
|
||||
for( size_t i=0; i<gds.size(); i++ ) {
|
||||
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j){
|
||||
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j) {
|
||||
|
||||
if (labeledOutput && (i == 0) ){
|
||||
if ((j == 0) || (j == pd_numinputscore)){
|
||||
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
if (labeledOutput && (i == 0) ) {
|
||||
if ((j == 0) || (j == pd_numinputscore)) {
|
||||
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -210,13 +210,13 @@ void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset,
|
||||
{
|
||||
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
|
||||
AlignVec alignments = ai.GetSortedAlignments();
|
||||
|
||||
|
||||
AlignVec::const_iterator it;
|
||||
for (it = alignments.begin(); it != alignments.end(); ++it) {
|
||||
const std::pair<size_t,size_t> &alignment = **it;
|
||||
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
||||
@ -227,7 +227,7 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
|
||||
const Hypothesis &edge = *edges[currEdge];
|
||||
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
|
||||
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
|
||||
|
||||
|
||||
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
|
||||
|
||||
targetOffset += tp.GetSize();
|
||||
@ -239,7 +239,7 @@ void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<co
|
||||
{
|
||||
ostringstream out;
|
||||
OutputAlignment(out, edges);
|
||||
|
||||
|
||||
collector->Write(lineNo,out.str());
|
||||
}
|
||||
|
||||
@ -412,18 +412,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
||||
if (pds.size() > 0) {
|
||||
|
||||
for( size_t i=0; i<pds.size(); i++ ) {
|
||||
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j){
|
||||
size_t pd_numinputscore = pds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j) {
|
||||
|
||||
if (labeledOutput && (i == 0) ){
|
||||
if ((j == 0) || (j == pd_numinputscore)){
|
||||
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
if (labeledOutput && (i == 0) ) {
|
||||
if ((j == 0) || (j == pd_numinputscore)) {
|
||||
lastName = pds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -432,18 +432,18 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
||||
if (gds.size() > 0) {
|
||||
|
||||
for( size_t i=0; i<gds.size(); i++ ) {
|
||||
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j){
|
||||
size_t pd_numinputscore = gds[i]->GetNumInputScores();
|
||||
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
|
||||
for (size_t j = 0; j<scores.size(); ++j) {
|
||||
|
||||
if (labeledOutput && (i == 0) ){
|
||||
if ((j == 0) || (j == pd_numinputscore)){
|
||||
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
if (labeledOutput && (i == 0) ) {
|
||||
if ((j == 0) || (j == pd_numinputscore)) {
|
||||
lastName = gds[i]->GetScoreProducerWeightShortName(j);
|
||||
out << " " << lastName << ":";
|
||||
}
|
||||
}
|
||||
out << " " << scores[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -477,7 +477,7 @@ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, con
|
||||
const int sourceOffset = sourceRange.GetStartPos();
|
||||
const int targetOffset = targetRange.GetStartPos();
|
||||
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
|
||||
|
||||
|
||||
OutputAlignment(out, ai, sourceOffset, targetOffset);
|
||||
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ public:
|
||||
m_detailedTranslationCollector(detailedTranslationCollector),
|
||||
m_alignmentInfoCollector(alignmentInfoCollector) {}
|
||||
|
||||
/** Translate one sentence
|
||||
/** Translate one sentence
|
||||
* gets called by main function implemented at end of this source file */
|
||||
void Run() {
|
||||
|
||||
@ -130,7 +130,7 @@ public:
|
||||
manager.SerializeSearchGraphPB(m_lineNumber, output);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// apply decision rule and output best translation(s)
|
||||
if (m_outputCollector) {
|
||||
@ -145,8 +145,7 @@ public:
|
||||
|
||||
// MAP decoding: best hypothesis
|
||||
const Hypothesis* bestHypo = NULL;
|
||||
if (!staticData.UseMBR())
|
||||
{
|
||||
if (!staticData.UseMBR()) {
|
||||
bestHypo = manager.GetBestHypothesis();
|
||||
if (bestHypo) {
|
||||
if (staticData.IsPathRecoveryEnabled()) {
|
||||
@ -165,11 +164,10 @@ public:
|
||||
}
|
||||
}
|
||||
out << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// MBR decoding (n-best MBR, lattice MBR, consensus)
|
||||
else
|
||||
{
|
||||
else {
|
||||
// we first need the n-best translations
|
||||
size_t nBestSize = staticData.GetMBRSize();
|
||||
if (nBestSize <= 0) {
|
||||
@ -205,7 +203,7 @@ public:
|
||||
}
|
||||
|
||||
// consensus decoding
|
||||
else if (staticData.UseConsensusDecoding()) {
|
||||
else if (staticData.UseConsensusDecoding()) {
|
||||
const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
|
||||
OutputBestHypo(conBestHypo, m_lineNumber,
|
||||
staticData.GetReportSegmentation(),
|
||||
@ -214,8 +212,8 @@ public:
|
||||
IFVERBOSE(2) {
|
||||
PrintUserTime("finished Consensus decoding");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// n-best MBR decoding
|
||||
else {
|
||||
const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
|
||||
@ -482,7 +480,7 @@ int main(int argc, char** argv)
|
||||
alignmentInfoCollector.get() );
|
||||
// execute task
|
||||
#ifdef WITH_THREADS
|
||||
pool.Submit(task);
|
||||
pool.Submit(task);
|
||||
#else
|
||||
task->Run();
|
||||
#endif
|
||||
|
@ -57,7 +57,7 @@ void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool epsilon = false;
|
||||
if (target == "") {
|
||||
target="<EPSILON>";
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -42,10 +42,11 @@ void AlignmentInfo::BuildNonTermIndexMap()
|
||||
for (p = begin(); p != end(); ++p) {
|
||||
m_nonTermIndexMap[p->second] = i++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) {
|
||||
bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b)
|
||||
{
|
||||
if(a->second < b->second) return true;
|
||||
if(a->second == b->second) return (a->first < b->first);
|
||||
return false;
|
||||
@ -55,34 +56,32 @@ bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,si
|
||||
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
|
||||
{
|
||||
std::vector< const std::pair<size_t,size_t>* > ret;
|
||||
|
||||
|
||||
CollType::const_iterator iter;
|
||||
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter)
|
||||
{
|
||||
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
||||
const std::pair<size_t,size_t> &alignPair = *iter;
|
||||
ret.push_back(&alignPair);
|
||||
}
|
||||
|
||||
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
|
||||
|
||||
switch (wordAlignmentSort)
|
||||
{
|
||||
case NoSort:
|
||||
break;
|
||||
|
||||
case TargetOrder:
|
||||
std::sort(ret.begin(), ret.end(), compare_target);
|
||||
break;
|
||||
|
||||
default:
|
||||
CHECK(false);
|
||||
|
||||
switch (wordAlignmentSort) {
|
||||
case NoSort:
|
||||
break;
|
||||
|
||||
case TargetOrder:
|
||||
std::sort(ret.begin(), ret.end(), compare_target);
|
||||
break;
|
||||
|
||||
default:
|
||||
CHECK(false);
|
||||
}
|
||||
|
||||
|
||||
return ret;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const AlignmentInfo &alignmentInfo)
|
||||
{
|
||||
AlignmentInfo::const_iterator iter;
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -37,12 +37,16 @@ class AlignmentInfo
|
||||
friend struct AlignmentInfoOrderer;
|
||||
friend class AlignmentInfoCollection;
|
||||
|
||||
public:
|
||||
public:
|
||||
typedef std::vector<size_t> NonTermIndexMap;
|
||||
typedef CollType::const_iterator const_iterator;
|
||||
|
||||
const_iterator begin() const { return m_collection.begin(); }
|
||||
const_iterator end() const { return m_collection.end(); }
|
||||
const_iterator begin() const {
|
||||
return m_collection.begin();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return m_collection.end();
|
||||
}
|
||||
|
||||
// Provides a map from target-side to source-side non-terminal indices.
|
||||
// The target-side index should be the rule symbol index (counting terminals).
|
||||
@ -52,12 +56,11 @@ class AlignmentInfo
|
||||
}
|
||||
|
||||
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
|
||||
|
||||
private:
|
||||
|
||||
private:
|
||||
// AlignmentInfo objects should only be created by an AlignmentInfoCollection
|
||||
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs)
|
||||
: m_collection(pairs)
|
||||
{
|
||||
: m_collection(pairs) {
|
||||
BuildNonTermIndexMap();
|
||||
}
|
||||
|
||||
@ -69,8 +72,7 @@ class AlignmentInfo
|
||||
|
||||
// Define an arbitrary strict weak ordering between AlignmentInfo objects
|
||||
// for use by AlignmentInfoCollection.
|
||||
struct AlignmentInfoOrderer
|
||||
{
|
||||
struct AlignmentInfoOrderer {
|
||||
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
|
||||
return a.m_collection < b.m_collection;
|
||||
}
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -36,7 +36,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
|
||||
}
|
||||
|
||||
const AlignmentInfo *AlignmentInfoCollection::Add(
|
||||
const std::set<std::pair<size_t,size_t> > &pairs)
|
||||
const std::set<std::pair<size_t,size_t> > &pairs)
|
||||
{
|
||||
std::pair<AlignmentInfoSet::iterator, bool> ret =
|
||||
m_collection.insert(AlignmentInfo(pairs));
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -29,8 +29,10 @@ namespace Moses
|
||||
// Singleton collection of all AlignmentInfo objects.
|
||||
class AlignmentInfoCollection
|
||||
{
|
||||
public:
|
||||
static AlignmentInfoCollection &Instance() { return s_instance; }
|
||||
public:
|
||||
static AlignmentInfoCollection &Instance() {
|
||||
return s_instance;
|
||||
}
|
||||
|
||||
// Returns a pointer to an AlignmentInfo object with the same source-target
|
||||
// alignment pairs as given in the argument. If the collection already
|
||||
@ -41,7 +43,7 @@ class AlignmentInfoCollection
|
||||
// Returns a pointer to an empty AlignmentInfo object.
|
||||
const AlignmentInfo &GetEmptyAlignmentInfo() const;
|
||||
|
||||
private:
|
||||
private:
|
||||
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
|
||||
|
||||
// Only a single static variable should be created.
|
||||
|
@ -7,455 +7,454 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
BilingualDynSuffixArray::BilingualDynSuffixArray():
|
||||
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
|
||||
m_maxSampleSize(20)
|
||||
{
|
||||
m_srcSA = 0;
|
||||
m_trgSA = 0;
|
||||
m_srcCorpus = new std::vector<wordID_t>();
|
||||
m_trgCorpus = new std::vector<wordID_t>();
|
||||
m_srcVocab = new Vocab(false);
|
||||
m_trgVocab = new Vocab(false);
|
||||
m_scoreCmp = 0;
|
||||
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
|
||||
m_maxSampleSize(20)
|
||||
{
|
||||
m_srcSA = 0;
|
||||
m_trgSA = 0;
|
||||
m_srcCorpus = new std::vector<wordID_t>();
|
||||
m_trgCorpus = new std::vector<wordID_t>();
|
||||
m_srcVocab = new Vocab(false);
|
||||
m_trgVocab = new Vocab(false);
|
||||
m_scoreCmp = 0;
|
||||
}
|
||||
|
||||
BilingualDynSuffixArray::~BilingualDynSuffixArray()
|
||||
BilingualDynSuffixArray::~BilingualDynSuffixArray()
|
||||
{
|
||||
if(m_srcSA) delete m_srcSA;
|
||||
if(m_trgSA) delete m_trgSA;
|
||||
if(m_srcVocab) delete m_srcVocab;
|
||||
if(m_trgVocab) delete m_trgVocab;
|
||||
if(m_srcCorpus) delete m_srcCorpus;
|
||||
if(m_trgCorpus) delete m_trgCorpus;
|
||||
if(m_scoreCmp) delete m_scoreCmp;
|
||||
if(m_srcSA) delete m_srcSA;
|
||||
if(m_trgSA) delete m_trgSA;
|
||||
if(m_srcVocab) delete m_srcVocab;
|
||||
if(m_trgVocab) delete m_trgVocab;
|
||||
if(m_srcCorpus) delete m_srcCorpus;
|
||||
if(m_trgCorpus) delete m_trgCorpus;
|
||||
if(m_scoreCmp) delete m_scoreCmp;
|
||||
}
|
||||
|
||||
bool BilingualDynSuffixArray::Load(
|
||||
const std::vector<FactorType>& inputFactors,
|
||||
const std::vector<FactorType>& outputFactors,
|
||||
std::string source, std::string target, std::string alignments,
|
||||
const std::vector<float> &weight)
|
||||
const std::vector<FactorType>& inputFactors,
|
||||
const std::vector<FactorType>& outputFactors,
|
||||
std::string source, std::string target, std::string alignments,
|
||||
const std::vector<float> &weight)
|
||||
{
|
||||
m_inputFactors = inputFactors;
|
||||
m_outputFactors = outputFactors;
|
||||
|
||||
m_scoreCmp = new ScoresComp(weight);
|
||||
InputFileStream sourceStrme(source);
|
||||
InputFileStream targetStrme(target);
|
||||
cerr << "Loading source corpus...\n";
|
||||
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
|
||||
cerr << "Loading target corpus...\n";
|
||||
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
|
||||
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
|
||||
m_scoreCmp = new ScoresComp(weight);
|
||||
InputFileStream sourceStrme(source);
|
||||
InputFileStream targetStrme(target);
|
||||
cerr << "Loading source corpus...\n";
|
||||
LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
|
||||
cerr << "Loading target corpus...\n";
|
||||
LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
|
||||
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
|
||||
|
||||
// build suffix arrays and auxilliary arrays
|
||||
cerr << "Building Source Suffix Array...\n";
|
||||
m_srcSA = new DynSuffixArray(m_srcCorpus);
|
||||
if(!m_srcSA) return false;
|
||||
cerr << "Building Target Suffix Array...\n";
|
||||
//m_trgSA = new DynSuffixArray(m_trgCorpus);
|
||||
//if(!m_trgSA) return false;
|
||||
// build suffix arrays and auxilliary arrays
|
||||
cerr << "Building Source Suffix Array...\n";
|
||||
m_srcSA = new DynSuffixArray(m_srcCorpus);
|
||||
if(!m_srcSA) return false;
|
||||
cerr << "Building Target Suffix Array...\n";
|
||||
//m_trgSA = new DynSuffixArray(m_trgCorpus);
|
||||
//if(!m_trgSA) return false;
|
||||
cerr << "\t(Skipped. Not used)\n";
|
||||
|
||||
InputFileStream alignStrme(alignments);
|
||||
cerr << "Loading Alignment File...\n";
|
||||
LoadRawAlignments(alignStrme);
|
||||
//LoadAlignments(alignStrme);
|
||||
|
||||
InputFileStream alignStrme(alignments);
|
||||
cerr << "Loading Alignment File...\n";
|
||||
LoadRawAlignments(alignStrme);
|
||||
//LoadAlignments(alignStrme);
|
||||
cerr << "Building frequent word cache...\n";
|
||||
CacheFreqWords();
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
|
||||
int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
|
||||
{
|
||||
// stores the alignments in the raw file format
|
||||
std::string line;
|
||||
std::vector<int> vtmp;
|
||||
while(getline(align, line)) {
|
||||
Utils::splitToInt(line, vtmp, "- ");
|
||||
CHECK(vtmp.size() % 2 == 0);
|
||||
std::vector<short> vAlgn; // store as short ints for memory
|
||||
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
||||
itr != vtmp.end(); ++itr) {
|
||||
vAlgn.push_back(short(*itr));
|
||||
}
|
||||
m_rawAlignments.push_back(vAlgn);
|
||||
}
|
||||
return m_rawAlignments.size();
|
||||
// stores the alignments in the raw file format
|
||||
std::string line;
|
||||
std::vector<int> vtmp;
|
||||
while(getline(align, line)) {
|
||||
Utils::splitToInt(line, vtmp, "- ");
|
||||
CHECK(vtmp.size() % 2 == 0);
|
||||
std::vector<short> vAlgn; // store as short ints for memory
|
||||
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
||||
itr != vtmp.end(); ++itr) {
|
||||
vAlgn.push_back(short(*itr));
|
||||
}
|
||||
m_rawAlignments.push_back(vAlgn);
|
||||
}
|
||||
return m_rawAlignments.size();
|
||||
}
|
||||
int BilingualDynSuffixArray::LoadRawAlignments(string& align) {
|
||||
// stores the alignments in the raw file format
|
||||
int BilingualDynSuffixArray::LoadRawAlignments(string& align)
|
||||
{
|
||||
// stores the alignments in the raw file format
|
||||
vector<int> vtmp;
|
||||
Utils::splitToInt(align, vtmp, "- ");
|
||||
CHECK(vtmp.size() % 2 == 0);
|
||||
vector<short> vAlgn; // store as short ints for memory
|
||||
for (std::vector<int>::const_iterator itr = vtmp.begin();
|
||||
itr != vtmp.end(); ++itr) {
|
||||
vAlgn.push_back(short(*itr));
|
||||
itr != vtmp.end(); ++itr) {
|
||||
vAlgn.push_back(short(*itr));
|
||||
}
|
||||
m_rawAlignments.push_back(vAlgn);
|
||||
return m_rawAlignments.size();
|
||||
}
|
||||
|
||||
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
|
||||
int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
|
||||
{
|
||||
std::string line;
|
||||
std::vector<int> vtmp;
|
||||
int sntIndex(0);
|
||||
|
||||
while(getline(align, line)) {
|
||||
Utils::splitToInt(line, vtmp, "- ");
|
||||
CHECK(vtmp.size() % 2 == 0);
|
||||
|
||||
int sourceSize = GetSourceSentenceSize(sntIndex);
|
||||
int targetSize = GetTargetSentenceSize(sntIndex);
|
||||
std::string line;
|
||||
std::vector<int> vtmp;
|
||||
int sntIndex(0);
|
||||
|
||||
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
|
||||
for(int i=0; i < (int)vtmp.size(); i+=2) {
|
||||
int sourcePos = vtmp[i];
|
||||
int targetPos = vtmp[i+1];
|
||||
CHECK(sourcePos < sourceSize);
|
||||
CHECK(targetPos < targetSize);
|
||||
|
||||
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
||||
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
||||
}
|
||||
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
||||
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
||||
m_alignments.push_back(curSnt);
|
||||
|
||||
sntIndex++;
|
||||
}
|
||||
return m_alignments.size();
|
||||
while(getline(align, line)) {
|
||||
Utils::splitToInt(line, vtmp, "- ");
|
||||
CHECK(vtmp.size() % 2 == 0);
|
||||
|
||||
int sourceSize = GetSourceSentenceSize(sntIndex);
|
||||
int targetSize = GetTargetSentenceSize(sntIndex);
|
||||
|
||||
SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
|
||||
for(int i=0; i < (int)vtmp.size(); i+=2) {
|
||||
int sourcePos = vtmp[i];
|
||||
int targetPos = vtmp[i+1];
|
||||
CHECK(sourcePos < sourceSize);
|
||||
CHECK(targetPos < targetSize);
|
||||
|
||||
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
||||
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
||||
}
|
||||
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
||||
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
||||
m_alignments.push_back(curSnt);
|
||||
|
||||
sntIndex++;
|
||||
}
|
||||
return m_alignments.size();
|
||||
}
|
||||
|
||||
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
|
||||
SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
|
||||
{
|
||||
// retrieves the alignments in the format used by SentenceAlignment.Extract()
|
||||
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
|
||||
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
|
||||
std::vector<short> alignment = m_rawAlignments.at(sntIndex);
|
||||
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
|
||||
for(size_t i=0; i < alignment.size(); i+=2) {
|
||||
int sourcePos = alignment[i];
|
||||
int targetPos = alignment[i+1];
|
||||
if(trg2Src) {
|
||||
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
|
||||
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
|
||||
}
|
||||
else {
|
||||
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
||||
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
||||
}
|
||||
}
|
||||
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
||||
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
||||
|
||||
return curSnt;
|
||||
// retrieves the alignments in the format used by SentenceAlignment.Extract()
|
||||
int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
|
||||
int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
|
||||
std::vector<short> alignment = m_rawAlignments.at(sntIndex);
|
||||
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
|
||||
for(size_t i=0; i < alignment.size(); i+=2) {
|
||||
int sourcePos = alignment[i];
|
||||
int targetPos = alignment[i+1];
|
||||
if(trg2Src) {
|
||||
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
|
||||
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
|
||||
} else {
|
||||
curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
|
||||
curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
|
||||
}
|
||||
}
|
||||
curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
|
||||
curSnt.trgSnt = m_trgCorpus + sntIndex;
|
||||
|
||||
return curSnt;
|
||||
}
|
||||
|
||||
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
|
||||
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
|
||||
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
|
||||
const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
|
||||
{
|
||||
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
|
||||
* parameter */
|
||||
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
|
||||
// get span of phrase in source sentence
|
||||
int beginSentence = m_srcSntBreaks[sntIndex];
|
||||
int rightIdx = wordIndex - beginSentence
|
||||
,leftIdx = rightIdx - sourceSize + 1;
|
||||
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
|
||||
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
|
||||
* parameter */
|
||||
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
|
||||
// get span of phrase in source sentence
|
||||
int beginSentence = m_srcSntBreaks[sntIndex];
|
||||
int rightIdx = wordIndex - beginSentence
|
||||
,leftIdx = rightIdx - sourceSize + 1;
|
||||
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
|
||||
}
|
||||
|
||||
void BilingualDynSuffixArray::CleanUp()
|
||||
void BilingualDynSuffixArray::CleanUp()
|
||||
{
|
||||
//m_wordPairCache.clear();
|
||||
//m_wordPairCache.clear();
|
||||
}
|
||||
|
||||
int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
|
||||
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
|
||||
Vocab* vocab)
|
||||
std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
|
||||
Vocab* vocab)
|
||||
{
|
||||
std::string line, word;
|
||||
int sntIdx(0);
|
||||
std::string line, word;
|
||||
int sntIdx(0);
|
||||
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
|
||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
while(getline(corpus, line)) {
|
||||
sntArray.push_back(sntIdx);
|
||||
Phrase phrase(ARRAY_SIZE_INCR);
|
||||
// parse phrase
|
||||
phrase.CreateFromString( factors, line, factorDelimiter);
|
||||
// store words in vocabulary and corpus
|
||||
for( size_t i = 0; i < phrase.GetSize(); ++i) {
|
||||
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
|
||||
}
|
||||
sntIdx += phrase.GetSize();
|
||||
}
|
||||
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
|
||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
while(getline(corpus, line)) {
|
||||
sntArray.push_back(sntIdx);
|
||||
Phrase phrase(ARRAY_SIZE_INCR);
|
||||
// parse phrase
|
||||
phrase.CreateFromString( factors, line, factorDelimiter);
|
||||
// store words in vocabulary and corpus
|
||||
for( size_t i = 0; i < phrase.GetSize(); ++i) {
|
||||
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
|
||||
}
|
||||
sntIdx += phrase.GetSize();
|
||||
}
|
||||
//cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
|
||||
vocab->MakeClosed(); // avoid adding words
|
||||
return cArray.size();
|
||||
return cArray.size();
|
||||
}
|
||||
|
||||
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
|
||||
bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
|
||||
{
|
||||
// looks up the SA vocab ids for the current src phrase
|
||||
size_t phraseSize = src.GetSize();
|
||||
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
||||
const Word &word = src.GetWord(pos);
|
||||
wordID_t arrayId = m_srcVocab->GetWordID(word);
|
||||
if (arrayId == m_srcVocab->GetkOOVWordID())
|
||||
{ // oov
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
output.SetId(pos, arrayId);
|
||||
//cerr << arrayId << " ";
|
||||
}
|
||||
}
|
||||
return true;
|
||||
// looks up the SA vocab ids for the current src phrase
|
||||
size_t phraseSize = src.GetSize();
|
||||
for (size_t pos = 0; pos < phraseSize; ++pos) {
|
||||
const Word &word = src.GetWord(pos);
|
||||
wordID_t arrayId = m_srcVocab->GetWordID(word);
|
||||
if (arrayId == m_srcVocab->GetkOOVWordID()) {
|
||||
// oov
|
||||
return false;
|
||||
} else {
|
||||
output.SetId(pos, arrayId);
|
||||
//cerr << arrayId << " ";
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
|
||||
pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
|
||||
{
|
||||
//return pair<float, float>(1, 1);
|
||||
float srcLexWeight(1.0), trgLexWeight(1.0);
|
||||
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
|
||||
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
|
||||
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
|
||||
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
|
||||
// for each source word
|
||||
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
|
||||
float srcSumPairProbs(0);
|
||||
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
|
||||
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
|
||||
//return pair<float, float>(1, 1);
|
||||
float srcLexWeight(1.0), trgLexWeight(1.0);
|
||||
std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
|
||||
//const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
|
||||
const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
|
||||
std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
|
||||
// for each source word
|
||||
for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
|
||||
float srcSumPairProbs(0);
|
||||
wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
|
||||
const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
|
||||
// for each target word aligned to this source word in this alignment
|
||||
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
|
||||
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
|
||||
itrCache = m_wordPairCache.find(wordpair);
|
||||
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
||||
CacheWordProbs(srcWord);
|
||||
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
||||
}
|
||||
CHECK(itrCache != m_wordPairCache.end());
|
||||
srcSumPairProbs += itrCache->second.first;
|
||||
targetProbs[wordpair] = itrCache->second.second;
|
||||
}
|
||||
else { // extract p(trg|src)
|
||||
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
|
||||
int trgIdx = srcWordAlignments[i];
|
||||
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
||||
// get probability of this source->target word pair
|
||||
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
|
||||
itrCache = m_wordPairCache.find(wordpair);
|
||||
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
||||
if(srcWordAlignments.size() == 0) { // get p(NULL|src)
|
||||
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
|
||||
itrCache = m_wordPairCache.find(wordpair);
|
||||
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
||||
CacheWordProbs(srcWord);
|
||||
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
||||
}
|
||||
CHECK(itrCache != m_wordPairCache.end());
|
||||
srcSumPairProbs += itrCache->second.first;
|
||||
targetProbs[wordpair] = itrCache->second.second;
|
||||
} else { // extract p(trg|src)
|
||||
for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
|
||||
int trgIdx = srcWordAlignments[i];
|
||||
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
||||
// get probability of this source->target word pair
|
||||
pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
|
||||
itrCache = m_wordPairCache.find(wordpair);
|
||||
if(itrCache == m_wordPairCache.end()) { // if not in cache
|
||||
CacheWordProbs(srcWord);
|
||||
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
||||
}
|
||||
CHECK(itrCache != m_wordPairCache.end());
|
||||
srcSumPairProbs += itrCache->second.first;
|
||||
targetProbs[wordpair] = itrCache->second.second;
|
||||
}
|
||||
}
|
||||
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
|
||||
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
|
||||
} // end for each source word
|
||||
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
|
||||
float trgSumPairProbs(0);
|
||||
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
||||
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
|
||||
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
|
||||
if(trgItr->first.second == trgWord)
|
||||
trgSumPairProbs += trgItr->second;
|
||||
itrCache = m_wordPairCache.find(wordpair); // search cache again
|
||||
}
|
||||
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
|
||||
int noAligned = alignment.numberAligned.at(trgIdx);
|
||||
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
|
||||
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
|
||||
}
|
||||
// TODO::Need to get p(NULL|trg)
|
||||
return pair<float, float>(srcLexWeight, trgLexWeight);
|
||||
CHECK(itrCache != m_wordPairCache.end());
|
||||
srcSumPairProbs += itrCache->second.first;
|
||||
targetProbs[wordpair] = itrCache->second.second;
|
||||
}
|
||||
}
|
||||
float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
|
||||
srcLexWeight *= (srcNormalizer * srcSumPairProbs);
|
||||
} // end for each source word
|
||||
for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
|
||||
float trgSumPairProbs(0);
|
||||
wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
|
||||
for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
|
||||
= targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
|
||||
if(trgItr->first.second == trgWord)
|
||||
trgSumPairProbs += trgItr->second;
|
||||
}
|
||||
if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
|
||||
int noAligned = alignment.numberAligned.at(trgIdx);
|
||||
float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
|
||||
trgLexWeight *= (trgNormalizer * trgSumPairProbs);
|
||||
}
|
||||
// TODO::Need to get p(NULL|trg)
|
||||
return pair<float, float>(srcLexWeight, trgLexWeight);
|
||||
}
|
||||
void BilingualDynSuffixArray::CacheFreqWords() const {
|
||||
void BilingualDynSuffixArray::CacheFreqWords() const
|
||||
{
|
||||
std::multimap<int, wordID_t> wordCnts;
|
||||
// for each source word in vocab
|
||||
Vocab::Word2Id::const_iterator it;
|
||||
Vocab::Word2Id::const_iterator it;
|
||||
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
|
||||
// get its frequency
|
||||
wordID_t srcWord = it->second;
|
||||
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
||||
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
||||
if(wrdIndices.size() >= 1000) { // min count
|
||||
if(wrdIndices.size() >= 1000) { // min count
|
||||
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
|
||||
}
|
||||
}
|
||||
int numSoFar(0);
|
||||
std::multimap<int, wordID_t>::reverse_iterator ritr;
|
||||
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
|
||||
std::multimap<int, wordID_t>::reverse_iterator ritr;
|
||||
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
|
||||
m_freqWordsCached.insert(ritr->second);
|
||||
CacheWordProbs(ritr->second);
|
||||
if(++numSoFar == 50) break; // get top counts
|
||||
}
|
||||
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
|
||||
}
|
||||
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
|
||||
void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
|
||||
{
|
||||
std::map<wordID_t, int> counts;
|
||||
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
||||
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
||||
CHECK(ret);
|
||||
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
|
||||
float denom(0);
|
||||
// for each occurrence of this word
|
||||
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
||||
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
|
||||
CHECK(sntIdx != -1);
|
||||
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
|
||||
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
|
||||
if(srcAlg.size() == 0) {
|
||||
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
|
||||
++denom;
|
||||
}
|
||||
else { //get target words aligned to srcword in this sentence
|
||||
for(size_t i=0; i < srcAlg.size(); ++i) {
|
||||
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
|
||||
++counts[trgWord];
|
||||
++denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
// now we've gotten counts of all target words aligned to this source word
|
||||
// get probs and cache all pairs
|
||||
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
|
||||
itrCnt != counts.end(); ++itrCnt) {
|
||||
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
|
||||
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
|
||||
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
|
||||
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
|
||||
}
|
||||
std::map<wordID_t, int> counts;
|
||||
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
|
||||
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
|
||||
CHECK(ret);
|
||||
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
|
||||
float denom(0);
|
||||
// for each occurrence of this word
|
||||
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
||||
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
|
||||
CHECK(sntIdx != -1);
|
||||
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
|
||||
const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
|
||||
if(srcAlg.size() == 0) {
|
||||
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
|
||||
++denom;
|
||||
} else { //get target words aligned to srcword in this sentence
|
||||
for(size_t i=0; i < srcAlg.size(); ++i) {
|
||||
wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
|
||||
++counts[trgWord];
|
||||
++denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
// now we've gotten counts of all target words aligned to this source word
|
||||
// get probs and cache all pairs
|
||||
for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
|
||||
itrCnt != counts.end(); ++itrCnt) {
|
||||
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
|
||||
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
|
||||
float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
|
||||
m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
|
||||
}
|
||||
}
|
||||
|
||||
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
|
||||
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
|
||||
{
|
||||
// takes sentence indexes and looks up vocab IDs
|
||||
SAPhrase phraseIds(phrasepair.GetTargetSize());
|
||||
int sntIndex = phrasepair.m_sntIndex;
|
||||
int id(-1), pos(0);
|
||||
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
|
||||
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
|
||||
phraseIds.SetId(pos++, id);
|
||||
}
|
||||
return phraseIds;
|
||||
}
|
||||
|
||||
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
|
||||
{
|
||||
TargetPhrase* targetPhrase = new TargetPhrase(Output);
|
||||
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
|
||||
Word& word = m_trgVocab->GetWord( phrase.words[i]);
|
||||
CHECK(word != m_trgVocab->GetkOOVWord());
|
||||
targetPhrase->AddWord(word);
|
||||
}
|
||||
// scoring
|
||||
return targetPhrase;
|
||||
// takes sentence indexes and looks up vocab IDs
|
||||
SAPhrase phraseIds(phrasepair.GetTargetSize());
|
||||
int sntIndex = phrasepair.m_sntIndex;
|
||||
int id(-1), pos(0);
|
||||
for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
|
||||
id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
|
||||
phraseIds.SetId(pos++, id);
|
||||
}
|
||||
return phraseIds;
|
||||
}
|
||||
|
||||
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
|
||||
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
|
||||
{
|
||||
TargetPhrase* targetPhrase = new TargetPhrase(Output);
|
||||
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
|
||||
Word& word = m_trgVocab->GetWord( phrase.words[i]);
|
||||
CHECK(word != m_trgVocab->GetkOOVWord());
|
||||
targetPhrase->AddWord(word);
|
||||
}
|
||||
// scoring
|
||||
return targetPhrase;
|
||||
}
|
||||
|
||||
void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
|
||||
{
|
||||
//cerr << "phrase is \"" << src << endl;
|
||||
size_t sourceSize = src.GetSize();
|
||||
SAPhrase localIDs(sourceSize);
|
||||
if(!GetLocalVocabIDs(src, localIDs)) return;
|
||||
float totalTrgPhrases(0);
|
||||
std::map<SAPhrase, int> phraseCounts;
|
||||
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
|
||||
std::map<SAPhrase, pair<float, float> > lexicalWeights;
|
||||
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
|
||||
std::vector<unsigned> wrdIndices;
|
||||
// extract sentence IDs from SA and return rightmost index of phrases
|
||||
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
|
||||
size_t sourceSize = src.GetSize();
|
||||
SAPhrase localIDs(sourceSize);
|
||||
if(!GetLocalVocabIDs(src, localIDs)) return;
|
||||
float totalTrgPhrases(0);
|
||||
std::map<SAPhrase, int> phraseCounts;
|
||||
//std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
|
||||
std::map<SAPhrase, pair<float, float> > lexicalWeights;
|
||||
std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
|
||||
std::vector<unsigned> wrdIndices;
|
||||
// extract sentence IDs from SA and return rightmost index of phrases
|
||||
if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
|
||||
SampleSelection(wrdIndices);
|
||||
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
|
||||
// for each sentence with this phrase
|
||||
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
||||
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
|
||||
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
|
||||
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
|
||||
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
|
||||
//cerr << "extracted " << phrasePairs.size() << endl;
|
||||
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
|
||||
std::vector<PhrasePair*>::iterator iterPhrasePair;
|
||||
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
|
||||
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
|
||||
phraseCounts[phrase]++; // count each unique phrase
|
||||
// NOTE::Correct but slow to extract lexical weight here. could do
|
||||
std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
|
||||
// for each sentence with this phrase
|
||||
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
|
||||
std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
|
||||
int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
|
||||
if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
|
||||
ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
|
||||
//cerr << "extracted " << phrasePairs.size() << endl;
|
||||
totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
|
||||
std::vector<PhrasePair*>::iterator iterPhrasePair;
|
||||
for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
|
||||
SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
|
||||
phraseCounts[phrase]++; // count each unique phrase
|
||||
// NOTE::Correct but slow to extract lexical weight here. could do
|
||||
// it later for only the top phrases chosen by phrase prob p(e|f)
|
||||
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
|
||||
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
|
||||
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
|
||||
itrLexW->second = lexWeight; // if this lex weight is greater save it
|
||||
else lexicalWeights[phrase] = lexWeight; // else save
|
||||
}
|
||||
// done with sentence. delete SA phrase pairs
|
||||
RemoveAllInColl(phrasePairs);
|
||||
} // done with all sentences
|
||||
// convert to moses phrase pairs
|
||||
std::map<SAPhrase, int>::const_iterator iterPhrases;
|
||||
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
|
||||
// get scores of all phrases
|
||||
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
|
||||
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
|
||||
itrLexW = lexicalWeights.find(iterPhrases->first);
|
||||
CHECK(itrLexW != lexicalWeights.end());
|
||||
Scores scoreVector(3);
|
||||
scoreVector[0] = trg2SrcMLE;
|
||||
scoreVector[1] = itrLexW->second.first;
|
||||
scoreVector[2] = 2.718; // exp(1);
|
||||
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
|
||||
}
|
||||
// return top scoring phrases
|
||||
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
|
||||
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
|
||||
Scores scoreVector = ritr->first;
|
||||
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
|
||||
target.push_back(make_pair( scoreVector, targetPhrase));
|
||||
if(target.size() == m_maxSampleSize) break;
|
||||
}
|
||||
pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
|
||||
itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
|
||||
if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
|
||||
itrLexW->second = lexWeight; // if this lex weight is greater save it
|
||||
else lexicalWeights[phrase] = lexWeight; // else save
|
||||
}
|
||||
// done with sentence. delete SA phrase pairs
|
||||
RemoveAllInColl(phrasePairs);
|
||||
} // done with all sentences
|
||||
// convert to moses phrase pairs
|
||||
std::map<SAPhrase, int>::const_iterator iterPhrases;
|
||||
std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
|
||||
// get scores of all phrases
|
||||
for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
|
||||
float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
|
||||
itrLexW = lexicalWeights.find(iterPhrases->first);
|
||||
CHECK(itrLexW != lexicalWeights.end());
|
||||
Scores scoreVector(3);
|
||||
scoreVector[0] = trg2SrcMLE;
|
||||
scoreVector[1] = itrLexW->second.first;
|
||||
scoreVector[2] = 2.718; // exp(1);
|
||||
phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
|
||||
}
|
||||
// return top scoring phrases
|
||||
std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
|
||||
for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
|
||||
Scores scoreVector = ritr->first;
|
||||
TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
|
||||
target.push_back(make_pair( scoreVector, targetPhrase));
|
||||
if(target.size() == m_maxSampleSize) break;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
|
||||
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
|
||||
std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
|
||||
const int sourceSize, const std::vector<unsigned>& sntBreaks) const
|
||||
{
|
||||
std::vector<unsigned>::const_iterator vit;
|
||||
std::vector<int> sntIndexes;
|
||||
for(size_t i=0; i < wrdIndices.size(); ++i) {
|
||||
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
|
||||
int index = int(vit - sntBreaks.begin()) - 1;
|
||||
// check for phrases that cross sentence boundaries
|
||||
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
|
||||
sntIndexes.push_back(-1); // set bad flag
|
||||
else
|
||||
sntIndexes.push_back(index); // store the index of the sentence in the corpus
|
||||
}
|
||||
return sntIndexes;
|
||||
std::vector<unsigned>::const_iterator vit;
|
||||
std::vector<int> sntIndexes;
|
||||
for(size_t i=0; i < wrdIndices.size(); ++i) {
|
||||
vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
|
||||
int index = int(vit - sntBreaks.begin()) - 1;
|
||||
// check for phrases that cross sentence boundaries
|
||||
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
|
||||
sntIndexes.push_back(-1); // set bad flag
|
||||
else
|
||||
sntIndexes.push_back(index); // store the index of the sentence in the corpus
|
||||
}
|
||||
return sntIndexes;
|
||||
}
|
||||
|
||||
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
|
||||
int sampleSize) const
|
||||
int sampleSize) const
|
||||
{
|
||||
// only use top 'sampleSize' number of samples
|
||||
if(sample.size() > sampleSize)
|
||||
sample.erase(sample.begin()+sampleSize, sample.end());
|
||||
return sample.size();
|
||||
if(sample.size() > sampleSize)
|
||||
sample.erase(sample.begin()+sampleSize, sample.end());
|
||||
return sample.size();
|
||||
}
|
||||
|
||||
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
|
||||
void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
|
||||
{
|
||||
vuint_t srcFactor, trgFactor;
|
||||
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
|
||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
|
||||
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
|
||||
Phrase sphrase(ARRAY_SIZE_INCR);
|
||||
@ -471,7 +470,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
|
||||
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
|
||||
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
|
||||
}
|
||||
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
|
||||
m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
|
||||
m_srcVocab->MakeClosed();
|
||||
Phrase tphrase(ARRAY_SIZE_INCR);
|
||||
tphrase.CreateFromString(m_outputFactors, target, factorDelimiter);
|
||||
@ -494,16 +493,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
|
||||
LoadRawAlignments(alignment);
|
||||
m_trgVocab->MakeClosed();
|
||||
//for(size_t i=0; i < sphrase.GetSize(); ++i)
|
||||
//ClearWordInCache(sIDs[i]);
|
||||
|
||||
//ClearWordInCache(sIDs[i]);
|
||||
|
||||
}
|
||||
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
|
||||
void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
|
||||
{
|
||||
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
|
||||
return;
|
||||
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
|
||||
first, last;
|
||||
std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
|
||||
first, last;
|
||||
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
|
||||
if(it->first.first == srcWord) { // all source words grouped
|
||||
if(it->first.first == srcWord) { // all source words grouped
|
||||
first = it; // copy first entry of srcWord
|
||||
last = it++;
|
||||
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
|
||||
@ -513,80 +513,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
|
||||
m_wordPairCache.erase(first, last);
|
||||
}
|
||||
}
|
||||
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
|
||||
:m_sntIndex(sntIndex)
|
||||
,numberAligned(targetSize, 0)
|
||||
,alignedList(sourceSize)
|
||||
SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
|
||||
:m_sntIndex(sntIndex)
|
||||
,numberAligned(targetSize, 0)
|
||||
,alignedList(sourceSize)
|
||||
{
|
||||
for(int i=0; i < sourceSize; ++i) {
|
||||
std::vector<int> trgWrd;
|
||||
alignedList[i] = trgWrd;
|
||||
}
|
||||
for(int i=0; i < sourceSize; ++i) {
|
||||
std::vector<int> trgWrd;
|
||||
alignedList[i] = trgWrd;
|
||||
}
|
||||
}
|
||||
|
||||
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
|
||||
{
|
||||
// foreign = target, F=T
|
||||
// english = source, E=S
|
||||
int countTarget = numberAligned.size();
|
||||
|
||||
int minTarget = 9999;
|
||||
int maxTarget = -1;
|
||||
std::vector< int > usedTarget = numberAligned;
|
||||
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++)
|
||||
{
|
||||
for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++)
|
||||
{
|
||||
int targetPos = alignedList[sourcePos][ind];
|
||||
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
|
||||
if (targetPos<minTarget) { minTarget = targetPos; }
|
||||
if (targetPos>maxTarget) { maxTarget = targetPos; }
|
||||
usedTarget[ targetPos ]--;
|
||||
} // for(int ind=0;ind<sentence
|
||||
} // for(int sourcePos=startSource
|
||||
|
||||
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
||||
|
||||
if (maxTarget >= 0 && // aligned to any foreign words at all
|
||||
maxTarget-minTarget < maxPhraseLength)
|
||||
{ // foreign phrase within limits
|
||||
|
||||
// check if foreign words are aligned to out of bound english words
|
||||
bool out_of_bounds = false;
|
||||
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++)
|
||||
{
|
||||
if (usedTarget[targetPos]>0)
|
||||
{
|
||||
// cout << "ouf of bounds: " << targetPos << "\n";
|
||||
out_of_bounds = true;
|
||||
}
|
||||
}
|
||||
|
||||
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
||||
if (!out_of_bounds)
|
||||
{
|
||||
// start point of foreign phrase may retreat over unaligned
|
||||
for(int startTarget = minTarget;
|
||||
(startTarget >= 0 &&
|
||||
startTarget > maxTarget-maxPhraseLength && // within length limit
|
||||
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
|
||||
startTarget--)
|
||||
{
|
||||
// end point of foreign phrase may advance over unaligned
|
||||
for (int endTarget=maxTarget;
|
||||
(endTarget<countTarget &&
|
||||
endTarget<startTarget+maxPhraseLength && // within length limit
|
||||
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
|
||||
endTarget++)
|
||||
{
|
||||
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
|
||||
ret.push_back(phrasePair);
|
||||
} // for (int endTarget=maxTarget;
|
||||
} // for(int startTarget=minTarget;
|
||||
} // if (!out_of_bounds)
|
||||
} // if (maxTarget >= 0 &&
|
||||
return (ret.size() > 0);
|
||||
|
||||
// foreign = target, F=T
|
||||
// english = source, E=S
|
||||
int countTarget = numberAligned.size();
|
||||
|
||||
int minTarget = 9999;
|
||||
int maxTarget = -1;
|
||||
std::vector< int > usedTarget = numberAligned;
|
||||
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
|
||||
for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
|
||||
int targetPos = alignedList[sourcePos][ind];
|
||||
// cout << "point (" << targetPos << ", " << sourcePos << ")\n";
|
||||
if (targetPos<minTarget) {
|
||||
minTarget = targetPos;
|
||||
}
|
||||
if (targetPos>maxTarget) {
|
||||
maxTarget = targetPos;
|
||||
}
|
||||
usedTarget[ targetPos ]--;
|
||||
} // for(int ind=0;ind<sentence
|
||||
} // for(int sourcePos=startSource
|
||||
|
||||
// cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
||||
|
||||
if (maxTarget >= 0 && // aligned to any foreign words at all
|
||||
maxTarget-minTarget < maxPhraseLength) {
|
||||
// foreign phrase within limits
|
||||
|
||||
// check if foreign words are aligned to out of bound english words
|
||||
bool out_of_bounds = false;
|
||||
for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
|
||||
if (usedTarget[targetPos]>0) {
|
||||
// cout << "ouf of bounds: " << targetPos << "\n";
|
||||
out_of_bounds = true;
|
||||
}
|
||||
}
|
||||
|
||||
// cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
|
||||
if (!out_of_bounds) {
|
||||
// start point of foreign phrase may retreat over unaligned
|
||||
for(int startTarget = minTarget;
|
||||
(startTarget >= 0 &&
|
||||
startTarget > maxTarget-maxPhraseLength && // within length limit
|
||||
(startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
|
||||
startTarget--) {
|
||||
// end point of foreign phrase may advance over unaligned
|
||||
for (int endTarget=maxTarget;
|
||||
(endTarget<countTarget &&
|
||||
endTarget<startTarget+maxPhraseLength && // within length limit
|
||||
(endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
|
||||
endTarget++) {
|
||||
PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
|
||||
ret.push_back(phrasePair);
|
||||
} // for (int endTarget=maxTarget;
|
||||
} // for(int startTarget=minTarget;
|
||||
} // if (!out_of_bounds)
|
||||
} // if (maxTarget >= 0 &&
|
||||
return (ret.size() > 0);
|
||||
|
||||
}
|
||||
|
||||
}// end namepsace
|
||||
|
@ -2,70 +2,73 @@
|
||||
#define moses_BilingualDynSuffixArray_h
|
||||
|
||||
#include "TargetPhrase.h"
|
||||
#include "DynSuffixArray.h"
|
||||
#include "DynSuffixArray.h"
|
||||
#include "DynSAInclude/vocab.h"
|
||||
#include "DynSAInclude/types.h"
|
||||
#include "DynSAInclude/utils.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "FactorTypeSet.h"
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class SAPhrase
|
||||
{
|
||||
public:
|
||||
std::vector<wordID_t> words;
|
||||
|
||||
SAPhrase(size_t phraseSize)
|
||||
:words(phraseSize)
|
||||
{}
|
||||
|
||||
void SetId(size_t pos, wordID_t id)
|
||||
{
|
||||
std::vector<wordID_t> words;
|
||||
|
||||
SAPhrase(size_t phraseSize)
|
||||
:words(phraseSize)
|
||||
{}
|
||||
|
||||
void SetId(size_t pos, wordID_t id) {
|
||||
CHECK(pos < words.size());
|
||||
words[pos] = id;
|
||||
}
|
||||
bool operator<(const SAPhrase& phr2) const
|
||||
{ return words < phr2.words; }
|
||||
words[pos] = id;
|
||||
}
|
||||
bool operator<(const SAPhrase& phr2) const {
|
||||
return words < phr2.words;
|
||||
}
|
||||
};
|
||||
|
||||
class PhrasePair
|
||||
{
|
||||
public:
|
||||
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
|
||||
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
|
||||
: m_startTarget(startTarget)
|
||||
, m_endTarget(endTarget)
|
||||
, m_startSource(startSource)
|
||||
, m_endSource(endSource)
|
||||
, m_sntIndex(sntIndex)
|
||||
{}
|
||||
int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
|
||||
PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
|
||||
: m_startTarget(startTarget)
|
||||
, m_endTarget(endTarget)
|
||||
, m_startSource(startSource)
|
||||
, m_endSource(endSource)
|
||||
, m_sntIndex(sntIndex)
|
||||
{}
|
||||
|
||||
size_t GetTargetSize() const
|
||||
{ return m_endTarget - m_startTarget + 1; }
|
||||
size_t GetTargetSize() const {
|
||||
return m_endTarget - m_startTarget + 1;
|
||||
}
|
||||
};
|
||||
|
||||
class SentenceAlignment
|
||||
|
||||
class SentenceAlignment
|
||||
{
|
||||
public:
|
||||
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
|
||||
int m_sntIndex;
|
||||
std::vector<wordID_t>* trgSnt;
|
||||
std::vector<wordID_t>* srcSnt;
|
||||
std::vector<int> numberAligned;
|
||||
std::vector< std::vector<int> > alignedList;
|
||||
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
|
||||
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
|
||||
int m_sntIndex;
|
||||
std::vector<wordID_t>* trgSnt;
|
||||
std::vector<wordID_t>* srcSnt;
|
||||
std::vector<int> numberAligned;
|
||||
std::vector< std::vector<int> > alignedList;
|
||||
bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
|
||||
};
|
||||
class ScoresComp {
|
||||
public:
|
||||
class ScoresComp
|
||||
{
|
||||
public:
|
||||
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
|
||||
bool operator()(const Scores& s1, const Scores& s2) const {
|
||||
bool operator()(const Scores& s1, const Scores& s2) const {
|
||||
return s1[0] < s2[0]; // just p(e|f) as approximation
|
||||
/*float score1(0), score2(0);
|
||||
int idx1(0), idx2(0);
|
||||
for (Scores::const_iterator itr = s1.begin();
|
||||
for (Scores::const_iterator itr = s1.begin();
|
||||
itr != s1.end(); ++itr) {
|
||||
score1 += log(*itr * m_weights.at(idx1++));
|
||||
score1 += log(*itr * m_weights.at(idx1++));
|
||||
}
|
||||
for (Scores::const_iterator itr = s2.begin();
|
||||
itr != s2.end(); ++itr) {
|
||||
@ -73,73 +76,72 @@ public:
|
||||
}
|
||||
return score1 < score2;*/
|
||||
}
|
||||
private:
|
||||
private:
|
||||
const std::vector<float>& m_weights;
|
||||
};
|
||||
|
||||
class BilingualDynSuffixArray {
|
||||
public:
|
||||
BilingualDynSuffixArray();
|
||||
~BilingualDynSuffixArray();
|
||||
bool Load( const std::vector<FactorType>& inputFactors,
|
||||
const std::vector<FactorType>& outputTactors,
|
||||
std::string source, std::string target, std::string alignments,
|
||||
const std::vector<float> &weight);
|
||||
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
|
||||
void CleanUp();
|
||||
|
||||
class BilingualDynSuffixArray
|
||||
{
|
||||
public:
|
||||
BilingualDynSuffixArray();
|
||||
~BilingualDynSuffixArray();
|
||||
bool Load( const std::vector<FactorType>& inputFactors,
|
||||
const std::vector<FactorType>& outputTactors,
|
||||
std::string source, std::string target, std::string alignments,
|
||||
const std::vector<float> &weight);
|
||||
void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
|
||||
void CleanUp();
|
||||
void addSntPair(string& source, string& target, string& alignment);
|
||||
private:
|
||||
DynSuffixArray* m_srcSA;
|
||||
DynSuffixArray* m_trgSA;
|
||||
std::vector<wordID_t>* m_srcCorpus;
|
||||
std::vector<wordID_t>* m_trgCorpus;
|
||||
DynSuffixArray* m_srcSA;
|
||||
DynSuffixArray* m_trgSA;
|
||||
std::vector<wordID_t>* m_srcCorpus;
|
||||
std::vector<wordID_t>* m_trgCorpus;
|
||||
std::vector<FactorType> m_inputFactors;
|
||||
std::vector<FactorType> m_outputFactors;
|
||||
|
||||
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
|
||||
std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
|
||||
|
||||
Vocab* m_srcVocab, *m_trgVocab;
|
||||
ScoresComp* m_scoreCmp;
|
||||
Vocab* m_srcVocab, *m_trgVocab;
|
||||
ScoresComp* m_scoreCmp;
|
||||
|
||||
std::vector<SentenceAlignment> m_alignments;
|
||||
std::vector<std::vector<short> > m_rawAlignments;
|
||||
std::vector<SentenceAlignment> m_alignments;
|
||||
std::vector<std::vector<short> > m_rawAlignments;
|
||||
|
||||
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
|
||||
mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
|
||||
mutable std::set<wordID_t> m_freqWordsCached;
|
||||
const size_t m_maxPhraseLength, m_maxSampleSize;
|
||||
const size_t m_maxPhraseLength, m_maxSampleSize;
|
||||
|
||||
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
|
||||
std::vector<wordID_t>&, std::vector<wordID_t>&,
|
||||
Vocab*);
|
||||
int LoadAlignments(InputFileStream& aligs);
|
||||
int LoadRawAlignments(InputFileStream& aligs);
|
||||
int LoadRawAlignments(string& aligs);
|
||||
int LoadCorpus(InputFileStream&, const std::vector<FactorType>& factors,
|
||||
std::vector<wordID_t>&, std::vector<wordID_t>&,
|
||||
Vocab*);
|
||||
int LoadAlignments(InputFileStream& aligs);
|
||||
int LoadRawAlignments(InputFileStream& aligs);
|
||||
int LoadRawAlignments(string& aligs);
|
||||
|
||||
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
|
||||
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
|
||||
int SampleSelection(std::vector<unsigned>&, int = 300) const;
|
||||
bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
|
||||
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
|
||||
int SampleSelection(std::vector<unsigned>&, int = 300) const;
|
||||
|
||||
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
|
||||
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
|
||||
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
|
||||
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
|
||||
void CacheWordProbs(wordID_t) const;
|
||||
std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
|
||||
TargetPhrase* GetMosesFactorIDs(const SAPhrase&) const;
|
||||
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
|
||||
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
|
||||
void CacheWordProbs(wordID_t) const;
|
||||
void CacheFreqWords() const;
|
||||
void ClearWordInCache(wordID_t);
|
||||
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
|
||||
std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
|
||||
|
||||
int GetSourceSentenceSize(size_t sentenceId) const
|
||||
{
|
||||
return (sentenceId==m_srcSntBreaks.size()-1) ?
|
||||
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
|
||||
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
|
||||
}
|
||||
int GetTargetSentenceSize(size_t sentenceId) const
|
||||
{
|
||||
return (sentenceId==m_trgSntBreaks.size()-1) ?
|
||||
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
|
||||
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
|
||||
}
|
||||
int GetSourceSentenceSize(size_t sentenceId) const {
|
||||
return (sentenceId==m_srcSntBreaks.size()-1) ?
|
||||
m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
|
||||
m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
|
||||
}
|
||||
int GetTargetSentenceSize(size_t sentenceId) const {
|
||||
return (sentenceId==m_trgSntBreaks.size()-1) ?
|
||||
m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
|
||||
m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
|
||||
}
|
||||
};
|
||||
} // end namespace
|
||||
#endif
|
||||
|
@ -98,8 +98,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
|
||||
|
||||
// add all trans opt into queue. using only 1st child node.
|
||||
ChartTranslationOptionList::const_iterator iterList;
|
||||
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList)
|
||||
{
|
||||
for (iterList = transOptList.begin(); iterList != transOptList.end(); ++iterList) {
|
||||
const ChartTranslationOption &transOpt = **iterList;
|
||||
RuleCube *ruleCube = new RuleCube(transOpt, allChartCells, m_manager);
|
||||
queue.Add(ruleCube);
|
||||
@ -107,8 +106,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
|
||||
|
||||
// pluck things out of queue and add to hypo collection
|
||||
const size_t popLimit = staticData.GetCubePruningPopLimit();
|
||||
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops)
|
||||
{
|
||||
for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
|
||||
ChartHypothesis *hypo = queue.Pop();
|
||||
AddHypothesis(hypo);
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ class Word;
|
||||
|
||||
class ChartCellLabel
|
||||
{
|
||||
public:
|
||||
public:
|
||||
ChartCellLabel(const WordsRange &coverage, const Word &label,
|
||||
const ChartHypothesisCollection *stack=NULL)
|
||||
: m_coverage(coverage)
|
||||
@ -42,12 +42,17 @@ class ChartCellLabel
|
||||
, m_stack(stack)
|
||||
{}
|
||||
|
||||
const WordsRange &GetCoverage() const { return m_coverage; }
|
||||
const Word &GetLabel() const { return m_label; }
|
||||
const ChartHypothesisCollection *GetStack() const { return m_stack; }
|
||||
const WordsRange &GetCoverage() const {
|
||||
return m_coverage;
|
||||
}
|
||||
const Word &GetLabel() const {
|
||||
return m_label;
|
||||
}
|
||||
const ChartHypothesisCollection *GetStack() const {
|
||||
return m_stack;
|
||||
}
|
||||
|
||||
bool operator<(const ChartCellLabel &other) const
|
||||
{
|
||||
bool operator<(const ChartCellLabel &other) const {
|
||||
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't
|
||||
// need to compare m_stack.
|
||||
if (m_coverage == other.m_coverage) {
|
||||
@ -56,7 +61,7 @@ class ChartCellLabel
|
||||
return m_coverage < other.m_coverage;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
const WordsRange &m_coverage;
|
||||
const Word &m_label;
|
||||
const ChartHypothesisCollection *m_stack;
|
||||
|
@ -34,40 +34,45 @@ class ChartHypothesisCollection;
|
||||
|
||||
class ChartCellLabelSet
|
||||
{
|
||||
private:
|
||||
private:
|
||||
typedef std::set<ChartCellLabel> SetType;
|
||||
|
||||
public:
|
||||
public:
|
||||
typedef SetType::const_iterator const_iterator;
|
||||
|
||||
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
|
||||
|
||||
const_iterator begin() const { return m_set.begin(); }
|
||||
const_iterator end() const { return m_set.end(); }
|
||||
const_iterator begin() const {
|
||||
return m_set.begin();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return m_set.end();
|
||||
}
|
||||
|
||||
void AddWord(const Word &w)
|
||||
{
|
||||
void AddWord(const Word &w) {
|
||||
ChartCellLabel cellLabel(m_coverage, w);
|
||||
m_set.insert(cellLabel);
|
||||
}
|
||||
|
||||
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack)
|
||||
{
|
||||
void AddConstituent(const Word &w, const ChartHypothesisCollection &stack) {
|
||||
ChartCellLabel cellLabel(m_coverage, w, &stack);
|
||||
m_set.insert(cellLabel);
|
||||
}
|
||||
|
||||
bool Empty() const { return m_set.empty(); }
|
||||
bool Empty() const {
|
||||
return m_set.empty();
|
||||
}
|
||||
|
||||
size_t GetSize() const { return m_set.size(); }
|
||||
size_t GetSize() const {
|
||||
return m_set.size();
|
||||
}
|
||||
|
||||
const ChartCellLabel *Find(const Word &w) const
|
||||
{
|
||||
const ChartCellLabel *Find(const Word &w) const {
|
||||
SetType::const_iterator p = m_set.find(ChartCellLabel(m_coverage, w));
|
||||
return p == m_set.end() ? 0 : &(*p);
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
const WordsRange &m_coverage;
|
||||
SetType m_set;
|
||||
};
|
||||
|
@ -57,15 +57,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOption &transOpt,
|
||||
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
|
||||
m_prevHypos.reserve(childEntries.size());
|
||||
std::vector<HypothesisDimension>::const_iterator iter;
|
||||
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter)
|
||||
{
|
||||
for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) {
|
||||
m_prevHypos.push_back(iter->GetHypothesis());
|
||||
}
|
||||
}
|
||||
|
||||
ChartHypothesis::~ChartHypothesis()
|
||||
{
|
||||
// delete feature function states
|
||||
// delete feature function states
|
||||
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
|
||||
delete m_ffStates[i];
|
||||
}
|
||||
@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const
|
||||
size_t nonTermInd = nonTermIndexMap[pos];
|
||||
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
|
||||
prevHypo->CreateOutputPhrase(outPhrase);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
outPhrase.AddWord(word);
|
||||
}
|
||||
}
|
||||
@ -120,20 +118,19 @@ Phrase ChartHypothesis::GetOutputPhrase() const
|
||||
*/
|
||||
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
|
||||
{
|
||||
int comp = 0;
|
||||
int comp = 0;
|
||||
// -1 = this < compare
|
||||
// +1 = this > compare
|
||||
// 0 = this ==compare
|
||||
|
||||
for (unsigned i = 0; i < m_ffStates.size(); ++i)
|
||||
{
|
||||
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
|
||||
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
|
||||
if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
|
||||
comp = m_ffStates[i] - compare.m_ffStates[i];
|
||||
else
|
||||
else
|
||||
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
|
||||
|
||||
if (comp != 0)
|
||||
return comp;
|
||||
if (comp != 0)
|
||||
return comp;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -154,12 +151,12 @@ void ChartHypothesis::CalcScore()
|
||||
const ScoreComponentCollection &scoreBreakdown = GetCurrTargetPhrase().GetScoreBreakdown();
|
||||
m_scoreBreakdown.PlusEquals(scoreBreakdown);
|
||||
|
||||
// compute values of stateless feature functions that were not
|
||||
// compute values of stateless feature functions that were not
|
||||
// cached in the translation option-- there is no principled distinction
|
||||
|
||||
//const vector<const StatelessFeatureFunction*>& sfs =
|
||||
// m_manager.GetTranslationSystem()->GetStatelessFeatureFunctions();
|
||||
// TODO!
|
||||
// TODO!
|
||||
//for (unsigned i = 0; i < sfs.size(); ++i) {
|
||||
// sfs[i]->ChartEvaluate(m_targetPhrase, &m_scoreBreakdown);
|
||||
//}
|
||||
@ -167,7 +164,7 @@ void ChartHypothesis::CalcScore()
|
||||
const std::vector<const StatefulFeatureFunction*>& ffs =
|
||||
m_manager.GetTranslationSystem()->GetStatefulFeatureFunctions();
|
||||
for (unsigned i = 0; i < ffs.size(); ++i) {
|
||||
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
|
||||
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
|
||||
}
|
||||
|
||||
m_totalScore = m_scoreBreakdown.GetWeightedScore();
|
||||
@ -258,13 +255,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
|
||||
{
|
||||
|
||||
out << hypo.GetId();
|
||||
|
||||
// recombination
|
||||
if (hypo.GetWinningHypothesis() != NULL &&
|
||||
hypo.GetWinningHypothesis() != &hypo)
|
||||
{
|
||||
out << "->" << hypo.GetWinningHypothesis()->GetId();
|
||||
}
|
||||
|
||||
// recombination
|
||||
if (hypo.GetWinningHypothesis() != NULL &&
|
||||
hypo.GetWinningHypothesis() != &hypo) {
|
||||
out << "->" << hypo.GetWinningHypothesis()->GetId();
|
||||
}
|
||||
|
||||
out << " " << hypo.GetCurrTargetPhrase()
|
||||
//<< " " << outPhrase
|
||||
|
@ -55,7 +55,7 @@ protected:
|
||||
const ChartTranslationOption &m_transOpt;
|
||||
|
||||
WordsRange m_currSourceWordsRange;
|
||||
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
|
||||
std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
|
||||
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
|
||||
,m_lmNGram
|
||||
,m_lmPrefix;
|
||||
@ -94,7 +94,9 @@ public:
|
||||
|
||||
~ChartHypothesis();
|
||||
|
||||
unsigned GetId() const { return m_id; }
|
||||
unsigned GetId() const {
|
||||
return m_id;
|
||||
}
|
||||
|
||||
const ChartTranslationOption &GetTranslationOption()const {
|
||||
return m_transOpt;
|
||||
@ -108,15 +110,17 @@ public:
|
||||
inline const ChartArcList* GetArcList() const {
|
||||
return m_arcList;
|
||||
}
|
||||
inline const FFState* GetFFState( size_t featureID ) const {
|
||||
return m_ffStates[ featureID ];
|
||||
}
|
||||
inline const ChartManager& GetManager() const { return m_manager; }
|
||||
inline const FFState* GetFFState( size_t featureID ) const {
|
||||
return m_ffStates[ featureID ];
|
||||
}
|
||||
inline const ChartManager& GetManager() const {
|
||||
return m_manager;
|
||||
}
|
||||
|
||||
void CreateOutputPhrase(Phrase &outPhrase) const;
|
||||
Phrase GetOutputPhrase() const;
|
||||
|
||||
int RecombineCompare(const ChartHypothesis &compare) const;
|
||||
int RecombineCompare(const ChartHypothesis &compare) const;
|
||||
|
||||
void CalcScore();
|
||||
|
||||
@ -135,17 +139,17 @@ public:
|
||||
return m_prevHypos;
|
||||
}
|
||||
|
||||
const ChartHypothesis* GetPrevHypo(size_t pos) const {
|
||||
return m_prevHypos[pos];
|
||||
}
|
||||
const ChartHypothesis* GetPrevHypo(size_t pos) const {
|
||||
return m_prevHypos[pos];
|
||||
}
|
||||
|
||||
const Word &GetTargetLHS() const {
|
||||
return GetCurrTargetPhrase().GetTargetLHS();
|
||||
}
|
||||
|
||||
const ChartHypothesis* GetWinningHypothesis() const {
|
||||
return m_winningHypo;
|
||||
}
|
||||
const ChartHypothesis* GetWinningHypothesis() const {
|
||||
return m_winningHypo;
|
||||
}
|
||||
|
||||
TO_STRING();
|
||||
|
||||
|
@ -101,8 +101,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage
|
||||
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
|
||||
if (m_nBestIsEnabled) {
|
||||
hypoExisting->AddArc(hypo);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ChartHypothesis::Delete(hypo);
|
||||
}
|
||||
return false;
|
||||
|
@ -43,7 +43,7 @@ public:
|
||||
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
|
||||
// assert in same cell
|
||||
const WordsRange &rangeA = hypoA->GetCurrSourceRange()
|
||||
, &rangeB = hypoB->GetCurrSourceRange();
|
||||
, &rangeB = hypoB->GetCurrSourceRange();
|
||||
CHECK(rangeA == rangeB);
|
||||
|
||||
// shouldn't be mixing hypos with different lhs
|
||||
@ -113,7 +113,9 @@ public:
|
||||
return m_hyposOrdered;
|
||||
}
|
||||
|
||||
float GetBestScore() const { return m_bestScore; }
|
||||
float GetBestScore() const {
|
||||
return m_bestScore;
|
||||
}
|
||||
|
||||
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
|
||||
|
||||
|
@ -231,17 +231,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
|
||||
{
|
||||
size_t size = m_source.GetSize();
|
||||
|
||||
// which hypotheses are reachable?
|
||||
std::map<unsigned,bool> reachable;
|
||||
WordsRange fullRange(0, size-1);
|
||||
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
|
||||
// which hypotheses are reachable?
|
||||
std::map<unsigned,bool> reachable;
|
||||
WordsRange fullRange(0, size-1);
|
||||
const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
|
||||
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
|
||||
|
||||
if (hypo == NULL) {
|
||||
// no hypothesis
|
||||
return;
|
||||
}
|
||||
FindReachableHypotheses( hypo, reachable);
|
||||
FindReachableHypotheses( hypo, reachable);
|
||||
|
||||
for (size_t width = 1; width <= size; ++width) {
|
||||
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
|
||||
@ -257,42 +257,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
|
||||
|
||||
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
|
||||
{
|
||||
// do not recurse, if already visited
|
||||
if (reachable.find(hypo->GetId()) != reachable.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
// do not recurse, if already visited
|
||||
if (reachable.find(hypo->GetId()) != reachable.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// recurse
|
||||
reachable[ hypo->GetId() ] = true;
|
||||
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
|
||||
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i)
|
||||
{
|
||||
FindReachableHypotheses( *i, reachable );
|
||||
}
|
||||
// recurse
|
||||
reachable[ hypo->GetId() ] = true;
|
||||
const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
|
||||
for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
|
||||
FindReachableHypotheses( *i, reachable );
|
||||
}
|
||||
|
||||
// also loop over recombined hypotheses (arcs)
|
||||
const ChartArcList *arcList = hypo->GetArcList();
|
||||
if (arcList) {
|
||||
ChartArcList::const_iterator iterArc;
|
||||
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
|
||||
const ChartHypothesis &arc = **iterArc;
|
||||
FindReachableHypotheses( &arc, reachable );
|
||||
}
|
||||
}
|
||||
// also loop over recombined hypotheses (arcs)
|
||||
const ChartArcList *arcList = hypo->GetArcList();
|
||||
if (arcList) {
|
||||
ChartArcList::const_iterator iterArc;
|
||||
for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
|
||||
const ChartHypothesis &arc = **iterArc;
|
||||
FindReachableHypotheses( &arc, reachable );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ChartManager::CreateDeviantPaths(
|
||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||
ChartTrellisDetourQueue &q)
|
||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||
ChartTrellisDetourQueue &q)
|
||||
{
|
||||
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
|
||||
}
|
||||
|
||||
void ChartManager::CreateDeviantPaths(
|
||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||
const ChartTrellisNode &substitutedNode,
|
||||
ChartTrellisDetourQueue &queue)
|
||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||
const ChartTrellisNode &substitutedNode,
|
||||
ChartTrellisDetourQueue &queue)
|
||||
{
|
||||
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
|
||||
if (arcList) {
|
||||
|
@ -69,7 +69,7 @@ public:
|
||||
void CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct=0) const;
|
||||
|
||||
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
||||
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
|
||||
void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
|
||||
|
||||
const InputType& GetSource() const {
|
||||
return m_source;
|
||||
@ -89,7 +89,9 @@ public:
|
||||
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
|
||||
}
|
||||
|
||||
unsigned GetNextHypoId() { return m_hypothesisId++; }
|
||||
unsigned GetNextHypoId() {
|
||||
return m_hypothesisId++;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -77,19 +77,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
||||
// get list of all rules that apply to spans at same starting position
|
||||
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
|
||||
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
|
||||
|
||||
|
||||
const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();
|
||||
|
||||
// loop through the rules
|
||||
// (note that expandableDottedRuleList can be expanded as the loop runs
|
||||
// (note that expandableDottedRuleList can be expanded as the loop runs
|
||||
// through calls to ExtendPartialRuleApplication())
|
||||
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
|
||||
// rule we are about to extend
|
||||
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
|
||||
// we will now try to extend it, starting after where it ended
|
||||
size_t startPos = prevDottedRule.IsRoot()
|
||||
? range.GetStartPos()
|
||||
: prevDottedRule.GetWordsRange().GetEndPos() + 1;
|
||||
? range.GetStartPos()
|
||||
: prevDottedRule.GetWordsRange().GetEndPos() + 1;
|
||||
|
||||
// search for terminal symbol
|
||||
// (if only one more word position needs to be covered)
|
||||
@ -102,15 +102,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
||||
|
||||
// if we found a new rule -> create it and add it to the list
|
||||
if (node != NULL) {
|
||||
// create the rule
|
||||
// create the rule
|
||||
#ifdef USE_BOOST_POOL
|
||||
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
|
||||
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
|
||||
prevDottedRule);
|
||||
#else
|
||||
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
|
||||
sourceWordLabel,
|
||||
prevDottedRule);
|
||||
sourceWordLabel,
|
||||
prevDottedRule);
|
||||
#endif
|
||||
dottedRuleCol.Add(relEndPos+1, dottedRule);
|
||||
}
|
||||
@ -136,9 +136,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
||||
// word.
|
||||
endPos = absEndPos - 1;
|
||||
stackInd = relEndPos;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
endPos = absEndPos;
|
||||
stackInd = relEndPos + 1;
|
||||
}
|
||||
@ -215,7 +213,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
|
||||
// We'll do whichever minimises the number of lookups:
|
||||
if (numCombinations <= numChildren*2) {
|
||||
|
||||
// loop over possible source non-terminal labels (as found in input tree)
|
||||
// loop over possible source non-terminal labels (as found in input tree)
|
||||
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
|
||||
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
|
||||
for (; p != sEnd; ++p) {
|
||||
@ -242,14 +240,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
|
||||
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
|
||||
#else
|
||||
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
|
||||
prevDottedRule);
|
||||
prevDottedRule);
|
||||
#endif
|
||||
dottedRuleColl.Add(stackInd, rule);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
// loop over possible expansions of the rule
|
||||
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p;
|
||||
PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end =
|
||||
@ -274,7 +270,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
|
||||
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
|
||||
#else
|
||||
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
|
||||
prevDottedRule);
|
||||
prevDottedRule);
|
||||
#endif
|
||||
dottedRuleColl.Add(stackInd, rule);
|
||||
}
|
||||
|
@ -30,7 +30,7 @@ namespace Moses
|
||||
{
|
||||
|
||||
void ChartTranslationOption::CalcEstimateOfBestScore(
|
||||
const ChartCellCollection &allChartCells)
|
||||
const ChartCellCollection &allChartCells)
|
||||
{
|
||||
const TargetPhrase &targetPhrase = **(m_targetPhraseCollection.begin());
|
||||
m_estimateOfBestScore = targetPhrase.GetFutureScore();
|
||||
|
@ -37,7 +37,7 @@ class ChartCellCollection;
|
||||
// of translations and provdes an estimate of the best score.
|
||||
class ChartTranslationOption
|
||||
{
|
||||
public:
|
||||
public:
|
||||
ChartTranslationOption(const TargetPhraseCollection &targetPhraseColl,
|
||||
const DottedRule &dottedRule,
|
||||
const WordsRange &wordsRange,
|
||||
@ -45,16 +45,17 @@ class ChartTranslationOption
|
||||
: m_dottedRule(dottedRule)
|
||||
, m_targetPhraseCollection(targetPhraseColl)
|
||||
, m_wordsRange(wordsRange)
|
||||
, m_estimateOfBestScore(0)
|
||||
{
|
||||
, m_estimateOfBestScore(0) {
|
||||
CalcEstimateOfBestScore(allChartCells);
|
||||
}
|
||||
|
||||
~ChartTranslationOption() {}
|
||||
|
||||
const DottedRule &GetDottedRule() const { return m_dottedRule; }
|
||||
const DottedRule &GetDottedRule() const {
|
||||
return m_dottedRule;
|
||||
}
|
||||
|
||||
const TargetPhraseCollection &GetTargetPhraseCollection() const {
|
||||
const TargetPhraseCollection &GetTargetPhraseCollection() const {
|
||||
return m_targetPhraseCollection;
|
||||
}
|
||||
|
||||
@ -65,9 +66,11 @@ class ChartTranslationOption
|
||||
// return an estimate of the best score possible with this translation option.
|
||||
// the estimate is the sum of the top target phrase's estimated score plus the
|
||||
// scores of the best child hypotheses.
|
||||
inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; }
|
||||
inline float GetEstimateOfBestScore() const {
|
||||
return m_estimateOfBestScore;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
// not implemented
|
||||
ChartTranslationOption &operator=(const ChartTranslationOption &);
|
||||
|
||||
|
@ -106,8 +106,8 @@ void ChartTranslationOptionCollection::ProcessUnknownWord(size_t startPos, size_
|
||||
return;
|
||||
}
|
||||
|
||||
if (startPos == 0 || startPos == m_source.GetSize() - 1)
|
||||
{ // don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
|
||||
if (startPos == 0 || startPos == m_source.GetSize() - 1) {
|
||||
// don't create unknown words for <S> or </S> tags. Otherwise they can be moved. Should only be translated by glue rules
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -74,9 +74,9 @@ protected:
|
||||
|
||||
public:
|
||||
ChartTranslationOptionCollection(InputType const& source
|
||||
, const TranslationSystem* system
|
||||
, const ChartCellCollection &hypoStackColl
|
||||
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
|
||||
, const TranslationSystem* system
|
||||
, const ChartCellCollection &hypoStackColl
|
||||
, const std::vector<ChartRuleLookupManager*> &ruleLookupManagers);
|
||||
virtual ~ChartTranslationOptionCollection();
|
||||
void CreateTranslationOptionsForRange(size_t startPos
|
||||
, size_t endPos);
|
||||
|
@ -66,12 +66,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &targetPhraseC
|
||||
if (m_collection.size() < ruleLimit) {
|
||||
// not yet filled out quota. add everything
|
||||
ChartTranslationOption *option = new ChartTranslationOption(
|
||||
targetPhraseCollection, dottedRule, m_range, chartCellColl);
|
||||
targetPhraseCollection, dottedRule, m_range, chartCellColl);
|
||||
m_collection.push_back(option);
|
||||
float score = option->GetEstimateOfBestScore();
|
||||
m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// full but not bursting. add if better than worst score
|
||||
ChartTranslationOption option(targetPhraseCollection, dottedRule,
|
||||
m_range, chartCellColl);
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -27,15 +27,15 @@ namespace Moses
|
||||
{
|
||||
|
||||
ChartTrellisDetour::ChartTrellisDetour(
|
||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||
const ChartTrellisNode &substitutedNode,
|
||||
const ChartHypothesis &replacementHypo)
|
||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
||||
const ChartTrellisNode &substitutedNode,
|
||||
const ChartHypothesis &replacementHypo)
|
||||
: m_basePath(basePath)
|
||||
, m_substitutedNode(substitutedNode)
|
||||
, m_replacementHypo(replacementHypo)
|
||||
{
|
||||
float diff = replacementHypo.GetTotalScore()
|
||||
- substitutedNode.GetHypothesis().GetTotalScore();
|
||||
- substitutedNode.GetHypothesis().GetTotalScore();
|
||||
m_totalScore = basePath->GetTotalScore() + diff;
|
||||
}
|
||||
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -29,20 +29,24 @@ class ChartTrellisPath;
|
||||
|
||||
class ChartTrellisDetour
|
||||
{
|
||||
public:
|
||||
public:
|
||||
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
|
||||
const ChartTrellisNode &, const ChartHypothesis &);
|
||||
|
||||
const ChartTrellisPath &GetBasePath() const { return *m_basePath; }
|
||||
const ChartTrellisPath &GetBasePath() const {
|
||||
return *m_basePath;
|
||||
}
|
||||
const ChartTrellisNode &GetSubstitutedNode() const {
|
||||
return m_substitutedNode;
|
||||
}
|
||||
const ChartHypothesis &GetReplacementHypo() const {
|
||||
return m_replacementHypo;
|
||||
}
|
||||
float GetTotalScore() const { return m_totalScore; }
|
||||
float GetTotalScore() const {
|
||||
return m_totalScore;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
boost::shared_ptr<const ChartTrellisPath> m_basePath;
|
||||
const ChartTrellisNode &m_substitutedNode;
|
||||
const ChartHypothesis &m_replacementHypo;
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -21,13 +21,16 @@
|
||||
|
||||
#include "Util.h"
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
ChartTrellisDetourQueue::~ChartTrellisDetourQueue() {
|
||||
ChartTrellisDetourQueue::~ChartTrellisDetourQueue()
|
||||
{
|
||||
RemoveAllInColl(m_queue);
|
||||
}
|
||||
|
||||
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
|
||||
void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour)
|
||||
{
|
||||
if (m_capacity == 0 || m_queue.size() < m_capacity) {
|
||||
m_queue.insert(detour);
|
||||
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
|
||||
@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
|
||||
}
|
||||
}
|
||||
|
||||
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() {
|
||||
const ChartTrellisDetour *ChartTrellisDetourQueue::Pop()
|
||||
{
|
||||
QueueType::iterator p = m_queue.begin();
|
||||
const ChartTrellisDetour *top = *p;
|
||||
m_queue.erase(p);
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
@ -23,19 +23,23 @@
|
||||
|
||||
#include <set>
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
// A bounded priority queue of ChartTrellisDetour pointers. The top item is
|
||||
// the best scoring detour. The queue assumes ownership of pushed items and
|
||||
// relinquishes ownership when they are popped. Any remaining items at the
|
||||
// time of the queue's destruction are deleted.
|
||||
class ChartTrellisDetourQueue {
|
||||
public:
|
||||
class ChartTrellisDetourQueue
|
||||
{
|
||||
public:
|
||||
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
|
||||
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
|
||||
~ChartTrellisDetourQueue();
|
||||
|
||||
bool Empty() const { return m_queue.empty(); }
|
||||
bool Empty() const {
|
||||
return m_queue.empty();
|
||||
}
|
||||
|
||||
// Add the detour to the queue or delete it if the queue is full and the
|
||||
// score is no better than the queue's worst score.
|
||||
@ -45,7 +49,7 @@ class ChartTrellisDetourQueue {
|
||||
// caller is responsible for deleting the object.
|
||||
const ChartTrellisDetour *Pop();
|
||||
|
||||
private:
|
||||
private:
|
||||
struct DetourOrderer {
|
||||
bool operator()(const ChartTrellisDetour* a,
|
||||
const ChartTrellisDetour* b) const {
|
||||
|
@ -31,16 +31,16 @@ namespace Moses
|
||||
{
|
||||
|
||||
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
|
||||
: m_hypo(hypo)
|
||||
: m_hypo(hypo)
|
||||
{
|
||||
CreateChildren();
|
||||
}
|
||||
|
||||
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
|
||||
ChartTrellisNode *&deviationPoint)
|
||||
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
|
||||
? detour.GetReplacementHypo()
|
||||
: detour.GetBasePath().GetFinalNode().GetHypothesis())
|
||||
: m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
|
||||
? detour.GetReplacementHypo()
|
||||
: detour.GetBasePath().GetFinalNode().GetHypothesis())
|
||||
{
|
||||
if (&m_hypo == &detour.GetReplacementHypo()) {
|
||||
deviationPoint = this;
|
||||
@ -56,9 +56,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root,
|
||||
const ChartTrellisNode &substitutedNode,
|
||||
const ChartHypothesis &replacementHypo,
|
||||
ChartTrellisNode *&deviationPoint)
|
||||
: m_hypo((&root == &substitutedNode)
|
||||
? replacementHypo
|
||||
: root.GetHypothesis())
|
||||
: m_hypo((&root == &substitutedNode)
|
||||
? replacementHypo
|
||||
: root.GetHypothesis())
|
||||
{
|
||||
if (&root == &substitutedNode) {
|
||||
deviationPoint = this;
|
||||
@ -124,8 +124,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode,
|
||||
for (size_t ind = 0; ind < children.size(); ++ind) {
|
||||
const ChartTrellisNode *origChild = children[ind];
|
||||
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
|
||||
replacementHypo,
|
||||
deviationPoint);
|
||||
replacementHypo,
|
||||
deviationPoint);
|
||||
m_children.push_back(child);
|
||||
}
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ class ChartTrellisDetour;
|
||||
|
||||
class ChartTrellisNode
|
||||
{
|
||||
public:
|
||||
public:
|
||||
typedef std::vector<ChartTrellisNode*> NodeChildren;
|
||||
|
||||
ChartTrellisNode(const ChartHypothesis &hypo);
|
||||
@ -40,15 +40,21 @@ class ChartTrellisNode
|
||||
|
||||
~ChartTrellisNode();
|
||||
|
||||
const ChartHypothesis &GetHypothesis() const { return m_hypo; }
|
||||
const ChartHypothesis &GetHypothesis() const {
|
||||
return m_hypo;
|
||||
}
|
||||
|
||||
const NodeChildren &GetChildren() const { return m_children; }
|
||||
const NodeChildren &GetChildren() const {
|
||||
return m_children;
|
||||
}
|
||||
|
||||
const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; }
|
||||
const ChartTrellisNode &GetChild(size_t i) const {
|
||||
return *m_children[i];
|
||||
}
|
||||
|
||||
Phrase GetOutputPhrase() const;
|
||||
|
||||
private:
|
||||
private:
|
||||
ChartTrellisNode(const ChartTrellisNode &); // Not implemented
|
||||
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented
|
||||
|
||||
|
@ -30,17 +30,17 @@ namespace Moses
|
||||
{
|
||||
|
||||
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
|
||||
: m_finalNode(new ChartTrellisNode(hypo))
|
||||
, m_deviationPoint(NULL)
|
||||
, m_scoreBreakdown(hypo.GetScoreBreakdown())
|
||||
, m_totalScore(hypo.GetTotalScore())
|
||||
: m_finalNode(new ChartTrellisNode(hypo))
|
||||
, m_deviationPoint(NULL)
|
||||
, m_scoreBreakdown(hypo.GetScoreBreakdown())
|
||||
, m_totalScore(hypo.GetTotalScore())
|
||||
{
|
||||
}
|
||||
|
||||
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
|
||||
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
|
||||
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
|
||||
, m_totalScore(0)
|
||||
: m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
|
||||
, m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
|
||||
, m_totalScore(0)
|
||||
{
|
||||
CHECK(m_deviationPoint);
|
||||
ScoreComponentCollection scoreChange;
|
||||
|
@ -36,18 +36,24 @@ class ChartTrellisNode;
|
||||
|
||||
class ChartTrellisPath
|
||||
{
|
||||
public:
|
||||
public:
|
||||
ChartTrellisPath(const ChartHypothesis &hypo);
|
||||
ChartTrellisPath(const ChartTrellisDetour &detour);
|
||||
|
||||
~ChartTrellisPath();
|
||||
|
||||
const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; }
|
||||
const ChartTrellisNode &GetFinalNode() const {
|
||||
return *m_finalNode;
|
||||
}
|
||||
|
||||
const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; }
|
||||
const ChartTrellisNode *GetDeviationPoint() const {
|
||||
return m_deviationPoint;
|
||||
}
|
||||
|
||||
//! get score for this path throught trellis
|
||||
float GetTotalScore() const { return m_totalScore; }
|
||||
float GetTotalScore() const {
|
||||
return m_totalScore;
|
||||
}
|
||||
|
||||
Phrase GetOutputPhrase() const;
|
||||
|
||||
@ -56,7 +62,7 @@ class ChartTrellisPath
|
||||
return m_scoreBreakdown;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
ChartTrellisPath(const ChartTrellisPath &); // Not implemented
|
||||
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented
|
||||
|
||||
|
@ -32,26 +32,38 @@ class DottedRule
|
||||
{
|
||||
friend std::ostream& operator<<(std::ostream &, const DottedRule &);
|
||||
|
||||
public:
|
||||
public:
|
||||
// used only to init dot stack.
|
||||
DottedRule()
|
||||
: m_cellLabel(NULL)
|
||||
, m_prev(NULL) {}
|
||||
: m_cellLabel(NULL)
|
||||
, m_prev(NULL) {}
|
||||
|
||||
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
|
||||
: m_cellLabel(&ccl)
|
||||
, m_prev(&prev) {}
|
||||
: m_cellLabel(&ccl)
|
||||
, m_prev(&prev) {}
|
||||
|
||||
const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); }
|
||||
const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); }
|
||||
bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); }
|
||||
const DottedRule *GetPrev() const { return m_prev; }
|
||||
bool IsRoot() const { return m_prev == NULL; }
|
||||
const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; }
|
||||
const WordsRange &GetWordsRange() const {
|
||||
return m_cellLabel->GetCoverage();
|
||||
}
|
||||
const Word &GetSourceWord() const {
|
||||
return m_cellLabel->GetLabel();
|
||||
}
|
||||
bool IsNonTerminal() const {
|
||||
return m_cellLabel->GetLabel().IsNonTerminal();
|
||||
}
|
||||
const DottedRule *GetPrev() const {
|
||||
return m_prev;
|
||||
}
|
||||
bool IsRoot() const {
|
||||
return m_prev == NULL;
|
||||
}
|
||||
const ChartCellLabel &GetChartCellLabel() const {
|
||||
return *m_cellLabel;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
const ChartCellLabel *m_cellLabel; // usually contains something, unless
|
||||
// it's the init processed rule
|
||||
// it's the init processed rule
|
||||
const DottedRule *m_prev;
|
||||
};
|
||||
|
||||
|
@ -1,17 +1,17 @@
|
||||
/***********************************************************************
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2006-2011 University of Edinburgh
|
||||
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
@ -34,21 +34,23 @@ namespace Moses
|
||||
|
||||
class DottedRuleInMemory : public DottedRule
|
||||
{
|
||||
public:
|
||||
public:
|
||||
// used only to init dot stack.
|
||||
explicit DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node)
|
||||
: DottedRule()
|
||||
, m_node(node) {}
|
||||
: DottedRule()
|
||||
, m_node(node) {}
|
||||
|
||||
DottedRuleInMemory(const PhraseDictionaryNodeSCFG &node,
|
||||
const ChartCellLabel &cellLabel,
|
||||
const DottedRuleInMemory &prev)
|
||||
: DottedRule(cellLabel, prev)
|
||||
, m_node(node) {}
|
||||
|
||||
const PhraseDictionaryNodeSCFG &GetLastNode() const { return m_node; }
|
||||
: DottedRule(cellLabel, prev)
|
||||
, m_node(node) {}
|
||||
|
||||
private:
|
||||
const PhraseDictionaryNodeSCFG &GetLastNode() const {
|
||||
return m_node;
|
||||
}
|
||||
|
||||
private:
|
||||
const PhraseDictionaryNodeSCFG &m_node;
|
||||
};
|
||||
|
||||
|
@ -34,26 +34,32 @@ namespace Moses
|
||||
{
|
||||
class DottedRuleOnDisk : public DottedRule
|
||||
{
|
||||
public:
|
||||
public:
|
||||
// used only to init dot stack.
|
||||
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
|
||||
: DottedRule()
|
||||
, m_lastNode(lastNode)
|
||||
, m_done(false) {}
|
||||
: DottedRule()
|
||||
, m_lastNode(lastNode)
|
||||
, m_done(false) {}
|
||||
|
||||
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
|
||||
const ChartCellLabel &cellLabel,
|
||||
const DottedRuleOnDisk &prev)
|
||||
: DottedRule(cellLabel, prev)
|
||||
, m_lastNode(lastNode)
|
||||
, m_done(false) {}
|
||||
: DottedRule(cellLabel, prev)
|
||||
, m_lastNode(lastNode)
|
||||
, m_done(false) {}
|
||||
|
||||
const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; }
|
||||
const OnDiskPt::PhraseNode &GetLastNode() const {
|
||||
return m_lastNode;
|
||||
}
|
||||
|
||||
bool Done() const { return m_done; }
|
||||
void Done(bool value) const { m_done = value; }
|
||||
bool Done() const {
|
||||
return m_done;
|
||||
}
|
||||
void Done(bool value) const {
|
||||
m_done = value;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
const OnDiskPt::PhraseNode &m_lastNode;
|
||||
mutable bool m_done;
|
||||
};
|
||||
|
@ -36,9 +36,9 @@ public:
|
||||
const ChartHypothesis&,
|
||||
int /* featureID */,
|
||||
ScoreComponentCollection*) const {
|
||||
CHECK(0); // feature function not valid in chart decoder
|
||||
return NULL;
|
||||
}
|
||||
CHECK(0); // feature function not valid in chart decoder
|
||||
return NULL;
|
||||
}
|
||||
};
|
||||
|
||||
/** Doesn't do anything but provide a key into the global
|
||||
|
@ -22,176 +22,179 @@
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
|
||||
namespace randlm {
|
||||
|
||||
template<typename T>
|
||||
class CacheNode {
|
||||
public:
|
||||
typedef std::map<wordID_t, CacheNode<T>* > childMap;
|
||||
// initialise value to 'unknown' (i.e. not yet queried or cached).
|
||||
CacheNode(T unknown_value) : value_(unknown_value) {}
|
||||
childMap childs_; // child pointers
|
||||
T value_; // value stored
|
||||
const void* state_; // state pointer
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class Cache {
|
||||
public:
|
||||
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
|
||||
// unknown_value is used to indicate the ngram was not queried (yet)
|
||||
// null_value_ indicates it was queried but not found in model
|
||||
// space usage is handled by client.
|
||||
Cache(T unknown_value, T null_value) :
|
||||
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
|
||||
root_ = newNode();
|
||||
}
|
||||
~Cache() {
|
||||
if(clear()) {
|
||||
delete root_;
|
||||
root_ = NULL;
|
||||
} else {
|
||||
std::cerr << "Error freeing cache memory.\n";
|
||||
}
|
||||
}
|
||||
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
|
||||
// inserts full ngram into cache
|
||||
CacheNode<T>* node = root_;
|
||||
for (int i = len - 1; i > -1; --i) {
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// current node is already prefix. Go to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
} else {
|
||||
// no child for prefix. set new child link in current node
|
||||
CacheNode<T> * newChild = newNode(node);
|
||||
node->childs_[ngram[i]] = newChild;
|
||||
// go to new node
|
||||
node = newChild;
|
||||
}
|
||||
}
|
||||
node->value_ = value;
|
||||
node->state_ = state;
|
||||
return true;
|
||||
}
|
||||
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
|
||||
// finds value for this full ngram only (returns false if full ngram not in cache)
|
||||
CacheNode<T> * node = root_;
|
||||
for(int i = len - 1; i > -1; --i) {
|
||||
// go to deepest level node of ngram in cache
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// switch to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
} else {
|
||||
// not cached
|
||||
return false;
|
||||
}
|
||||
}
|
||||
*value = node->value_;
|
||||
if(state) *state = node->state_;
|
||||
return *value != null_value_ && *value != unknown_value_;
|
||||
}
|
||||
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
|
||||
// set values array to point to cache value nodes
|
||||
CacheNode<T> * node = root_;
|
||||
*found = 0;
|
||||
//values[0] = &node->value_; // pointer to root node's value
|
||||
bool all_found = true;
|
||||
for(int i = len - 1; i > -1; --i) {
|
||||
// go to deepest level node of ngram in cache
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// switch to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
// get pointer to value (index by length - 1)
|
||||
values[i] = &node->value_;
|
||||
// if null_value then assume all extensions impossible
|
||||
if (node->value_ == null_value_) {
|
||||
return len - 1 - i; // max length posible
|
||||
}
|
||||
all_found = all_found && (node->value_ != unknown_value_);
|
||||
if (all_found)
|
||||
++(*found);
|
||||
} else {
|
||||
// initialise uncached values
|
||||
CacheNode<T> * newChild = newNode(node);
|
||||
node->childs_[ngram[i]] = newChild;
|
||||
// go to new node
|
||||
node = newChild;
|
||||
values[i] = &node->value_;
|
||||
}
|
||||
}
|
||||
return len; // all possible
|
||||
}
|
||||
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
|
||||
// get pointers to values for ngram and constituents.
|
||||
// returns upper bound on longest subngram in model.
|
||||
// 'found' stores longest non-null and known value found.
|
||||
CacheNode<T> * node = root_;
|
||||
*found = 0;
|
||||
values[0] = &node->value_; // pointer to root node's value
|
||||
bool all_found = true;
|
||||
for(int i = len - 1; i > -1; --i) {
|
||||
// go to deepest level node of ngram in cache
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// switch to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
// get pointer to value (index by length - 1)
|
||||
values[len - i] = &node->value_;
|
||||
// if null_value then assume all extensions impossible
|
||||
if (node->value_ == null_value_)
|
||||
return len - 1 - i; // max length posible
|
||||
all_found = all_found && (node->value_ != unknown_value_);
|
||||
if (all_found)
|
||||
++(*found);
|
||||
} else {
|
||||
// initialise uncached values
|
||||
CacheNode<T> * newChild = newNode(node);
|
||||
node->childs_[ngram[i]] = newChild;
|
||||
// go to new node
|
||||
node = newChild;
|
||||
values[len - i] = &node->value_;
|
||||
}
|
||||
}
|
||||
return len; // all possible
|
||||
}
|
||||
bool clear() {
|
||||
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
|
||||
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
|
||||
return clearNodes(root_);
|
||||
}
|
||||
int nodes() {
|
||||
// returns number of nodes
|
||||
return cur_nodes_;
|
||||
}
|
||||
int nodeSize() {
|
||||
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
|
||||
}
|
||||
private:
|
||||
CacheNode<T> * root_;
|
||||
count_t cur_nodes_;
|
||||
T unknown_value_; // Used to initialise data at each node
|
||||
T null_value_; // Indicates cached something not in model
|
||||
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
|
||||
++cur_nodes_;
|
||||
return new CacheNode<T>(unknown_value_);
|
||||
}
|
||||
bool clearNodes(CacheNode<T> * node) {
|
||||
//delete children from this node
|
||||
if(!node->childs_.empty()) {
|
||||
iterate(node->childs_, itr) {
|
||||
if(!clearNodes(itr->second))
|
||||
std::cerr << "Error emptying cache\n";
|
||||
delete itr->second;
|
||||
--cur_nodes_;
|
||||
}
|
||||
node->childs_.clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
namespace randlm
|
||||
{
|
||||
|
||||
};
|
||||
template<typename T>
|
||||
class CacheNode
|
||||
{
|
||||
public:
|
||||
typedef std::map<wordID_t, CacheNode<T>* > childMap;
|
||||
// initialise value to 'unknown' (i.e. not yet queried or cached).
|
||||
CacheNode(T unknown_value) : value_(unknown_value) {}
|
||||
childMap childs_; // child pointers
|
||||
T value_; // value stored
|
||||
const void* state_; // state pointer
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class Cache
|
||||
{
|
||||
public:
|
||||
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
|
||||
// unknown_value is used to indicate the ngram was not queried (yet)
|
||||
// null_value_ indicates it was queried but not found in model
|
||||
// space usage is handled by client.
|
||||
Cache(T unknown_value, T null_value) :
|
||||
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
|
||||
root_ = newNode();
|
||||
}
|
||||
~Cache() {
|
||||
if(clear()) {
|
||||
delete root_;
|
||||
root_ = NULL;
|
||||
} else {
|
||||
std::cerr << "Error freeing cache memory.\n";
|
||||
}
|
||||
}
|
||||
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
|
||||
// inserts full ngram into cache
|
||||
CacheNode<T>* node = root_;
|
||||
for (int i = len - 1; i > -1; --i) {
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// current node is already prefix. Go to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
} else {
|
||||
// no child for prefix. set new child link in current node
|
||||
CacheNode<T> * newChild = newNode(node);
|
||||
node->childs_[ngram[i]] = newChild;
|
||||
// go to new node
|
||||
node = newChild;
|
||||
}
|
||||
}
|
||||
node->value_ = value;
|
||||
node->state_ = state;
|
||||
return true;
|
||||
}
|
||||
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
|
||||
// finds value for this full ngram only (returns false if full ngram not in cache)
|
||||
CacheNode<T> * node = root_;
|
||||
for(int i = len - 1; i > -1; --i) {
|
||||
// go to deepest level node of ngram in cache
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// switch to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
} else {
|
||||
// not cached
|
||||
return false;
|
||||
}
|
||||
}
|
||||
*value = node->value_;
|
||||
if(state) *state = node->state_;
|
||||
return *value != null_value_ && *value != unknown_value_;
|
||||
}
|
||||
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
|
||||
// set values array to point to cache value nodes
|
||||
CacheNode<T> * node = root_;
|
||||
*found = 0;
|
||||
//values[0] = &node->value_; // pointer to root node's value
|
||||
bool all_found = true;
|
||||
for(int i = len - 1; i > -1; --i) {
|
||||
// go to deepest level node of ngram in cache
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// switch to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
// get pointer to value (index by length - 1)
|
||||
values[i] = &node->value_;
|
||||
// if null_value then assume all extensions impossible
|
||||
if (node->value_ == null_value_) {
|
||||
return len - 1 - i; // max length posible
|
||||
}
|
||||
all_found = all_found && (node->value_ != unknown_value_);
|
||||
if (all_found)
|
||||
++(*found);
|
||||
} else {
|
||||
// initialise uncached values
|
||||
CacheNode<T> * newChild = newNode(node);
|
||||
node->childs_[ngram[i]] = newChild;
|
||||
// go to new node
|
||||
node = newChild;
|
||||
values[i] = &node->value_;
|
||||
}
|
||||
}
|
||||
return len; // all possible
|
||||
}
|
||||
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
|
||||
// get pointers to values for ngram and constituents.
|
||||
// returns upper bound on longest subngram in model.
|
||||
// 'found' stores longest non-null and known value found.
|
||||
CacheNode<T> * node = root_;
|
||||
*found = 0;
|
||||
values[0] = &node->value_; // pointer to root node's value
|
||||
bool all_found = true;
|
||||
for(int i = len - 1; i > -1; --i) {
|
||||
// go to deepest level node of ngram in cache
|
||||
childPtr child = node->childs_.find(ngram[i]);
|
||||
if( child != node->childs_.end() ) {
|
||||
// switch to child node
|
||||
node = node->childs_[ngram[i]];
|
||||
// get pointer to value (index by length - 1)
|
||||
values[len - i] = &node->value_;
|
||||
// if null_value then assume all extensions impossible
|
||||
if (node->value_ == null_value_)
|
||||
return len - 1 - i; // max length posible
|
||||
all_found = all_found && (node->value_ != unknown_value_);
|
||||
if (all_found)
|
||||
++(*found);
|
||||
} else {
|
||||
// initialise uncached values
|
||||
CacheNode<T> * newChild = newNode(node);
|
||||
node->childs_[ngram[i]] = newChild;
|
||||
// go to new node
|
||||
node = newChild;
|
||||
values[len - i] = &node->value_;
|
||||
}
|
||||
}
|
||||
return len; // all possible
|
||||
}
|
||||
bool clear() {
|
||||
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
|
||||
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
|
||||
return clearNodes(root_);
|
||||
}
|
||||
int nodes() {
|
||||
// returns number of nodes
|
||||
return cur_nodes_;
|
||||
}
|
||||
int nodeSize() {
|
||||
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
|
||||
}
|
||||
private:
|
||||
CacheNode<T> * root_;
|
||||
count_t cur_nodes_;
|
||||
T unknown_value_; // Used to initialise data at each node
|
||||
T null_value_; // Indicates cached something not in model
|
||||
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
|
||||
++cur_nodes_;
|
||||
return new CacheNode<T>(unknown_value_);
|
||||
}
|
||||
bool clearNodes(CacheNode<T> * node) {
|
||||
//delete children from this node
|
||||
if(!node->childs_.empty()) {
|
||||
iterate(node->childs_, itr) {
|
||||
if(!clearNodes(itr->second))
|
||||
std::cerr << "Error emptying cache\n";
|
||||
delete itr->second;
|
||||
--cur_nodes_;
|
||||
}
|
||||
node->childs_.clear();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
} //end namespace
|
||||
#endif //INC_RANDLM_CACHE_H
|
||||
|
@ -20,295 +20,306 @@
|
||||
#include <cmath>
|
||||
#include "file.h"
|
||||
|
||||
namespace randlm {
|
||||
|
||||
// Class Filter wraps a contiguous array of data. Filter and its subclasses
|
||||
// implement read/write/increment functionality on arrays with arbitrary sized addresses
|
||||
// (i.e. an address may not use a full number of bytes). When converting to byte-based
|
||||
// representation we assume "unused" bits are to left.
|
||||
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
|
||||
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
|
||||
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
|
||||
// been masked out.
|
||||
template<typename T>
|
||||
class Filter {
|
||||
public:
|
||||
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
|
||||
// number of bits in T
|
||||
cell_width_ = sizeof(T) << 3;
|
||||
// current implementation has following constraints
|
||||
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
|
||||
// used for >> division
|
||||
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
|
||||
// size of underlying data in Ts
|
||||
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
|
||||
// instantiate underlying data
|
||||
data_ = new T[cells_];
|
||||
CHECK(data_ != NULL);
|
||||
CHECK(reset());
|
||||
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
|
||||
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
|
||||
// mask for full cell
|
||||
full_mask_ = static_cast<T>(0xffffffffffffffffull);
|
||||
// mask for bits that make up the address
|
||||
address_mask_ = full_mask_ >> first_bit_;
|
||||
}
|
||||
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
|
||||
CHECK(loadHeader(fin));
|
||||
if (loaddata)
|
||||
CHECK(loadData(fin));
|
||||
}
|
||||
virtual ~Filter() {
|
||||
delete[] data_;
|
||||
}
|
||||
bool reset() {
|
||||
for (uint64_t i = 0; i < cells_; ++i)
|
||||
data_[i] = 0;
|
||||
return true;
|
||||
}
|
||||
count_t size() {
|
||||
// return approx size of filter in MBs
|
||||
return cells_ * sizeof(T) >> 20;
|
||||
}
|
||||
// read / write functions
|
||||
inline bool read(uint64_t address, T* value) {
|
||||
CHECK(address <= addresses_);
|
||||
// copy address to 'value'
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading bits
|
||||
if (offset == 0) {
|
||||
*value = data_[data_cell] & address_mask_;
|
||||
return true;
|
||||
}
|
||||
// data address starts to left so shift it right
|
||||
if (offset < 0) {
|
||||
*value = (data_[data_cell] >> -offset) & address_mask_;
|
||||
return true;
|
||||
}
|
||||
// data address is to right so shift it left and look at one more cell to right
|
||||
*value = ((data_[data_cell] << offset)
|
||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
||||
return true;
|
||||
}
|
||||
inline T read(uint64_t address) {
|
||||
CHECK(address <= addresses_);
|
||||
// return value at address
|
||||
T value = 0;
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading bits
|
||||
if (offset == 0) {
|
||||
value = data_[data_cell] & address_mask_;
|
||||
}
|
||||
// data address starts to left so shift it right
|
||||
else if (offset < 0) {
|
||||
value = (data_[data_cell] >> -offset) & address_mask_;
|
||||
}
|
||||
// data address is to right so shift it left and look at one more cell to right
|
||||
else
|
||||
value = ((data_[data_cell] << offset)
|
||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
||||
return value;
|
||||
}
|
||||
inline bool write(uint64_t address, T value) {
|
||||
CHECK(address <= addresses_);
|
||||
CHECK(log2(value) <= width_);
|
||||
// write 'value' to address
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading zeros of value
|
||||
if (offset == 0) {
|
||||
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
|
||||
return true;
|
||||
}
|
||||
// address in data is to left so shift value left by -offset
|
||||
if (offset < 0) {
|
||||
data_[data_cell] = (value << -offset)
|
||||
| (data_[data_cell] & ~(address_mask_ << -offset));
|
||||
return true;
|
||||
}
|
||||
// address in data is to right so shift value right by offset
|
||||
data_[data_cell] = (value >> offset) |
|
||||
(data_[data_cell] & ~(address_mask_ >> offset));
|
||||
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
|
||||
(data_[data_cell + 1] & (full_mask_ >> offset));
|
||||
return true;
|
||||
}
|
||||
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
|
||||
// copy 'address' ^ 'finger' to 'value'
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading bits
|
||||
if (offset == 0) {
|
||||
*value = (finger ^ data_[data_cell]) & address_mask_;
|
||||
return true;
|
||||
}
|
||||
// data address starts to left so shift it right
|
||||
if (offset < 0) {
|
||||
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
|
||||
return true;
|
||||
}
|
||||
// data address is to right so shift it left and look at one more cell to right
|
||||
*value = (((data_[data_cell] << offset)
|
||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
|
||||
& address_mask_ ;
|
||||
return true;
|
||||
}
|
||||
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
|
||||
// write 'value' ^ 'finger' to address
|
||||
finger &= address_mask_; // make sure fingerprint is correct size
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading zeros of value
|
||||
if (offset == 0) {
|
||||
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
|
||||
return true;
|
||||
}
|
||||
// address in data is to left so shift value left by -offset
|
||||
if (offset < 0) {
|
||||
data_[data_cell] = ((finger ^ value) << -offset)
|
||||
| (data_[data_cell] & ~(address_mask_ << -offset));
|
||||
return true;
|
||||
}
|
||||
// address in data is to right so shift value right by offset
|
||||
data_[data_cell] = ((finger ^ value) >> offset) |
|
||||
(data_[data_cell] & ~(address_mask_ >> offset));
|
||||
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
|
||||
(data_[data_cell + 1] & (full_mask_ >> offset));
|
||||
return true;
|
||||
}
|
||||
// debugging
|
||||
void printFilter(const std::string & prefix = "", uint32_t truncate = 64){
|
||||
std::cout << prefix;
|
||||
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
|
||||
for (int j = cell_width_ - 1; j >= 0; --j)
|
||||
if (data_[i] & (1ull << j))
|
||||
std::cout << 1;
|
||||
else
|
||||
std::cout << 0;
|
||||
std::cout << "\n";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
// i/o
|
||||
uint64_t getAddresses() { return addresses_; }
|
||||
int getWidth() { return width_; }
|
||||
int getCellWidth() { return cell_width_; }
|
||||
uint32_t getCells() { return cells_; }
|
||||
virtual bool save(FileHandler* out) {
|
||||
CHECK(out != NULL);
|
||||
CHECK(out->write((char*)&cells_, sizeof(cells_)));
|
||||
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
|
||||
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
||||
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
|
||||
CHECK(out->write((char*)&width_, sizeof(width_)));
|
||||
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
|
||||
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
|
||||
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
|
||||
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
||||
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
|
||||
if((width_ == 1) || cells_ < jump)
|
||||
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
||||
else {
|
||||
uint64_t idx(0);
|
||||
while(idx + jump < cells_) {
|
||||
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
|
||||
idx += jump;
|
||||
}
|
||||
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
protected:
|
||||
bool loadHeader(FileHandler* fin) {
|
||||
CHECK(fin != NULL);
|
||||
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
|
||||
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
|
||||
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
|
||||
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
||||
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
|
||||
CHECK(fin->read((char*)&width_, sizeof(width_)));
|
||||
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
|
||||
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
|
||||
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
|
||||
return true;
|
||||
}
|
||||
bool loadData(FileHandler* fin) {
|
||||
// instantiate underlying array
|
||||
data_ = new T[cells_];
|
||||
CHECK(data_ != NULL);
|
||||
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
|
||||
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
|
||||
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
|
||||
return true;
|
||||
}
|
||||
uint64_t cells_; // number T making up 'data_'
|
||||
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
|
||||
int log_cell_width_; // log of bits used for >> division
|
||||
uint64_t addresses_; // number of addresses in the filter
|
||||
int width_; // width in bits of each address
|
||||
int first_bit_; // position of first bit in initial byte
|
||||
T full_mask_; // all 1s
|
||||
T address_mask_; // 1s in those positions that are part of address
|
||||
T* data_; // the raw data as bytes
|
||||
};
|
||||
namespace randlm
|
||||
{
|
||||
|
||||
// Extension with bit test/setter methods added
|
||||
class BitFilter : public Filter<uint8_t> {
|
||||
public:
|
||||
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
|
||||
BitFilter(FileHandler* fin, bool loaddata = true)
|
||||
: Filter<uint8_t>(fin, loaddata) {
|
||||
if (loaddata)
|
||||
CHECK(load(fin));
|
||||
}
|
||||
// TODO: overload operator[]
|
||||
virtual bool testBit(uint64_t location) {
|
||||
// test bit referenced by location
|
||||
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
|
||||
}
|
||||
virtual bool setBit(uint64_t location) {
|
||||
// set bit referenced by location
|
||||
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
|
||||
// Class Filter wraps a contiguous array of data. Filter and its subclasses
|
||||
// implement read/write/increment functionality on arrays with arbitrary sized addresses
|
||||
// (i.e. an address may not use a full number of bytes). When converting to byte-based
|
||||
// representation we assume "unused" bits are to left.
|
||||
// E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
|
||||
// to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
|
||||
// and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
|
||||
// been masked out.
|
||||
template<typename T>
|
||||
class Filter
|
||||
{
|
||||
public:
|
||||
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
|
||||
// number of bits in T
|
||||
cell_width_ = sizeof(T) << 3;
|
||||
// current implementation has following constraints
|
||||
CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
|
||||
// used for >> division
|
||||
log_cell_width_ = static_cast<int>(floor(log(cell_width_)/log(2) + 0.000001));
|
||||
// size of underlying data in Ts
|
||||
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
|
||||
// instantiate underlying data
|
||||
data_ = new T[cells_];
|
||||
CHECK(data_ != NULL);
|
||||
CHECK(reset());
|
||||
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
|
||||
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
|
||||
// mask for full cell
|
||||
full_mask_ = static_cast<T>(0xffffffffffffffffull);
|
||||
// mask for bits that make up the address
|
||||
address_mask_ = full_mask_ >> first_bit_;
|
||||
}
|
||||
Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
|
||||
CHECK(loadHeader(fin));
|
||||
if (loaddata)
|
||||
CHECK(loadData(fin));
|
||||
}
|
||||
virtual ~Filter() {
|
||||
delete[] data_;
|
||||
}
|
||||
bool reset() {
|
||||
for (uint64_t i = 0; i < cells_; ++i)
|
||||
data_[i] = 0;
|
||||
return true;
|
||||
}
|
||||
count_t size() {
|
||||
// return approx size of filter in MBs
|
||||
return cells_ * sizeof(T) >> 20;
|
||||
}
|
||||
// read / write functions
|
||||
inline bool read(uint64_t address, T* value) {
|
||||
CHECK(address <= addresses_);
|
||||
// copy address to 'value'
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading bits
|
||||
if (offset == 0) {
|
||||
*value = data_[data_cell] & address_mask_;
|
||||
return true;
|
||||
}
|
||||
virtual bool clearBit(uint64_t location) {
|
||||
// set bit referenced by location
|
||||
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
|
||||
// data address starts to left so shift it right
|
||||
if (offset < 0) {
|
||||
*value = (data_[data_cell] >> -offset) & address_mask_;
|
||||
return true;
|
||||
}
|
||||
bool save(FileHandler* fout) {
|
||||
CHECK(Filter<uint8_t>::save(fout));
|
||||
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
|
||||
// data address is to right so shift it left and look at one more cell to right
|
||||
*value = ((data_[data_cell] << offset)
|
||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
||||
return true;
|
||||
}
|
||||
inline T read(uint64_t address) {
|
||||
CHECK(address <= addresses_);
|
||||
// return value at address
|
||||
T value = 0;
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading bits
|
||||
if (offset == 0) {
|
||||
value = data_[data_cell] & address_mask_;
|
||||
}
|
||||
// data address starts to left so shift it right
|
||||
else if (offset < 0) {
|
||||
value = (data_[data_cell] >> -offset) & address_mask_;
|
||||
}
|
||||
// data address is to right so shift it left and look at one more cell to right
|
||||
else
|
||||
value = ((data_[data_cell] << offset)
|
||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
|
||||
return value;
|
||||
}
|
||||
inline bool write(uint64_t address, T value) {
|
||||
CHECK(address <= addresses_);
|
||||
CHECK(log2(value) <= width_);
|
||||
// write 'value' to address
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading zeros of value
|
||||
if (offset == 0) {
|
||||
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
|
||||
return true;
|
||||
}
|
||||
float rho(uint64_t limit = 0) {
|
||||
uint64_t ones = 0;
|
||||
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
|
||||
for (uint64_t i = 0; i < range; ++i)
|
||||
for (int j = 0; j < 8; ++j)
|
||||
if (data_[i] & (1 << j))
|
||||
++ones;
|
||||
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
|
||||
}
|
||||
protected:
|
||||
bool load(FileHandler* fin) {
|
||||
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
|
||||
// address in data is to left so shift value left by -offset
|
||||
if (offset < 0) {
|
||||
data_[data_cell] = (value << -offset)
|
||||
| (data_[data_cell] & ~(address_mask_ << -offset));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
/*
|
||||
// address in data is to right so shift value right by offset
|
||||
data_[data_cell] = (value >> offset) |
|
||||
(data_[data_cell] & ~(address_mask_ >> offset));
|
||||
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
|
||||
(data_[data_cell + 1] & (full_mask_ >> offset));
|
||||
return true;
|
||||
}
|
||||
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
|
||||
// copy 'address' ^ 'finger' to 'value'
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading bits
|
||||
if (offset == 0) {
|
||||
*value = (finger ^ data_[data_cell]) & address_mask_;
|
||||
return true;
|
||||
}
|
||||
// data address starts to left so shift it right
|
||||
if (offset < 0) {
|
||||
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
|
||||
return true;
|
||||
}
|
||||
// data address is to right so shift it left and look at one more cell to right
|
||||
*value = (((data_[data_cell] << offset)
|
||||
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
|
||||
& address_mask_ ;
|
||||
return true;
|
||||
}
|
||||
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
|
||||
// write 'value' ^ 'finger' to address
|
||||
finger &= address_mask_; // make sure fingerprint is correct size
|
||||
uint64_t data_bit = address * width_;
|
||||
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
|
||||
// 'offset' shows how address in 'data' and 'value' align
|
||||
int offset = (data_bit % cell_width_) - first_bit_;
|
||||
// they align so just copy across masking unneeded leading zeros of value
|
||||
if (offset == 0) {
|
||||
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
|
||||
return true;
|
||||
}
|
||||
// address in data is to left so shift value left by -offset
|
||||
if (offset < 0) {
|
||||
data_[data_cell] = ((finger ^ value) << -offset)
|
||||
| (data_[data_cell] & ~(address_mask_ << -offset));
|
||||
return true;
|
||||
}
|
||||
// address in data is to right so shift value right by offset
|
||||
data_[data_cell] = ((finger ^ value) >> offset) |
|
||||
(data_[data_cell] & ~(address_mask_ >> offset));
|
||||
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
|
||||
(data_[data_cell + 1] & (full_mask_ >> offset));
|
||||
return true;
|
||||
}
|
||||
// debugging
|
||||
void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
|
||||
std::cout << prefix;
|
||||
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
|
||||
for (int j = cell_width_ - 1; j >= 0; --j)
|
||||
if (data_[i] & (1ull << j))
|
||||
std::cout << 1;
|
||||
else
|
||||
std::cout << 0;
|
||||
std::cout << "\n";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
// i/o
|
||||
uint64_t getAddresses() {
|
||||
return addresses_;
|
||||
}
|
||||
int getWidth() {
|
||||
return width_;
|
||||
}
|
||||
int getCellWidth() {
|
||||
return cell_width_;
|
||||
}
|
||||
uint32_t getCells() {
|
||||
return cells_;
|
||||
}
|
||||
virtual bool save(FileHandler* out) {
|
||||
CHECK(out != NULL);
|
||||
CHECK(out->write((char*)&cells_, sizeof(cells_)));
|
||||
CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
|
||||
CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
||||
CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
|
||||
CHECK(out->write((char*)&width_, sizeof(width_)));
|
||||
CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
|
||||
CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
|
||||
CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
|
||||
//CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
||||
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
|
||||
if((width_ == 1) || cells_ < jump)
|
||||
CHECK(out->write((char*)data_, cells_ * sizeof(T)));
|
||||
else {
|
||||
uint64_t idx(0);
|
||||
while(idx + jump < cells_) {
|
||||
CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
|
||||
idx += jump;
|
||||
}
|
||||
CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
protected:
|
||||
bool loadHeader(FileHandler* fin) {
|
||||
CHECK(fin != NULL);
|
||||
CHECK(fin->read((char*)&cells_, sizeof(cells_)));
|
||||
CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
|
||||
CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
|
||||
CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
|
||||
CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
|
||||
CHECK(fin->read((char*)&width_, sizeof(width_)));
|
||||
CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
|
||||
CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
|
||||
CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
|
||||
return true;
|
||||
}
|
||||
bool loadData(FileHandler* fin) {
|
||||
// instantiate underlying array
|
||||
data_ = new T[cells_];
|
||||
CHECK(data_ != NULL);
|
||||
CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
|
||||
//CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
|
||||
//CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
|
||||
return true;
|
||||
}
|
||||
uint64_t cells_; // number T making up 'data_'
|
||||
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
|
||||
int log_cell_width_; // log of bits used for >> division
|
||||
uint64_t addresses_; // number of addresses in the filter
|
||||
int width_; // width in bits of each address
|
||||
int first_bit_; // position of first bit in initial byte
|
||||
T full_mask_; // all 1s
|
||||
T address_mask_; // 1s in those positions that are part of address
|
||||
T* data_; // the raw data as bytes
|
||||
};
|
||||
|
||||
// Extension with bit test/setter methods added
|
||||
class BitFilter : public Filter<uint8_t>
|
||||
{
|
||||
public:
|
||||
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
|
||||
BitFilter(FileHandler* fin, bool loaddata = true)
|
||||
: Filter<uint8_t>(fin, loaddata) {
|
||||
if (loaddata)
|
||||
CHECK(load(fin));
|
||||
}
|
||||
// TODO: overload operator[]
|
||||
virtual bool testBit(uint64_t location) {
|
||||
// test bit referenced by location
|
||||
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
|
||||
}
|
||||
virtual bool setBit(uint64_t location) {
|
||||
// set bit referenced by location
|
||||
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
|
||||
return true;
|
||||
}
|
||||
virtual bool clearBit(uint64_t location) {
|
||||
// set bit referenced by location
|
||||
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
|
||||
return true;
|
||||
}
|
||||
bool save(FileHandler* fout) {
|
||||
CHECK(Filter<uint8_t>::save(fout));
|
||||
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
|
||||
return true;
|
||||
}
|
||||
float rho(uint64_t limit = 0) {
|
||||
uint64_t ones = 0;
|
||||
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
|
||||
for (uint64_t i = 0; i < range; ++i)
|
||||
for (int j = 0; j < 8; ++j)
|
||||
if (data_[i] & (1 << j))
|
||||
++ones;
|
||||
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
|
||||
}
|
||||
protected:
|
||||
bool load(FileHandler* fin) {
|
||||
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
/*
|
||||
// ResizedBitFilter deals with resizing to save memory
|
||||
// whereas other filters should expect locations to be within range
|
||||
// this filter will need to resize (and possibly rehash) locations
|
||||
@ -380,9 +391,9 @@ namespace randlm {
|
||||
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
|
||||
}
|
||||
// last update must not have carried
|
||||
if (!carry)
|
||||
if (!carry)
|
||||
return true;
|
||||
// wrapped round so check whether need to reset to max count
|
||||
// wrapped round so check whether need to reset to max count
|
||||
if (!wrap_around_)
|
||||
CHECK(this->write(address, this->address_mask_));
|
||||
return false; // false to indicate that overflowed
|
||||
@ -397,7 +408,7 @@ namespace randlm {
|
||||
}
|
||||
inline bool incrementSubCell(int bit, int len, T* cell) {
|
||||
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
|
||||
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
|
||||
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
|
||||
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
|
||||
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
|
||||
// indicate overflow as true
|
||||
|
@ -10,58 +10,66 @@ using namespace Moses;
|
||||
typedef uint64_t P; // largest input range is 2^64
|
||||
|
||||
template <typename T>
|
||||
class HashBase {
|
||||
protected:
|
||||
T m_; // range of hash output
|
||||
count_t H_; // number of hash functions to instantiate
|
||||
virtual void initSeeds()=0;
|
||||
virtual void freeSeeds()=0;
|
||||
public:
|
||||
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
|
||||
//cerr << "range = (0..." << m_ << "]" << endl;
|
||||
}
|
||||
HashBase(FileHandler* fin) {
|
||||
load(fin);
|
||||
}
|
||||
virtual ~HashBase(){}
|
||||
virtual T hash(const char*s, count_t h)=0; // string hashing
|
||||
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
|
||||
count_t size() { return H_;}
|
||||
virtual void save(FileHandler* fout) {
|
||||
CHECK(fout != 0);
|
||||
fout->write((char*)&m_, sizeof(m_));
|
||||
fout->write((char*)&H_, sizeof(H_));
|
||||
}
|
||||
virtual void load(FileHandler* fin) {
|
||||
CHECK(fin != 0);
|
||||
fin->read((char*)&m_, sizeof(m_));
|
||||
fin->read((char*)&H_, sizeof(H_));
|
||||
}
|
||||
class HashBase
|
||||
{
|
||||
protected:
|
||||
T m_; // range of hash output
|
||||
count_t H_; // number of hash functions to instantiate
|
||||
virtual void initSeeds()=0;
|
||||
virtual void freeSeeds()=0;
|
||||
public:
|
||||
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
|
||||
//cerr << "range = (0..." << m_ << "]" << endl;
|
||||
}
|
||||
HashBase(FileHandler* fin) {
|
||||
load(fin);
|
||||
}
|
||||
virtual ~HashBase() {}
|
||||
virtual T hash(const char*s, count_t h)=0; // string hashing
|
||||
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
|
||||
count_t size() {
|
||||
return H_;
|
||||
}
|
||||
virtual void save(FileHandler* fout) {
|
||||
CHECK(fout != 0);
|
||||
fout->write((char*)&m_, sizeof(m_));
|
||||
fout->write((char*)&H_, sizeof(H_));
|
||||
}
|
||||
virtual void load(FileHandler* fin) {
|
||||
CHECK(fin != 0);
|
||||
fin->read((char*)&m_, sizeof(m_));
|
||||
fin->read((char*)&H_, sizeof(H_));
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
class UnivHash_linear: public HashBase<T> {
|
||||
public:
|
||||
UnivHash_linear(float m, count_t H, P pr):
|
||||
HashBase<T>(m, H), pr_(pr) {
|
||||
//CHECK(isPrime(pr_));
|
||||
initSeeds();
|
||||
}
|
||||
UnivHash_linear(FileHandler* fin):
|
||||
HashBase<T>(fin) {
|
||||
load(fin);
|
||||
}
|
||||
~UnivHash_linear() {freeSeeds();}
|
||||
T hash(const char* s, count_t h){return 0;} //not implemented
|
||||
T hash(const wordID_t* id, const int len, count_t h);
|
||||
T hash(const wordID_t id, const count_t pos,
|
||||
const T prevValue, count_t h);
|
||||
void save(FileHandler* fout);
|
||||
void load(FileHandler* fin);
|
||||
private:
|
||||
T** a_, **b_;
|
||||
P pr_;
|
||||
void initSeeds();
|
||||
void freeSeeds();
|
||||
class UnivHash_linear: public HashBase<T>
|
||||
{
|
||||
public:
|
||||
UnivHash_linear(float m, count_t H, P pr):
|
||||
HashBase<T>(m, H), pr_(pr) {
|
||||
//CHECK(isPrime(pr_));
|
||||
initSeeds();
|
||||
}
|
||||
UnivHash_linear(FileHandler* fin):
|
||||
HashBase<T>(fin) {
|
||||
load(fin);
|
||||
}
|
||||
~UnivHash_linear() {
|
||||
freeSeeds();
|
||||
}
|
||||
T hash(const char* s, count_t h) {
|
||||
return 0; //not implemented
|
||||
}
|
||||
T hash(const wordID_t* id, const int len, count_t h);
|
||||
T hash(const wordID_t id, const count_t pos,
|
||||
const T prevValue, count_t h);
|
||||
void save(FileHandler* fout);
|
||||
void load(FileHandler* fin);
|
||||
private:
|
||||
T** a_, **b_;
|
||||
P pr_;
|
||||
void initSeeds();
|
||||
void freeSeeds();
|
||||
};
|
||||
|
||||
/* UnivHash_noPrimes:
|
||||
@ -71,74 +79,89 @@ class UnivHash_linear: public HashBase<T> {
|
||||
* # of hash function = 2^(l-1)
|
||||
*/
|
||||
template <typename T>
|
||||
class UnivHash_noPrimes: public HashBase<T> {
|
||||
public:
|
||||
UnivHash_noPrimes(float k, float l):
|
||||
HashBase<T>(k, 100), d_(count_t((l-k))) {
|
||||
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
|
||||
else p_ = (P) pow(2,l);
|
||||
initSeeds();
|
||||
}
|
||||
UnivHash_noPrimes(FileHandler* fin):
|
||||
HashBase<T>(fin) {
|
||||
load(fin);
|
||||
}
|
||||
~UnivHash_noPrimes() {freeSeeds();}
|
||||
T hash(const char* s, count_t h);
|
||||
T hash(const wordID_t* id, const int len, count_t h);
|
||||
T hash(const P x, count_t h);
|
||||
void save(FileHandler* fout);
|
||||
void load(FileHandler* fin);
|
||||
private:
|
||||
count_t d_; // l-k
|
||||
P p_, *a_; // real-valued input range, storage
|
||||
void initSeeds();
|
||||
void freeSeeds() {delete[] a_;}
|
||||
class UnivHash_noPrimes: public HashBase<T>
|
||||
{
|
||||
public:
|
||||
UnivHash_noPrimes(float k, float l):
|
||||
HashBase<T>(k, 100), d_(count_t((l-k))) {
|
||||
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
|
||||
else p_ = (P) pow(2,l);
|
||||
initSeeds();
|
||||
}
|
||||
UnivHash_noPrimes(FileHandler* fin):
|
||||
HashBase<T>(fin) {
|
||||
load(fin);
|
||||
}
|
||||
~UnivHash_noPrimes() {
|
||||
freeSeeds();
|
||||
}
|
||||
T hash(const char* s, count_t h);
|
||||
T hash(const wordID_t* id, const int len, count_t h);
|
||||
T hash(const P x, count_t h);
|
||||
void save(FileHandler* fout);
|
||||
void load(FileHandler* fin);
|
||||
private:
|
||||
count_t d_; // l-k
|
||||
P p_, *a_; // real-valued input range, storage
|
||||
void initSeeds();
|
||||
void freeSeeds() {
|
||||
delete[] a_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class Hash_shiftAddXOR: public HashBase<T> {
|
||||
public:
|
||||
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
|
||||
l_(5), r_(2) {
|
||||
initSeeds();
|
||||
}
|
||||
~Hash_shiftAddXOR() {freeSeeds();}
|
||||
T hash(const char* s, count_t h);
|
||||
T hash(const wordID_t* id, const int len, count_t h) {} // empty
|
||||
private:
|
||||
T* v_; // random seed storage
|
||||
const unsigned short l_, r_; // left-shift bits, right-shift bits
|
||||
void initSeeds();
|
||||
void freeSeeds() {delete[] v_;}
|
||||
class Hash_shiftAddXOR: public HashBase<T>
|
||||
{
|
||||
public:
|
||||
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
|
||||
l_(5), r_(2) {
|
||||
initSeeds();
|
||||
}
|
||||
~Hash_shiftAddXOR() {
|
||||
freeSeeds();
|
||||
}
|
||||
T hash(const char* s, count_t h);
|
||||
T hash(const wordID_t* id, const int len, count_t h) {} // empty
|
||||
private:
|
||||
T* v_; // random seed storage
|
||||
const unsigned short l_, r_; // left-shift bits, right-shift bits
|
||||
void initSeeds();
|
||||
void freeSeeds() {
|
||||
delete[] v_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class UnivHash_tableXOR: public HashBase<T> {
|
||||
public:
|
||||
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
|
||||
table_(NULL), tblLen_(255*MAX_STR_LEN) {
|
||||
initSeeds();
|
||||
}
|
||||
~UnivHash_tableXOR() {freeSeeds();}
|
||||
T hash(const char* s, count_t h);
|
||||
T hash(const wordID_t* id, const int len, count_t h) {}
|
||||
private:
|
||||
T** table_; // storage for random numbers
|
||||
count_t tblLen_; // length of table
|
||||
void initSeeds();
|
||||
void freeSeeds();
|
||||
class UnivHash_tableXOR: public HashBase<T>
|
||||
{
|
||||
public:
|
||||
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
|
||||
table_(NULL), tblLen_(255*MAX_STR_LEN) {
|
||||
initSeeds();
|
||||
}
|
||||
~UnivHash_tableXOR() {
|
||||
freeSeeds();
|
||||
}
|
||||
T hash(const char* s, count_t h);
|
||||
T hash(const wordID_t* id, const int len, count_t h) {}
|
||||
private:
|
||||
T** table_; // storage for random numbers
|
||||
count_t tblLen_; // length of table
|
||||
void initSeeds();
|
||||
void freeSeeds();
|
||||
};
|
||||
|
||||
// ShiftAddXor
|
||||
template <typename T>
|
||||
void Hash_shiftAddXOR<T>::initSeeds() {
|
||||
void Hash_shiftAddXOR<T>::initSeeds()
|
||||
{
|
||||
v_ = new T[this->H_];
|
||||
for(count_t i=0; i < this->H_; i++)
|
||||
v_[i] = Utils::rand<T>() + 1;
|
||||
v_[i] = Utils::rand<T>() + 1;
|
||||
}
|
||||
template <typename T>
|
||||
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
|
||||
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0)
|
||||
{
|
||||
T value = v_[h];
|
||||
int pos(0);
|
||||
unsigned char c;
|
||||
@ -150,40 +173,44 @@ T Hash_shiftAddXOR<T>::hash(const char* s, count_t h=0) {
|
||||
|
||||
// UnivHash_tableXOR
|
||||
template <typename T>
|
||||
void UnivHash_tableXOR<T>::initSeeds() {
|
||||
void UnivHash_tableXOR<T>::initSeeds()
|
||||
{
|
||||
// delete any values in table
|
||||
if(table_) freeSeeds();
|
||||
if(table_) freeSeeds();
|
||||
// instance of new table
|
||||
table_ = new T* [this->H_];
|
||||
// fill with random values
|
||||
for(count_t j=0; j < this->H_; j++) {
|
||||
table_[j] = new T[tblLen_];
|
||||
for(count_t i=0; i < tblLen_; i++) {
|
||||
table_[j][i] = Utils::rand<T>(this->m_-1);
|
||||
for(count_t i=0; i < tblLen_; i++) {
|
||||
table_[j][i] = Utils::rand<T>(this->m_-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void UnivHash_tableXOR<T>::freeSeeds() {
|
||||
void UnivHash_tableXOR<T>::freeSeeds()
|
||||
{
|
||||
for(count_t j = 0; j < this->H_; j++)
|
||||
delete[] table_[j];
|
||||
delete[] table_;
|
||||
table_ = NULL;
|
||||
}
|
||||
template <typename T>
|
||||
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0) {
|
||||
T UnivHash_tableXOR<T>::hash(const char* s, count_t h = 0)
|
||||
{
|
||||
T value = 0;
|
||||
count_t pos = 0, idx = 0;
|
||||
unsigned char c;
|
||||
while((c = *s++) && (++pos < MAX_STR_LEN))
|
||||
value ^= table_[h][idx += c];
|
||||
CHECK(value < this->m_);
|
||||
CHECK(value < this->m_);
|
||||
return value;
|
||||
}
|
||||
|
||||
// UnivHash_noPrimes
|
||||
template <typename T>
|
||||
void UnivHash_noPrimes<T>::initSeeds() {
|
||||
void UnivHash_noPrimes<T>::initSeeds()
|
||||
{
|
||||
a_ = new P[this->H_];
|
||||
for(T i=0; i < this->H_; i++) {
|
||||
a_[i] = Utils::rand<P>();
|
||||
@ -191,14 +218,16 @@ void UnivHash_noPrimes<T>::initSeeds() {
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0) {
|
||||
T UnivHash_noPrimes<T>::hash(const P x, count_t h=0)
|
||||
{
|
||||
// h_a(x) = (ax mod 2^l) div 2^(l-k)
|
||||
T value = ((a_[h] * x) % p_) >> d_;
|
||||
return value % this->m_;
|
||||
}
|
||||
template <typename T>
|
||||
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
||||
count_t h=0) {
|
||||
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
||||
count_t h=0)
|
||||
{
|
||||
T value = 0;
|
||||
int pos(0);
|
||||
while(pos < len) {
|
||||
@ -208,39 +237,42 @@ T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
||||
return value % this->m_;
|
||||
}
|
||||
template <typename T>
|
||||
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0) {
|
||||
T UnivHash_noPrimes<T>::hash(const char* s, count_t h=0)
|
||||
{
|
||||
T value = 0;
|
||||
int pos(0);
|
||||
unsigned char c;
|
||||
while((c = *s++) && (++pos < MAX_STR_LEN)) {
|
||||
value ^= hash((P)c, h);
|
||||
value ^= hash((P)c, h);
|
||||
}
|
||||
return value % this->m_;
|
||||
}
|
||||
template <typename T>
|
||||
void UnivHash_noPrimes<T>::save(FileHandler* fout) {
|
||||
void UnivHash_noPrimes<T>::save(FileHandler* fout)
|
||||
{
|
||||
HashBase<T>::save(fout);
|
||||
fout->write((char*)&p_, sizeof(p_));
|
||||
fout->write((char*)&d_, sizeof(d_));
|
||||
for(T i=0; i < this->H_; i++) {
|
||||
for(T i=0; i < this->H_; i++) {
|
||||
fout->write((char*)&a_[i], sizeof(a_[i]));
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void UnivHash_noPrimes<T>::load(FileHandler* fin) {
|
||||
void UnivHash_noPrimes<T>::load(FileHandler* fin)
|
||||
{
|
||||
a_ = new P[this->H_];
|
||||
// HashBase<T>::load(fin) already done in constructor
|
||||
fin->read((char*)&p_, sizeof(p_));
|
||||
fin->read((char*)&d_, sizeof(d_));
|
||||
for(T i=0; i < this->H_; i++)
|
||||
{
|
||||
for(T i=0; i < this->H_; i++) {
|
||||
fin->read((char*)&a_[i], sizeof(a_[i]));
|
||||
}
|
||||
}
|
||||
|
||||
//UnivHash_linear
|
||||
template <typename T>
|
||||
void UnivHash_linear<T>::initSeeds() {
|
||||
void UnivHash_linear<T>::initSeeds()
|
||||
{
|
||||
a_ = new T*[this->H_];
|
||||
b_ = new T*[this->H_];
|
||||
for(count_t i=0; i < this->H_; i++) {
|
||||
@ -253,7 +285,8 @@ void UnivHash_linear<T>::initSeeds() {
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void UnivHash_linear<T>::freeSeeds() {
|
||||
void UnivHash_linear<T>::freeSeeds()
|
||||
{
|
||||
for(count_t i=0; i < this->H_; i++) {
|
||||
delete[] a_[i];
|
||||
delete[] b_[i];
|
||||
@ -263,8 +296,9 @@ void UnivHash_linear<T>::freeSeeds() {
|
||||
a_ = b_ = NULL;
|
||||
}
|
||||
template <typename T>
|
||||
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
||||
count_t h=0) {
|
||||
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
||||
count_t h=0)
|
||||
{
|
||||
CHECK(h < this->H_);
|
||||
T value = 0;
|
||||
int pos(0);
|
||||
@ -276,19 +310,21 @@ inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
||||
}
|
||||
template <typename T>
|
||||
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
|
||||
const T prevValue, count_t h=0) {
|
||||
const T prevValue, count_t h=0)
|
||||
{
|
||||
CHECK(h < this->H_);
|
||||
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
|
||||
return value % this->m_;
|
||||
}
|
||||
template <typename T>
|
||||
void UnivHash_linear<T>::save(FileHandler* fout) {
|
||||
void UnivHash_linear<T>::save(FileHandler* fout)
|
||||
{
|
||||
// int bytes = sizeof(a_[0][0]);
|
||||
HashBase<T>::save(fout);
|
||||
fout->write((char*)&pr_, sizeof(pr_));
|
||||
for(count_t i=0; i < this->H_; i++) {
|
||||
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
||||
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
|
||||
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
|
||||
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
|
||||
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
||||
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
||||
@ -296,7 +332,8 @@ void UnivHash_linear<T>::save(FileHandler* fout) {
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void UnivHash_linear<T>::load(FileHandler* fin) {
|
||||
void UnivHash_linear<T>::load(FileHandler* fin)
|
||||
{
|
||||
// HashBase<T>::load(fin) already done in constructor
|
||||
fin->read((char*)&pr_, sizeof(pr_));
|
||||
a_ = new T*[this->H_];
|
||||
@ -305,8 +342,8 @@ void UnivHash_linear<T>::load(FileHandler* fin) {
|
||||
a_[i] = new T[MAX_NGRAM_ORDER];
|
||||
b_[i] = new T[MAX_NGRAM_ORDER];
|
||||
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
||||
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
|
||||
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
|
||||
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
|
||||
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
|
||||
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
||||
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
||||
}
|
||||
|
@ -16,27 +16,28 @@ using randlm::Cache;
|
||||
const bool strict_checks_ = false;
|
||||
|
||||
template<typename T>
|
||||
class OnlineRLM: public PerfectHash<T> {
|
||||
class OnlineRLM: public PerfectHash<T>
|
||||
{
|
||||
public:
|
||||
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
|
||||
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
|
||||
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
|
||||
Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
|
||||
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
|
||||
CHECK(vocab_ != 0);
|
||||
//instantiate quantizer class here
|
||||
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
|
||||
alpha_ = new float[order_ + 1];
|
||||
for(count_t i = 0; i <= order_; ++i)
|
||||
for(count_t i = 0; i <= order_; ++i)
|
||||
alpha_[i] = i * log10(0.4);
|
||||
cerr << "Initialzing auxillary bit filters...\n";
|
||||
bPrefix_ = new BitFilter(this->cells_);
|
||||
bHit_ = new BitFilter(this->cells_);
|
||||
}
|
||||
OnlineRLM(FileHandler* fin, count_t order):
|
||||
OnlineRLM(FileHandler* fin, count_t order):
|
||||
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
|
||||
load(fin);
|
||||
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
|
||||
alpha_ = new float[order_ + 1];
|
||||
for(count_t i = 0; i <= order_; ++i)
|
||||
for(count_t i = 0; i <= order_; ++i)
|
||||
alpha_[i] = i * log10(0.4);
|
||||
}
|
||||
~OnlineRLM() {
|
||||
@ -52,14 +53,18 @@ public:
|
||||
bool insert(const std::vector<string>& ngram, const int value);
|
||||
bool update(const std::vector<string>& ngram, const int value);
|
||||
int query(const wordID_t* IDs, const int len);
|
||||
int sbsqQuery(const std::vector<string>& ngram, int* len,
|
||||
bool bStrict = false);
|
||||
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||
bool bStrict = false);
|
||||
int sbsqQuery(const std::vector<string>& ngram, int* len,
|
||||
bool bStrict = false);
|
||||
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||
bool bStrict = false);
|
||||
void remove(const std::vector<string>& ngram);
|
||||
count_t heurDelete(count_t num2del, count_t order = 5);
|
||||
uint64_t corpusSize() {return corpusSize_;}
|
||||
void corpusSize(uint64_t c) {corpusSize_ = c;}
|
||||
uint64_t corpusSize() {
|
||||
return corpusSize_;
|
||||
}
|
||||
void corpusSize(uint64_t c) {
|
||||
corpusSize_ = c;
|
||||
}
|
||||
void clearCache() {
|
||||
if(cache_) cache_->clear();
|
||||
}
|
||||
@ -77,7 +82,7 @@ protected:
|
||||
void markQueried(hpdEntry_t& value);
|
||||
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
|
||||
private:
|
||||
const void* getContext(const wordID_t* ngram, int len);
|
||||
const void* getContext(const wordID_t* ngram, int len);
|
||||
const bool bAdapting_; // used to signal adaptation of model
|
||||
const count_t order_; // LM order
|
||||
uint64_t corpusSize_; // total training corpus size
|
||||
@ -87,46 +92,48 @@ private:
|
||||
BitFilter* bHit_;
|
||||
};
|
||||
template<typename T>
|
||||
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value) {
|
||||
bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value)
|
||||
{
|
||||
int len = ngram.size();
|
||||
wordID_t wrdIDs[len];
|
||||
uint64_t index(this->cells_ + 1);
|
||||
for(int i = 0; i < len; ++i)
|
||||
for(int i = 0; i < len; ++i)
|
||||
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
|
||||
index = PerfectHash<T>::insert(wrdIDs, len, value);
|
||||
if(value > 1 && len < order_)
|
||||
markPrefix(wrdIDs, ngram.size(), true); // mark context
|
||||
// keep track of total items from training data minus "<s>"
|
||||
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
|
||||
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
|
||||
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
|
||||
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
|
||||
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
|
||||
markQueried(index);
|
||||
return true;
|
||||
}
|
||||
template<typename T>
|
||||
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value) {
|
||||
bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value)
|
||||
{
|
||||
int len = ngram.size();
|
||||
wordID_t wrdIDs[len];
|
||||
uint64_t index(this->cells_ + 1);
|
||||
hpdEntry_t hpdItr;
|
||||
vocab_->MakeOpen();
|
||||
for(int i = 0; i < len; ++i)
|
||||
for(int i = 0; i < len; ++i)
|
||||
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
|
||||
// if updating, minimize false positives by pre-checking if context already in model
|
||||
bool bIncluded(true);
|
||||
// if updating, minimize false positives by pre-checking if context already in model
|
||||
bool bIncluded(true);
|
||||
if(value > 1 && len < (int)order_)
|
||||
bIncluded = markPrefix(wrdIDs, ngram.size(), true); // mark context
|
||||
if(bIncluded) { // if context found
|
||||
if(bIncluded) { // if context found
|
||||
bIncluded = PerfectHash<T>::update2(wrdIDs, len, value, hpdItr, index);
|
||||
if(index < this->cells_) {
|
||||
markQueried(index);
|
||||
}
|
||||
else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
|
||||
} else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
|
||||
}
|
||||
return bIncluded;
|
||||
}
|
||||
template<typename T>
|
||||
int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
|
||||
int OnlineRLM<T>::query(const wordID_t* IDs, int len)
|
||||
{
|
||||
uint64_t filterIdx = 0;
|
||||
hpdEntry_t hpdItr;
|
||||
int value(0);
|
||||
@ -135,8 +142,7 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
|
||||
if(hpdItr != this->dict_.end()) {
|
||||
//markQueried(hpdItr); // mark this event as "hit"
|
||||
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
CHECK(filterIdx < this->cells_);
|
||||
//markQueried(filterIdx);
|
||||
}
|
||||
@ -144,15 +150,16 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
|
||||
return value > 0 ? value : 0;
|
||||
}
|
||||
template<typename T>
|
||||
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
|
||||
if(len <= 1) return true; // only do this for for ngrams with context
|
||||
static Cache<int> pfCache(-1, -1); // local prefix cache
|
||||
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
|
||||
{
|
||||
if(len <= 1) return true; // only do this for for ngrams with context
|
||||
static Cache<int> pfCache(-1, -1); // local prefix cache
|
||||
int code(0);
|
||||
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
|
||||
hpdEntry_t hpdItr;
|
||||
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
|
||||
hpdEntry_t hpdItr;
|
||||
uint64_t filterIndex(0);
|
||||
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
|
||||
if(code == -1) { // encountered false positive in pipeline
|
||||
if(code == -1) { // encountered false positive in pipeline
|
||||
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
|
||||
// add all prefixes or return false;
|
||||
return false;
|
||||
@ -161,10 +168,9 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
|
||||
CHECK(hpdItr == this->dict_.end());
|
||||
if(bSet) bPrefix_->setBit(filterIndex); // mark index
|
||||
else bPrefix_->clearBit(filterIndex); // unset index
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
CHECK(filterIndex == this->cells_ + 1);
|
||||
//how to handle hpd prefixes?
|
||||
//how to handle hpd prefixes?
|
||||
}
|
||||
if(pfCache.nodes() > 10000) pfCache.clear();
|
||||
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
|
||||
@ -172,36 +178,40 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
|
||||
return true;
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::markQueried(const uint64_t& index) {
|
||||
void OnlineRLM<T>::markQueried(const uint64_t& index)
|
||||
{
|
||||
bHit_->setBit(index);
|
||||
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::markQueried(hpdEntry_t& value) {
|
||||
// set high bit of counter to indicate "hit" status
|
||||
void OnlineRLM<T>::markQueried(hpdEntry_t& value)
|
||||
{
|
||||
// set high bit of counter to indicate "hit" status
|
||||
value->second |= this->hitMask_;
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::remove(const std::vector<string>& ngram) {
|
||||
void OnlineRLM<T>::remove(const std::vector<string>& ngram)
|
||||
{
|
||||
wordID_t IDs[ngram.size()];
|
||||
for(count_t i = 0; i < ngram.size(); ++i)
|
||||
for(count_t i = 0; i < ngram.size(); ++i)
|
||||
IDs[i] = vocab_->GetWordID(ngram[i]);
|
||||
PerfectHash<T>::remove(IDs, ngram.size());
|
||||
}
|
||||
template<typename T>
|
||||
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
|
||||
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
|
||||
{
|
||||
count_t deleted = 0;
|
||||
cout << "Deleting " << num2del << " of order "<< order << endl;
|
||||
// delete from filter first
|
||||
int full = *std::max_element(this->idxTracker_, this->idxTracker_
|
||||
+ this->totBuckets_);
|
||||
int full = *std::max_element(this->idxTracker_, this->idxTracker_
|
||||
+ this->totBuckets_);
|
||||
for(; full > 0; --full) // delete from fullest buckets first
|
||||
for(int bk = 0; bk < this->totBuckets_; ++bk) {
|
||||
for(int bk = 0; bk < this->totBuckets_; ++bk) {
|
||||
if(deleted >= num2del) break;
|
||||
if(this->idxTracker_[bk] == full) { // if full
|
||||
uint64_t first = bk * this->bucketRange_,
|
||||
last = first + this->bucketRange_;
|
||||
for(uint64_t row = first; row < last; ++row) { // check each row
|
||||
last = first + this->bucketRange_;
|
||||
for(uint64_t row = first; row < last; ++row) { // check each row
|
||||
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
|
||||
if(this->filter_->read(row) != 0) {
|
||||
PerfectHash<T>::remove(row); // remove from filter
|
||||
@ -220,15 +230,17 @@ count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
|
||||
}
|
||||
template<typename T>
|
||||
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
|
||||
bool bStrict) {
|
||||
bool bStrict)
|
||||
{
|
||||
wordID_t IDs[ngram.size()];
|
||||
for(count_t i = 0; i < ngram.size(); ++i)
|
||||
for(count_t i = 0; i < ngram.size(); ++i)
|
||||
IDs[i] = vocab_->GetWordID(ngram[i]);
|
||||
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
|
||||
}
|
||||
template<typename T>
|
||||
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||
bool bStrict) {
|
||||
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||
bool bStrict)
|
||||
{
|
||||
uint64_t filterIdx = 0;
|
||||
int val(0), fnd(0);
|
||||
hpdEntry_t hpdItr;
|
||||
@ -240,14 +252,13 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||
if(hpdItr != this->dict_.end()) {
|
||||
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
|
||||
}
|
||||
}
|
||||
else if(bStrict) {
|
||||
break;
|
||||
} else if(bStrict) {
|
||||
break;
|
||||
}
|
||||
// add to value array
|
||||
codes[i] = val > 0 ? val : 0;
|
||||
}
|
||||
while(bStrict && (fnd > 1)) { // do checks the other way
|
||||
while(bStrict && (fnd > 1)) { // do checks the other way
|
||||
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
|
||||
if(val != -1) break; // if anything found
|
||||
else --fnd; // else decrement found
|
||||
@ -255,8 +266,9 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
|
||||
return fnd;
|
||||
}
|
||||
template<typename T>
|
||||
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
||||
const void** state) {
|
||||
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
||||
const void** state)
|
||||
{
|
||||
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
|
||||
float logprob(0);
|
||||
const void* context = (state) ? *state : 0;
|
||||
@ -264,61 +276,61 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
|
||||
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
|
||||
// get full prob and put in cache
|
||||
int num_fnd(0), den_val(0);
|
||||
int in[len]; // in[] keeps counts of increasing order numerator
|
||||
int in[len]; // in[] keeps counts of increasing order numerator
|
||||
for(int i = 0; i < len; ++i) in[i] = 0;
|
||||
for(int i = len - 1; i >= 0; --i) {
|
||||
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
|
||||
in[i] = query(&ngram[i], len - i);
|
||||
if(in[i] > 0) {
|
||||
num_fnd = len - i;
|
||||
}
|
||||
else if(strict_checks_) break;
|
||||
} else if(strict_checks_) break;
|
||||
}
|
||||
while(num_fnd > 1) { // get lower order count
|
||||
//get sub-context of size one less than length found (exluding target)
|
||||
//get sub-context of size one less than length found (exluding target)
|
||||
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
|
||||
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
|
||||
break;
|
||||
}
|
||||
else --num_fnd; // else backoff to lower ngram order
|
||||
} else --num_fnd; // else backoff to lower ngram order
|
||||
}
|
||||
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
|
||||
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
|
||||
num_fnd = 0;
|
||||
switch(num_fnd) { // find prob (need to refactor into precomputation)
|
||||
case 0: // OOV
|
||||
logprob = alpha_[len] + oovprob;
|
||||
break;
|
||||
case 1: // unigram found only
|
||||
CHECK(in[len - 1] > 0);
|
||||
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
|
||||
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
|
||||
//logprob = alpha_[len - 1] +
|
||||
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
|
||||
break;
|
||||
default:
|
||||
CHECK(den_val > 0);
|
||||
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
|
||||
logprob = alpha_[len - num_fnd] +
|
||||
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
|
||||
break;
|
||||
case 0: // OOV
|
||||
logprob = alpha_[len] + oovprob;
|
||||
break;
|
||||
case 1: // unigram found only
|
||||
CHECK(in[len - 1] > 0);
|
||||
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
|
||||
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
|
||||
//logprob = alpha_[len - 1] +
|
||||
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
|
||||
break;
|
||||
default:
|
||||
CHECK(den_val > 0);
|
||||
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
|
||||
logprob = alpha_[len - num_fnd] +
|
||||
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
|
||||
break;
|
||||
}
|
||||
// need unique context
|
||||
context = getContext(&ngram[len - num_fnd], num_fnd);
|
||||
// put whatever was found in cache
|
||||
cache_->setCacheNgram(ngram, len, logprob, context);
|
||||
} // end checkCache
|
||||
return logprob;
|
||||
return logprob;
|
||||
}
|
||||
template<typename T>
|
||||
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len) {
|
||||
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
|
||||
{
|
||||
int dummy(0);
|
||||
float* addresses[len]; // only interested in addresses of cache
|
||||
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
|
||||
// return address of cache node
|
||||
return (const void*)addresses[0];
|
||||
return (const void*)addresses[0];
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::randDelete(int num2del) {
|
||||
void OnlineRLM<T>::randDelete(int num2del)
|
||||
{
|
||||
int deleted = 0;
|
||||
for(uint64_t i = 0; i < this->cells_; i++) {
|
||||
if(this->filter_->read(i) != 0) {
|
||||
@ -329,18 +341,20 @@ void OnlineRLM<T>::randDelete(int num2del) {
|
||||
}
|
||||
}
|
||||
template<typename T>
|
||||
int OnlineRLM<T>::countHits() {
|
||||
int OnlineRLM<T>::countHits()
|
||||
{
|
||||
int hit(0);
|
||||
for(uint64_t i = 0; i < this->cells_; ++i)
|
||||
if(bHit_->testBit(i)) ++hit;
|
||||
iterate(this->dict_, itr)
|
||||
if((itr->second & this->hitMask_) != 0)
|
||||
++hit;
|
||||
if((itr->second & this->hitMask_) != 0)
|
||||
++hit;
|
||||
cerr << "Hit count = " << hit << endl;
|
||||
return hit;
|
||||
}
|
||||
template<typename T>
|
||||
int OnlineRLM<T>::countPrefixes() {
|
||||
int OnlineRLM<T>::countPrefixes()
|
||||
{
|
||||
int pfx(0);
|
||||
for(uint64_t i = 0; i < this->cells_; ++i)
|
||||
if(bPrefix_->testBit(i)) ++pfx;
|
||||
@ -349,22 +363,24 @@ int OnlineRLM<T>::countPrefixes() {
|
||||
return pfx;
|
||||
}
|
||||
template<typename T>
|
||||
int OnlineRLM<T>::cleanUpHPD() {
|
||||
int OnlineRLM<T>::cleanUpHPD()
|
||||
{
|
||||
cerr << "HPD size before = " << this->dict_.size() << endl;
|
||||
std::vector<string> vDel, vtmp;
|
||||
iterate(this->dict_, itr) {
|
||||
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
|
||||
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
|
||||
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
|
||||
vDel.push_back(itr->first);
|
||||
}
|
||||
}
|
||||
iterate(vDel, vitr)
|
||||
this->dict_.erase(*vitr);
|
||||
iterate(vDel, vitr)
|
||||
this->dict_.erase(*vitr);
|
||||
cerr << "HPD size after = " << this->dict_.size() << endl;
|
||||
return vDel.size();
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::clearMarkings() {
|
||||
void OnlineRLM<T>::clearMarkings()
|
||||
{
|
||||
cerr << "clearing all event hits\n";
|
||||
bHit_->reset();
|
||||
count_t* value(0);
|
||||
@ -374,7 +390,8 @@ void OnlineRLM<T>::clearMarkings() {
|
||||
}
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::save(FileHandler* fout) {
|
||||
void OnlineRLM<T>::save(FileHandler* fout)
|
||||
{
|
||||
cerr << "Saving ORLM...\n";
|
||||
// save vocab
|
||||
vocab_->Save(fout);
|
||||
@ -387,7 +404,8 @@ void OnlineRLM<T>::save(FileHandler* fout) {
|
||||
cerr << "Finished saving ORLM." << endl;
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::load(FileHandler* fin) {
|
||||
void OnlineRLM<T>::load(FileHandler* fin)
|
||||
{
|
||||
cerr << "Loading ORLM...\n";
|
||||
// load vocab first
|
||||
vocab_ = new Vocab(fin);
|
||||
@ -402,12 +420,13 @@ void OnlineRLM<T>::load(FileHandler* fin) {
|
||||
PerfectHash<T>::load(fin);
|
||||
}
|
||||
template<typename T>
|
||||
void OnlineRLM<T>::removeNonMarked() {
|
||||
void OnlineRLM<T>::removeNonMarked()
|
||||
{
|
||||
cerr << "deleting all unused events\n";
|
||||
int deleted(0);
|
||||
for(uint64_t i = 0; i < this->cells_; ++i) {
|
||||
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
|
||||
&& (this->filter_->read(i) != 0)) {
|
||||
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
|
||||
&& (this->filter_->read(i) != 0)) {
|
||||
PerfectHash<T>::remove(i);
|
||||
++deleted;
|
||||
}
|
||||
@ -429,36 +448,36 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
|
||||
// constrain cache queries using model assumptions
|
||||
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
|
||||
cerr << "denom_len = " << denom_len << endl;
|
||||
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
|
||||
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
|
||||
&num_codes[0], &found);
|
||||
cerr << "num_len= " << num_len << endl;
|
||||
// keed reducing ngram size until both denominator and numerator are found
|
||||
// allowed to leave kUnknownCode in cache because we check for this.
|
||||
found = num_len; // guaranteed to be <= denom_len + 1
|
||||
// still check for OOV
|
||||
for (int i = len - found; i < len; ++i)
|
||||
if (ngram[i] == Vocab::kOOVWordID) {
|
||||
for (int i = len - found; i < len; ++i)
|
||||
if (ngram[i] == Vocab::kOOVWordID) {
|
||||
found = len - i - 1;
|
||||
}
|
||||
// check for relative estimator
|
||||
while(found > 1) {
|
||||
if(*denom_codes[found-1] == cache_unk_ &&
|
||||
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
|
||||
if(*denom_codes[found-1] == cache_unk_ &&
|
||||
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
|
||||
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
|
||||
*num_codes[found] = cache_unk_;
|
||||
} else {
|
||||
if(*num_codes[found] != cache_unk_ ||
|
||||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
|
||||
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
|
||||
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
|
||||
// num_codes[*found], *denom_codes[*found-1]))
|
||||
break;
|
||||
}
|
||||
}
|
||||
--found;
|
||||
}
|
||||
// didn't find bigram numerator or unigram denominator
|
||||
// didn't find bigram numerator or unigram denominator
|
||||
if (found == 1)
|
||||
found = *num_codes[1] != cache_unk_
|
||||
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
|
||||
found = *num_codes[1] != cache_unk_
|
||||
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
|
||||
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
|
||||
// ....
|
||||
// return estimate applying correct backoff score (precomputed)
|
||||
@ -469,20 +488,20 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
|
||||
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
|
||||
break;
|
||||
case 1: // unigram over whole corpus
|
||||
log_prob = alpha_[len - 1] +
|
||||
log_prob = alpha_[len - 1] +
|
||||
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
|
||||
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
|
||||
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
|
||||
// + stupid_backoff_log10_[len - 1]; // precomputed
|
||||
break;
|
||||
default: // otherwise use both statistics and (possibly zero) backoff weight
|
||||
log_prob = alpha_[len - found] +
|
||||
log_prob = alpha_[len - found] +
|
||||
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
|
||||
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
|
||||
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
|
||||
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
|
||||
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
|
||||
// + stupid_backoff_log10_[len - *found];
|
||||
}
|
||||
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
|
||||
//probCache_->store(len, log_prob, context_state);
|
||||
//probCache_->store(len, log_prob, context_state);
|
||||
if (state)
|
||||
*state = context_state;
|
||||
return log_prob;
|
||||
|
@ -1,10 +1,11 @@
|
||||
#include "params.h"
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
// parameter constants
|
||||
const std::string Parameters::kNotSetValue = "__NOT_SET__";
|
||||
|
||||
const int Parameters::kBoolValue = 0;
|
||||
const int Parameters::kBoolValue = 0;
|
||||
const int Parameters::kIntValue = 1;
|
||||
const int Parameters::kFloatValue = 2;
|
||||
const int Parameters::kStringValue = 3;
|
||||
@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1;
|
||||
const std::string Parameters::kTrueValue = "1";
|
||||
const std::string Parameters::kFalseValue = "0";
|
||||
|
||||
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) {
|
||||
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
|
||||
{
|
||||
initialize(paramdefs, paramNum);
|
||||
}
|
||||
|
||||
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
|
||||
const count_t paramNum) {
|
||||
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
|
||||
const count_t paramNum)
|
||||
{
|
||||
initialize(paramdefs, paramNum);
|
||||
loadParams(argc, argv);
|
||||
}
|
||||
|
||||
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) {
|
||||
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
|
||||
{
|
||||
for( count_t i = 0; i < paramNum; i++ ) {
|
||||
params_[paramdefs[i].name] = paramdefs[i]; // assign name
|
||||
}
|
||||
cerr << "Default parameter values:\n";
|
||||
iterate(params_, itr)
|
||||
cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
|
||||
iterate(params_, itr)
|
||||
cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
|
||||
}
|
||||
|
||||
bool Parameters::loadParams(int argc, char ** argv) {
|
||||
bool Parameters::loadParams(int argc, char ** argv)
|
||||
{
|
||||
// load params from commandline args
|
||||
//if( argc < 3 ) {
|
||||
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
|
||||
@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) {
|
||||
std::string val = argv[i+1];
|
||||
Utils::trim(val);
|
||||
if( param == "config" )
|
||||
load_from_file = true;
|
||||
load_from_file = true;
|
||||
if(!setParamValue(param, val)) {
|
||||
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
|
||||
return false;
|
||||
@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) {
|
||||
return success;
|
||||
}
|
||||
|
||||
std::string Parameters::normaliseParamName(const std::string & name) {
|
||||
std::string Parameters::normaliseParamName(const std::string & name)
|
||||
{
|
||||
// Map valid abbreviations to long names. Retain other names.
|
||||
if( params_.find(name) == params_.end() )
|
||||
iterate(params_, i)
|
||||
if( i->second.abbrev == name )
|
||||
return i->first;
|
||||
iterate(params_, i)
|
||||
if( i->second.abbrev == name )
|
||||
return i->first;
|
||||
return name;
|
||||
}
|
||||
|
||||
int Parameters::getValueType(const std::string& name) {
|
||||
int Parameters::getValueType(const std::string& name)
|
||||
{
|
||||
if(params_.find(name) != params_.end())
|
||||
return params_[name].type;
|
||||
return Parameters::kUndefinedValue;
|
||||
}
|
||||
|
||||
bool Parameters::isValidParamName(const std::string & name) {
|
||||
return params_.find(name) != params_.end();
|
||||
bool Parameters::isValidParamName(const std::string & name)
|
||||
{
|
||||
return params_.find(name) != params_.end();
|
||||
}
|
||||
|
||||
bool Parameters::setParamValue(const std::string& name, const std::string& val) {
|
||||
// TODO: Add basic type checking w verifyValueType()
|
||||
bool set = isValidParamName(name);
|
||||
if(set) {
|
||||
params_[name].value = val;
|
||||
bool Parameters::setParamValue(const std::string& name, const std::string& val)
|
||||
{
|
||||
// TODO: Add basic type checking w verifyValueType()
|
||||
bool set = isValidParamName(name);
|
||||
if(set) {
|
||||
params_[name].value = val;
|
||||
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
|
||||
}
|
||||
return( set );
|
||||
}
|
||||
std::string Parameters::getParamValue(const std::string& name) {
|
||||
std::string Parameters::getParamValue(const std::string& name)
|
||||
{
|
||||
std::string value = Parameters::kNotSetValue;
|
||||
if(isValidParamName(name))
|
||||
if(params_.find(name) != params_.end())
|
||||
@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) {
|
||||
value = kFalseValue;
|
||||
return value;
|
||||
}
|
||||
std::string Parameters::getParam(const std::string& name) {
|
||||
std::string Parameters::getParam(const std::string& name)
|
||||
{
|
||||
return getParamValue(name);
|
||||
/*void* Parameters::getParam(const std::string& name) {
|
||||
void* paramVal = 0;
|
||||
int type = getValueType(name);
|
||||
const char* sval = getParamValue(name).c_str();
|
||||
switch(type) {
|
||||
case kIntValue: {
|
||||
int ival = atoi(sval);
|
||||
paramVal = (void*)&ival;
|
||||
break;
|
||||
/*void* Parameters::getParam(const std::string& name) {
|
||||
void* paramVal = 0;
|
||||
int type = getValueType(name);
|
||||
const char* sval = getParamValue(name).c_str();
|
||||
switch(type) {
|
||||
case kIntValue: {
|
||||
int ival = atoi(sval);
|
||||
paramVal = (void*)&ival;
|
||||
break;
|
||||
}
|
||||
case kFloatValue: {
|
||||
float fval = atof(sval);
|
||||
paramVal = (void*)&fval;
|
||||
break;
|
||||
}
|
||||
case kStringValue: {
|
||||
paramVal = (void*)sval;
|
||||
break;
|
||||
}
|
||||
case kBoolValue: {
|
||||
bool bval = sval == Parameters::kTrueValue ? true : false;
|
||||
paramVal = (void*)&bval;
|
||||
break;
|
||||
}
|
||||
default: // --> Parameters::kUndefinedValue
|
||||
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
|
||||
}
|
||||
case kFloatValue: {
|
||||
float fval = atof(sval);
|
||||
paramVal = (void*)&fval;
|
||||
break;
|
||||
}
|
||||
case kStringValue: {
|
||||
paramVal = (void*)sval;
|
||||
break;
|
||||
}
|
||||
case kBoolValue: {
|
||||
bool bval = sval == Parameters::kTrueValue ? true : false;
|
||||
paramVal = (void*)&bval;
|
||||
break;
|
||||
}
|
||||
default: // --> Parameters::kUndefinedValue
|
||||
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
|
||||
}
|
||||
return paramVal;*/
|
||||
return paramVal;*/
|
||||
}
|
||||
bool Parameters::verifyValueType(const std::string& name, const std::string& val) {
|
||||
bool Parameters::verifyValueType(const std::string& name, const std::string& val)
|
||||
{
|
||||
// Implement basic type checking
|
||||
return true;
|
||||
}
|
||||
|
||||
int Parameters::getParamCount() const {
|
||||
int Parameters::getParamCount() const
|
||||
{
|
||||
return params_.size();
|
||||
}
|
||||
|
||||
@ -161,7 +174,8 @@ int Parameters::getParamCount() const {
|
||||
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
|
||||
* override default if different*/
|
||||
bool Parameters::loadParams(const std::string & file_path,
|
||||
std::set<std::string>& setParams) {
|
||||
std::set<std::string>& setParams)
|
||||
{
|
||||
// parameters loaded from file don't override cmd line paramters
|
||||
/*std::set<std::string>::iterator end = setParams.end();
|
||||
FileHandler file(file_path.c_str(), std::ios::in);
|
||||
|
@ -10,20 +10,22 @@
|
||||
#include "utils.h"
|
||||
#include "types.h"
|
||||
|
||||
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
|
||||
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
typedef struct ParamDefs {
|
||||
std::string name;
|
||||
std::string value;
|
||||
std::string value;
|
||||
std::string abbrev;
|
||||
int type;
|
||||
std::string description;
|
||||
} ParamDefs;
|
||||
|
||||
class Parameters {
|
||||
class Parameters
|
||||
{
|
||||
public:
|
||||
static const std::string kNotSetValue;
|
||||
static const std::string kNotSetValue;
|
||||
static const int kBoolValue;
|
||||
static const int kIntValue;
|
||||
static const int kFloatValue;
|
||||
@ -31,15 +33,15 @@ public:
|
||||
static const int kUndefinedValue;
|
||||
static const std::string kFalseValue;
|
||||
static const std::string kTrueValue;
|
||||
|
||||
|
||||
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
|
||||
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
|
||||
~Parameters() {}
|
||||
bool loadParams(int argc, char ** argv);
|
||||
bool loadParams(const std::string& param_file, std::set<std::string>&);
|
||||
int getValueType(const std::string & name);
|
||||
bool setParamValue(const std::string& name, const std::string& value);
|
||||
bool verifyValueType(const std::string& name, const std::string& value);
|
||||
bool setParamValue(const std::string& name, const std::string& value);
|
||||
bool verifyValueType(const std::string& name, const std::string& value);
|
||||
bool isValidParamName(const std::string & name);
|
||||
std::string getParamValue(const std::string& name);
|
||||
//void* getParam(const std::string& name);
|
||||
|
@ -8,17 +8,18 @@
|
||||
#include "RandLMFilter.h"
|
||||
#include "quantizer.h"
|
||||
/*
|
||||
* PerfectHash handles setting up hash functions and storage
|
||||
* for LM data.
|
||||
*/
|
||||
* PerfectHash handles setting up hash functions and storage
|
||||
* for LM data.
|
||||
*/
|
||||
using randlm::Filter;
|
||||
using randlm::BitFilter;
|
||||
typedef std::map<string, count_t> hpDict_t;
|
||||
typedef hpDict_t::iterator hpdEntry_t;
|
||||
static count_t collisions_ = 0;
|
||||
/* Based on Mortenson et. al. 2006 */
|
||||
/* Based on Mortenson et. al. 2006 */
|
||||
template<typename T>
|
||||
class PerfectHash {
|
||||
class PerfectHash
|
||||
{
|
||||
public:
|
||||
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
|
||||
PerfectHash(FileHandler* fin) {
|
||||
@ -39,11 +40,11 @@ protected:
|
||||
uint8_t* idxTracker_;
|
||||
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
|
||||
bool update(const wordID_t* IDs, const int len, const count_t value,
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||
bool update2(const wordID_t* IDs, const int len, const count_t value,
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||
int query(const wordID_t* IDs, const int len,
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||
int query(const wordID_t* IDs, const int len,
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
|
||||
virtual void remove(const wordID_t* IDs, const int len);
|
||||
void remove(uint64_t index);
|
||||
void save(FileHandler* fout);
|
||||
@ -52,32 +53,33 @@ protected:
|
||||
//pointer to a specific entry in a hpDict_t
|
||||
virtual void markQueried(hpdEntry_t&)=0;
|
||||
private:
|
||||
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
|
||||
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
|
||||
string hpDictKeyValue(const wordID_t* IDs, const int len);
|
||||
uint64_t memBound_; // total memory bound in bytes
|
||||
uint16_t cellWidth_; // in bits
|
||||
UnivHash_linear<count_t>* bucketHash_;
|
||||
UnivHash_linear<count_t>* bucketHash_;
|
||||
UnivHash_linear<T>* fingerHash_;
|
||||
LogQtizer* qtizer_;
|
||||
};
|
||||
template<typename T>
|
||||
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
|
||||
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
|
||||
cellWidth_(width) {
|
||||
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
|
||||
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
|
||||
cellWidth_(width)
|
||||
{
|
||||
bucketRange_ = static_cast<uint8_t>(bucketRange);
|
||||
if(bucketRange > 255) {
|
||||
cerr << "ERROR: Max bucket range is > 2^8\n";
|
||||
cerr << "ERROR: Max bucket range is > 2^8\n";
|
||||
exit(1);
|
||||
}
|
||||
qtizer_ = new LogQtizer(qBase);
|
||||
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
|
||||
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
|
||||
uint64_t totalBits = memBound_ << 3;
|
||||
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
|
||||
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
|
||||
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
|
||||
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
|
||||
filter_ = new Filter<T>(cells_, cellWidth_);
|
||||
values_ = new Filter<T>(cells_, valBits);
|
||||
values_ = new Filter<T>(cells_, valBits);
|
||||
idxTracker_ = new uint8_t[totBuckets_];
|
||||
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
|
||||
// initialize ranges for each hash function
|
||||
@ -85,7 +87,8 @@ PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
|
||||
fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
|
||||
}
|
||||
template<typename T>
|
||||
PerfectHash<T>::~PerfectHash() {
|
||||
PerfectHash<T>::~PerfectHash()
|
||||
{
|
||||
delete[] idxTracker_;
|
||||
delete filter_;
|
||||
filter_ = NULL;
|
||||
@ -94,22 +97,22 @@ PerfectHash<T>::~PerfectHash() {
|
||||
delete qtizer_;
|
||||
delete values_;
|
||||
}
|
||||
template<typename T>
|
||||
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
|
||||
const count_t value) {
|
||||
template<typename T>
|
||||
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
|
||||
const count_t value)
|
||||
{
|
||||
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
||||
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
|
||||
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
|
||||
// restriction on fprint value is non-zero
|
||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||
uint64_t emptyidx = cells_ + 1;
|
||||
uint64_t index = bucket * bucketRange_, // starting bucket row
|
||||
lastrow = index + bucketRange_; // ending row
|
||||
while(index < lastrow) { // unique so check each row for "matching" signature
|
||||
lastrow = index + bucketRange_; // ending row
|
||||
while(index < lastrow) { // unique so check each row for "matching" signature
|
||||
T filterVal = filter_->read(index);
|
||||
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
|
||||
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
|
||||
emptyidx = index;
|
||||
}
|
||||
else if(filterVal == fp) {
|
||||
} else if(filterVal == fp) {
|
||||
++collisions_;
|
||||
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
|
||||
return cells_ + 1; // finished
|
||||
@ -122,20 +125,20 @@ uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
|
||||
values_->write(emptyidx, code);
|
||||
++idxTracker_[bucket]; // keep track of bucket size
|
||||
return emptyidx;
|
||||
}
|
||||
else { // bucket is full
|
||||
} else { // bucket is full
|
||||
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
|
||||
return cells_ + 1;
|
||||
}
|
||||
}
|
||||
template<typename T>
|
||||
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
|
||||
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
|
||||
template<typename T>
|
||||
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
|
||||
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
|
||||
{
|
||||
// check if key is in high perf. dictionary
|
||||
filterIdx = cells_ + 1;
|
||||
string skey = hpDictKeyValue(IDs, len);
|
||||
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
||||
hpdAddr->second = value;
|
||||
hpdAddr->second = value;
|
||||
return true;
|
||||
}
|
||||
// else hash ngram
|
||||
@ -144,66 +147,67 @@ bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
|
||||
// restriction on fprint value is non-zero
|
||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||
uint64_t index = bucket * bucketRange_, // starting bucket row
|
||||
lastrow = index + bucketRange_;
|
||||
lastrow = index + bucketRange_;
|
||||
while(index < lastrow) { // must check each row for matching fp event
|
||||
T filterVal = filter_->read(index);
|
||||
if(filterVal == fp) { // found event w.h.p.
|
||||
values_->write(index, (T)qtizer_->code(value));
|
||||
values_->write(index, (T)qtizer_->code(value));
|
||||
filterIdx = index;
|
||||
return true;
|
||||
}
|
||||
++index;
|
||||
}
|
||||
// could add if it gets here.
|
||||
// could add if it gets here.
|
||||
return false;
|
||||
}
|
||||
template<typename T>
|
||||
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
|
||||
template<typename T>
|
||||
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
|
||||
hpdEntry_t& hpdAddr, uint64_t& filterIdx)
|
||||
{
|
||||
// check if key is in high perf. dictionary
|
||||
string skey = hpDictKeyValue(IDs, len);
|
||||
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
||||
filterIdx = cells_ + 1;
|
||||
return(hpdAddr->second); // returns copy of value
|
||||
}
|
||||
else { // check if key is in filter
|
||||
// get bucket
|
||||
} else { // check if key is in filter
|
||||
// get bucket
|
||||
//count_t bucket = bucketHash_->hash(IDs, len);
|
||||
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
||||
// restriction on fprint value is non-zero
|
||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||
// return value if ngram is in filter
|
||||
uint64_t index = bucket * bucketRange_,
|
||||
lastrow = index + bucketRange_;
|
||||
lastrow = index + bucketRange_;
|
||||
for(; index < lastrow; ++index) {
|
||||
if(filter_->read(index) == fp) {
|
||||
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
|
||||
//filter_->read(index) << "\tcode = " << code << endl;
|
||||
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
|
||||
//filter_->read(index) << "\tcode = " << code << endl;
|
||||
filterIdx = index;
|
||||
hpdAddr = dict_.end();
|
||||
return (int)qtizer_->value(values_->read(index));
|
||||
return (int)qtizer_->value(values_->read(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
template<typename T>
|
||||
void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
|
||||
void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
|
||||
{
|
||||
// delete key if in high perf. dictionary
|
||||
string skey = hpDictKeyValue(IDs, len);
|
||||
if(dict_.find(skey) != dict_.end())
|
||||
dict_.erase(skey);
|
||||
else { // check if key is in filter
|
||||
// get small representation for ngrams
|
||||
// get small representation for ngrams
|
||||
//count_t bucket = bucketHash_->hash(IDs, len);
|
||||
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len));
|
||||
// retrieve non zero fingerprint for ngram
|
||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||
// return value if ngram is in filter
|
||||
uint64_t index = bucket * bucketRange_,
|
||||
lastrow = index + bucketRange_;
|
||||
lastrow = index + bucketRange_;
|
||||
for(; index < lastrow; ++index) {
|
||||
if(filter_->read(index) == fp) {
|
||||
if(filter_->read(index) == fp) {
|
||||
filter_->write(index, 0);
|
||||
values_->write(index, 0);
|
||||
--idxTracker_[bucket]; // track bucket size reduction
|
||||
@ -213,7 +217,8 @@ void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
|
||||
}
|
||||
}
|
||||
template<typename T> // clear filter index
|
||||
void PerfectHash<T>::remove(uint64_t index) {
|
||||
void PerfectHash<T>::remove(uint64_t index)
|
||||
{
|
||||
CHECK(index < cells_);
|
||||
CHECK(filter_->read(index) != 0); // slow
|
||||
filter_->write(index, 0);
|
||||
@ -224,19 +229,21 @@ void PerfectHash<T>::remove(uint64_t index) {
|
||||
}
|
||||
template<typename T>
|
||||
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
|
||||
count_t bucket) {
|
||||
count_t bucket)
|
||||
{
|
||||
count_t h = bucket;
|
||||
T fingerprint(0);
|
||||
do {
|
||||
fingerprint = fingerHash_->hash(IDs, len, h);
|
||||
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
|
||||
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
|
||||
} while((fingerprint == 0) && (h != bucket));
|
||||
if(fingerprint == 0)
|
||||
if(fingerprint == 0)
|
||||
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
|
||||
return fingerprint;
|
||||
}
|
||||
template<typename T>
|
||||
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
|
||||
string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
|
||||
{
|
||||
string skey(" ");
|
||||
for(int i = 0; i < len; ++i)
|
||||
skey += Utils::IntToStr(IDs[i]) + "¬";
|
||||
@ -244,17 +251,20 @@ string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
|
||||
return skey;
|
||||
}
|
||||
template<typename T>
|
||||
count_t PerfectHash<T>::hpDictMemUse() {
|
||||
count_t PerfectHash<T>::hpDictMemUse()
|
||||
{
|
||||
// return hpDict memory usage in MBs
|
||||
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
|
||||
}
|
||||
template<typename T>
|
||||
count_t PerfectHash<T>::bucketsMemUse() {
|
||||
count_t PerfectHash<T>::bucketsMemUse()
|
||||
{
|
||||
// return bucket memory usage in MBs
|
||||
return (count_t) (filter_->size() + values_->size());
|
||||
return (count_t) (filter_->size() + values_->size());
|
||||
}
|
||||
template<typename T>
|
||||
void PerfectHash<T>::save(FileHandler* fout) {
|
||||
void PerfectHash<T>::save(FileHandler* fout)
|
||||
{
|
||||
CHECK(fout != 0);
|
||||
cerr << "\tSaving perfect hash parameters...\n";
|
||||
fout->write((char*)&hitMask_, sizeof(hitMask_));
|
||||
@ -275,11 +285,12 @@ void PerfectHash<T>::save(FileHandler* fout) {
|
||||
count_t size = dict_.size();
|
||||
fout->write((char*)&size, sizeof(count_t));
|
||||
*fout << endl;
|
||||
iterate(dict_, t)
|
||||
*fout << t->first << "\t" << t->second << "\n";
|
||||
iterate(dict_, t)
|
||||
*fout << t->first << "\t" << t->second << "\n";
|
||||
}
|
||||
template<typename T>
|
||||
void PerfectHash<T>::load(FileHandler* fin) {
|
||||
void PerfectHash<T>::load(FileHandler* fin)
|
||||
{
|
||||
CHECK(fin != 0);
|
||||
cerr << "\tLoading perfect hash parameters...\n";
|
||||
fin->read((char*)&hitMask_, sizeof(hitMask_));
|
||||
@ -315,12 +326,13 @@ void PerfectHash<T>::load(FileHandler* fin) {
|
||||
cerr << "Finished loading ORLM." << endl;
|
||||
}
|
||||
template<typename T>
|
||||
void PerfectHash<T>::analyze() {
|
||||
void PerfectHash<T>::analyze()
|
||||
{
|
||||
cerr << "Analyzing Dynamic Bloomier Filter...\n";
|
||||
// see how many items in each bucket
|
||||
uint8_t* bucketCnt = new uint8_t[totBuckets_];
|
||||
unsigned largestBucket = 0, totalCellsSet = 0,
|
||||
smallestBucket = bucketRange_, totalZeroes = 0;
|
||||
unsigned largestBucket = 0, totalCellsSet = 0,
|
||||
smallestBucket = bucketRange_, totalZeroes = 0;
|
||||
int curBucket = -1, fullBuckets(0);
|
||||
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
|
||||
for(uint64_t i =0; i < cells_; ++i) {
|
||||
@ -328,16 +340,14 @@ void PerfectHash<T>::analyze() {
|
||||
if(filter_->read(i) != 0) {
|
||||
++bucketCnt[curBucket];
|
||||
++totalCellsSet;
|
||||
}
|
||||
else ++totalZeroes;
|
||||
} else ++totalZeroes;
|
||||
}
|
||||
count_t bi = 0, si = 0;
|
||||
for(int i = 0; i < totBuckets_; ++i) {
|
||||
if(bucketCnt[i] > largestBucket) {
|
||||
largestBucket = bucketCnt[i];
|
||||
bi = i;
|
||||
}
|
||||
else if(bucketCnt[i] < smallestBucket) {
|
||||
} else if(bucketCnt[i] < smallestBucket) {
|
||||
smallestBucket = bucketCnt[i];
|
||||
si = i;
|
||||
}
|
||||
@ -350,8 +360,8 @@ void PerfectHash<T>::analyze() {
|
||||
}
|
||||
for(int i = 0; i < totBuckets_; ++i) {
|
||||
if(bucketCnt[i] != idxTracker_[i])
|
||||
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
|
||||
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
|
||||
cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
|
||||
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
|
||||
}
|
||||
cerr << "total cells= " << cells_ << endl;
|
||||
cerr << "total buckets= " << totBuckets_ << endl;
|
||||
@ -364,7 +374,7 @@ void PerfectHash<T>::analyze() {
|
||||
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
|
||||
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
|
||||
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
|
||||
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
|
||||
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
|
||||
cerr << "total buckets full = " << fullBuckets << endl;
|
||||
cerr << "total collision errors= " << collisions_ << endl;
|
||||
cerr << "high performance dictionary size= " << dict_.size() << endl;
|
||||
@ -373,14 +383,15 @@ void PerfectHash<T>::analyze() {
|
||||
cerr << "values MBs= " << values_->size() << endl;
|
||||
delete[] bucketCnt;
|
||||
}
|
||||
template<typename T>
|
||||
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
|
||||
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
|
||||
template<typename T>
|
||||
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
|
||||
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
|
||||
{
|
||||
// check if key is in high perf. dictionary
|
||||
filterIdx = cells_ + 1;
|
||||
string skey = hpDictKeyValue(IDs, len);
|
||||
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
|
||||
hpdAddr->second += value;
|
||||
hpdAddr->second += value;
|
||||
return true;
|
||||
}
|
||||
// else hash ngram
|
||||
@ -389,18 +400,18 @@ bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
|
||||
// restriction on fprint value is non-zero
|
||||
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
|
||||
uint64_t index = bucket * bucketRange_, // starting bucket row
|
||||
lastrow = index + bucketRange_;
|
||||
lastrow = index + bucketRange_;
|
||||
while(index < lastrow) { // must check each row for matching fp event
|
||||
T filterVal = filter_->read(index);
|
||||
if(filterVal == fp) { // found event w.h.p.
|
||||
int oldval = (int)qtizer_->value(values_->read(index));
|
||||
values_->write(index, (T)qtizer_->code(oldval + value));
|
||||
int oldval = (int)qtizer_->value(values_->read(index));
|
||||
values_->write(index, (T)qtizer_->code(oldval + value));
|
||||
filterIdx = index;
|
||||
return true;
|
||||
}
|
||||
++index;
|
||||
}
|
||||
// add if it gets here.
|
||||
// add if it gets here.
|
||||
insert(IDs, len, value);
|
||||
return false;
|
||||
}
|
||||
|
@ -8,7 +8,8 @@
|
||||
#include "types.h"
|
||||
|
||||
static const float kFloatErr = 0.00001f;
|
||||
class LogQtizer {
|
||||
class LogQtizer
|
||||
{
|
||||
public:
|
||||
LogQtizer(float i): base_(pow(2, 1 / i)) {
|
||||
CHECK(base_ > 1);
|
||||
@ -16,8 +17,8 @@ public:
|
||||
float value = 1; // code = 1 -> value = 1 for any base
|
||||
std::vector<float> code_to_value_vec;
|
||||
while (log2(value) < 30) { // assume 2^30 is largest count
|
||||
code_to_value_vec.push_back(value);
|
||||
value = pow(base_, ++max_code_);
|
||||
code_to_value_vec.push_back(value);
|
||||
value = pow(base_, ++max_code_);
|
||||
}
|
||||
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
|
||||
// get valid range
|
||||
@ -40,22 +41,22 @@ public:
|
||||
int code(float value) {
|
||||
// should just be: return log_b(value)
|
||||
CHECK(!(value < min_value_ || value > max_value_));
|
||||
// but binary search removes errors due to floor operator above
|
||||
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
|
||||
value) - code_to_value_);
|
||||
// make sure not overestimating
|
||||
// but binary search removes errors due to floor operator above
|
||||
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
|
||||
value) - code_to_value_);
|
||||
// make sure not overestimating
|
||||
code = code_to_value_[code] > value ? code - 1 : code;
|
||||
return code;
|
||||
}
|
||||
inline float value(int code) {
|
||||
// table look up for values
|
||||
// table look up for values
|
||||
return code_to_value_[code];
|
||||
}
|
||||
inline int maxcode() {
|
||||
return max_code_;
|
||||
}
|
||||
inline float logValue(int code) {
|
||||
// table look up for log of values
|
||||
// table look up for log of values
|
||||
return code_to_log_value_[code];
|
||||
}
|
||||
~LogQtizer() {
|
||||
@ -69,15 +70,15 @@ public:
|
||||
fout->write((char*)&min_value_, sizeof(min_value_));
|
||||
for (int j = 0; j <= max_code_; ++j)
|
||||
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
|
||||
for (int j = 0; j <= max_code_; ++j)
|
||||
for (int j = 0; j <= max_code_; ++j)
|
||||
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
|
||||
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
|
||||
}
|
||||
private:
|
||||
float base_;
|
||||
float* code_to_value_;
|
||||
float* code_to_value_;
|
||||
float* code_to_log_value_;
|
||||
int max_code_;
|
||||
int max_code_;
|
||||
float max_value_;
|
||||
float min_value_;
|
||||
void load(FileHandler* fin) {
|
||||
|
@ -103,10 +103,11 @@ bool Vocab::Load(const std::string & vocab_path, const FactorDirection& directio
|
||||
std::cerr << "Loading vocab from " << vocab_path << std::endl;
|
||||
return Load(&vcbin, direction, factors, closed);
|
||||
}
|
||||
bool Vocab::Load(FileHandler* vcbin) {
|
||||
bool Vocab::Load(FileHandler* vcbin)
|
||||
{
|
||||
FactorList factors;
|
||||
factors.push_back(0);
|
||||
return Load(vcbin, Input, factors);
|
||||
return Load(vcbin, Input, factors);
|
||||
}
|
||||
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
|
||||
const FactorList& factors, bool closed)
|
||||
|
@ -74,12 +74,12 @@ int DynSuffixArray::F_firstIdx(unsigned word)
|
||||
// return index of first row where word is found in m_F
|
||||
/*for(int i=0; i < m_F->size(); ++i) {
|
||||
if(m_F->at(i) == word) {
|
||||
return i;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;*/
|
||||
//NOTE: lower_bound is faster than linear search above but may cause issues
|
||||
// if ordering of vocab is not consecutive (ie..after deletions)
|
||||
//NOTE: lower_bound is faster than linear search above but may cause issues
|
||||
// if ordering of vocab is not consecutive (ie..after deletions)
|
||||
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
|
||||
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
|
||||
if(low >= m_F->size())
|
||||
@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
|
||||
{
|
||||
set<pair<unsigned, unsigned> > seen;
|
||||
while(j != jprime) {
|
||||
// this 'seenit' check added for data with many loops. will remove after double
|
||||
// checking.
|
||||
// this 'seenit' check added for data with many loops. will remove after double
|
||||
// checking.
|
||||
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
|
||||
if(seenit) {
|
||||
for(int i=1; i < m_SA->size(); ++i) {
|
||||
@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
|
||||
int new_j = LastFirstFunc(j);
|
||||
CHECK(j <= jprime);
|
||||
// for SA and L, the element at pos j is moved to pos j'
|
||||
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
|
||||
m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
|
||||
m_L->erase(m_L->begin() + j);
|
||||
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
|
||||
m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
|
||||
m_SA->erase(m_SA->begin() + j);
|
||||
// all ISA values between (j...j'] decremented
|
||||
for(size_t i = 0; i < m_ISA->size(); ++i) {
|
||||
|
@ -33,9 +33,9 @@ namespace Moses
|
||||
class FactorFriend;
|
||||
class FactorCollection;
|
||||
|
||||
/** Represents a factor (word, POS, etc).
|
||||
/** Represents a factor (word, POS, etc).
|
||||
*
|
||||
* A Factor has a contiguous identifier and string value.
|
||||
* A Factor has a contiguous identifier and string value.
|
||||
*/
|
||||
class Factor
|
||||
{
|
||||
@ -45,17 +45,17 @@ class Factor
|
||||
friend class FactorCollection;
|
||||
friend class FactorFriend;
|
||||
|
||||
// FactorCollection writes here.
|
||||
// FactorCollection writes here.
|
||||
std::string m_string;
|
||||
size_t m_id;
|
||||
|
||||
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
|
||||
Factor() {}
|
||||
|
||||
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
|
||||
// Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
|
||||
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
|
||||
|
||||
// Not implemented. Shouldn't be called.
|
||||
// Not implemented. Shouldn't be called.
|
||||
Factor &operator=(const Factor &factor);
|
||||
|
||||
public:
|
||||
|
@ -33,7 +33,7 @@ FactorCollection FactorCollection::s_instance;
|
||||
|
||||
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
|
||||
{
|
||||
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
|
||||
// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
|
||||
#ifdef WITH_THREADS
|
||||
#if BOOST_VERSION < 104200
|
||||
FactorFriend to_ins;
|
||||
@ -42,7 +42,7 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
|
||||
{
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
||||
#if BOOST_VERSION >= 104200
|
||||
// If this line doesn't compile, upgrade your Boost.
|
||||
// If this line doesn't compile, upgrade your Boost.
|
||||
Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
|
||||
#else // BOOST_VERSION
|
||||
Set::const_iterator i = m_set.find(to_ins);
|
||||
|
@ -47,7 +47,7 @@ namespace Moses
|
||||
* private and friended to FactorFriend. The STL containers can delegate
|
||||
* copying, so friending the container isn't sufficient. STL containers see
|
||||
* FactorFriend's public copy constructor and everybody else sees Factor's
|
||||
* private copy constructor.
|
||||
* private copy constructor.
|
||||
*/
|
||||
struct FactorFriend {
|
||||
Factor in;
|
||||
|
@ -30,20 +30,24 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
LanguageModel::LanguageModel() {
|
||||
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
|
||||
LanguageModel::LanguageModel()
|
||||
{
|
||||
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
|
||||
}
|
||||
|
||||
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager) {
|
||||
void LanguageModel::Init(ScoreIndexManager &scoreIndexManager)
|
||||
{
|
||||
scoreIndexManager.AddScoreProducer(this);
|
||||
}
|
||||
|
||||
LanguageModel::~LanguageModel() {}
|
||||
|
||||
// don't inline virtual funcs...
|
||||
size_t LanguageModel::GetNumScoreComponents() const {
|
||||
size_t LanguageModel::GetNumScoreComponents() const
|
||||
{
|
||||
if (m_enableOOVFeature) {
|
||||
return 2;
|
||||
} else {
|
||||
@ -51,13 +55,15 @@ size_t LanguageModel::GetNumScoreComponents() const {
|
||||
}
|
||||
}
|
||||
|
||||
float LanguageModel::GetWeight() const {
|
||||
float LanguageModel::GetWeight() const
|
||||
{
|
||||
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
|
||||
GetBeginIndex(GetScoreBookkeepingID());
|
||||
return StaticData::Instance().GetAllWeights()[lmIndex];
|
||||
}
|
||||
|
||||
float LanguageModel::GetOOVWeight() const {
|
||||
float LanguageModel::GetOOVWeight() const
|
||||
{
|
||||
if (!m_enableOOVFeature) return 0;
|
||||
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
|
||||
GetBeginIndex(GetScoreBookkeepingID());
|
||||
|
@ -35,7 +35,8 @@ class Phrase;
|
||||
class ScoreIndexManager;
|
||||
|
||||
//! Abstract base class which represent a language model on a contiguous phrase
|
||||
class LanguageModel : public StatefulFeatureFunction {
|
||||
class LanguageModel : public StatefulFeatureFunction
|
||||
{
|
||||
protected:
|
||||
LanguageModel();
|
||||
|
||||
@ -43,11 +44,11 @@ protected:
|
||||
void Init(ScoreIndexManager &scoreIndexManager);
|
||||
|
||||
bool m_enableOOVFeature;
|
||||
|
||||
|
||||
public:
|
||||
virtual ~LanguageModel();
|
||||
|
||||
// Make another feature without copying the underlying model data.
|
||||
// Make another feature without copying the underlying model data.
|
||||
virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0;
|
||||
|
||||
//! see ScoreProducer.h
|
||||
|
@ -10,10 +10,12 @@
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0) {
|
||||
LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0)
|
||||
{
|
||||
}
|
||||
|
||||
LanguageModelDMapLM::~LanguageModelDMapLM() {
|
||||
LanguageModelDMapLM::~LanguageModelDMapLM()
|
||||
{
|
||||
delete m_lm;
|
||||
}
|
||||
|
||||
@ -51,8 +53,8 @@ void LanguageModelDMapLM::CreateFactor(FactorCollection& factorCollection)
|
||||
}
|
||||
|
||||
LMResult LanguageModelDMapLM::GetValueGivenState(
|
||||
const std::vector<const Word*>& contextFactor,
|
||||
FFState& state) const
|
||||
const std::vector<const Word*>& contextFactor,
|
||||
FFState& state) const
|
||||
{
|
||||
DMapLMState& cast_state = static_cast<DMapLMState&>(state);
|
||||
LMResult result;
|
||||
@ -65,8 +67,8 @@ LMResult LanguageModelDMapLM::GetValueGivenState(
|
||||
}
|
||||
|
||||
LMResult LanguageModelDMapLM::GetValueForgotState(
|
||||
const std::vector<const Word*>& contextFactor,
|
||||
FFState& outState) const
|
||||
const std::vector<const Word*>& contextFactor,
|
||||
FFState& outState) const
|
||||
{
|
||||
DMapLMState& cast_state = static_cast<DMapLMState&>(outState);
|
||||
LMResult result;
|
||||
@ -78,13 +80,13 @@ LMResult LanguageModelDMapLM::GetValueForgotState(
|
||||
}
|
||||
|
||||
float LanguageModelDMapLM::GetValue(
|
||||
const std::vector<const Word*>& contextFactor,
|
||||
size_t target_order,
|
||||
size_t* succeeding_order) const
|
||||
const std::vector<const Word*>& contextFactor,
|
||||
size_t target_order,
|
||||
size_t* succeeding_order) const
|
||||
{
|
||||
FactorType factorType = GetFactorType();
|
||||
float score;
|
||||
|
||||
|
||||
std::string ngram_string("");
|
||||
ngram_string.append(((*contextFactor[0])[factorType])->GetString());
|
||||
for (size_t i = 1; i < contextFactor.size(); ++i) {
|
||||
@ -97,38 +99,44 @@ float LanguageModelDMapLM::GetValue(
|
||||
return score;
|
||||
}
|
||||
|
||||
const FFState* LanguageModelDMapLM::GetNullContextState() const {
|
||||
DMapLMState* state = new DMapLMState();
|
||||
state->m_last_succeeding_order = GetNGramOrder();
|
||||
return state;
|
||||
const FFState* LanguageModelDMapLM::GetNullContextState() const
|
||||
{
|
||||
DMapLMState* state = new DMapLMState();
|
||||
state->m_last_succeeding_order = GetNGramOrder();
|
||||
return state;
|
||||
}
|
||||
|
||||
FFState* LanguageModelDMapLM::GetNewSentenceState() const {
|
||||
DMapLMState* state = new DMapLMState();
|
||||
state->m_last_succeeding_order = GetNGramOrder();
|
||||
return state;
|
||||
FFState* LanguageModelDMapLM::GetNewSentenceState() const
|
||||
{
|
||||
DMapLMState* state = new DMapLMState();
|
||||
state->m_last_succeeding_order = GetNGramOrder();
|
||||
return state;
|
||||
}
|
||||
|
||||
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const {
|
||||
DMapLMState* state = new DMapLMState();
|
||||
state->m_last_succeeding_order = GetNGramOrder();
|
||||
return state;
|
||||
const FFState* LanguageModelDMapLM::GetBeginSentenceState() const
|
||||
{
|
||||
DMapLMState* state = new DMapLMState();
|
||||
state->m_last_succeeding_order = GetNGramOrder();
|
||||
return state;
|
||||
}
|
||||
|
||||
FFState* LanguageModelDMapLM::NewState(const FFState* state) const {
|
||||
DMapLMState* new_state = new DMapLMState();
|
||||
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
|
||||
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
|
||||
return new_state;
|
||||
FFState* LanguageModelDMapLM::NewState(const FFState* state) const
|
||||
{
|
||||
DMapLMState* new_state = new DMapLMState();
|
||||
const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
|
||||
new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
|
||||
return new_state;
|
||||
}
|
||||
|
||||
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing() {
|
||||
void LanguageModelDMapLM::CleanUpAfterSentenceProcessing()
|
||||
{
|
||||
m_lm->printStats();
|
||||
m_lm->resetStats();
|
||||
m_lm->clearCaches();
|
||||
}
|
||||
|
||||
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing() {
|
||||
void LanguageModelDMapLM::InitializeBeforeSentenceProcessing()
|
||||
{
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -12,20 +12,22 @@
|
||||
#include "LM/SingleFactor.h"
|
||||
#include "Util.h"
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class DMapLMState : public FFState {
|
||||
class DMapLMState : public FFState
|
||||
{
|
||||
public:
|
||||
int Compare(const FFState &o) const {
|
||||
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
|
||||
if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
|
||||
return -1;
|
||||
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
uint8_t m_last_succeeding_order;
|
||||
int Compare(const FFState &o) const {
|
||||
const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
|
||||
if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
|
||||
return -1;
|
||||
else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
uint8_t m_last_succeeding_order;
|
||||
};
|
||||
|
||||
class LanguageModelDMapLM : public LanguageModelSingleFactor
|
||||
|
@ -69,7 +69,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
|
||||
m_filePath = filePath;
|
||||
|
||||
|
||||
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
|
||||
m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
|
||||
m_lmtb->setMaxLoadedLevel(1000);
|
||||
m_lmtb->load(m_filePath);
|
||||
d=m_lmtb->getDict();
|
||||
@ -140,7 +140,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
|
||||
}
|
||||
|
||||
int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
||||
{
|
||||
{
|
||||
size_t factorId = factor->GetId();
|
||||
|
||||
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
|
||||
@ -150,12 +150,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
||||
|
||||
//////////
|
||||
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
|
||||
///e delle parole target in Moses, puo' accadere che una parola target
|
||||
///e delle parole target in Moses, puo' accadere che una parola target
|
||||
///di cui non sia stato ancora calcolato il suo codice target abbia
|
||||
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
|
||||
///E' necessario dunque identificare questi casi di indeterminatezza
|
||||
///del codice target. Attualamente, questo controllo e' stato implementato
|
||||
///impostando a m_empty tutti i termini che non hanno ancora
|
||||
///impostando a m_empty tutti i termini che non hanno ancora
|
||||
//ricevuto un codice target effettivo
|
||||
///////////
|
||||
|
||||
@ -167,7 +167,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
||||
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
|
||||
/// Cosi' funziona ....
|
||||
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
|
||||
/// quindi
|
||||
/// quindi
|
||||
/// e scopro che rimane vuota una entry ogni due
|
||||
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
|
||||
/// non da problemi di correttezza, ma solo di "spreco" di memoria
|
||||
@ -177,10 +177,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
|
||||
////////////////
|
||||
|
||||
|
||||
if (factorId >= m_lmIdLookup.size()){
|
||||
//resize and fill with m_empty
|
||||
//increment the array more than needed to avoid too many resizing operation.
|
||||
m_lmIdLookup.resize(factorId+10, m_empty);
|
||||
if (factorId >= m_lmIdLookup.size()) {
|
||||
//resize and fill with m_empty
|
||||
//increment the array more than needed to avoid too many resizing operation.
|
||||
m_lmIdLookup.resize(factorId+10, m_empty);
|
||||
}
|
||||
|
||||
//insert new code
|
||||
|
@ -68,8 +68,9 @@ void LanguageModelImplementation::GetState(
|
||||
GetValueForgotState(contextFactor, state);
|
||||
}
|
||||
|
||||
// Calculate score of a phrase.
|
||||
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
||||
// Calculate score of a phrase.
|
||||
void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
|
||||
{
|
||||
fullScore = 0;
|
||||
ngramScore = 0;
|
||||
|
||||
@ -81,7 +82,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
|
||||
vector<const Word*> contextFactor;
|
||||
contextFactor.reserve(GetNGramOrder());
|
||||
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartArray()) ?
|
||||
GetBeginSentenceState() : GetNullContextState()));
|
||||
GetBeginSentenceState() : GetNullContextState()));
|
||||
size_t currPos = 0;
|
||||
while (currPos < phraseSize) {
|
||||
const Word &word = phrase.GetWord(currPos);
|
||||
@ -108,7 +109,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
|
||||
fullScore += result.score;
|
||||
if (contextFactor.size() == GetNGramOrder())
|
||||
ngramScore += result.score;
|
||||
if (result.unknown) ++oovCount;
|
||||
if (result.unknown) ++oovCount;
|
||||
}
|
||||
}
|
||||
|
||||
@ -116,7 +117,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
|
||||
}
|
||||
}
|
||||
|
||||
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const {
|
||||
FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const
|
||||
{
|
||||
// In this function, we only compute the LM scores of n-grams that overlap a
|
||||
// phrase boundary. Phrase-internal scores are taken directly from the
|
||||
// translation option.
|
||||
@ -178,9 +180,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
|
||||
contextFactor[i] = &hypo.GetWord((size_t)currPos);
|
||||
}
|
||||
lmScore += GetValueForgotState(contextFactor, *res).score;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
if (endPos < currEndPos) {
|
||||
//need to get the LM state (otherwise the last LM state is fine)
|
||||
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
|
||||
@ -207,10 +207,11 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
|
||||
return res;
|
||||
}
|
||||
|
||||
namespace {
|
||||
namespace
|
||||
{
|
||||
|
||||
// This is the FFState used by LanguageModelImplementation::EvaluateChart.
|
||||
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
|
||||
// This is the FFState used by LanguageModelImplementation::EvaluateChart.
|
||||
// Though svn blame goes back to heafield, don't blame me. I just moved this from LanguageModelChartState.cpp and ChartHypothesis.cpp.
|
||||
class LanguageModelChartState : public FFState
|
||||
{
|
||||
private:
|
||||
@ -223,12 +224,11 @@ private:
|
||||
|
||||
const ChartHypothesis &m_hypo;
|
||||
|
||||
/** Construct the prefix string of up to specified size
|
||||
/** Construct the prefix string of up to specified size
|
||||
* \param ret prefix string
|
||||
* \param size maximum size (typically max lm context window)
|
||||
*/
|
||||
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
|
||||
{
|
||||
size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
|
||||
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
|
||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
||||
target.GetAlignmentInfo().GetNonTermIndexMap();
|
||||
@ -257,13 +257,12 @@ private:
|
||||
return size;
|
||||
}
|
||||
|
||||
/** Construct the suffix phrase of up to specified size
|
||||
/** Construct the suffix phrase of up to specified size
|
||||
* will always be called after the construction of prefix phrase
|
||||
* \param ret suffix phrase
|
||||
* \param size maximum size of suffix
|
||||
*/
|
||||
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
|
||||
{
|
||||
size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
|
||||
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
|
||||
|
||||
// special handling for small hypotheses
|
||||
@ -292,8 +291,7 @@ private:
|
||||
size_t nonTermInd = nonTermIndexMap[pos];
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
|
||||
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
|
||||
size--;
|
||||
}
|
||||
@ -309,11 +307,10 @@ private:
|
||||
|
||||
public:
|
||||
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
|
||||
:m_lmRightContext(NULL)
|
||||
,m_contextPrefix(order - 1)
|
||||
,m_contextSuffix( order - 1)
|
||||
,m_hypo(hypo)
|
||||
{
|
||||
:m_lmRightContext(NULL)
|
||||
,m_contextPrefix(order - 1)
|
||||
,m_contextSuffix( order - 1)
|
||||
,m_hypo(hypo) {
|
||||
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
|
||||
|
||||
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
|
||||
@ -334,8 +331,12 @@ public:
|
||||
m_lmRightContext = rightState;
|
||||
}
|
||||
|
||||
float GetPrefixScore() const { return m_prefixScore; }
|
||||
FFState* GetRightContext() const { return m_lmRightContext; }
|
||||
float GetPrefixScore() const {
|
||||
return m_prefixScore;
|
||||
}
|
||||
FFState* GetRightContext() const {
|
||||
return m_lmRightContext;
|
||||
}
|
||||
|
||||
size_t GetNumTargetTerminals() const {
|
||||
return m_numTargetTerminals;
|
||||
@ -353,8 +354,7 @@ public:
|
||||
dynamic_cast<const LanguageModelChartState &>( o );
|
||||
|
||||
// prefix
|
||||
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
|
||||
{
|
||||
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
|
||||
int ret = GetPrefix().Compare(other.GetPrefix());
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
@ -362,8 +362,7 @@ public:
|
||||
|
||||
// suffix
|
||||
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
|
||||
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
|
||||
{
|
||||
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
|
||||
int ret = other.GetRightContext()->Compare(*m_lmRightContext);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
@ -374,7 +373,8 @@ public:
|
||||
|
||||
} // namespace
|
||||
|
||||
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const {
|
||||
FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out, const LanguageModel *scorer) const
|
||||
{
|
||||
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
|
||||
// data structure for factored context phrase (history and predicted word)
|
||||
vector<const Word*> contextFactor;
|
||||
@ -394,33 +394,28 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
||||
// loop over rule
|
||||
for (size_t phrasePos = 0, wordPos = 0;
|
||||
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
|
||||
phrasePos++)
|
||||
{
|
||||
phrasePos++) {
|
||||
// consult rule for either word or non-terminal
|
||||
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
|
||||
|
||||
// regular word
|
||||
if (!word.IsNonTerminal())
|
||||
{
|
||||
if (!word.IsNonTerminal()) {
|
||||
ShiftOrPush(contextFactor, word);
|
||||
|
||||
// beginning of sentence symbol <s>? -> just update state
|
||||
if (word == GetSentenceStartArray())
|
||||
{
|
||||
if (word == GetSentenceStartArray()) {
|
||||
CHECK(phrasePos == 0);
|
||||
delete lmState;
|
||||
lmState = NewState( GetBeginSentenceState() );
|
||||
}
|
||||
// score a regular word added by the rule
|
||||
else
|
||||
{
|
||||
else {
|
||||
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
|
||||
}
|
||||
}
|
||||
|
||||
// non-terminal, add phrase from underlying hypothesis
|
||||
else
|
||||
{
|
||||
else {
|
||||
// look up underlying hypothesis
|
||||
size_t nonTermIndex = nonTermIndexMap[phrasePos];
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
||||
@ -444,8 +439,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
||||
// push suffix
|
||||
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
|
||||
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
|
||||
for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
|
||||
{
|
||||
for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
|
||||
const Word &word = prevState->GetSuffix().GetWord(suffixPos);
|
||||
ShiftOrPush(contextFactor, word);
|
||||
wordPos++;
|
||||
@ -453,22 +447,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
||||
}
|
||||
|
||||
// internal non-terminal
|
||||
else
|
||||
{
|
||||
else {
|
||||
// score its prefix
|
||||
for(size_t prefixPos = 0;
|
||||
prefixPos < GetNGramOrder()-1 // up to LM order window
|
||||
&& prefixPos < subPhraseLength; // up to length
|
||||
prefixPos++)
|
||||
{
|
||||
&& prefixPos < subPhraseLength; // up to length
|
||||
prefixPos++) {
|
||||
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
|
||||
ShiftOrPush(contextFactor, word);
|
||||
updateChartScore( &prefixScore, &finalizedScore, UntransformLMScore(GetValueGivenState(contextFactor, *lmState).score), ++wordPos );
|
||||
}
|
||||
|
||||
// check if we are dealing with a large sub-phrase
|
||||
if (subPhraseLength > GetNGramOrder() - 1)
|
||||
{
|
||||
if (subPhraseLength > GetNGramOrder() - 1) {
|
||||
// add its finalized language model score
|
||||
finalizedScore +=
|
||||
prevHypo->GetScoreBreakdown().GetScoresForProducer(scorer)[0] // full score
|
||||
@ -503,11 +494,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
|
||||
return ret;
|
||||
}
|
||||
|
||||
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const {
|
||||
void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
|
||||
{
|
||||
if (wordPos < GetNGramOrder()) {
|
||||
*prefixScore += score;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
*finalizedScore += score;
|
||||
}
|
||||
}
|
||||
|
@ -45,7 +45,7 @@ class Phrase;
|
||||
struct LMResult {
|
||||
// log probability
|
||||
float score;
|
||||
// Is the word unknown?
|
||||
// Is the word unknown?
|
||||
bool unknown;
|
||||
};
|
||||
|
||||
@ -126,54 +126,55 @@ public:
|
||||
virtual void CleanUpAfterSentenceProcessing() {};
|
||||
};
|
||||
|
||||
class LMRefCount : public LanguageModel {
|
||||
public:
|
||||
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
|
||||
Init(scoreIndexManager);
|
||||
}
|
||||
class LMRefCount : public LanguageModel
|
||||
{
|
||||
public:
|
||||
LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
|
||||
Init(scoreIndexManager);
|
||||
}
|
||||
|
||||
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
|
||||
return new LMRefCount(scoreIndexManager, *this);
|
||||
}
|
||||
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
|
||||
return new LMRefCount(scoreIndexManager, *this);
|
||||
}
|
||||
|
||||
void InitializeBeforeSentenceProcessing() {
|
||||
m_impl->InitializeBeforeSentenceProcessing();
|
||||
}
|
||||
void InitializeBeforeSentenceProcessing() {
|
||||
m_impl->InitializeBeforeSentenceProcessing();
|
||||
}
|
||||
|
||||
void CleanUpAfterSentenceProcessing() {
|
||||
m_impl->CleanUpAfterSentenceProcessing();
|
||||
}
|
||||
void CleanUpAfterSentenceProcessing() {
|
||||
m_impl->CleanUpAfterSentenceProcessing();
|
||||
}
|
||||
|
||||
const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
|
||||
return m_impl->NewState(m_impl->GetBeginSentenceState());
|
||||
}
|
||||
const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
|
||||
return m_impl->NewState(m_impl->GetBeginSentenceState());
|
||||
}
|
||||
|
||||
bool Useable(const Phrase &phrase) const {
|
||||
return m_impl->Useable(phrase);
|
||||
}
|
||||
bool Useable(const Phrase &phrase) const {
|
||||
return m_impl->Useable(phrase);
|
||||
}
|
||||
|
||||
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
||||
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
|
||||
}
|
||||
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
||||
return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
|
||||
}
|
||||
|
||||
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
|
||||
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
|
||||
}
|
||||
FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
|
||||
return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
|
||||
}
|
||||
|
||||
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
|
||||
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
|
||||
}
|
||||
FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
|
||||
return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
|
||||
}
|
||||
|
||||
std::string GetScoreProducerDescription(unsigned int param) const {
|
||||
return m_impl->GetScoreProducerDescription(param);
|
||||
}
|
||||
std::string GetScoreProducerDescription(unsigned int param) const {
|
||||
return m_impl->GetScoreProducerDescription(param);
|
||||
}
|
||||
|
||||
private:
|
||||
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount ©_from) : m_impl(copy_from.m_impl) {
|
||||
Init(scoreIndexManager);
|
||||
}
|
||||
private:
|
||||
LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount ©_from) : m_impl(copy_from.m_impl) {
|
||||
Init(scoreIndexManager);
|
||||
}
|
||||
|
||||
boost::shared_ptr<LanguageModelImplementation> m_impl;
|
||||
boost::shared_ptr<LanguageModelImplementation> m_impl;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -43,8 +43,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses {
|
||||
namespace {
|
||||
namespace Moses
|
||||
{
|
||||
namespace
|
||||
{
|
||||
|
||||
struct KenLMState : public FFState {
|
||||
lm::ngram::State state;
|
||||
@ -59,67 +61,69 @@ struct KenLMState : public FFState {
|
||||
/*
|
||||
* An implementation of single factor LM using Ken's code.
|
||||
*/
|
||||
template <class Model> class LanguageModelKen : public LanguageModel {
|
||||
public:
|
||||
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
||||
template <class Model> class LanguageModelKen : public LanguageModel
|
||||
{
|
||||
public:
|
||||
LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
||||
|
||||
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
|
||||
LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
|
||||
|
||||
bool Useable(const Phrase &phrase) const {
|
||||
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
|
||||
}
|
||||
bool Useable(const Phrase &phrase) const {
|
||||
return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
|
||||
}
|
||||
|
||||
std::string GetScoreProducerDescription(unsigned) const {
|
||||
std::ostringstream oss;
|
||||
oss << "LM_" << m_ngram->Order() << "gram";
|
||||
return oss.str();
|
||||
}
|
||||
std::string GetScoreProducerDescription(unsigned) const {
|
||||
std::ostringstream oss;
|
||||
oss << "LM_" << m_ngram->Order() << "gram";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
|
||||
KenLMState *ret = new KenLMState();
|
||||
ret->state = m_ngram->BeginSentenceState();
|
||||
return ret;
|
||||
}
|
||||
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
|
||||
KenLMState *ret = new KenLMState();
|
||||
ret->state = m_ngram->BeginSentenceState();
|
||||
return ret;
|
||||
}
|
||||
|
||||
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
|
||||
void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
|
||||
|
||||
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
|
||||
FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
|
||||
|
||||
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
|
||||
FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
|
||||
|
||||
private:
|
||||
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from);
|
||||
private:
|
||||
LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from);
|
||||
|
||||
lm::WordIndex TranslateID(const Word &word) const {
|
||||
std::size_t factor = word.GetFactor(m_factorType)->GetId();
|
||||
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
||||
}
|
||||
lm::WordIndex TranslateID(const Word &word) const {
|
||||
std::size_t factor = word.GetFactor(m_factorType)->GetId();
|
||||
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
||||
}
|
||||
|
||||
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
||||
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
|
||||
lm::WordIndex *index = indices;
|
||||
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
||||
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
||||
for (; ; ++index, --position) {
|
||||
if (position == -1) {
|
||||
*index = m_ngram->GetVocabulary().BeginSentence();
|
||||
return index + 1;
|
||||
}
|
||||
if (index == end) return index;
|
||||
*index = TranslateID(hypo.GetWord(position));
|
||||
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
||||
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
|
||||
lm::WordIndex *index = indices;
|
||||
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
||||
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
||||
for (; ; ++index, --position) {
|
||||
if (position == -1) {
|
||||
*index = m_ngram->GetVocabulary().BeginSentence();
|
||||
return index + 1;
|
||||
}
|
||||
if (index == end) return index;
|
||||
*index = TranslateID(hypo.GetWord(position));
|
||||
}
|
||||
}
|
||||
|
||||
boost::shared_ptr<Model> m_ngram;
|
||||
|
||||
std::vector<lm::WordIndex> m_lmIdLookup;
|
||||
boost::shared_ptr<Model> m_ngram;
|
||||
|
||||
FactorType m_factorType;
|
||||
std::vector<lm::WordIndex> m_lmIdLookup;
|
||||
|
||||
const Factor *m_beginSentenceFactor;
|
||||
FactorType m_factorType;
|
||||
|
||||
const Factor *m_beginSentenceFactor;
|
||||
};
|
||||
|
||||
class MappingBuilder : public lm::EnumerateVocab {
|
||||
class MappingBuilder : public lm::EnumerateVocab
|
||||
{
|
||||
public:
|
||||
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
|
||||
: m_factorCollection(factorCollection), m_mapping(mapping) {}
|
||||
@ -138,11 +142,13 @@ private:
|
||||
std::vector<lm::WordIndex> &m_mapping;
|
||||
};
|
||||
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType) {
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType)
|
||||
{
|
||||
lm::ngram::Config config;
|
||||
IFVERBOSE(1) {
|
||||
config.messages = &std::cerr;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
config.messages = NULL;
|
||||
}
|
||||
FactorCollection &collection = FactorCollection::Instance();
|
||||
@ -156,20 +162,23 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
|
||||
Init(manager);
|
||||
}
|
||||
|
||||
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const {
|
||||
template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const
|
||||
{
|
||||
return new LanguageModelKen<Model>(manager, *this);
|
||||
}
|
||||
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> ©_from) :
|
||||
m_ngram(copy_from.m_ngram),
|
||||
// TODO: don't copy this.
|
||||
m_lmIdLookup(copy_from.m_lmIdLookup),
|
||||
m_factorType(copy_from.m_factorType),
|
||||
m_beginSentenceFactor(copy_from.m_beginSentenceFactor) {
|
||||
m_ngram(copy_from.m_ngram),
|
||||
// TODO: don't copy this.
|
||||
m_lmIdLookup(copy_from.m_lmIdLookup),
|
||||
m_factorType(copy_from.m_factorType),
|
||||
m_beginSentenceFactor(copy_from.m_beginSentenceFactor)
|
||||
{
|
||||
Init(manager);
|
||||
}
|
||||
|
||||
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
|
||||
template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
|
||||
{
|
||||
fullScore = 0;
|
||||
ngramScore = 0;
|
||||
oovCount = 0;
|
||||
@ -186,13 +195,13 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
|
||||
*state0 = m_ngram->NullContextState();
|
||||
position = 0;
|
||||
}
|
||||
|
||||
|
||||
size_t ngramBoundary = m_ngram->Order() - 1;
|
||||
|
||||
for (; position < phrase.GetSize(); ++position) {
|
||||
const Word &word = phrase.GetWord(position);
|
||||
if (word.IsNonTerminal()) {
|
||||
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
|
||||
// If there's a non-terminal at 1 and we have a 5-gram LM, then positions 2 3 4 and 5 will be incomplete while position 6 is complete.
|
||||
ngramBoundary = m_ngram->Order() + position;
|
||||
*state0 = m_ngram->NullContextState();
|
||||
} else {
|
||||
@ -210,11 +219,12 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
|
||||
}
|
||||
}
|
||||
|
||||
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
|
||||
template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
|
||||
{
|
||||
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
|
||||
|
||||
std::auto_ptr<KenLMState> ret(new KenLMState());
|
||||
|
||||
|
||||
if (!hypo.GetCurrTargetLength()) {
|
||||
ret->state = in_state;
|
||||
return ret.release();
|
||||
@ -237,17 +247,17 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
|
||||
}
|
||||
|
||||
if (hypo.IsSourceCompleted()) {
|
||||
// Score end of sentence.
|
||||
// Score end of sentence.
|
||||
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
||||
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
||||
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
|
||||
} else if (adjust_end < end) {
|
||||
// Get state after adding a long phrase.
|
||||
// Get state after adding a long phrase.
|
||||
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
||||
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
||||
m_ngram->GetState(&indices.front(), last, ret->state);
|
||||
} else if (state0 != &ret->state) {
|
||||
// Short enough phrase that we can just reuse the state.
|
||||
// Short enough phrase that we can just reuse the state.
|
||||
ret->state = *state0;
|
||||
}
|
||||
|
||||
@ -265,32 +275,37 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
|
||||
return ret.release();
|
||||
}
|
||||
|
||||
class LanguageModelChartStateKenLM : public FFState {
|
||||
public:
|
||||
LanguageModelChartStateKenLM() {}
|
||||
class LanguageModelChartStateKenLM : public FFState
|
||||
{
|
||||
public:
|
||||
LanguageModelChartStateKenLM() {}
|
||||
|
||||
const lm::ngram::ChartState &GetChartState() const { return m_state; }
|
||||
lm::ngram::ChartState &GetChartState() { return m_state; }
|
||||
const lm::ngram::ChartState &GetChartState() const {
|
||||
return m_state;
|
||||
}
|
||||
lm::ngram::ChartState &GetChartState() {
|
||||
return m_state;
|
||||
}
|
||||
|
||||
int Compare(const FFState& o) const
|
||||
{
|
||||
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
|
||||
int ret = m_state.Compare(other.m_state);
|
||||
return ret;
|
||||
}
|
||||
int Compare(const FFState& o) const {
|
||||
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
|
||||
int ret = m_state.Compare(other.m_state);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
lm::ngram::ChartState m_state;
|
||||
private:
|
||||
lm::ngram::ChartState m_state;
|
||||
};
|
||||
|
||||
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const {
|
||||
template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
|
||||
{
|
||||
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
|
||||
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
|
||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
|
||||
|
||||
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
|
||||
size_t phrasePos = 0;
|
||||
// Special cases for first word.
|
||||
// Special cases for first word.
|
||||
if (size) {
|
||||
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
|
||||
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
|
||||
@ -298,7 +313,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
|
||||
ruleScore.BeginSentence();
|
||||
phrasePos++;
|
||||
} else if (word.IsNonTerminal()) {
|
||||
// Non-terminal is first so we can copy instead of rescoring.
|
||||
// Non-terminal is first so we can copy instead of rescoring.
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
|
||||
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
|
||||
ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
@ -323,24 +338,25 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
|
||||
|
||||
} // namespace
|
||||
|
||||
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) {
|
||||
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy)
|
||||
{
|
||||
try {
|
||||
lm::ngram::ModelType model_type;
|
||||
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
||||
switch(model_type) {
|
||||
case lm::ngram::HASH_PROBING:
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::QUANT_TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::ARRAY_TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
|
||||
default:
|
||||
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
||||
abort();
|
||||
case lm::ngram::HASH_PROBING:
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::QUANT_TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::ARRAY_TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
|
||||
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
|
||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
|
||||
default:
|
||||
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
|
||||
|
@ -26,12 +26,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include "TypeDef.h"
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class ScoreIndexManager;
|
||||
class LanguageModel;
|
||||
|
||||
// This will also load.
|
||||
// This will also load.
|
||||
LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -9,10 +9,11 @@
|
||||
#include "LM/ORLM.h"
|
||||
|
||||
using std::map;
|
||||
namespace Moses
|
||||
namespace Moses
|
||||
{
|
||||
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
||||
size_t nGramOrder)
|
||||
{
|
||||
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
||||
size_t nGramOrder) {
|
||||
cerr << "Loading LanguageModelORLM..." << endl;
|
||||
m_filePath = filePath;
|
||||
m_factorType = factorType;
|
||||
@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
|
||||
CreateFactors();
|
||||
return true;
|
||||
}
|
||||
void LanguageModelORLM::CreateFactors() {
|
||||
void LanguageModelORLM::CreateFactors()
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
size_t maxFactorId = 0; // to create lookup vector later on
|
||||
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
|
||||
|
||||
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
|
||||
vIter != m_lm->vocab_->VocabEnd(); vIter++){
|
||||
vIter != m_lm->vocab_->VocabEnd(); vIter++) {
|
||||
// get word from ORLM vocab and associate with (new) factor id
|
||||
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
|
||||
m_lmids_map[factorId] = vIter->second;
|
||||
@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() {
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
||||
// add to lookup vector in object
|
||||
lm_ids_vec_.resize(maxFactorId+1);
|
||||
lm_ids_vec_.resize(maxFactorId+1);
|
||||
// fill with OOV code
|
||||
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
|
||||
|
||||
@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() {
|
||||
iter != m_lmids_map.end() ; ++iter)
|
||||
lm_ids_vec_[iter->first] = iter->second;
|
||||
}
|
||||
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const {
|
||||
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
|
||||
{
|
||||
return m_lm->vocab_->GetWordID(str);
|
||||
}
|
||||
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const {
|
||||
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
|
||||
{
|
||||
size_t factorId = factor->GetId();
|
||||
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
|
||||
}
|
||||
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
|
||||
State* finalState) const {
|
||||
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
|
||||
State* finalState) const
|
||||
{
|
||||
FactorType factorType = GetFactorType();
|
||||
// set up context
|
||||
//std::vector<long unsigned int> factor(1,0);
|
||||
@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFact
|
||||
*/
|
||||
return ret;
|
||||
}
|
||||
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value) {
|
||||
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
|
||||
{
|
||||
/*cerr << "Inserting into ORLM: \"";
|
||||
iterate(ngram, nit)
|
||||
cerr << *nit << " ";
|
||||
cerr << "\"\t" << value << endl; */
|
||||
m_lm->vocab_->MakeOpen();
|
||||
bool res = m_lm->update(ngram, value);
|
||||
bool res = m_lm->update(ngram, value);
|
||||
m_lm->vocab_->MakeClosed();
|
||||
return res;
|
||||
}
|
||||
|
@ -15,7 +15,8 @@ namespace Moses
|
||||
class Factor;
|
||||
class Phrase;
|
||||
|
||||
class LanguageModelORLM : public LanguageModelPointerState {
|
||||
class LanguageModelORLM : public LanguageModelPointerState
|
||||
{
|
||||
public:
|
||||
typedef count_t T; // type for ORLM filter
|
||||
LanguageModelORLM()
|
||||
@ -30,13 +31,15 @@ public:
|
||||
fout.close();
|
||||
delete m_lm;
|
||||
}
|
||||
void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches
|
||||
void CleanUpAfterSentenceProcessing() {
|
||||
m_lm->clearCache(); // clear caches
|
||||
}
|
||||
void InitializeBeforeSentenceProcessing() { // nothing to do
|
||||
//m_lm->initThreadSpecificData(); // Creates thread specific data iff
|
||||
// compiled with multithreading.
|
||||
// compiled with multithreading.
|
||||
}
|
||||
bool UpdateORLM(const std::vector<string>& ngram, const int value);
|
||||
protected:
|
||||
protected:
|
||||
OnlineRLM<T>* m_lm;
|
||||
//MultiOnlineRLM<T>* m_lm;
|
||||
wordID_t m_oov_id;
|
||||
|
@ -347,7 +347,8 @@ const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
|
||||
|
||||
}
|
||||
|
||||
LanguageModelMultiFactor *NewParallelBackoff() {
|
||||
LanguageModelMultiFactor *NewParallelBackoff()
|
||||
{
|
||||
return new LanguageModelParallelBackoff();
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
namespace
|
||||
namespace
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
@ -57,7 +57,7 @@ public:
|
||||
}
|
||||
void InitializeBeforeSentenceProcessing() {
|
||||
m_lm->initThreadSpecificData(); // Creates thread specific data iff
|
||||
// compiled with multithreading.
|
||||
// compiled with multithreading.
|
||||
}
|
||||
protected:
|
||||
std::vector<randlm::WordID> m_randlm_ids_vec;
|
||||
@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
|
||||
}
|
||||
|
||||
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
||||
State* finalState) const
|
||||
State* finalState) const
|
||||
{
|
||||
FactorType factorType = GetFactorType();
|
||||
// set up context
|
||||
@ -156,7 +156,8 @@ LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
||||
|
||||
}
|
||||
|
||||
LanguageModelPointerState *NewRandLM() {
|
||||
LanguageModelPointerState *NewRandLM()
|
||||
{
|
||||
return new LanguageModelRandLM();
|
||||
}
|
||||
|
||||
|
@ -46,7 +46,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
|
||||
const float weightLM = lm.GetWeight();
|
||||
const float oovWeightLM = lm.GetOOVWeight();
|
||||
|
||||
float fullScore, nGramScore;
|
||||
float fullScore, nGramScore;
|
||||
size_t oovCount;
|
||||
|
||||
// do not process, if factors not defined yet (happens in partial translation options)
|
||||
@ -64,7 +64,7 @@ void LMList::CalcScore(const Phrase &phrase, float &retFullScore, float &retNGra
|
||||
} else {
|
||||
breakdown->Assign(&lm, nGramScore); // I'm not sure why += doesn't work here- it should be 0.0 right?
|
||||
}
|
||||
|
||||
|
||||
|
||||
retFullScore += fullScore * weightLM;
|
||||
retNGramScore += nGramScore * weightLM;
|
||||
|
@ -39,13 +39,13 @@ public:
|
||||
virtual FFState* Evaluate(const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
|
||||
virtual FFState* EvaluateChart(const ChartHypothesis&,
|
||||
int /* featureID */,
|
||||
ScoreComponentCollection*) const {
|
||||
CHECK(0); // not valid for chart decoder
|
||||
return NULL;
|
||||
}
|
||||
ScoreComponentCollection*) const {
|
||||
CHECK(0); // not valid for chart decoder
|
||||
return NULL;
|
||||
}
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||
|
||||
|
@ -267,8 +267,9 @@ struct SGNReverseCompare {
|
||||
/**
|
||||
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
|
||||
**/
|
||||
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
|
||||
void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
|
||||
{
|
||||
|
||||
vector<SearchGraphNode> searchGraph;
|
||||
GetSearchGraph(searchGraph);
|
||||
|
||||
@ -282,15 +283,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
map<int,const Hypothesis*> idToHyp;
|
||||
map<int,float> fscores;
|
||||
|
||||
//Iterating through the hypos in reverse order of id gives a reverse
|
||||
//topological order. We rely on the fact that hypo ids are given out
|
||||
//Iterating through the hypos in reverse order of id gives a reverse
|
||||
//topological order. We rely on the fact that hypo ids are given out
|
||||
//sequentially, as the search proceeds.
|
||||
//NB: Could just sort by stack.
|
||||
//NB: Could just sort by stack.
|
||||
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
|
||||
|
||||
//first task is to fill in the outgoing hypos and edge scores.
|
||||
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
|
||||
i != searchGraph.end(); ++i) {
|
||||
i != searchGraph.end(); ++i) {
|
||||
const Hypothesis* hypo = i->hypo;
|
||||
idToHyp[hypo->GetId()] = hypo;
|
||||
fscores[hypo->GetId()] = i->fscore;
|
||||
@ -298,7 +299,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
//back to current
|
||||
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
|
||||
outgoingHyps[prevHypo].insert(hypo);
|
||||
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
|
||||
edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
|
||||
hypo->GetScore() - prevHypo->GetScore();
|
||||
}
|
||||
//forward from current
|
||||
@ -309,7 +310,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
outgoingHyps[hypo].insert(nextHypo);
|
||||
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
|
||||
CHECK(fscoreIter != fscores.end());
|
||||
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
|
||||
edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
|
||||
i->fscore - fscoreIter->second;
|
||||
}
|
||||
}
|
||||
@ -317,26 +318,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
|
||||
//then run through again to calculate sigmas
|
||||
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
|
||||
i != searchGraph.end(); ++i) {
|
||||
i != searchGraph.end(); ++i) {
|
||||
|
||||
if (i->forward == -1) {
|
||||
sigmas[i->hypo] = 0;
|
||||
} else {
|
||||
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
||||
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
||||
outgoingHyps.find(i->hypo);
|
||||
|
||||
|
||||
CHECK(outIter != outgoingHyps.end());
|
||||
float sigma = 0;
|
||||
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
|
||||
j != outIter->second.end(); ++j) {
|
||||
j != outIter->second.end(); ++j) {
|
||||
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
|
||||
CHECK(succIter != sigmas.end());
|
||||
map<Edge,float>::const_iterator edgeScoreIter =
|
||||
map<Edge,float>::const_iterator edgeScoreIter =
|
||||
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
|
||||
CHECK(edgeScoreIter != edgeScores.end());
|
||||
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
|
||||
if (sigma == 0) {
|
||||
sigma = term;
|
||||
sigma = term;
|
||||
} else {
|
||||
sigma = log_sum(sigma,term);
|
||||
}
|
||||
@ -352,7 +353,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
vector<const Hypothesis*> path;
|
||||
path.push_back(startHypo);
|
||||
while(1) {
|
||||
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
||||
map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
|
||||
outgoingHyps.find(path.back());
|
||||
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
|
||||
//end of the path
|
||||
@ -363,7 +364,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
vector<float> candidateScores;
|
||||
float scoreTotal = 0;
|
||||
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
|
||||
j != outIter->second.end(); ++j) {
|
||||
j != outIter->second.end(); ++j) {
|
||||
candidates.push_back(*j);
|
||||
CHECK(sigmas.find(*j) != sigmas.end());
|
||||
Edge edge(path.back()->GetId(),(*j)->GetId());
|
||||
@ -390,18 +391,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
|
||||
}
|
||||
//cerr << "Random: " << random << " Chose " << position-1 << endl;
|
||||
const Hypothesis* chosen = candidates[position-1];
|
||||
path.push_back(chosen);
|
||||
path.push_back(chosen);
|
||||
}
|
||||
//cerr << "Path: " << endl;
|
||||
//for (size_t j = 0; j < path.size(); ++j) {
|
||||
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
|
||||
// cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
|
||||
//}
|
||||
//cerr << endl;
|
||||
|
||||
//Convert the hypos to TrellisPath
|
||||
ret.Add(new TrellisPath(path));
|
||||
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -676,17 +677,17 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
|
||||
else
|
||||
outputSearchGraphStream << " hyp=" << searchNode.hypo->GetId();
|
||||
|
||||
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
|
||||
<< " back=" << prevHypo->GetId()
|
||||
<< " score=" << searchNode.hypo->GetScore()
|
||||
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
|
||||
outputSearchGraphStream << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
|
||||
<< " back=" << prevHypo->GetId()
|
||||
<< " score=" << searchNode.hypo->GetScore()
|
||||
<< " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
|
||||
|
||||
if (searchNode.recombinationHypo != NULL)
|
||||
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
|
||||
if (searchNode.recombinationHypo != NULL)
|
||||
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
|
||||
|
||||
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
|
||||
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
|
||||
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
|
||||
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
|
||||
<< " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
|
||||
<< "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
|
||||
|
||||
// Modified so that -osgx is a superset of -osg (GST Oct 2011)
|
||||
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
|
||||
@ -694,10 +695,10 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
|
||||
outputSearchGraphStream << " scores=[ ";
|
||||
StaticData::Instance().GetScoreIndexManager().PrintLabeledScores( outputSearchGraphStream, scoreBreakdown );
|
||||
outputSearchGraphStream << " ]";
|
||||
|
||||
|
||||
|
||||
outputSearchGraphStream << " out=" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
|
||||
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
||||
searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
||||
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,7 @@ namespace PCN
|
||||
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
|
||||
typedef std::vector<CNAlt> CNCol;
|
||||
typedef std::vector<CNCol> CN;
|
||||
|
||||
|
||||
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
|
||||
* word lattice in PCN format, return a CN object representing the lattice
|
||||
*/
|
||||
|
@ -71,10 +71,10 @@ Parameter::Parameter()
|
||||
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
|
||||
AddParam("report-segmentation", "t", "report phrase segmentation in the output");
|
||||
#ifdef HAVE_SYNLM
|
||||
AddParam("slmodel-file", "location of the syntactic language model file(s)");
|
||||
AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
|
||||
AddParam("slmodel-factor", "factor to use with syntactic language model");
|
||||
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
|
||||
AddParam("slmodel-file", "location of the syntactic language model file(s)");
|
||||
AddParam("weight-slm", "slm", "weight(s) for syntactic language model");
|
||||
AddParam("slmodel-factor", "factor to use with syntactic language model");
|
||||
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
|
||||
#endif
|
||||
AddParam("stack", "s", "maximum stack size for histogram pruning");
|
||||
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
|
||||
@ -277,14 +277,13 @@ bool Parameter::Validate()
|
||||
PARAM_MAP::const_iterator iterParams;
|
||||
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
|
||||
const std::string &key = iterParams->first;
|
||||
|
||||
if (m_valid.find(key) == m_valid.end())
|
||||
{
|
||||
|
||||
if (m_valid.find(key) == m_valid.end()) {
|
||||
UserMessage::Add("Unknown parameter " + key);
|
||||
noErrorFlag = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// required parameters
|
||||
if (m_setting["ttable-file"].size() == 0) {
|
||||
@ -307,7 +306,7 @@ bool Parameter::Validate()
|
||||
}
|
||||
|
||||
if (m_setting["lmodel-file"].size() * (m_setting.find("lmodel-oov-feature") != m_setting.end() ? 2 : 1)
|
||||
!= m_setting["weight-l"].size()) {
|
||||
!= m_setting["weight-l"].size()) {
|
||||
stringstream errorMsg("");
|
||||
errorMsg << "Config and parameters specify "
|
||||
<< static_cast<int>(m_setting["lmodel-file"].size())
|
||||
@ -457,8 +456,7 @@ bool Parameter::ReadConfigFile(const string &filePath )
|
||||
|
||||
if (line.size() == 0) {
|
||||
// blank line. do nothing.
|
||||
}
|
||||
else if (line[0]=='[') {
|
||||
} else if (line[0]=='[') {
|
||||
// new parameter
|
||||
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
|
||||
if (line[currPos] == ']') {
|
||||
|
@ -143,9 +143,9 @@ void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const
|
||||
for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
|
||||
Word &word = AddWord();
|
||||
size_t index = 0;
|
||||
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
|
||||
factor_it && (index < factorOrder.size());
|
||||
++factor_it, ++index) {
|
||||
for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
|
||||
factor_it && (index < factorOrder.size());
|
||||
++factor_it, ++index) {
|
||||
word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
|
||||
}
|
||||
if (index != factorOrder.size()) {
|
||||
|
@ -61,7 +61,7 @@ public:
|
||||
/** Fills phrase with words from format string, typically from phrase table or sentence input
|
||||
* \param factorOrder factor types of each element in 2D string vector
|
||||
* \param phraseString formatted input string to parse
|
||||
* \param factorDelimiter delimiter between factors.
|
||||
* \param factorDelimiter delimiter between factors.
|
||||
*/
|
||||
void CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter);
|
||||
|
||||
|
@ -136,7 +136,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
|
||||
m_filePath += ".gz";
|
||||
VERBOSE(2,"Using gzipped file" << std::endl);
|
||||
}
|
||||
|
||||
|
||||
PhraseDictionaryHiero* pdm = new PhraseDictionaryHiero(m_numScoreComponent,this);
|
||||
bool ret = pdm->Load(GetInput()
|
||||
, GetOutput()
|
||||
@ -154,7 +154,7 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
|
||||
m_filePath += ".gz";
|
||||
VERBOSE(2,"Using gzipped file" << std::endl);
|
||||
}
|
||||
|
||||
|
||||
PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this);
|
||||
bool ret = pdm->Load(GetInput()
|
||||
, GetOutput()
|
||||
@ -255,18 +255,18 @@ PhraseDictionaryFeature::~PhraseDictionaryFeature()
|
||||
|
||||
std::string PhraseDictionaryFeature::GetScoreProducerDescription(unsigned idx) const
|
||||
{
|
||||
if (idx < GetNumInputScores()){
|
||||
if (idx < GetNumInputScores()) {
|
||||
return "InputScore";
|
||||
}else{
|
||||
} else {
|
||||
return "PhraseModel";
|
||||
}
|
||||
}
|
||||
|
||||
std::string PhraseDictionaryFeature::GetScoreProducerWeightShortName(unsigned idx) const
|
||||
{
|
||||
if (idx < GetNumInputScores()){
|
||||
if (idx < GetNumInputScores()) {
|
||||
return "I";
|
||||
}else{
|
||||
} else {
|
||||
return "tm";
|
||||
}
|
||||
}
|
||||
|
@ -16,16 +16,16 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
|
||||
bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::string &filePath
|
||||
, const std::vector<float> &weight
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, const WordPenaltyProducer* wpProducer)
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::string &filePath
|
||||
, const std::vector<float> &weight
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, const WordPenaltyProducer* wpProducer)
|
||||
{
|
||||
// file path is the directory of the rules for eacg, NOT the file of all the rules
|
||||
m_filePath = filePath;
|
||||
@ -36,7 +36,7 @@ bool PhraseDictionaryALSuffixArray::Load(const std::vector<FactorType> &input
|
||||
m_languageModels = &languageModels;
|
||||
m_wpProducer = wpProducer;
|
||||
m_weight = &weight;
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -44,20 +44,20 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
|
||||
{
|
||||
// clear out rules for previous sentence
|
||||
m_collection.Clear();
|
||||
|
||||
|
||||
// populate with rules for this sentence
|
||||
long translationId = source.GetTranslationId();
|
||||
|
||||
|
||||
string grammarFile = m_filePath + "/grammar.out." + SPrint(translationId);
|
||||
|
||||
|
||||
// data from file
|
||||
InputFileStream inFile(grammarFile);
|
||||
|
||||
std::auto_ptr<RuleTableLoader> loader =
|
||||
RuleTableLoaderFactory::Create(grammarFile);
|
||||
RuleTableLoaderFactory::Create(grammarFile);
|
||||
bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
|
||||
*m_languageModels, m_wpProducer, *this);
|
||||
|
||||
|
||||
CHECK(ret);
|
||||
}
|
||||
|
||||
|
@ -11,13 +11,14 @@
|
||||
|
||||
#include "PhraseDictionarySCFG.h"
|
||||
|
||||
namespace Moses {
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class PhraseDictionaryALSuffixArray : public PhraseDictionarySCFG
|
||||
{
|
||||
public:
|
||||
PhraseDictionaryALSuffixArray(size_t numScoreComponent, PhraseDictionaryFeature* feature)
|
||||
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
||||
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
||||
|
||||
bool Load(const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
@ -34,9 +35,9 @@ protected:
|
||||
const LMList *m_languageModels;
|
||||
const WordPenaltyProducer *m_wpProducer;
|
||||
const std::vector<float> *m_weight;
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -72,7 +72,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol
|
||||
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
|
||||
{
|
||||
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
|
||||
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
|
||||
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
|
||||
}
|
||||
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
|
||||
{
|
||||
|
@ -15,30 +15,31 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
bool PhraseDictionaryHiero::Load(const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::string &filePath
|
||||
, const std::vector<float> &weight
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, const WordPenaltyProducer* wpProducer)
|
||||
, const std::vector<FactorType> &output
|
||||
, const std::string &filePath
|
||||
, const std::vector<float> &weight
|
||||
, size_t tableLimit
|
||||
, const LMList &languageModels
|
||||
, const WordPenaltyProducer* wpProducer)
|
||||
{
|
||||
m_filePath = filePath;
|
||||
m_tableLimit = tableLimit;
|
||||
|
||||
|
||||
|
||||
|
||||
// data from file
|
||||
InputFileStream inFile(filePath);
|
||||
|
||||
|
||||
std::auto_ptr<RuleTableLoader> loader =
|
||||
RuleTableLoaderFactory::Create(filePath);
|
||||
RuleTableLoaderFactory::Create(filePath);
|
||||
bool ret = loader->Load(input, output, inFile, weight, tableLimit,
|
||||
languageModels, wpProducer, *this);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
@ -11,13 +11,14 @@
|
||||
|
||||
#include "PhraseDictionarySCFG.h"
|
||||
|
||||
namespace Moses {
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class PhraseDictionaryHiero : public PhraseDictionarySCFG
|
||||
{
|
||||
public:
|
||||
PhraseDictionaryHiero(size_t numScoreComponent, PhraseDictionaryFeature* feature)
|
||||
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
||||
: PhraseDictionarySCFG(numScoreComponent,feature) {}
|
||||
|
||||
bool Load(const std::vector<FactorType> &input
|
||||
, const std::vector<FactorType> &output
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user